航空发动机寿命预测

作者: AI信仰者 | 来源:发表于2023-03-07 20:19 被阅读0次

    航空发动机寿命预测

    该数据集的挑战在于,数据来自50或60米的气象塔的数据。 每个塔都有多个风速计,一个风向标和一个温度传感器。 每个传感器在10分钟内测量数据并报告10分钟内的平均值,标 准偏差,最小值和最大值。 通常情况下,最高两级有一对风速计(例如60米高的59米和49米),然后是30米和/或10米的单个或配对风速计。

    a)训练数据集
    提供的数据文件中有两个结构,第一个是“Train_EngineRun”。这包含结构内的260个发动机。在每个结构中,该引擎的生命周期数据,从新引擎直到退役。数据包含24列,每行对应一个给定的航班。每次飞行数据都采集自六种飞行状态中的一种,飞行状态标签也有提供。“Variable_List”的结构包含对应于24列的传感器名称。每个发动机的行数(行程)都不相同,因为一些发动机的寿命比其他发动机短或长。

    b)测试数据集
    测试数据文件以相同的方式配置,但每个引擎单元只会有一些降级,并且还有额外的航班,直到它应该退役。其目标是为每台发动机确定剩余寿命(航班数量)。测试数据集中有100个引擎,因此所有100个引擎都应该有一个寿命估算。

    数据文件已上传到我的下载:
    数据文件

    数据集来源:工业大数据产业创新平台
    需要登录注册后到数据集页面下载

    该平台收录了多种行业场景,包括加工制造、轨道交通、能源电力、半导体等行业,从不同层级收录了包括部件级、设备级、产线级的数据。

    简单思路如下:

    1. 将训练数据和标签连接起来
    2. 训练集与测试集共同进行归一化
    3. 特征选择,筛选调不重要的特征
    4. 将svr、神经网络、岭回归、lgb等模型融合在一起,提高算法准确率

    优化方法:

    • 使用时间序列模型LSTM试试
    • 特征工程部分多多分析与优化、观察训练集与测试集是否在同一分布
    • 了解航空发动机行业背景知识,或许可以加入一些其他特征去优化

    训练和预测代码1:

    # -*- coding: utf-8 -*-
    import os
    
    import lightgbm
    import numpy as np
    import pandas as pd
    import xgboost
    from keras.layers import Dense
    from keras.models import Sequential
    from keras.wrappers.scikit_learn import KerasRegressor
    from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import KFold, cross_val_score
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.svm import SVR
    
    def pdReadCsv(file, sep):
        try:
            data = pd.read_csv(file, sep=sep,encoding='utf-8',error_bad_lines=False,engine='python')
            return data
        except:
            data = pd.read_csv(file,sep=sep,encoding='gbk',error_bad_lines=False,engine='python')
            return data
    
    os.chdir(r'E:\项目文件\航空发动机寿命预测\data\\')
    src = r'E:\项目文件\航空发动机寿命预测\\'
    
    seed = 2018
    
    
    # Stacking
    class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
        def __init__(self, base_models, meta_model, n_folds=5):
            self.base_models = base_models
            self.meta_model = meta_model
            self.n_folds = n_folds
    
        # 我们再次拟合原始模型的克隆数据
        def fit(self, X, y):
            self.base_models_ = [list() for x in self.base_models]
            self.meta_model_ = clone(self.meta_model)
            kfold = KFold(n_splits=self.n_folds, shuffle=True)
    
            # 训练克隆的基础模型,然后创建非折叠预测
            # 培养克隆元模型所需的
            out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
            for i, clf in enumerate(self.base_models):
                for train_index, holdout_index in kfold.split(X, y):
                    instance = clone(clf)
                    self.base_models_[i].append(instance)
                    instance.fit(X[train_index], y[train_index])
                    y_pred = instance.predict(X[holdout_index])
                    out_of_fold_predictions[holdout_index, i] = y_pred
    
            # 现在使用不可折叠的预测来训练克隆的元模型
            print(out_of_fold_predictions.shape)
            self.meta_model_.fit(out_of_fold_predictions, y)
            return self
    
        def predict(self, X):
            meta_features = np.column_stack([
                np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
                for base_models in self.base_models_])
            return self.meta_model_.predict(meta_features)
    
    
    # 简单模型融合
    class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
        def __init__(self, models):
            self.models = models
    
        # 遍历所有模型
        def fit(self, X, y):
            self.models_ = [clone(x) for x in self.models]
    
            for model in self.models_:
                model.fit(X, y)
    
            return self
    
        # 预估,并对预估结果值做average
        def predict(self, X):
            predictions = np.column_stack([
                model.predict(X) for model in self.models_
            ])
            return np.mean(predictions, axis=1)
    
    
    def build_nn():
        model = Sequential()
        model.add(Dense(units=128, activation='linear', input_dim=18))
        model.add(Dense(units=32, activation='linear'))
        model.add(Dense(units=8, activation='linear'))
        model.add(Dense(units=1, activation='linear'))
        model.compile(loss='mse', optimizer='adam')
        return model
    
    
    def build_model():
        svr = make_pipeline(SVR(kernel='linear'))
        line = make_pipeline(LinearRegression())
        lasso = make_pipeline(Lasso(alpha=0.0005, random_state=1))
        ENet = make_pipeline(ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
        KRR1 = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
        # KRR1 = LinearSVR(C=2)
        KRR2 = KernelRidge(alpha=1.5, kernel='linear', degree=2, coef0=2.5)
        lgbm = lightgbm.LGBMRegressor(learning_rate=0.01, n_estimators=500, num_leaves=31)
        # lgbm = ExtraTreesRegressor(criterion='mse', n_estimators=500, max_depth=38)
        xgb = xgboost.XGBRegressor(booster='gbtree', colsample_bytree=0.8, gamma=0.1,
                                   learning_rate=0.02, max_depth=5,
                                   n_estimators=500, min_child_weight=0.8,
                                   reg_alpha=0, reg_lambda=1, subsample=0.8,
                                   random_state=seed, nthread=2)
        nn = KerasRegressor(build_fn=build_nn, nb_epoch=500, batch_size=32, verbose=2)
        return svr, line, lasso, ENet, KRR1, KRR2, lgbm, xgb, nn
    
    
    def rmsle_cv(model=None, X_train_head=None, y_train=None):
        n_folds = 5
        kf = KFold(n_folds, shuffle=True, random_state=seed).get_n_splits(X_train_head)
        rmse = -cross_val_score(model, X_train_head, y_train, scoring="neg_mean_squared_error", cv=kf)
        return (rmse)
    
    
    def main():
        print("Load data from file......")
        file = 'train_label.csv'
        # file = 'download_label.csv'
        test_file = 'test_label.csv'
        X_test = pdReadCsv(test_file, ',').drop(columns=["life"])
        train = pdReadCsv(file, ',')
        X_train, y_train = train.drop(columns=["life"]), train["life"]
        print("X_train shape", X_train.shape)
        print("X_test shape", X_test.shape)
        print("y_train shape", y_train.shape)
        all_data = pd.concat([X_train, X_test])
        print(all_data.shape)
        print("Load done.")
        # 标准化
        from sklearn import preprocessing
        scaler = MinMaxScaler(feature_range=(0, 1))
        all_data = pd.DataFrame(scaler.fit_transform(all_data), columns=all_data.columns)
        print("Scale done.")
        scaled = pd.DataFrame(preprocessing.scale(all_data), columns=all_data.columns)
        X_train = scaled.loc[0:len(X_train) - 1]
        X_test = scaled.loc[len(X_train):]
        # 特征选择
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression
        all_data = pd.concat([X_train, X_test])
        # 获取效果最好的前18个特征
        X_scored = SelectKBest(score_func=f_regression, k='all').fit(X_train, y_train)
        feature_scoring = pd.DataFrame({
            'feature': X_train.columns,
            'score': X_scored.scores_
        })
        head_feature_num = 18
        feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
        X_train_head = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
        X_test_head = X_test[X_test.columns[X_test.columns.isin(feat_scored_headnum)]]
        print(X_train_head.shape)
        print(y_train.shape)
        print(X_test_head.shape)
        print("Start training......")
        svr, line, lasso, ENet, KRR1, KRR2, lgbm, xgb, nn = build_model()
        score = rmsle_cv(svr, X_train_head, y_train)
        print("SVR rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
        svr.fit(X_train_head, y_train)
        score = rmsle_cv(line, X_train_head, y_train)
        print("Line rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
        score = rmsle_cv(lasso, X_train_head, y_train)
        print("Lasso rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
        score = rmsle_cv(ENet, X_train_head, y_train)
        print("ElasticNet rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
        # =============================================================================
        score = rmsle_cv(KRR1, X_train_head, y_train)
        print("Kernel Ridge1 rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
        score = rmsle_cv(KRR2, X_train_head, y_train)
        print("Kernel Ridge2 rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
        KRR2.fit(X_train_head, y_train)
        # =============================================================================
        head_feature_num = 22
        feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
        X_train_head3 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
        score = rmsle_cv(xgb, X_train_head3, y_train)
        print("Xgboost rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
        xgb.fit(X_train_head, y_train)
        # =============================================================================
        head_feature_num = 22
        feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
        X_train_head4 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
        score = rmsle_cv(lgbm, X_train_head4, y_train)
        print("LGBM 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
        lgbm.fit(X_train_head, y_train)
        # =============================================================================
        head_feature_num = 18
        feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
        X_train_head5 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
        score = rmsle_cv(nn, X_train_head5, y_train)
        print("NN 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
        nn.fit(X_train_head, y_train)
        # =============================================================================
        averaged_models = AveragingModels(models=(svr, KRR2, lgbm, nn))
        score = rmsle_cv(averaged_models, X_train_head, y_train)
        print("对基模型集成后的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
        averaged_models.fit(X_train_head, y_train)
        stacking_models = StackingAveragedModels(base_models=(svr, KRR2, lgbm, nn), meta_model=xgb)
        stacking_models.fit(X_train_head.values, y_train.values)
        stacked_train_pred = stacking_models.predict(X_train_head)
        score = mean_squared_error(y_train.values, stacked_train_pred)
        print("Stacking Averaged models predict score: {:.4f}".format(score))
    
    
    main()
    
    

    训练和预测代码2:

    #!/usr/bin/env Python
    # coding=utf-8
    import warnings
    
    from sklearn.decomposition import PCA
    from sklearn.feature_selection import SelectKBest, mutual_info_regression
    from sklearn.model_selection import train_test_split
    
    from Aero_engine_life.data_model import get_train, build_model_lgb, build_model_etr, build_model_rf, write_mse, \
        score_model
    
    warnings.filterwarnings("ignore", "(?s).*MATPLOTLIBDATA.*", category=UserWarning)
    import numpy as np
    
    import pandas as pd
    from sklearn.metrics import mean_squared_error
    
    np.random.seed(2020)
    
    
    for k in [20]:
        print(k)
        X_data, Y_data = get_train()
        X_data = SelectKBest(mutual_info_regression, k=k).fit_transform(X_data, Y_data)
        pca = PCA(n_components=k)
        X_data = pca.fit_transform(X_data)
        x_train, x_val, y_train, y_val = train_test_split(X_data, Y_data, test_size=0.02, random_state=20)
        model_lgb = build_model_lgb(x_train, y_train)
        val_lgb = model_lgb.predict(x_val)
        model_etr = build_model_etr(x_train, y_train)
        val_etr = model_etr.predict(x_val)
        model_rf = build_model_rf(x_train, y_train)
        val_rf = model_rf.predict(x_val)
        # Starking 第一层
        train_etr_pred = model_etr.predict(x_train)
        print('etr训练集,mse:', mean_squared_error(y_train, train_etr_pred))
        write_mse('etr', '训练集', mean_squared_error(y_train, train_etr_pred))
        train_lgb_pred = model_lgb.predict(x_train)
        print('lgb训练集,mse:', mean_squared_error(y_train, train_lgb_pred))
        write_mse('lgb', '训练集', mean_squared_error(y_train, train_lgb_pred))
        train_rf_pred = model_rf.predict(x_train)
        print('rf训练集,mse:', mean_squared_error(y_train, train_rf_pred))
        write_mse('rf', '训练集', mean_squared_error(y_train, train_rf_pred))
    
        Stacking_X_train = pd.DataFrame()
        Stacking_X_train['Method_1'] = train_rf_pred
        Stacking_X_train['Method_2'] = train_lgb_pred
        Stacking_X_train['Method_3'] = train_etr_pred
    
        Stacking_X_val = pd.DataFrame()
        Stacking_X_val['Method_1'] = val_rf
        Stacking_X_val['Method_2'] = val_lgb
        Stacking_X_val['Method_3'] = val_etr
    
        # 第二层
        model_Stacking = build_model_etr(Stacking_X_train, y_train)
    
        train_pre_Stacking = model_Stacking.predict(Stacking_X_train)
        score_model(Stacking_X_train, y_train, train_pre_Stacking, model_Stacking, '训练集')
        val_pre_Stacking = model_Stacking.predict(Stacking_X_val)
        score_model(Stacking_X_val, y_val, val_pre_Stacking, model_Stacking, '验证集')
    
    

    模型文件:

    import os
    
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    from lightgbm import LGBMRegressor
    from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
    from sklearn.model_selection import GridSearchCV
    
    from utils.read_write import writeOneCsv, pdReadCsv
    
    os.chdir(r'E:\项目文件\航空发动机寿命预测\data\\')
    src = r'E:\项目文件\航空发动机寿命预测\\'
    
    
    def get_train():
        file = 'train_label.csv'
        # file = 'download_label.csv'
        # file = 'test_label.csv'
        train = pdReadCsv(file, ',')
        return train.values[:, 3:-1], train.values[:, -1:].ravel()
    
    
    def build_model_rf(x_train, y_train):
        estimator = RandomForestRegressor(criterion='mse')
        param_grid = {
            'max_depth': range(33, 35, 9),
            'n_estimators': range(73, 77, 9),
        }
        model = GridSearchCV(estimator, param_grid, cv=3)
        model.fit(x_train, y_train)
        print('rf')
        print(model.best_params_)
        writeParams('rf', model.best_params_)
        return model
    
    
    def build_model_etr(x_train, y_train):
        # 极端随机森林回归   n_estimators 即ExtraTreesRegressor最大的决策树个数
        estimator = ExtraTreesRegressor(criterion='mse')
        param_grid = {
            'max_depth': range(33, 39, 9),
            'n_estimators': range(96, 99, 9),
        }
        model = GridSearchCV(estimator, param_grid)
        model.fit(x_train, y_train)
        print('etr')
        print(model.best_params_)
        writeParams('etr', model.best_params_)
        return model
    
    
    def build_model_lgb(x_train, y_train):
        estimator = LGBMRegressor()
        param_grid = {
            'learning_rate': [0.1],
            'n_estimators': range(77, 78, 9),
            'num_leaves': range(59, 66, 9)
        }
        gbm = GridSearchCV(estimator, param_grid)
        gbm.fit(x_train, y_train.ravel())
        print('lgb')
        print(gbm.best_params_)
        writeParams('lgb', gbm.best_params_)
        return gbm
    
    
    def scatter_line(y_val, y_pre):
        import matplotlib.pyplot as plt
        xx = range(0, len(y_val))
        plt.scatter(xx, y_val, color="red", label="Sample Point", linewidth=3)
        plt.plot(xx, y_pre, color="orange", label="Fitting Line", linewidth=2)
        plt.legend()
        plt.show()
    
    
    def score_model(train, test, predict, model, data_type):
        score = model.score(train, test)
        print(data_type + ",R^2,", round(score, 6))
        writeOneCsv(['staking', data_type, 'R^2', round(score, 6)], src + '调参记录.csv')
        mae = mean_absolute_error(test, predict)
        print(data_type + ',MAE,', mae)
        writeOneCsv(['staking', data_type, 'MAE', mae], src + '调参记录.csv')
        mse = mean_squared_error(test, predict)
        print(data_type + ",MSE,", mse)
        writeOneCsv(['staking', data_type, 'MSE', mse], src + '调参记录.csv')
    
    
    def writeParams(model, best):
        if model == 'lgb':
            writeOneCsv([model, best['num_leaves'], best['n_estimators'], best['learning_rate']], src + '调参记录.csv')
        else:
            writeOneCsv([model, best['max_depth'], best['n_estimators'], 0], src + '调参记录.csv')
    
    
    def write_mse(model, data_type, mse):
        writeOneCsv([model, data_type, 'mse', mse], src + '调参记录.csv')
    
    

    如果你是去平台下的原始数据就要经过数据处理

    import os
    
    import pandas as pd
    
    from utils.read_write import pdReadCsv
    
    os.chdir(r'E:\项目文件\航空发动机寿命预测\data\\')
    src = r'E:\项目文件\航空发动机寿命预测\\'
    
    
    def join_data():
        file = 'Dataset_Aero_engine_life_prediction_train_2020_09_05.csv'
        train = pdReadCsv(file, ',')
        label_file = 'Dataset_Aero_engine_life_prediction_label_2020_09_05.csv'
        label = pdReadCsv(label_file, ',')
        download_file = 'Dataset_Aero_engine_life_prediction_download_2020_09_05.csv'
        download = pdReadCsv(download_file, ',')
        test_file = 'Dataset_Aero_engine_life_prediction_test_2020_09_05.csv'
        test = pdReadCsv(test_file, ',')
        train_label = pd.merge(train, label, on='Number')
        train_label.to_csv('train_label.csv')
        download_label = pd.merge(download, label, on='Number')
        download_label.to_csv('download_label.csv')
        test_label = pd.merge(test, label, on='Number')
        test_label.to_csv('test_label.csv')
    
    
    join_data()
    
    

    欢迎大家多多交流工业大数据创新应用

    相关文章

      网友评论

        本文标题:航空发动机寿命预测

        本文链接:https://www.haomeiwen.com/subject/pqcgldtx.html