航空发动机寿命预测
该数据集的挑战在于,数据来自50或60米的气象塔的数据。 每个塔都有多个风速计,一个风向标和一个温度传感器。 每个传感器在10分钟内测量数据并报告10分钟内的平均值,标 准偏差,最小值和最大值。 通常情况下,最高两级有一对风速计(例如60米高的59米和49米),然后是30米和/或10米的单个或配对风速计。
a)训练数据集
提供的数据文件中有两个结构,第一个是“Train_EngineRun”。这包含结构内的260个发动机。在每个结构中,该引擎的生命周期数据,从新引擎直到退役。数据包含24列,每行对应一个给定的航班。每次飞行数据都采集自六种飞行状态中的一种,飞行状态标签也有提供。“Variable_List”的结构包含对应于24列的传感器名称。每个发动机的行数(行程)都不相同,因为一些发动机的寿命比其他发动机短或长。
b)测试数据集
测试数据文件以相同的方式配置,但每个引擎单元只会有一些降级,并且还有额外的航班,直到它应该退役。其目标是为每台发动机确定剩余寿命(航班数量)。测试数据集中有100个引擎,因此所有100个引擎都应该有一个寿命估算。
数据文件已上传到我的下载:
数据文件
数据集来源:工业大数据产业创新平台
需要登录注册后到数据集页面下载
该平台收录了多种行业场景,包括加工制造、轨道交通、能源电力、半导体等行业,从不同层级收录了包括部件级、设备级、产线级的数据。
简单思路如下:
- 将训练数据和标签连接起来
- 训练集与测试集共同进行归一化
- 特征选择,筛选调不重要的特征
- 将svr、神经网络、岭回归、lgb等模型融合在一起,提高算法准确率
优化方法:
- 使用时间序列模型LSTM试试
- 特征工程部分多多分析与优化、观察训练集与测试集是否在同一分布
- 了解航空发动机行业背景知识,或许可以加入一些其他特征去优化
训练和预测代码1:
# -*- coding: utf-8 -*-
import os
import lightgbm
import numpy as np
import pandas as pd
import xgboost
from keras.layers import Dense
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
def pdReadCsv(file, sep):
try:
data = pd.read_csv(file, sep=sep,encoding='utf-8',error_bad_lines=False,engine='python')
return data
except:
data = pd.read_csv(file,sep=sep,encoding='gbk',error_bad_lines=False,engine='python')
return data
os.chdir(r'E:\项目文件\航空发动机寿命预测\data\\')
src = r'E:\项目文件\航空发动机寿命预测\\'
seed = 2018
# Stacking
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, base_models, meta_model, n_folds=5):
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
# 我们再次拟合原始模型的克隆数据
def fit(self, X, y):
self.base_models_ = [list() for x in self.base_models]
self.meta_model_ = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds, shuffle=True)
# 训练克隆的基础模型,然后创建非折叠预测
# 培养克隆元模型所需的
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
for i, clf in enumerate(self.base_models):
for train_index, holdout_index in kfold.split(X, y):
instance = clone(clf)
self.base_models_[i].append(instance)
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index, i] = y_pred
# 现在使用不可折叠的预测来训练克隆的元模型
print(out_of_fold_predictions.shape)
self.meta_model_.fit(out_of_fold_predictions, y)
return self
def predict(self, X):
meta_features = np.column_stack([
np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
for base_models in self.base_models_])
return self.meta_model_.predict(meta_features)
# 简单模型融合
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# 遍历所有模型
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
for model in self.models_:
model.fit(X, y)
return self
# 预估,并对预估结果值做average
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
return np.mean(predictions, axis=1)
def build_nn():
model = Sequential()
model.add(Dense(units=128, activation='linear', input_dim=18))
model.add(Dense(units=32, activation='linear'))
model.add(Dense(units=8, activation='linear'))
model.add(Dense(units=1, activation='linear'))
model.compile(loss='mse', optimizer='adam')
return model
def build_model():
svr = make_pipeline(SVR(kernel='linear'))
line = make_pipeline(LinearRegression())
lasso = make_pipeline(Lasso(alpha=0.0005, random_state=1))
ENet = make_pipeline(ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR1 = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# KRR1 = LinearSVR(C=2)
KRR2 = KernelRidge(alpha=1.5, kernel='linear', degree=2, coef0=2.5)
lgbm = lightgbm.LGBMRegressor(learning_rate=0.01, n_estimators=500, num_leaves=31)
# lgbm = ExtraTreesRegressor(criterion='mse', n_estimators=500, max_depth=38)
xgb = xgboost.XGBRegressor(booster='gbtree', colsample_bytree=0.8, gamma=0.1,
learning_rate=0.02, max_depth=5,
n_estimators=500, min_child_weight=0.8,
reg_alpha=0, reg_lambda=1, subsample=0.8,
random_state=seed, nthread=2)
nn = KerasRegressor(build_fn=build_nn, nb_epoch=500, batch_size=32, verbose=2)
return svr, line, lasso, ENet, KRR1, KRR2, lgbm, xgb, nn
def rmsle_cv(model=None, X_train_head=None, y_train=None):
n_folds = 5
kf = KFold(n_folds, shuffle=True, random_state=seed).get_n_splits(X_train_head)
rmse = -cross_val_score(model, X_train_head, y_train, scoring="neg_mean_squared_error", cv=kf)
return (rmse)
def main():
print("Load data from file......")
file = 'train_label.csv'
# file = 'download_label.csv'
test_file = 'test_label.csv'
X_test = pdReadCsv(test_file, ',').drop(columns=["life"])
train = pdReadCsv(file, ',')
X_train, y_train = train.drop(columns=["life"]), train["life"]
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
all_data = pd.concat([X_train, X_test])
print(all_data.shape)
print("Load done.")
# 标准化
from sklearn import preprocessing
scaler = MinMaxScaler(feature_range=(0, 1))
all_data = pd.DataFrame(scaler.fit_transform(all_data), columns=all_data.columns)
print("Scale done.")
scaled = pd.DataFrame(preprocessing.scale(all_data), columns=all_data.columns)
X_train = scaled.loc[0:len(X_train) - 1]
X_test = scaled.loc[len(X_train):]
# 特征选择
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
all_data = pd.concat([X_train, X_test])
# 获取效果最好的前18个特征
X_scored = SelectKBest(score_func=f_regression, k='all').fit(X_train, y_train)
feature_scoring = pd.DataFrame({
'feature': X_train.columns,
'score': X_scored.scores_
})
head_feature_num = 18
feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
X_train_head = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
X_test_head = X_test[X_test.columns[X_test.columns.isin(feat_scored_headnum)]]
print(X_train_head.shape)
print(y_train.shape)
print(X_test_head.shape)
print("Start training......")
svr, line, lasso, ENet, KRR1, KRR2, lgbm, xgb, nn = build_model()
score = rmsle_cv(svr, X_train_head, y_train)
print("SVR rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
svr.fit(X_train_head, y_train)
score = rmsle_cv(line, X_train_head, y_train)
print("Line rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
score = rmsle_cv(lasso, X_train_head, y_train)
print("Lasso rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
score = rmsle_cv(ENet, X_train_head, y_train)
print("ElasticNet rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
# =============================================================================
score = rmsle_cv(KRR1, X_train_head, y_train)
print("Kernel Ridge1 rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
score = rmsle_cv(KRR2, X_train_head, y_train)
print("Kernel Ridge2 rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
KRR2.fit(X_train_head, y_train)
# =============================================================================
head_feature_num = 22
feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
X_train_head3 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
score = rmsle_cv(xgb, X_train_head3, y_train)
print("Xgboost rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std()))
xgb.fit(X_train_head, y_train)
# =============================================================================
head_feature_num = 22
feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
X_train_head4 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
score = rmsle_cv(lgbm, X_train_head4, y_train)
print("LGBM 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
lgbm.fit(X_train_head, y_train)
# =============================================================================
head_feature_num = 18
feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature']
X_train_head5 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]]
score = rmsle_cv(nn, X_train_head5, y_train)
print("NN 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
nn.fit(X_train_head, y_train)
# =============================================================================
averaged_models = AveragingModels(models=(svr, KRR2, lgbm, nn))
score = rmsle_cv(averaged_models, X_train_head, y_train)
print("对基模型集成后的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_models.fit(X_train_head, y_train)
stacking_models = StackingAveragedModels(base_models=(svr, KRR2, lgbm, nn), meta_model=xgb)
stacking_models.fit(X_train_head.values, y_train.values)
stacked_train_pred = stacking_models.predict(X_train_head)
score = mean_squared_error(y_train.values, stacked_train_pred)
print("Stacking Averaged models predict score: {:.4f}".format(score))
main()
训练和预测代码2:
#!/usr/bin/env Python
# coding=utf-8
import warnings
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import train_test_split
from Aero_engine_life.data_model import get_train, build_model_lgb, build_model_etr, build_model_rf, write_mse, \
score_model
warnings.filterwarnings("ignore", "(?s).*MATPLOTLIBDATA.*", category=UserWarning)
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
np.random.seed(2020)
for k in [20]:
print(k)
X_data, Y_data = get_train()
X_data = SelectKBest(mutual_info_regression, k=k).fit_transform(X_data, Y_data)
pca = PCA(n_components=k)
X_data = pca.fit_transform(X_data)
x_train, x_val, y_train, y_val = train_test_split(X_data, Y_data, test_size=0.02, random_state=20)
model_lgb = build_model_lgb(x_train, y_train)
val_lgb = model_lgb.predict(x_val)
model_etr = build_model_etr(x_train, y_train)
val_etr = model_etr.predict(x_val)
model_rf = build_model_rf(x_train, y_train)
val_rf = model_rf.predict(x_val)
# Starking 第一层
train_etr_pred = model_etr.predict(x_train)
print('etr训练集,mse:', mean_squared_error(y_train, train_etr_pred))
write_mse('etr', '训练集', mean_squared_error(y_train, train_etr_pred))
train_lgb_pred = model_lgb.predict(x_train)
print('lgb训练集,mse:', mean_squared_error(y_train, train_lgb_pred))
write_mse('lgb', '训练集', mean_squared_error(y_train, train_lgb_pred))
train_rf_pred = model_rf.predict(x_train)
print('rf训练集,mse:', mean_squared_error(y_train, train_rf_pred))
write_mse('rf', '训练集', mean_squared_error(y_train, train_rf_pred))
Stacking_X_train = pd.DataFrame()
Stacking_X_train['Method_1'] = train_rf_pred
Stacking_X_train['Method_2'] = train_lgb_pred
Stacking_X_train['Method_3'] = train_etr_pred
Stacking_X_val = pd.DataFrame()
Stacking_X_val['Method_1'] = val_rf
Stacking_X_val['Method_2'] = val_lgb
Stacking_X_val['Method_3'] = val_etr
# 第二层
model_Stacking = build_model_etr(Stacking_X_train, y_train)
train_pre_Stacking = model_Stacking.predict(Stacking_X_train)
score_model(Stacking_X_train, y_train, train_pre_Stacking, model_Stacking, '训练集')
val_pre_Stacking = model_Stacking.predict(Stacking_X_val)
score_model(Stacking_X_val, y_val, val_pre_Stacking, model_Stacking, '验证集')
模型文件:
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from utils.read_write import writeOneCsv, pdReadCsv
os.chdir(r'E:\项目文件\航空发动机寿命预测\data\\')
src = r'E:\项目文件\航空发动机寿命预测\\'
def get_train():
file = 'train_label.csv'
# file = 'download_label.csv'
# file = 'test_label.csv'
train = pdReadCsv(file, ',')
return train.values[:, 3:-1], train.values[:, -1:].ravel()
def build_model_rf(x_train, y_train):
estimator = RandomForestRegressor(criterion='mse')
param_grid = {
'max_depth': range(33, 35, 9),
'n_estimators': range(73, 77, 9),
}
model = GridSearchCV(estimator, param_grid, cv=3)
model.fit(x_train, y_train)
print('rf')
print(model.best_params_)
writeParams('rf', model.best_params_)
return model
def build_model_etr(x_train, y_train):
# 极端随机森林回归 n_estimators 即ExtraTreesRegressor最大的决策树个数
estimator = ExtraTreesRegressor(criterion='mse')
param_grid = {
'max_depth': range(33, 39, 9),
'n_estimators': range(96, 99, 9),
}
model = GridSearchCV(estimator, param_grid)
model.fit(x_train, y_train)
print('etr')
print(model.best_params_)
writeParams('etr', model.best_params_)
return model
def build_model_lgb(x_train, y_train):
estimator = LGBMRegressor()
param_grid = {
'learning_rate': [0.1],
'n_estimators': range(77, 78, 9),
'num_leaves': range(59, 66, 9)
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(x_train, y_train.ravel())
print('lgb')
print(gbm.best_params_)
writeParams('lgb', gbm.best_params_)
return gbm
def scatter_line(y_val, y_pre):
import matplotlib.pyplot as plt
xx = range(0, len(y_val))
plt.scatter(xx, y_val, color="red", label="Sample Point", linewidth=3)
plt.plot(xx, y_pre, color="orange", label="Fitting Line", linewidth=2)
plt.legend()
plt.show()
def score_model(train, test, predict, model, data_type):
score = model.score(train, test)
print(data_type + ",R^2,", round(score, 6))
writeOneCsv(['staking', data_type, 'R^2', round(score, 6)], src + '调参记录.csv')
mae = mean_absolute_error(test, predict)
print(data_type + ',MAE,', mae)
writeOneCsv(['staking', data_type, 'MAE', mae], src + '调参记录.csv')
mse = mean_squared_error(test, predict)
print(data_type + ",MSE,", mse)
writeOneCsv(['staking', data_type, 'MSE', mse], src + '调参记录.csv')
def writeParams(model, best):
if model == 'lgb':
writeOneCsv([model, best['num_leaves'], best['n_estimators'], best['learning_rate']], src + '调参记录.csv')
else:
writeOneCsv([model, best['max_depth'], best['n_estimators'], 0], src + '调参记录.csv')
def write_mse(model, data_type, mse):
writeOneCsv([model, data_type, 'mse', mse], src + '调参记录.csv')
如果你是去平台下的原始数据就要经过数据处理
import os
import pandas as pd
from utils.read_write import pdReadCsv
os.chdir(r'E:\项目文件\航空发动机寿命预测\data\\')
src = r'E:\项目文件\航空发动机寿命预测\\'
def join_data():
file = 'Dataset_Aero_engine_life_prediction_train_2020_09_05.csv'
train = pdReadCsv(file, ',')
label_file = 'Dataset_Aero_engine_life_prediction_label_2020_09_05.csv'
label = pdReadCsv(label_file, ',')
download_file = 'Dataset_Aero_engine_life_prediction_download_2020_09_05.csv'
download = pdReadCsv(download_file, ',')
test_file = 'Dataset_Aero_engine_life_prediction_test_2020_09_05.csv'
test = pdReadCsv(test_file, ',')
train_label = pd.merge(train, label, on='Number')
train_label.to_csv('train_label.csv')
download_label = pd.merge(download, label, on='Number')
download_label.to_csv('download_label.csv')
test_label = pd.merge(test, label, on='Number')
test_label.to_csv('test_label.csv')
join_data()
欢迎大家多多交流工业大数据创新应用
网友评论