美文网首页
神经网络学习:LSTM对沪深300收益率预测

神经网络学习:LSTM对沪深300收益率预测

作者: 黄yy家的jby | 来源:发表于2019-11-27 17:36 被阅读0次

    1-导库

    # 常见库
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import matplotlib
    import numpy as np
    import os
    # 数据预处理,sts是为了adf检验,acf是用来看滞后几期比较合适
    import statsmodels.tsa.stattools as sts
    from sklearn import preprocessing
    from sklearn.metrics import mean_squared_error
    from statsmodels.tsa.stattools import acf
    
    # 构建LSTM 所需库
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.layers import Dropout
    from tensorflow.keras.layers import BatchNormalization
    from tensorflow.keras.utils import to_categorical
    from tensorflow import keras
    from sklearn.metrics import classification_report
    
    

    2-数据导入

    df_hs : 沪深300的相关数据:收盘价,换手率,成交量,股息率,最大振幅
    df_marco : 各宏观指标
    考虑披露时效,指标滞后一期(季度滞后3个月,月度滞后一个月)

    def read_data(open_path):
        df_hs = pd.read_excel(open_path+'hs300_zhenfu.xlsx',index_col=0)
        df_hs.columns = ['g','close','vol','ts','high','low']
        df_hs['zhenfu'] = 100*(df_hs['high']-df_hs['low'])/df_hs['close']
        df_hs['chg'] = 100*df_hs['close'].pct_change(1)
        df_hs.loc[:,df_hs.columns != 'chg'] = df_hs.loc[:,df_hs.columns != 'chg'].shift(1)
        df_hs.dropna(inplace=True,axis=0)
    
        df_marco = pd.read_excel(open_path+'宏观数据.xls',index_col=0)
        df_marco.columns = ['gdp','ppi','cpi','pmi','industry_add','investment','current_deposit',\
                            'fixed_deposit_3','fixed_deposit_6','mlf','slf','shibor','rf_1','rf_10',\
                            'm1','m2','inter_borrow','dollar','dollar_rmb','yen_rmb','social_financing',\
                            'foreign_reserve','au','rare','aa+','aa']
        df_marco['m1_m2'] = df_marco['m1'] - df_marco['m2']
        df_marco['qixian'] = df_marco['rf_10'] - df_marco['rf_1']
        df_marco['xinyong_+'] = df_marco['aa+'] - df_marco['rf_1']
        df_marco['xinyong'] = df_marco['aa'] - df_marco['rf_1']
    
        # 滞后一期,先resample到月,向后推一个月,在resample到天
        day_ = df_marco[['shibor','rf_1','rf_10','dollar','dollar_rmb','yen_rmb','au','aa+','aa','qixian','xinyong_+','xinyong']]
        month_ = df_marco[['ppi','cpi','pmi','industry_add','investment','current_deposit','fixed_deposit_3','fixed_deposit_6',\
                           'slf','m1','m2','m1_m2','inter_borrow','social_financing','rare']]
        quarter_ = df_marco[['gdp','foreign_reserve']]
    
        day_[['dollar','dollar_rmb','yen_rmb','au']] = 100*day_[['dollar','dollar_rmb','yen_rmb','au']].pct_change(1)
        day_ = day_.shift(1)
    
        month_ = month_.resample('M').mean()
        month_.fillna(method='ffill',inplace=True)
        month_['pmi'] = 100*month_['pmi'].pct_change(12)
        month_['social_financing'] = 100*month_['social_financing'].pct_change(1)
        month_ = month_.shift(1)
        month_ = month_.resample('d').pad()
    
        quarter_ = quarter_.resample('3M').mean()
        quarter_.fillna(method='ffill',inplace=True)
        quarter_['foreign_reserve'] = 100*quarter_['foreign_reserve'].pct_change(1)
        quarter_ = quarter_.shift(1)
        quarter_ = quarter_.resample('d').pad()
    
        df = pd.concat([day_,month_,quarter_],axis=1)
        df = df.fillna(method='ffill')
        df = df.loc[df_hs.index]
        return df_hs, df,df_marco
    

    3- 数据整合

    计算技术指标数据
    定义x(宏观指标,资产配置指标,利率指标,外汇指标,量价指标,滞后项)和y(沪深300日收益率)
    将y按照分位数进行6等分类

    def cal_ema(df, N):
        a = 2/(N+1)
        b = pd.DataFrame(columns = ['close'], index=df.index)
        for i in range(len(df)):
            if i == 0:
                b.iloc[i] = df['close'].iloc[i]
            else:
                b.iloc[i] = a * df['close'].iloc[i] + (1-a) * b.iloc[i-1]
        return b
    
    def cal_dea(df, short_t=12, long_t=26 ,avg_t=9):
        ema_short = cal_ema(df, short_t)
        ema_long = cal_ema(df, long_t)
        dif = ema_short - ema_long
        dea = cal_ema(dif, avg_t)
        df['macd'] = (dif-dea)*2
        return df
    
    def cal_adx(df, N=14, M=6):
        hd = df['high'].diff().dropna()
        ld = -df['low'].diff().dropna()
        dmp = pd.DataFrame({'dmp': [0] * len(hd)}, index=hd.index)
        dmp[(hd > 0) & (ld < 0)] = hd
        dmp = dmp.rolling(N).sum().dropna()
        dmm = pd.DataFrame({'dmm': [0] * len(ld)}, index=ld.index)
        dmm[(hd < 0) & (ld > 0)] = ld
        dmm = dmm.rolling(N).sum().dropna()
        temp = pd.concat([df['high'] - df['low'], abs(df['high'] - df['close'].shift(1)), \
                          abs(df['low'] - df['close'].shift(1))], axis=1).dropna()
        tr = temp.max(axis=1).dropna()
    
        s_index = dmm.index & tr.index & dmp.index
        dmp = dmp.loc[s_index]
        dmm = dmm.loc[s_index]
        tr = tr.loc[s_index]
        pdi = 100 * dmp['dmp'] / tr
        mdi = dmm['dmm'] * 100 / tr
    
        dx = abs(pdi - mdi) / (pdi + mdi) * 100
        adx = dx.rolling(M).mean().dropna()
        adx = pd.DataFrame(adx, columns=['adx'])
        return adx
    
    def handle_hs(df_hs):
        df_hs = cal_dea(df_hs)
        df_hs['adx'] = cal_adx(df_hs)
        df_hs.dropna(inplace=True)
        return df_hs
    
    
    def handle_data(df_hs,df_marco):
        df_hs = handle_hs(df_hs)
        df_marco = df_marco.loc[df_hs.index]
    
        df_hs = df_hs[df_hs.index.year>=2009]
        df_marco = df_marco[df_marco.index.year>=2009]
    
        #重新定义y 和 x
        y = df_hs.loc[:,'chg']
        # 找分位数
        for i in range(1,6):
            print( y.quantile(i/6))
            print(i/6)
            print("")
        y_new = y.copy()
        y_new[1.24 <= y_new] = 5
        y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
        y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
        y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
        y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
        y_new[y_new < -1.04] = 0
    
        y_new.columns = ['label']
        y_new_count = y_new.groupby(y_new).count()
        y_new_count =pd.concat([ pd.Series(['<-1.04','-1.04~-0.37','-0.37~-0.06',\
                                       '0.06~0.46','0.46~1.24','>1.24']),y_new_count],axis=1)
        y_new_count.columns = ['分位数定义','num']
        y_new_count.to_excel(save_path+'lstm的y分类.xls')
    
        #去除相关性不高的因子
        x = pd.concat([df_hs[['g','vol','ts','zhenfu','macd','adx']],df_marco],axis=1)
        x = x.astype('float')
        dic = {}
        for i in x.columns:
            temp = x[i]
            dic[i] = temp.corr(y)
            print(i)
        df = pd.DataFrame.from_dict(dic,orient='index')
        df.to_excel(save_path+'LSTM因子相关性.xls')
        df_temp = df[abs(df)>0.01]
        df_temp.dropna(inplace=True)
    
        target = ['g','vol','ts','zhenfu','macd','adx','shibor','rf_10','dollar','aa','xinyong','cpi','m1_m2']
        df_temp2 = df_temp.loc[target]
        df_temp2.to_excel(save_path+'lstm筛选后因子相关性.xls')
        x = x[target]
        return x,y_new
    

    4-检验数据

    相关性过滤--单位根检验--PCA取主成因子(不存在共线性)

    def again_handle_data(x,y,df_backup):
        def adf_test(x,y):
            dic = {}
            dic['y'] = sts.adfuller(y)[1]
            for i in x.columns:
                dic[i] = sts.adfuller(x[i])[1]
            df = pd.DataFrame.from_dict(dic,orient='index')
            target = list(df[df>0.01].dropna().index)
            if len(target)>0:
                print(target)
            else:
                print('所有因子通过单位根检验')
            return df,target
    
        def adf_data(x, y, df_backup):
            df1, target1 = adf_test(x, y)
            for i in target1:
                x[i] = x[i].diff(1)
            x.dropna(inplace=True)
            y = y[x.index]
    
            # 重新diff cpi 和 m1-m2,然后赋值给x
            month_ = df_backup[['cpi','m1_m2']]
            month_.dropna(inplace=True)
            month_ = month_.shift(1)
            month_[['cpi','m1_m2']] = month_[['cpi','m1_m2']].diff(1)
            month_.dropna(inplace=True)
            for i in month_.columns:
                print(sts.adfuller(month_[i])[1])
    
            a = list(x.columns)
            a.remove('cpi')
            a.remove('m1_m2')
            b = x[a]
            c = pd.concat([b,month_],axis=1)
            x = c.fillna(method='bfill')
            x = x[x.index.year>=2009]
            x.dropna(inplace=True)
            s_index = x.index & y.index
            x = x.loc[s_index]
            y = y[s_index]
            # df2, target2 = adf_test(x, y)
            # df1.to_excel(save_path + 'lstm单位根检验.xls')
            # df2.to_excel(save_path + 'lstm差分后单位根检验.xls')
            return x, y
        x,y = adf_data(x,y,df_backup)
        x_new = preprocessing.scale(x,axis=0)
        x_new = pd.DataFrame(x_new,index=x.index,columns=x.columns)
        mat = np.dot(x_new.T, x_new)
        l,eig,r = np.linalg.svd(mat)
        eig = eig / eig.sum()
        eig = eig.cumsum()
    
        i = 0
        for e in eig:
            i += 1
            if e > 0.99:
                n_pca = i
                break
        print('最大特征数目:' + str(n_pca))
    
        # b = pd.DataFrame(y)
        y_new = to_categorical(y)
        y_new = pd.DataFrame(y_new,index=y.index)
        return x_new,y_new
    

    5-数据准备

    lstm要求数据输入有特定格式,n_steps=2是根据acf得到

    def pre_data(x,y,n_steps=2):
        data = np.hstack([np.array(x),np.array(y)])
    
        n_feature = x.shape[1]
        train_pos = 0.9
    
        result = []
        for s in range(len(data)-n_steps):
            temp = data[s:s+n_steps]
            result.append(temp)
        result = np.array(result)
    
        row = round(train_pos * result.shape[0])
        x_train = result[:row, :, :-6]
        x_test = result[row:, :, :-6]
        y_train = result[:row, -1, -6:]
        y_test = result[row:, -1, -6:]
    
        x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], n_feature))
        x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], n_feature))
        return x_train, x_test, y_train, y_test
    

    6-模型建立

    由于对数据有结构要求,要建立两层,需要return_sequences=True

    def bulid_model(x_train, neurons=[128,128,6], dropout=0.2):
        model = Sequential()
        model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=False))
        model.add(Dropout(dropout))
        model.add(Dense(neurons[2], activation='softmax'))
        adam = keras.optimizers.Adam(decay=0.2)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()
    
      # 建立两层模型
       # model = Sequential()
       # model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
       # model.add(Dropout(dropout))
       # model .add(LSTM(neurons[1]))
       # model.add(Dropout(dropout))
       # model.add(Dense(neurons[2], activation='softmax'))
       # adam = keras.optimizers.Adam(decay=0.2)
       # model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
       # model.summary()
    
        return model
    

    7-运行模型

    def run_model(x_train,y_train,save_path, neurons=[128,128,9], dropout=0.2):
        model = bulid_model(x_train, neurons=[128,128,6], dropout=0.2)
        history = model.fit(x_train,y_train,epochs=32,batch_size=64,verbose=2)
    
        fig = plt.figure()
        ax = fig.add_subplot(111)
        pic1 = ax.plot(history.history['loss'], label='loss',color='b')
        ax2 = ax.twinx()
        pic2 = ax2.plot(history.history['accuracy'],label='acc',color='r')
        pic = pic1+pic2
        labs = [l.get_label() for l in pic]
        ax.legend(pic,labs,loc=2)
        plt.savefig(save_path+'LSTM_loss.jpg',dpi=300)
        plt.show()
        plt.close()
    
    
    def model_score(model, x_train, y_train, x_test, y_test):
        trainScore = model.evaluate(x_train, y_train, verbose=0)
        print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], np.sqrt(trainScore[0])))
    
        testScore = model.evaluate(x_test, y_test, verbose=0)
        print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], np.sqrt(testScore[0])))
        return trainScore, testScore        
    

    8- 预测模型

    def predict_model(model,x_test,y_test,save_path,x,train_pos=0.9):
    
        y_pred_prob = model.predict(x_test)
        #y_pred = tran_porb(y_pred_prob)
        y_class = model.predict_classes(x_test)
        y_pred = to_categorical(y_class)
        target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
        print(classification_report(y_test, y_pred, target_names=target))    
    

    9-保存模型

    读取模型中 直接
    from keras.models 有可能会报错,查询了一下貌似是keras版本太高的问题
    from tensorflow.keras.models 既可以了

    model.save(save_path+'lstm.h5')
    
    # 读取模型
    #from tensorflow.keras.models import load_model
    
    #model = load_model(save_path+'lstm.h5'
    

    完整代码

    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import matplotlib
    import numpy as np
    import os
    import statsmodels.tsa.stattools as sts
    from sklearn import preprocessing
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    from sklearn import linear_model
    from sklearn.metrics import mean_squared_error
    from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
    import statsmodels.api as sm
    # from statsmodels.tsa.api import VAR
    from statsmodels.tsa.vector_ar.var_model import VAR
    from statsmodels.tsa.stattools import acf
    from arch import arch_model
    
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.layers import Dropout
    from tensorflow.keras.layers import BatchNormalization
    from tensorflow.keras.utils import to_categorical
    from tensorflow import keras
    from sklearn.metrics import classification_report
    
    
    
    import ReadData as RD
    import PreHandle as PH
    import warnings
    
    warnings.filterwarnings('ignore')
    
    #%%
    # 提取原始信息
    def read_data(open_path):
        df_hs = pd.read_excel(open_path+'hs300_zhenfu.xlsx',index_col=0)
        df_hs.columns = ['g','close','vol','ts','high','low']
        df_hs['zhenfu'] = 100*(df_hs['high']-df_hs['low'])/df_hs['close']
        df_hs['chg'] = 100*df_hs['close'].pct_change(1)
        df_hs.loc[:,df_hs.columns != 'chg'] = df_hs.loc[:,df_hs.columns != 'chg'].shift(1)
        df_hs.dropna(inplace=True,axis=0)
    
        df_marco = pd.read_excel(open_path+'宏观数据.xls',index_col=0)
        df_marco.columns = ['gdp','ppi','cpi','pmi','industry_add','investment','current_deposit',\
                            'fixed_deposit_3','fixed_deposit_6','mlf','slf','shibor','rf_1','rf_10',\
                            'm1','m2','inter_borrow','dollar','dollar_rmb','yen_rmb','social_financing',\
                            'foreign_reserve','au','rare','aa+','aa']
        df_marco['m1_m2'] = df_marco['m1'] - df_marco['m2']
        df_marco['qixian'] = df_marco['rf_10'] - df_marco['rf_1']
        df_marco['xinyong_+'] = df_marco['aa+'] - df_marco['rf_1']
        df_marco['xinyong'] = df_marco['aa'] - df_marco['rf_1']
    
        # 滞后一期,先resample到月,向后推一个月,在resample到天
        day_ = df_marco[['shibor','rf_1','rf_10','dollar','dollar_rmb','yen_rmb','au','aa+','aa','qixian','xinyong_+','xinyong']]
        month_ = df_marco[['ppi','cpi','pmi','industry_add','investment','current_deposit','fixed_deposit_3','fixed_deposit_6',\
                           'slf','m1','m2','m1_m2','inter_borrow','social_financing','rare']]
        quarter_ = df_marco[['gdp','foreign_reserve']]
    
        #day_.dropna(inplace=True)
        # day_['dollar'] = day_['dollar'].pct_change(1)
        # day_['dollar_rmb'] = day_['dollar_rmb'].pct_change(1)
        # day_['yen_rmb'] = day_['yen_rmb'].pct_change(1)
        day_[['dollar','dollar_rmb','yen_rmb','au']] = 100*day_[['dollar','dollar_rmb','yen_rmb','au']].pct_change(1)
        day_ = day_.shift(1)
        #day_.dropna(inplace=True)
    
        month_ = month_.resample('M').mean()
        month_.fillna(method='ffill',inplace=True)
        month_['pmi'] = 100*month_['pmi'].pct_change(12)
        month_['social_financing'] = 100*month_['social_financing'].pct_change(1)
        month_ = month_.shift(1)
        month_ = month_.resample('d').pad()
    
        quarter_ = quarter_.resample('3M').mean()
        quarter_.fillna(method='ffill',inplace=True)
        quarter_['foreign_reserve'] = 100*quarter_['foreign_reserve'].pct_change(1)
        quarter_ = quarter_.shift(1)
        quarter_ = quarter_.resample('d').pad()
    
        df = pd.concat([day_,month_,quarter_],axis=1)
        df = df.fillna(method='ffill')
        df = df.loc[df_hs.index]
        return df_hs, df,df_marco
    
    
    # 提取技术性指标
    def cal_ema(df, N):
        a = 2/(N+1)
        b = pd.DataFrame(columns = ['close'], index=df.index)
        for i in range(len(df)):
            if i == 0:
                b.iloc[i] = df['close'].iloc[i]
            else:
                b.iloc[i] = a * df['close'].iloc[i] + (1-a) * b.iloc[i-1]
        return b
    
    def cal_dea(df, short_t=12, long_t=26 ,avg_t=9):
        ema_short = cal_ema(df, short_t)
        ema_long = cal_ema(df, long_t)
        dif = ema_short - ema_long
        dea = cal_ema(dif, avg_t)
        df['macd'] = (dif-dea)*2
        return df
    
    def cal_adx(df, N=14, M=6):
        hd = df['high'].diff().dropna()
        ld = -df['low'].diff().dropna()
        dmp = pd.DataFrame({'dmp': [0] * len(hd)}, index=hd.index)
        dmp[(hd > 0) & (ld < 0)] = hd
        dmp = dmp.rolling(N).sum().dropna()
        dmm = pd.DataFrame({'dmm': [0] * len(ld)}, index=ld.index)
        dmm[(hd < 0) & (ld > 0)] = ld
        dmm = dmm.rolling(N).sum().dropna()
        temp = pd.concat([df['high'] - df['low'], abs(df['high'] - df['close'].shift(1)), \
                          abs(df['low'] - df['close'].shift(1))], axis=1).dropna()
        tr = temp.max(axis=1).dropna()
    
        s_index = dmm.index & tr.index & dmp.index
        dmp = dmp.loc[s_index]
        dmm = dmm.loc[s_index]
        tr = tr.loc[s_index]
        pdi = 100 * dmp['dmp'] / tr
        mdi = dmm['dmm'] * 100 / tr
    
        dx = abs(pdi - mdi) / (pdi + mdi) * 100
        adx = dx.rolling(M).mean().dropna()
        adx = pd.DataFrame(adx, columns=['adx'])
        return adx
    
    def handle_hs(df_hs):
        df_hs = cal_dea(df_hs)
        df_hs['adx'] = cal_adx(df_hs)
        df_hs.dropna(inplace=True)
        return df_hs
    
    
    
    def handle_data(df_hs,df_marco):
        df_hs = handle_hs(df_hs)
        df_marco = df_marco.loc[df_hs.index]
    
        df_hs = df_hs[df_hs.index.year>=2009]
        df_marco = df_marco[df_marco.index.year>=2009]
    
        #重新定义y 和 x
        y = df_hs.loc[:,'chg']
        # 找分位数
        for i in range(1,6):
            print( y.quantile(i/6))
            print(i/6)
            print("")
        y_new = y.copy()
        y_new[1.24 <= y_new] = 5
        y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
        y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
        y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
        y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
        y_new[y_new < -1.04] = 0
    
        y_new.columns = ['label']
        y_new_count = y_new.groupby(y_new).count()
        y_new_count =pd.concat([ pd.Series(['<-1.04','-1.04~-0.37','-0.37~-0.06',\
                                       '0.06~0.46','0.46~1.24','>1.24']),y_new_count],axis=1)
        y_new_count.columns = ['分位数定义','num']
        y_new_count.to_excel(save_path+'lstm的y分类.xls')
    
        #去除相关性不高的因子
        x = pd.concat([df_hs[['g','vol','ts','zhenfu','macd','adx']],df_marco],axis=1)
        x = x.astype('float')
        dic = {}
        for i in x.columns:
            temp = x[i]
            dic[i] = temp.corr(y)
            print(i)
        df = pd.DataFrame.from_dict(dic,orient='index')
        df.to_excel(save_path+'LSTM因子相关性.xls')
        df_temp = df[abs(df)>0.01]
        df_temp.dropna(inplace=True)
    
        target = ['g','vol','ts','zhenfu','macd','adx','shibor','rf_10','dollar','aa','xinyong','cpi','m1_m2']
        df_temp2 = df_temp.loc[target]
        df_temp2.to_excel(save_path+'lstm筛选后因子相关性.xls')
        x = x[target]
        return x,y_new
    
    def again_handle_data(x,y,df_backup):
        def adf_test(x,y):
            dic = {}
            dic['y'] = sts.adfuller(y)[1]
            for i in x.columns:
                dic[i] = sts.adfuller(x[i])[1]
            df = pd.DataFrame.from_dict(dic,orient='index')
            target = list(df[df>0.01].dropna().index)
            if len(target)>0:
                print(target)
            else:
                print('所有因子通过单位根检验')
            return df,target
    
        def adf_data(x, y, df_backup):
            df1, target1 = adf_test(x, y)
            for i in target1:
                x[i] = x[i].diff(1)
            x.dropna(inplace=True)
            y = y[x.index]
    
            # 重新diff cpi 和 m1-m2,然后赋值给x
            month_ = df_backup[['cpi','m1_m2']]
            month_.dropna(inplace=True)
            month_ = month_.shift(1)
            month_[['cpi','m1_m2']] = month_[['cpi','m1_m2']].diff(1)
            month_.dropna(inplace=True)
            for i in month_.columns:
                print(sts.adfuller(month_[i])[1])
    
            a = list(x.columns)
            a.remove('cpi')
            a.remove('m1_m2')
            b = x[a]
            c = pd.concat([b,month_],axis=1)
            x = c.fillna(method='bfill')
            x = x[x.index.year>=2009]
            x.dropna(inplace=True)
            s_index = x.index & y.index
            x = x.loc[s_index]
            y = y[s_index]
            # df2, target2 = adf_test(x, y)
            # df1.to_excel(save_path + 'lstm单位根检验.xls')
            # df2.to_excel(save_path + 'lstm差分后单位根检验.xls')
            return x, y
        x,y = adf_data(x,y,df_backup)
        x_new = preprocessing.scale(x,axis=0)
        x_new = pd.DataFrame(x_new,index=x.index,columns=x.columns)
        mat = np.dot(x_new.T, x_new)
        l,eig,r = np.linalg.svd(mat)
        eig = eig / eig.sum()
        eig = eig.cumsum()
    
        i = 0
        for e in eig:
            i += 1
            if e > 0.99:
                n_pca = i
                break
        print('最大特征数目:' + str(n_pca))
    
        # b = pd.DataFrame(y)
        y_new = to_categorical(y)
        y_new = pd.DataFrame(y_new,index=y.index)
        return x_new,y_new
    
    def pre_data(x,y,n_steps=2):
        data = np.hstack([np.array(x),np.array(y)])
    
        n_feature = x.shape[1]
        train_pos = 0.9
    
        result = []
        for s in range(len(data)-n_steps):
            temp = data[s:s+n_steps]
            result.append(temp)
        result = np.array(result)
    
        row = round(train_pos * result.shape[0])
        x_train = result[:row, :, :-6]
        x_test = result[row:, :, :-6]
        y_train = result[:row, -1, -6:]
        y_test = result[row:, -1, -6:]
    
        x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], n_feature))
        x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], n_feature))
        return x_train, x_test, y_train, y_test
    
    def bulid_model(x_train, neurons=[128,128,6], dropout=0.2):
        model = Sequential()
        model.add(LSTM(neurons[0], input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=False))
        model.add(Dropout(dropout))
        model.add(Dense(neurons[2], activation='softmax'))
        adam = keras.optimizers.Adam(decay=0.2)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()
        return model
    
        trainScore, testScore = model_score(model,x_train, y_train, x_test, y_test)
    
    def run_model(x_train,y_train,save_path, neurons=[128,128,9], dropout=0.2):
        model = bulid_model(x_train, neurons=[128,128,6], dropout=0.2)
        history = model.fit(x_train,y_train,epochs=32,batch_size=64,verbose=2)
    
        fig = plt.figure()
        ax = fig.add_subplot(111)
        pic1 = ax.plot(history.history['loss'], label='loss',color='b')
        ax2 = ax.twinx()
        pic2 = ax2.plot(history.history['accuracy'],label='acc',color='r')
        pic = pic1+pic2
        labs = [l.get_label() for l in pic]
        ax.legend(pic,labs,loc=2)
        plt.savefig(save_path+'LSTM_loss.jpg',dpi=300)
        plt.show()
        plt.close()
    
    
    def model_score(model, x_train, y_train, x_test, y_test):
        trainScore = model.evaluate(x_train, y_train, verbose=0)
        print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], np.sqrt(trainScore[0])))
    
        testScore = model.evaluate(x_test, y_test, verbose=0)
        print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], np.sqrt(testScore[0])))
        return trainScore, testScore
    
    def predict_model(model,x_test,y_test,save_path,x,train_pos=0.9):
    
        y_pred_prob = model.predict(x_test)
        #y_pred = tran_porb(y_pred_prob)
        y_class = model.predict_classes(x_test)
        y_pred = to_categorical(y_class)
        target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
        print(classification_report(y_test, y_pred, target_names=target))
    
    
    
        # target = ['<-1.04', '-1.04~-0.37', '-0.37~-0.06', '0.06~0.46', '0.46~1.24', '>1.24']
        # print(classification_report(y_test, y_pred, target_names=target))
        row = round(train_pos * x.shape[0])
        s_index = x.index[row:]
        y_pred_prob = pd.DataFrame(y_pred_prob,columns = target,index=s_index)
        y_pred = pd.DataFrame(y_pred,columns=target,index=s_index)
        y_test = pd.DataFrame(y_test,columns=target,index=s_index)
        y_class = pd.DataFrame(y_class,index=s_index)
        y_pred_prob.to_excel(save_path+'LSTM预测概率.xls')
        y_pred.to_excel(save_path+'LSTM预测类别dummy.xls')
        y_test.to_excel(save_path+'LSTM真实值类别.xls')
        y_class.to_excel(save_path+'LSTM预测类别class.xls')
        return y_pred_prob,y_pred
    
    def tran_porb(y_pred_prob):
        y_pred = np.zeros((y_pred_prob.shape[0],y_pred_prob.shape[1]))
        x_max = y_pred_prob.argmax(axis=1)
        for i in range(y_pred_prob.shape[0]):
            y_pred[i,x_max[i]] = 1
        return y_pred
    
    
    def pic_lstm(df_hs,y_class):
        y = df_hs['chg']
        y_new = y.copy()
        y_new[1.24 <= y_new] = 5
        y_new[(0.46 <= y_new) & (y_new < 1.24)] = 4
        y_new[(0.06 <= y_new) & (y_new < 0.46)] = 3
        y_new[(-0.37 <= y_new) & (y_new < 0.06)] = 2
        y_new[(-1.04 <= y_new) & (y_new < -0.37)] = 1
        y_new[y_new < -1.04] = 0
        y_new = y_new[y_class.index]
    
        df = pd.concat([y_new,y_class],axis=1)
        df.columns = ['真实值','预测值']
        df.to_excel(save_path+'预测汇总/lstm画图.xls')
    
        fig = plt.figure(figsize=(12,8))
        plt.plot(df['真实值'])
        plt.plot(df['预测值'])
        plt.show()
        plt.close()
    
    
    
    
    #%%
    if __name__ == '__main__':
        open_path = 'data1115/'
        view_path = 'view_path/'
        save_path = 'save_path/'
    
        #中文字体
        matplotlib.rcParams['axes.unicode_minus'] = False
        plt.rcParams['font.sans-serif'] = ['SimHei']
    
        df_hs,df_marco,df_backup = read_data(open_path)
        x,y = handle_data(df_hs,df_marco,)
        x,y = again_handle_data(x,y,df_backup)
    
        x_train, x_test, y_train, y_test = pre_data(x, y)
    
    
        model = bulid_model(x_train)
        model.save(save_path+'LSTM.H5')
        run_model(x_train, y_train, save_path)
    
        #trainScore, testScore = model_score(model, x_train, y_train, x_test, y_test)
        y_pred_prob, y_pred = predict_model(model,x_test,y_test,save_path,x,train_pos=0.9)
    
    

    相关文章

      网友评论

          本文标题:神经网络学习:LSTM对沪深300收益率预测

          本文链接:https://www.haomeiwen.com/subject/amtmwctx.html