美文网首页
特征选择

特征选择

作者: Ary_zz | 来源:发表于2020-01-10 11:59 被阅读0次

    2020-01-10

    皮尔逊相关系数

    image.png

    衡量线性相关性,检查数据集里目标和数值特征之间皮尔逊相关系数的绝对值。根据这个准则保留前n个特征。

    def cor_selector(X, y,num_feats):
        cor_list = []    
        feature_name = X.columns.tolist()
        # calculate the correlation with y for each feature
        for i in X.columns.tolist():
            cor = np.corrcoef(X[i], y)[0, 1]
            cor_list.append(cor)
        # replace NaN with 0 
       cor_list = [0 if np.isnan(i) else i for i in cor_list]
        # feature name 
       cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))
    [-num_feats:]].columns.tolist() 
       # feature selection? 0 for not select, 1 for select
        cor_support = [True if i in cor_feature else False for i in 
    feature_name]
        return cor_support, cor_feature
    cor_support, cor_feature = cor_selector(X, y,num_feats)
    print(str(len(cor_feature)), 'selected features')
    
    from sklearn.feature_selection import SelectKBest
    from scipy.stats import pearsonr
    from sklearn.datasets import load_iris
    
    iris=load_iris()
    #选择K个最好的特征,返回选择特征后的数据
    
    #第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
    #参数k为选择的特征个数
    # 定义函数
    def multivariate_pearsonr(X, y):
        scores, pvalues = [], []
        for ret in map(lambda x:pearsonr(x, y), X.T):
            scores.append(abs(ret[0]))
            pvalues.append(ret[1])
        return (np.array(scores), np.array(pvalues))
    
    transformer = SelectKBest(score_func=multivariate_pearsonr, k=2)
    Xt_pearson = transformer.fit_transform(iris.data, iris.target)
    print(Xt_pearson)
    

    卡方分布

    只能用于二分类
    计算目标与数值变量之间的卡方度量分布,只选取卡方值最大的变量。


    image.png

    假设自变量有N种取值,因变量有M种取值,考虑自变量等于i且因变量等于j的样本频数的观察值与期望的差距,构建统计量:


    image.png
    from sklearn.feature_selection import SelectKBestfrom 
    sklearn.feature_selection import chi2
    
    #选择K个最好的特征,返回选择特征后的数据
    SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)
    
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    from sklearn.preprocessing import MinMaxScaler
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k=num_feats)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    print(str(len(chi_feature)), 'selected features')
    

    递归特征消除

    通过特征的重要性,递归的去掉不重要的

    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    rfe_selector = RFE(estimator=LogisticRegression(), 
    n_features_to_select=num_feats, step=10, verbose=5)
    rfe_selector.fit(X_norm, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:,rfe_support].columns.tolist()
    print(str(len(rfe_feature)), 'selected features')
    
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    
    #递归特征消除法,返回特征选择后的数据
    #参数estimator为基模型
    #参数n_features_to_select为选择的特征个数
    RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data,iris.target)
    

    套索:SelectFromModel

    Lasso和RF都有自己的特征选择方法。Lasso正则化器强制许多特征权重为零

    from sklearn.feature_selection import Select
    FromModelfrom sklearn.linear_model import LogisticRegression
    
    embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"),
     max_features=num_feats)
    embeded_lr_selector.fit(X_norm, y)
    
    embeded_lr_support = embeded_lr_selector.get_support()
    embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
    print(str(len(embeded_lr_feature)), 'selected features')
    

    基于树形结构:SelectFromModel

    使用随机森林,根据特征的重要性来选择特征, 使用每个决策树中的节点杂质来计算特征的重要性。随机森林中,最终的特征重要性是所有决策树特征重要性的平均值。

    from sklearn.feature_selection import SelectFromModel
    from sklearn.ensemble import RandomForestClassifier
    
    embeded_rf_selector = 
    SelectFromModel(RandomForestClassifier(n_estimators=100), 
    max_features=num_feats)
    embeded_rf_selector.fit(X, y)e
    
    mbeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
    print(str(len(embeded_rf_feature)), 'selected features')
    

    结合GBDT模型

    from sklearn.feature_selection import SelectFromModel
    from sklearn.ensemble import GradientBoostingClassifier
    
    #GBDT作为基模型的特征选择
    SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
    

    可以使用 LightGBM或者XGBoost 对象,只要它有feature_importances_属性

    from sklearn.feature_selection import SelectFromModel
    from lightgbm import LGBMClassifier
    
    gbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, 
    num_leaves=32, colsample_bytree=0.2,
                reg_alpha=3, reg_lambda=1, min_split_gain=0.01, 
    min_child_weight=40)
    
    embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
    embeded_lgb_selector.fit(X, y)
    
    embeded_lgb_support = embeded_lgb_selector.get_support()
    embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
    print(str(len(embeded_lgb_feature)), 'selected features'
    

    总结

    全部使用

    # put all selection together
    feature_selection_df = pd.DataFrame({'Feature':feature_name, 
    'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 
    'Logistics':embeded_lr_support, 
                                       'Random Forest':embeded_rf_support,
     'LightGBM':embeded_lgb_support})
    # count the selected times for each feature
    feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
    # display the top 100
    feature_selection_df = 
    feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
    feature_selection_df.index = range(1, len(feature_selection_df)+1)
    feature_selection_df.head(num_feats)
    
    functionComputeSMA(data,window_size)
    

    相关文章

      网友评论

          本文标题:特征选择

          本文链接:https://www.haomeiwen.com/subject/ehppactx.html