美文网首页
5.selectfrom mode 和管道的使用

5.selectfrom mode 和管道的使用

作者: 羽天驿 | 来源:发表于2020-04-06 16:09 被阅读0次

    一、管道处理数据

    将特征选择,和算法预测,这两部都一起执行。
    pipe=Pipeline([(特征选择),(调用算法)])

    二、代码实例

    import numpy as np
    
    # CV交叉验证,之前讲过RidgeCV
    from sklearn.linear_model import RidgeCV#筛选参数
    from sklearn.feature_selection import RFE,RFECV
    
    from sklearn import datasets
    from sklearn.linear_model import LogisticRegression,LinearRegression
    
    from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
    
    from sklearn.ensemble import AdaBoostClassifier,AdaBoostRegressor
    
    from  xgboost import XGBClassifier,XGBRegressor
    
    d:\python3.7.4\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
      return f(*args, **kwds)
    d:\python3.7.4\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
      return f(*args, **kwds)
    
    X ,y = datasets.load_boston(True)
    # RFE特征筛选,选择,删除最不重要的特征,少一个。再来一轮,再删除最不重要的特征
    # 递归特征消除
    rfe = RFE(LinearRegression(),n_features_to_select=4)
    
    X2 = rfe.fit_transform(X,y)
    display(X2[:10])
    rfe.get_support()
    
    array([[ 0.   ,  0.538,  6.575, 15.3  ],
           [ 0.   ,  0.469,  6.421, 17.8  ],
           [ 0.   ,  0.469,  7.185, 17.8  ],
           [ 0.   ,  0.458,  6.998, 18.7  ],
           [ 0.   ,  0.458,  7.147, 18.7  ],
           [ 0.   ,  0.458,  6.43 , 18.7  ],
           [ 0.   ,  0.524,  6.012, 15.2  ],
           [ 0.   ,  0.524,  6.172, 15.2  ],
           [ 0.   ,  0.524,  5.631, 15.2  ],
           [ 0.   ,  0.524,  6.004, 15.2  ]])
    
    
    
    
    
    array([False, False, False,  True,  True,  True, False, False, False,
           False,  True, False, False])
    
    # 使用RFE递归特征消除,保留的特征是:3,4,5,10
    
    X,y = datasets.load_boston(True)
    while True:
        s,f = X.shape
        l = list(np.arange(f))
        if f == 4:
            break
        linear = LinearRegression()
        linear.fit(X ,y)#权重发生变化
        index_min = np.argmin(linear.coef_)
        print('-------------------',index_min)
        l.remove(index_min)
        X = X[:,l]#索引0是最小的权重,特征消除
    X[:10]
    
    ------------------- 4
    ------------------- 6
    ------------------- 8
    ------------------- 9
    ------------------- 0
    ------------------- 4
    ------------------- 1
    ------------------- 4
    ------------------- 3
    
    
    
    
    
    array([[ 18.   ,   0.   ,   6.575, 396.9  ],
           [  0.   ,   0.   ,   6.421, 396.9  ],
           [  0.   ,   0.   ,   7.185, 392.83 ],
           [  0.   ,   0.   ,   6.998, 394.63 ],
           [  0.   ,   0.   ,   7.147, 396.9  ],
           [  0.   ,   0.   ,   6.43 , 394.12 ],
           [ 12.5  ,   0.   ,   6.012, 395.6  ],
           [ 12.5  ,   0.   ,   6.172, 396.9  ],
           [ 12.5  ,   0.   ,   5.631, 386.63 ],
           [ 12.5  ,   0.   ,   6.004, 386.71 ]])
    
    X,y = datasets.load_boston(True)
    result = []
    while True:
        s,f = X.shape
        l = list(np.arange(f))
        if f == 9:
            break
        linear = LinearRegression()
        linear.fit(X ,y)#权重发生变化
        index_max = np.argmax(linear.coef_)
        print('-------------------',index_max)
        l.remove(index_max)
        result.append(X[:,index_max])
        X = X[:,l]#索引0是最小的权重,特征消除
    np.asarray(result).T
    
    ------------------- 5
    ------------------- 3
    ------------------- 6
    ------------------- 1
    
    
    
    
    
    array([[ 6.575,  0.   ,  1.   , 18.   ],
           [ 6.421,  0.   ,  2.   ,  0.   ],
           [ 7.185,  0.   ,  2.   ,  0.   ],
           ...,
           [ 6.976,  0.   ,  1.   ,  0.   ],
           [ 6.794,  0.   ,  1.   ,  0.   ],
           [ 6.03 ,  0.   ,  1.   ,  0.   ]])
    
    X,y = datasets.load_boston(True)
    linear = LinearRegression()
    linear.fit(X,y)
    linear.coef_
    
    array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
           -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
            3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
           -5.24758378e-01])
    

    对分类数据,进行递归特征消除

    import warnings
    warnings.filterwarnings('ignore')
    
    X,y = datasets.load_wine(True)
    rfe = RFE(LogisticRegression(),n_features_to_select = 5)
    rfe.fit(X,y)
    X2 = rfe.transform(X)
    display(X2[:5],rfe.get_support())
    
    array([[14.23,  2.43,  3.06,  5.64,  3.92],
           [13.2 ,  2.14,  2.76,  4.38,  3.4 ],
           [13.16,  2.67,  3.24,  5.68,  3.17],
           [14.37,  2.5 ,  3.49,  7.8 ,  3.45],
           [13.24,  2.87,  2.69,  4.32,  2.93]])
    
    
    
    array([ True, False,  True, False, False, False,  True, False, False,
            True, False,  True, False])
    
    lr = LogisticRegression()
    lr.fit(X,y)#三分类,三个方程,
    np.abs(lr.coef_).mean(axis = 0)
    
    array([0.2524188 , 0.4722555 , 0.08869262, 0.17935718, 0.01994251,
           0.29348657, 0.57124269, 0.01622212, 0.24376119, 0.78520237,
           0.15345722, 0.47092866, 0.00561662])
    
    X,y = datasets.load_wine(True)
    rfe = RFE(DecisionTreeClassifier(),n_features_to_select = 5)
    rfe.fit(X,y)
    X2 = rfe.transform(X)
    display(X2[:5],rfe.get_support())
    
    array([[1.423e+01, 3.060e+00, 1.040e+00, 3.920e+00, 1.065e+03],
           [1.320e+01, 2.760e+00, 1.050e+00, 3.400e+00, 1.050e+03],
           [1.316e+01, 3.240e+00, 1.030e+00, 3.170e+00, 1.185e+03],
           [1.437e+01, 3.490e+00, 8.600e-01, 3.450e+00, 1.480e+03],
           [1.324e+01, 2.690e+00, 1.040e+00, 2.930e+00, 7.350e+02]])
    
    
    
    array([ True, False, False, False, False, False,  True, False, False,
           False,  True,  True,  True])
    
    dt = DecisionTreeClassifier()
    dt.fit(X,y)
    dt.feature_importances_
    
    array([0.01257056, 0.03982485, 0.        , 0.        , 0.        ,
           0.        , 0.14144668, 0.        , 0.        , 0.0534598 ,
           0.05818509, 0.31204257, 0.38247045])
    
    dt = DecisionTreeClassifier()
    dt.fit(X,y)
    dt.feature_importances_
    
    array([0.02679372, 0.        , 0.        , 0.        , 0.03297845,
           0.        , 0.16704836, 0.        , 0.        , 0.02048135,
           0.05818509, 0.31204257, 0.38247045])
    
    X,y = datasets.load_wine(True)
    rfe = RFE(AdaBoostClassifier(n_estimators =100),n_features_to_select = 2)
    rfe.fit(X,y)
    X2 = rfe.transform(X)
    display(X2[:5],rfe.get_support())
    
    array([[3.06, 1.04],
           [2.76, 1.05],
           [3.24, 1.03],
           [3.49, 0.86],
           [2.69, 1.04]])
    
    
    
    array([False, False, False, False, False, False,  True, False, False,
           False,  True, False, False])
    
    ada = AdaBoostClassifier(n_estimators=100)
    ada.fit(X,y)
    ada.feature_importances_
    
    array([0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.49, 0.  , 0.  , 0.01, 0.24,
           0.24, 0.01])
    
    # CV cross -validation 交叉验证!
    X,y = datasets.load_wine(True)
    rfecv = RFECV(estimator = AdaBoostClassifier(),min_features_to_select=2,cv = 5,n_jobs=-1)
    X2 = rfecv.fit_transform(X,y)
    display(X2[:5],rfecv.get_support())
    
    array([[2.800e+00, 3.060e+00, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
            1.065e+03],
           [2.650e+00, 2.760e+00, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
            1.050e+03],
           [2.800e+00, 3.240e+00, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
            1.185e+03],
           [3.850e+00, 3.490e+00, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
            1.480e+03],
           [2.800e+00, 2.690e+00, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
            7.350e+02]])
    
    
    
    array([False, False, False, False, False,  True,  True, False,  True,
            True,  True,  True,  True])
    

    相关文章

      网友评论

          本文标题:5.selectfrom mode 和管道的使用

          本文链接:https://www.haomeiwen.com/subject/snajphtx.html