一、管道处理数据
将特征选择,和算法预测,这两部都一起执行。
pipe=Pipeline([(特征选择),(调用算法)])
二、代码实例
import numpy as np
# CV交叉验证,之前讲过RidgeCV
from sklearn.linear_model import RidgeCV#筛选参数
from sklearn.feature_selection import RFE,RFECV
from sklearn import datasets
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier,AdaBoostRegressor
from xgboost import XGBClassifier,XGBRegressor
d:\python3.7.4\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
return f(*args, **kwds)
d:\python3.7.4\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
return f(*args, **kwds)
X ,y = datasets.load_boston(True)
# RFE特征筛选,选择,删除最不重要的特征,少一个。再来一轮,再删除最不重要的特征
# 递归特征消除
rfe = RFE(LinearRegression(),n_features_to_select=4)
X2 = rfe.fit_transform(X,y)
display(X2[:10])
rfe.get_support()
array([[ 0. , 0.538, 6.575, 15.3 ],
[ 0. , 0.469, 6.421, 17.8 ],
[ 0. , 0.469, 7.185, 17.8 ],
[ 0. , 0.458, 6.998, 18.7 ],
[ 0. , 0.458, 7.147, 18.7 ],
[ 0. , 0.458, 6.43 , 18.7 ],
[ 0. , 0.524, 6.012, 15.2 ],
[ 0. , 0.524, 6.172, 15.2 ],
[ 0. , 0.524, 5.631, 15.2 ],
[ 0. , 0.524, 6.004, 15.2 ]])
array([False, False, False, True, True, True, False, False, False,
False, True, False, False])
# 使用RFE递归特征消除,保留的特征是:3,4,5,10
X,y = datasets.load_boston(True)
while True:
s,f = X.shape
l = list(np.arange(f))
if f == 4:
break
linear = LinearRegression()
linear.fit(X ,y)#权重发生变化
index_min = np.argmin(linear.coef_)
print('-------------------',index_min)
l.remove(index_min)
X = X[:,l]#索引0是最小的权重,特征消除
X[:10]
------------------- 4
------------------- 6
------------------- 8
------------------- 9
------------------- 0
------------------- 4
------------------- 1
------------------- 4
------------------- 3
array([[ 18. , 0. , 6.575, 396.9 ],
[ 0. , 0. , 6.421, 396.9 ],
[ 0. , 0. , 7.185, 392.83 ],
[ 0. , 0. , 6.998, 394.63 ],
[ 0. , 0. , 7.147, 396.9 ],
[ 0. , 0. , 6.43 , 394.12 ],
[ 12.5 , 0. , 6.012, 395.6 ],
[ 12.5 , 0. , 6.172, 396.9 ],
[ 12.5 , 0. , 5.631, 386.63 ],
[ 12.5 , 0. , 6.004, 386.71 ]])
X,y = datasets.load_boston(True)
result = []
while True:
s,f = X.shape
l = list(np.arange(f))
if f == 9:
break
linear = LinearRegression()
linear.fit(X ,y)#权重发生变化
index_max = np.argmax(linear.coef_)
print('-------------------',index_max)
l.remove(index_max)
result.append(X[:,index_max])
X = X[:,l]#索引0是最小的权重,特征消除
np.asarray(result).T
------------------- 5
------------------- 3
------------------- 6
------------------- 1
array([[ 6.575, 0. , 1. , 18. ],
[ 6.421, 0. , 2. , 0. ],
[ 7.185, 0. , 2. , 0. ],
...,
[ 6.976, 0. , 1. , 0. ],
[ 6.794, 0. , 1. , 0. ],
[ 6.03 , 0. , 1. , 0. ]])
X,y = datasets.load_boston(True)
linear = LinearRegression()
linear.fit(X,y)
linear.coef_
array([-1.08011358e-01, 4.64204584e-02, 2.05586264e-02, 2.68673382e+00,
-1.77666112e+01, 3.80986521e+00, 6.92224640e-04, -1.47556685e+00,
3.06049479e-01, -1.23345939e-02, -9.52747232e-01, 9.31168327e-03,
-5.24758378e-01])
对分类数据,进行递归特征消除
import warnings
warnings.filterwarnings('ignore')
X,y = datasets.load_wine(True)
rfe = RFE(LogisticRegression(),n_features_to_select = 5)
rfe.fit(X,y)
X2 = rfe.transform(X)
display(X2[:5],rfe.get_support())
array([[14.23, 2.43, 3.06, 5.64, 3.92],
[13.2 , 2.14, 2.76, 4.38, 3.4 ],
[13.16, 2.67, 3.24, 5.68, 3.17],
[14.37, 2.5 , 3.49, 7.8 , 3.45],
[13.24, 2.87, 2.69, 4.32, 2.93]])
array([ True, False, True, False, False, False, True, False, False,
True, False, True, False])
lr = LogisticRegression()
lr.fit(X,y)#三分类,三个方程,
np.abs(lr.coef_).mean(axis = 0)
array([0.2524188 , 0.4722555 , 0.08869262, 0.17935718, 0.01994251,
0.29348657, 0.57124269, 0.01622212, 0.24376119, 0.78520237,
0.15345722, 0.47092866, 0.00561662])
X,y = datasets.load_wine(True)
rfe = RFE(DecisionTreeClassifier(),n_features_to_select = 5)
rfe.fit(X,y)
X2 = rfe.transform(X)
display(X2[:5],rfe.get_support())
array([[1.423e+01, 3.060e+00, 1.040e+00, 3.920e+00, 1.065e+03],
[1.320e+01, 2.760e+00, 1.050e+00, 3.400e+00, 1.050e+03],
[1.316e+01, 3.240e+00, 1.030e+00, 3.170e+00, 1.185e+03],
[1.437e+01, 3.490e+00, 8.600e-01, 3.450e+00, 1.480e+03],
[1.324e+01, 2.690e+00, 1.040e+00, 2.930e+00, 7.350e+02]])
array([ True, False, False, False, False, False, True, False, False,
False, True, True, True])
dt = DecisionTreeClassifier()
dt.fit(X,y)
dt.feature_importances_
array([0.01257056, 0.03982485, 0. , 0. , 0. ,
0. , 0.14144668, 0. , 0. , 0.0534598 ,
0.05818509, 0.31204257, 0.38247045])
dt = DecisionTreeClassifier()
dt.fit(X,y)
dt.feature_importances_
array([0.02679372, 0. , 0. , 0. , 0.03297845,
0. , 0.16704836, 0. , 0. , 0.02048135,
0.05818509, 0.31204257, 0.38247045])
X,y = datasets.load_wine(True)
rfe = RFE(AdaBoostClassifier(n_estimators =100),n_features_to_select = 2)
rfe.fit(X,y)
X2 = rfe.transform(X)
display(X2[:5],rfe.get_support())
array([[3.06, 1.04],
[2.76, 1.05],
[3.24, 1.03],
[3.49, 0.86],
[2.69, 1.04]])
array([False, False, False, False, False, False, True, False, False,
False, True, False, False])
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X,y)
ada.feature_importances_
array([0. , 0. , 0. , 0. , 0. , 0.01, 0.49, 0. , 0. , 0.01, 0.24,
0.24, 0.01])
# CV cross -validation 交叉验证!
X,y = datasets.load_wine(True)
rfecv = RFECV(estimator = AdaBoostClassifier(),min_features_to_select=2,cv = 5,n_jobs=-1)
X2 = rfecv.fit_transform(X,y)
display(X2[:5],rfecv.get_support())
array([[2.800e+00, 3.060e+00, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
1.065e+03],
[2.650e+00, 2.760e+00, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
1.050e+03],
[2.800e+00, 3.240e+00, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
1.185e+03],
[3.850e+00, 3.490e+00, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
1.480e+03],
[2.800e+00, 2.690e+00, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
7.350e+02]])
array([False, False, False, False, False, True, True, False, True,
True, True, True, True])
网友评论