分类树
数据准备
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
wine = load_wine()
X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.2)
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
pred = clf.predict(X_test)
查看特征名称
wine.feature_names
查看类别名称
wine.target_names
查看特征重要性
clf.feature_importances_
查看特征对应的重要性
[*zip(wine.feature_names, clf.feature_importances_)]
决策树随机分支 splitter = best/random
最大深度 max_depth
最少节点分支 min_samples_split
最少子节点分支 min_samples_leaf
限制特征个数 max_features
clf=tree.DecisionTreeClassifier(criterion='entropy',
random_state=0,
splitter='random',
max_depth=3,
min_samples_leaf=2,
min_samples_split=3,
max_features=10
)
回归树
交叉验证 cross_val_score
负的均方误差 -MSE neg_mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
boston=load_boston()
regr=DecisionTreeRegressor(random_state=0)
score=cross_val_score(regr,boston.data, boston.target, cv=10, scoring='neg_mean_squared_error')
score.mean()
GridSearchCV使用 泰坦尼克号数据
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
data=pd.read_csv(r'./train.csv', index_col=0)
data.drop(['Cabin','Name','Ticket'], inplace=True, axis=1)
data['Age']=data['Age'].fillna(data['Age'].mean())
data['Sex']=(data['Sex']=='male').astype('int')
labels=data['Embarked'].unique().tolist()
data['Embarked']=data['Embarked'].apply(lambda x:labels.index(x))
X=data.drop(columns='Survived')
y=data['Survived']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)
clf=DecisionTreeClassifier(random_state=0)
params={'splitter':('best', 'random'),
'criterion':('gini', 'entropy'),
'max_depth':[*range(1,10)],
'min_samples_leaf':[*range(1,50,5)],
'min_impurity_decrease':[*np.linspace(0,0.5,20)]}
GS=GridSearchCV(clf, params, cv=10)
GS.fit(X_train, y_train)
查看参数
GS.best_params_
查看分数
GS.best_score_
交叉验证
clf=DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=5, min_samples_leaf=1, splitter='random')
score=cross_val_score(clf,X_train, y_train, cv=10)
score.mean()
随机森林分类
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score
wine=load_wine()
rfc=RandomForestClassifier(n_estimators=25)
rfc_s=cross_val_score(rfc, wine.data, wine.target, cv=10)
rfc_s.mean()
查看随机森林中树的参数
rfc.estimators_
rfc.estimators_[0]
bootstrap默认True 采用有放回随机抽样技术
oob_score=True 使用袋外数据进行模型测试
rfc=RandomForestClassifier(n_estimators=25, oob_score=True)
rfc=rfc.fit(wine.data, wine.target)
rfc.oob_score_
查看重要特征
[*zip(wine.feature_names, rfc.feature_importances_)]
查看样本概率
rfc.predict_proba(wine.data)
随机森林回归
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
boston=load_boston()
regressor=RandomForestRegressor(n_estimators=50, random_state=0)
cross_val_score(regressor, boston.data, boston.target, cv=10)
SimpleImputer填充nan值
from sklearn.impute import SimpleImputer
SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_missing)
SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0).fit_transform(X_missing)
随机森林分类 GridSearch使用
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
data=load_breast_cancer()
rfc=RandomForestClassifier(n_estimators=50, random_state=10)
param_grid={'criterion':['gini', 'entropy'],
'max_depth':np.arange(1,10,1),
'n_estimators': np.arange(1,100,10),
'min_samples_leaf':np.arange(2,10,1),
'min_samples_split':np.arange(2,7,1),
'max_leaf_nodes':np.arange(25,50,1)}
GS=GridSearchCV(rfc, param_grid, cv=10)
GS.fit(data.data, data.target)
GS.best_params_
GS.best_score_
网友评论