美文网首页
学习笔记:sklearn-决策树

学习笔记:sklearn-决策树

作者: zeolite | 来源:发表于2021-06-16 13:20 被阅读0次

    分类树
    数据准备

    from sklearn import tree
    from sklearn.datasets import load_wine
    from sklearn.model_selection import train_test_split
    
    wine = load_wine()
    
    X_train, X_test, Y_train, Y_test = train_test_split(wine.data, wine.target, test_size=0.2)
    
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf = clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    pred = clf.predict(X_test)
    

    查看特征名称

    wine.feature_names
    

    查看类别名称

    wine.target_names
    

    查看特征重要性

    clf.feature_importances_
    

    查看特征对应的重要性

    [*zip(wine.feature_names, clf.feature_importances_)]
    

    决策树随机分支 splitter = best/random
    最大深度 max_depth
    最少节点分支 min_samples_split
    最少子节点分支 min_samples_leaf
    限制特征个数 max_features

    clf=tree.DecisionTreeClassifier(criterion='entropy', 
                                    random_state=0, 
                                    splitter='random',
                                    max_depth=3,
                                    min_samples_leaf=2,
                                    min_samples_split=3,
                                    max_features=10
                                   )
    

    回归树
    交叉验证 cross_val_score
    负的均方误差 -MSE neg_mean_squared_error

    from sklearn.tree import DecisionTreeRegressor
    from sklearn.datasets import load_boston
    from sklearn.model_selection import cross_val_score
    
    boston=load_boston()
    
    regr=DecisionTreeRegressor(random_state=0)
    score=cross_val_score(regr,boston.data, boston.target, cv=10, scoring='neg_mean_squared_error')
    score.mean()
    

    GridSearchCV使用 泰坦尼克号数据

    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import cross_val_score, GridSearchCV
    
    data=pd.read_csv(r'./train.csv', index_col=0)
    
    data.drop(['Cabin','Name','Ticket'], inplace=True, axis=1)
    
    data['Age']=data['Age'].fillna(data['Age'].mean())
    
    data['Sex']=(data['Sex']=='male').astype('int')
    
    labels=data['Embarked'].unique().tolist()
    data['Embarked']=data['Embarked'].apply(lambda x:labels.index(x))
    
    X=data.drop(columns='Survived')
    y=data['Survived']
    
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)
    
    clf=DecisionTreeClassifier(random_state=0)
    
    params={'splitter':('best', 'random'),
           'criterion':('gini', 'entropy'),
           'max_depth':[*range(1,10)],
           'min_samples_leaf':[*range(1,50,5)],
           'min_impurity_decrease':[*np.linspace(0,0.5,20)]}
    
    GS=GridSearchCV(clf, params, cv=10)
    GS.fit(X_train, y_train)
    

    查看参数

    GS.best_params_
    

    查看分数

    GS.best_score_
    

    交叉验证

    clf=DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=5, min_samples_leaf=1, splitter='random')
    score=cross_val_score(clf,X_train, y_train, cv=10)
    score.mean()
    

    随机森林分类

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import load_wine
    from sklearn.model_selection import cross_val_score
    
    wine=load_wine()
    
    rfc=RandomForestClassifier(n_estimators=25)
    rfc_s=cross_val_score(rfc, wine.data, wine.target, cv=10)
    rfc_s.mean()
    

    查看随机森林中树的参数

    rfc.estimators_
    rfc.estimators_[0]
    

    bootstrap默认True 采用有放回随机抽样技术
    oob_score=True 使用袋外数据进行模型测试

    rfc=RandomForestClassifier(n_estimators=25, oob_score=True)
    rfc=rfc.fit(wine.data, wine.target)
    rfc.oob_score_
    

    查看重要特征

    [*zip(wine.feature_names, rfc.feature_importances_)]
    

    查看样本概率

    rfc.predict_proba(wine.data)
    

    随机森林回归

    from sklearn.datasets import load_boston
    from sklearn.model_selection import cross_val_score
    from sklearn.ensemble import RandomForestRegressor
    
    boston=load_boston()
    regressor=RandomForestRegressor(n_estimators=50, random_state=0)
    cross_val_score(regressor, boston.data, boston.target, cv=10)
    

    SimpleImputer填充nan值

    from sklearn.impute import SimpleImputer
    
    SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(X_missing)
    
    SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0).fit_transform(X_missing)
    

    随机森林分类 GridSearch使用

    from sklearn.datasets import load_breast_cancer
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import cross_val_score
    import pandas as pd
    import numpy as np
    
    data=load_breast_cancer()
    
    rfc=RandomForestClassifier(n_estimators=50, random_state=10)
    param_grid={'criterion':['gini', 'entropy'],
               'max_depth':np.arange(1,10,1),
               'n_estimators': np.arange(1,100,10),
               'min_samples_leaf':np.arange(2,10,1),
               'min_samples_split':np.arange(2,7,1),
               'max_leaf_nodes':np.arange(25,50,1)}
    GS=GridSearchCV(rfc, param_grid, cv=10)
    GS.fit(data.data, data.target)
    
    GS.best_params_
    
    GS.best_score_
    
    

    相关文章

      网友评论

          本文标题:学习笔记:sklearn-决策树

          本文链接:https://www.haomeiwen.com/subject/hlztyltx.html