Bagging与Boosting

作者: 阿发贝塔伽马 | 来源:发表于2018-06-29 23:53 被阅读0次

    加载数据

    import pandas as pd
    df_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
    df_wine.columns = ['Class label', 'Alcohol', 
                       'Malic acid', 'Ash', 
                       'Alcalinity of ash', 'Magnesium', 
                       'Total phenols', 'Flavanoids', 
                       'Nonflavanoid phenols', 'Proanthocyanins', 
                       'Color intensity', 'Hue', 
                       'OD280/OD315 of diluted wines', 'Proline']
    y = df_wine['Class label'].values
    
    

    特征选择

    为了方便后面可视化,我们只选取2个特征,通过自变量与因变量y相关系数来选择

    # pearsonr可以计算相关系数与p值
    # 当p<0.01表示两个变量强相关
    from scipy.stats import pearsonr
    
    lable=df_wine.values[:,0]
    lr = []
    for i, line in enumerate(df_wine.values.T):
        lr.append([pearsonr(lable,line),i])
    lr.sort()
    X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values
    

    还可以通过PCA降维来选择,本例降维后分类效果并不好

    # pearsonr可以计算相关系数与p值
    # 当p<0.01表示两个变量强相关
    from scipy.stats import pearsonr
    
    lable=df_wine.values[:,0]
    lr = []
    for i, line in enumerate(df_wine.values.T):
        lr.append([pearsonr(lable,line),i])
    lr.sort()
    X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values
    

    因为这里有标签,还可以通过LDA来降维选择,效果比较好,数据分类达到100%正确

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    X = df_wine.iloc[:,range(1,len(df_wine.columns),1)].values
    lda = LinearDiscriminantAnalysis(n_components=2)
    X = lda.fit(X, y).transform(X)
    

    调参,这里只调一个决策树深度参数

    from sklearn.preprocessing import LabelEncoder
    from sklearn.cross_validation import train_test_split
    from sklearn.grid_search import GridSearchCV
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import BaggingClassifier
    
    # # 拆分训练集的30%作为测试集
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=1)
    param_test1 = {'max_depth':range(1,20,1)}
    gsearch1 = GridSearchCV(estimator = DecisionTreeClassifier(criterion="entropy",
                                    random_state=10), 
                           param_grid = param_test1,cv=10)
    gsearch1.fit(X_train,y_train)
    #print gsearch1.grid_scores_, 
    print gsearch1.best_params_ 
    print gsearch1.best_score_
    

    输出

    {'max_depth': 8}
    0.822580645161
    
    度量单个决策树的准确性
    # 度量单个决策树的准确性
    from sklearn.metrics import accuracy_score
    tree = DecisionTreeClassifier(criterion="entropy", max_depth=gsearch1.best_params_['max_depth'])
    tree = tree.fit(X_train, y_train)
    y_train_pred = tree.predict(X_train)
    y_test_pred = tree.predict(X_test)
    
    tree_train = accuracy_score(y_train, y_train_pred)
    tree_test = accuracy_score(y_test, y_test_pred)
    print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test))
    
    Decision tree train/test accuracies 0.984/0.815
    
    # 生成50个决策树,详细的参数建议参考官方文档
    bag = BaggingClassifier(base_estimator=tree, n_estimators=50, 
                            max_samples=1.0, max_features=1.0, 
                            bootstrap=True, bootstrap_features=False, 
                            n_jobs=1, random_state=1)
    
    # 度量bagging分类器的准确性
    bag = bag.fit(X_train, y_train)
    y_train_pred = bag.predict(X_train)
    y_test_pred = bag.predict(X_test)
    bag_train = accuracy_score(y_train, y_train_pred)
    bag_test = accuracy_score(y_test, y_test_pred)
    print('Bagging train/test accuracies %.3f/%.3f' % (bag_train, bag_test))
    

    Bagging分类器的效果的确要比单个决策树的效果好,提高了一点

    Bagging train/test accuracies 1.000/0.852
    

    Boosting分类器, Bagging是投票平均模式,Boosting

    ada = AdaBoostClassifier(base_estimator=tree, n_estimators=1000, learning_rate=0.1, random_state=0)
    ada = ada.fit(X_train, y_train)
    y_train_pred = ada.predict(X_train)
    y_test_pred = ada.predict(X_test)
    ada_train = accuracy_score(y_train, y_train_pred)
    ada_test = accuracy_score(y_test, y_test_pred)
    print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))
    

    相关文章

      网友评论

        本文标题:Bagging与Boosting

        本文链接:https://www.haomeiwen.com/subject/lkbayftx.html