美文网首页人生苦短,我用python
机器学习入门-模型验证

机器学习入门-模型验证

作者: 雷小厮 | 来源:发表于2017-07-13 12:27 被阅读92次

    混肴矩阵

    from sklearn.datasets import load_iris
    from sklearn.linear_model import LogisticRegression
    iris = load_iris()
    clf = LogisticRegression()
    clf.fit(iris.data,iris.target)
    predicted = clf.predict(iris.data)
    # 建立逻辑回归模型
    sum(iris.target == predicted)/len(iris.target)  # 计算准确率
    from sklearn.metrics import accuracy_score
    accuracy_score(iris.target,predicted)
    # 使用sklearn内置accuracy_score计算准确率
    # 注意,准确率并没有多大意义
    from sklearn.metrics import confusion_matrix
    m=confusion_matrix(iris.target,predicted)
    # 得到逻辑回归模型混肴矩阵
    %pylab inline
    import seaborn
    seaborn.heatmap(m) # 产生可视化混肴矩阵
    from sklearn.metrics import classification_report
    print(classification_report(iris.target,predicted))
    # 分类报告,得到分类结果的准确率,召回率,F1,判断模型好坏
    

    交叉验证

    Holdout验证

    随机选取大部分数据作训练数据集,剩余数据做验证数据集

    from sklearn.datasets import load_iris
    from sklearn.tree import DecisionTreeClassifier
    iris = load_iris()
    X = iris.data
    y = iris.target  # 依然使用自带dataset中的iris数据
    from sklearn.model_selection import train_test_split
    train_X,test_X,train_y,test_y=train_test_split(X,y,test_size = 0.33,random_state =123)
    # train_test_split 将数据X,y分成训练数据和验证数据,test_size 是验证数据集占总数据比例,random_state随便输,不同的值会产生不同数据集
    clf = DecisionTreeClassifier()
    clf.fit(train_X,train_y) # 使用训练数据集训练决策树模型
    from sklearn.metrics import accuracy_score
    predicted = clf.predict(test_X)
    accuracy_score(test_y,predicted) # 计算模型对验证数据集的准确率
    from sklearn.metrics import confusion_matrix
    m = confusion_matrix(test_y,predicted)  #模型对验证数据集的混肴矩阵
    print(m)
    
    交叉验证

    将数据随机分成N份,将N-1份作为训练数据,1份作为验证数据,重复N次后平均

    from sklearn.model_selection import KFold  
    kf = KFold(n_splits=10)  #将数据分成10份
    acc=[]
    for train,test in kf.split(X):
        train_X,test_X,train_y,test_y = X[train],X[test],y[train],y[test]
        clf= DecisionTreeClassifier()
        clf.fit(train_X,train_y)
        predicted = clf.predict(test_X)
        acc.append(accuracy_score(test_y,predicted))
    print(sum(acc)/len(acc)) #打印出验证的准确率的平均值
    

    另一种方法

    from sklearn.model_selection import cross_val_score
    acc = cross_val_score(clf,X=iris.data,y=iris.target,cv=10) #cv=10 表示做10次交叉验证
    # acc 为10次交叉验证准确率的array
    print(acc.mean()) 
    
    留一验证

    N-1个数据做训练,1个数据做验证,重复N次(相当与交叉验证分成N(=数据量)份)

    from sklearn.model_selection import LeaveOneOut
    res = []
    loo = LeaveOneOut()
    for train,test in loo.split(X):
        train_X,test_X,train_y,test_y = X[train],X[test],y[train],y[test]
        clf= DecisionTreeClassifier()
        clf.fit(train_X,train_y)
        predicted = clf.predict(test_X)
        res.extend((predicted==test_y).tolist())
    sum(res)
    

    ROC曲线评价分类模型

    生成ROC曲线

    from sklearn.datasets import load_iris
    from sklearn.tree import DecisionTreeClassifier
    from sklearn import preprocessing
    iris = load_iris()
    X = iris.data[50:150,] # ROC曲线使用二维混肴矩阵,选择2个分类的数据
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(iris.target[50:150]) # 选择的数据target值为1和2,使用preprocessing转换成0和1
    from sklearn.model_selection import train_test_split
    train_X,test_X,train_y,test_y=train_test_split(X,y,test_size = 0.33,random_state =123)    
    clf= DecisionTreeClassifier()
    clf.fit(train_X,train_y)
    probas_ = clf.fit(train_X,train_y).predict_proba(test_X)
    from sklearn.metrics import roc_curve,auc
    fpr,tpr,thresholds = roc_curve(test_y,probas_[:,1]) #生成false positive rate 和true positive rate
    import matplotlib.pyplot as plt
    plt.plot(fpr,tpr,label='ROC curve')
    plt.plot([0,1],[0,1],'k--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()
    
    ROC Curve

    计算auc(areas under curve)
    auc越大模型越准确

    from sklearn.metrics import auc
    roc_auc = auc(fpr,tpr)
    print('Area under the curve:{}'.format(roc_auc))
    

    不同模型ROC曲线对比

    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    clf1 = DecisionTreeClassifier()
    clf1.fit(train_X,train_y)
    clf2 = SVC(probability =True)
    clf2.fit(train_X,train_y)
    clf3 = LogisticRegression()
    clf3.fit(train_X,train_y)
    clf4 = RandomForestClassifier()
    clf4.fit(train_X,train_y)
    from sklearn.metrics import roc_curve,auc
    plt.figure(figsize=[20,10])
    for clf,title in zip([clf1,clf2,clf3,clf4],['Decision Tree','SVM','LogisticRegression','RandomForest']):
        probas_ = clf.fit(train_X,train_y).predict_proba(test_X)
        fpr,tpr,thresholds = roc_curve(test_y,probas_[:,1])
        plt.plot(fpr,tpr,label='%s-AUC:%.2f'%(title,auc(fpr,tpr)))
    plt.plot([0,1],[0,1],'k--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.0])
    plt.xlabel('False Positive Rate',fontsize=20)
    plt.ylabel('True Positive Rate',fontsize=20)
    plt.title('ROC Curve',fontsize=20)
    plt.legend(loc='lower right',fontsize=20)
    plt.show()
    
    ROC曲线对比

    按模型中维度重要性排序

    import numpy as np
    columns = np.array(iris.feature_names) # 将feature_names由列表变为array 
    importance = columns[clf1.feature_importances_.argsort()[::-1]]
    # clf1.feature_importances_ 生成各个特征的重要性
    # argsort() 获得array中的值按从小到大在array中的位置的array。
    # [::-1]将上面的array逆排序
    # columns[clf1.feature_importances_.argsort()[::-1]] 得到按照importance从大到小排序的array
    print(importance)
    #特征维度重要性排序可视化
    import matplotlib.pyplot as plt
    featur_importance = clf1.feature_importances_
    plt.title('Feature Importance')
    plt.bar(range(0,len(importance)),feature_importance[feature_importance.argsort()[::-1]])
    plt.xticks(range(0,len(importance)),importance,rotation=90)
    plt.show()
    
    特征维度重要性排序

    相关文章

      网友评论

      本文标题:机器学习入门-模型验证

      本文链接:https://www.haomeiwen.com/subject/epgbhxtx.html