美文网首页
数据挖掘实践任务4

数据挖掘实践任务4

作者: 乌和兔 | 来源:发表于2019-08-14 08:00 被阅读0次

    任务4:

    记录5个模型(逻辑回归、SVM、决策树、随机森林、XGBoost)关于accuracy、precision,recall和F1-score、auc值的评分表格,并画出ROC曲线。

    时间:2天

    结果

    import pandas as pd
    import numpy as np
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OrdinalEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from xgboost import XGBClassifier
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import minmax_scale
    from sklearn.metrics import precision_score, roc_curve, recall_score, f1_score, roc_auc_score, accuracy_score
    import warnings
    
    warnings.filterwarnings("ignore")
    
    data = pd.read_csv('data.csv',encoding='gbk')
    delete = ['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no','id_name','latest_query_time','source','loans_latest_time','first_transaction_time', 'student_feature']
    data = data.drop(delete,axis=1)
    
    # 使用众数填充
    for i in range(data.shape[1]):
        feature = data.iloc[:,i].values.reshape(-1,1)  #sklearn中特征矩阵必须是二维
        imp_mode = SimpleImputer(strategy='most_frequent')
        data.iloc[:,i] = imp_mode.fit_transform(feature)
    
    # 处理分类型特征
    data['reg_preference_for_trad'] = OrdinalEncoder().fit_transform(data['reg_preference_for_trad'].values.reshape(-1,1))
    
    # 划分数据
    x = data.drop('status',axis=1)
    y = data.status
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state =2018, shuffle = True)
    
    # 数据归一化
    X_train = minmax_scale(X_train)
    X_test =  minmax_scale(X_test)
    
    # 建立模型
    LR = LogisticRegression()
    LR.fit(X_train,y_train)
    
    svc = SVC(kernel='linear',probability=True)
    svc.fit(X_train,y_train)
    
    DT = DecisionTreeClassifier(max_depth=6)
    DT.fit(X_train,y_train)
    
    RF = RandomForestClassifier()
    RF.fit(X_train,y_train)
    
    XGB = XGBClassifier()
    XGB.fit(X_train,y_train)
    
    models = [LR,svc,DT,RF,XGB]
    
    names = ["LR","SVC", 'DT', "RF","Xgb"]
    evaluates = ['accuracy','precision','recall','f1','auc']
    
    df_list = []
    for name, model in zip(names, models):
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
        # accuracy
        train_accuracy = model.score(X_train, y_train)
        test_accuracy = model.score(X_test, y_test)
        # precision
        train_precision = precision_score(y_train, y_train_pred)
        test_precision = precision_score(y_test, y_test_pred)
        # recall
        train_recall = recall_score(y_train, y_train_pred)
        test_recall = recall_score(y_test, y_test_pred)
        # f1
        train_f1 = f1_score(y_train, y_train_pred)
        test_f1 = f1_score(y_test, y_test_pred)
        # auc 计算时,计算的应该是不同的概率画出来的曲线下的面积,而不是预测值对应的曲线下的面积
        # 预测值 分类模型,应该全是0 或者 1 ,但是概率是类似于得分一样的值
        # 根据资料貌似两种都行,都可以作为阈值来进行ROC曲线的绘制
        y_train_pred = model.predict_proba(X_train)[:, 1]
        y_test_pred = model.predict_proba(X_test)[:, 1]
    
        train_auc = roc_auc_score(y_train, y_train_pred)
        test_auc = roc_auc_score(y_test, y_test_pred)
        print('{}  训练集: accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name, train_accuracy,
                                                                                                  train_precision,
                                                                                                  train_recall, train_f1,
                                                                                                  train_auc))
        print('{}  测试集: accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name, test_accuracy,
                                                                                                  test_precision,
                                                                                                  test_recall, test_f1,
                                                                                                  test_auc))
        print('\n')
        df = pd.DataFrame(np.array(
            [train_accuracy, train_precision, train_recall, train_f1, train_auc, test_accuracy, test_precision, test_recall,
             test_f1, test_auc]).reshape(2, -1),
                          index=['train', 'test'],
                          columns=['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-Score'])
        df_list.append(df)
    
    pd.concat(df_list, axis=0, keys=names)
    
    def draw_roc_curve(train_pre_proba, test_pre_proba, train_auc, test_auc, model_name):
        fpr, tpr, roc_auc = train_pre_proba
        test_fpr, test_tpr, test_roc_auc = test_pre_proba
    
        plt.figure()
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
                 lw=lw, label='ROC curve (area = %0.2f)' % train_auc)
        plt.plot(test_fpr, test_tpr, color='red',
                 lw=lw, label='ROC curve (area = %0.2f)' % test_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Roc example ' + model_name)
        plt.legend(loc="lower right")
        plt.show()
    
    for name, model in zip(names, models):
        y_train_pred = model.predict_proba(X_train)[:, 1]
        y_test_pred = model.predict_proba(X_test)[:, 1]
    
        train_roc = roc_curve(y_train, y_train_pred)
        test_roc = roc_curve(y_test, y_test_pred)
    
        train_auc = roc_auc_score(y_train, y_train_pred)
        test_auc = roc_auc_score(y_test, y_test_pred)
    
        draw_roc_curve(train_roc, test_roc, train_auc, test_auc, name)
    

    输出结果:

    LR  训练集: accuracy:0.801,precision:0.749, recall:0.312, f1:0.44, auc:0.808
    LR  测试集: accuracy:0.786,precision:0.627, recall:0.37, f1:0.466, auc:0.77
    
    
    SVC  训练集: accuracy:0.796,precision:0.787, recall:0.257, f1:0.387, auc:0.816
    SVC  测试集: accuracy:0.773,precision:0.645, recall:0.217, f1:0.325, auc:0.773
    
    
    DT  训练集: accuracy:0.843,precision:0.773, recall:0.53, f1:0.629, auc:0.836
    DT  测试集: accuracy:0.641,precision:0.359, recall:0.546, f1:0.433, auc:0.671
    
    
    RF  训练集: accuracy:0.983,precision:0.997, recall:0.936, f1:0.966, auc:0.999
    RF  测试集: accuracy:0.718,precision:0.427, recall:0.351, f1:0.385, auc:0.675
    
    
    Xgb  训练集: accuracy:0.849,precision:0.861, recall:0.476, f1:0.613, auc:0.917
    Xgb  测试集: accuracy:0.77,precision:0.564, recall:0.379, f1:0.453, auc:0.757
    
    LR RF SVC DT Xgb

    相关文章

      网友评论

          本文标题:数据挖掘实践任务4

          本文链接:https://www.haomeiwen.com/subject/cyfwjctx.html