美文网首页
06基于随机森林模型的心脏病患者预测分类

06基于随机森林模型的心脏病患者预测分类

作者: Jachin111 | 来源:发表于2022-12-31 19:37 被阅读0次

    导入库

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import export_graphviz
    from sklearn.metrics import roc_curve,auc
    from sklearn.metrics import classification_report
    from sklearn.metrics import confusion_matrix
    from sklearn.model_selection import train_test_split
    import eli5
    from eli5.sklearn import PermutationImportance
    import shap
    from pdpbox import pdp,info_plots
    
    np.random.seed(123)
    pd.options.mode.chained_assignment = None
    
    import warnings
    warnings.filterwarnings("ignore")
    

    数据探索EDA

    # 导入数据
    df = pd.read_excel("heart.xlsx")
    df.head()
    
    image.png
    # 缺失值情况
    df.isnull().sum()
    
    image.png

    字段转化

    # 转化编码
    df["sex"][df["sex"]==0] = "female"
    df["sex"][df["sex"]==1] = "male"
    
    df["cp"][df["cp"]==1] = "typical angina"
    df["cp"][df["cp"]==2] = "atypical angina"
    df["cp"][df["cp"]==3] = "non-anginal pain"
    df["cp"][df["cp"]==4] = "asymptomatic"
    
    df["fbs"][df["fbs"]==0] = "lower than 120mg/ml"
    df["fbs"][df["fbs"]==1] = "greater than 120mg/ml"
    
    df["restecg"][df["restecg"]==0] = "normal"
    df["restecg"][df["restecg"]==1] = "ST-T wave abnormality"
    df["restecg"][df["restecg"]==2] = "left ventricular hypertrophy"
    
    df["exang"][df["exang"]==0] = "no"
    df["exang"][df["exang"]==1] = "yes"
    
    df["slope"][df["slope"]==1] = "upsloping"
    df["slope"][df["slope"]==2] = "flat"
    df["slope"][df["slope"]==3] = "downsloping"
    
    df["thal"][df["thal"]==1] = "normal"
    df["thal"][df["thal"]==2] = "fixed defect"
    df["thal"][df["thal"]==3] = "reversable defect"
    
    # 字段类型转化
    df["sex"] = df["sex"].astype("object")
    df["cp"] = df["cp"].astype("object")
    df["fbs"] = df["fbs"].astype("object")
    df["restecg"] = df["restecg"].astype("object")
    df["exang"] = df["exang"].astype("object")
    df["slope"] = df["slope"].astype("object")
    df["thal"] = df["thal"].astype("object")
    
    # 生成哑变量
    df = pd.get_dummies(df,drop_first=True)
    df
    
    image.png

    随机森林RandomForest

    # 切分数据
    X = df.drop("target",1)
    y = df["target"]
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)
    X_train
    
    image.png
    # 建模
    rf = RandomForestClassifier(max_depth=5)
    rf.fit(X_train,y_train)
    
    image.png
    3个重要属性
    查看森林中树的状况:estimators_
    袋外估计准确率得分:oob_score_,必须是oob_score参数选择True的时候才可用
    变量的重要性:feature_importances_
    # 决策树可视化
    estimator = rf.estimators_[1]
    feature_names = [i for i in X_train.columns]
    
    y_train_str = y_train.astype("str")
    y_train_str[y_train_str=="0"] = "no disease"
    y_train_str[y_train_str=="1"] = "disease"
    y_train_str = y_train_str.values
    y_train_str[:5]
    
    image.png
    export_graphviz(
        estimator,   
        out_file='tree.dot',   
        feature_names = feature_names,  
        class_names = y_train_str,  
        rounded = True, 
        proportion = True, 
        label='root',
        precision = 2, 
        filled = True
    )
    
    from subprocess import call
    call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
    
    from IPython.display import Image
    Image(filename = 'tree.png')
    
    # 模型得分验证
    y_predict = rf.predict(X_test)
    y_pred_quant = rf.predict_proba(X_test)[:,1]
    y_pred_bin = rf.predict(X_test)
    
    confusion_matrix = confusion_matrix(y_test,y_pred_bin)
    confusion_matrix
    
    image.png
    total = sum(sum(confusion_matrix))
    sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
    print("Sensitivity: ",sensitivity)
    
    image.png
    specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
    print("Specificity: ",specificity)
    
    image.png
    # 绘制ROC曲线
    fpr, tpr, thresholds = roc_curve(y_test,y_pred_quant)
    
    fig, ax = plt.subplots()
    ax.plot(fpr,tpr)
    ax.plot([0,1],[0,1],transform=ax.transAxes,ls="--",c=".3")
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.0])
    plt.rcParams["font.size"] = 12
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate (1 - Specificity)")
    plt.ylabel("True Positive Rate (Sensitivity)")
    plt.grid(True)
    
    image.png
    auc(fpr,tpr)
    
    image.png
    常见的评价指标:
    1、ACC:classification accuracy,描述分类器的分类准确率,计算公式为:ACC=(TP+TN)/(TP+FP+FN+TN)
    2、BER:balanced error rate 计算公式为:BER=1/2*(FPR+FN/(FN+TP))
    3、TPR:true positive rate,描述识别出的所有正例占所有正例的比例 计算公式为:TPR=TP / (TP+ FN)
    4、FPR:false positive rate,描述将负例识别为正例的情况占所有负例的比例 计算公式为:FPR= FP / (FP + TN)
    5、TNR:true negative rate,描述识别出的负例占所有负例的比例 计算公式为:TNR= TN / (FP + TN)
    6、PPV:Positive predictive value计算公式为:PPV=TP / (TP + FP)
    7、NPV:Negative predictive value计算公式:NPV=TN / (FN + TN)
    其中TPR即为敏感度(sensitivity),TNR即为特异度(specificity)

    可解释性

    # 排列重要性-Permutation Importance
    perm = PermutationImportance(rf,random_state=10).fit(X_test,y_test)
    perm
    
    image.png
    eli5.show_weights(perm,feature_names=X_test.columns.tolist())
    
    image.png

    部分依赖图(Partial dependence plots,PDP)

    Partial Dependence就是用来解释某个特征和目标值y的关系的,一般是通过画出Partial Dependence Plot(PDP)来体现。也就是说PDP在X1的值,就是把训练集中第一个变量换成X1之后,原模型预测出来的平均值。

    # 字段ca
    base_features = df.columns.values.tolist()
    base_features.remove("target")
    
    feat_name = "ca"
    pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
    pdp.pdp_plot(pdp_dist,feat_name)
    
    plt.show()
    
    image.png
    # 字段age
    feat_name = "age"
    
    pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
    pdp.pdp_plot(pdp_dist,feat_name)
    
    plt.show()
    
    image.png
    # 字段oldpeak
    feat_name = "oldpeak"
    
    pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
    pdp.pdp_plot(pdp_dist,feat_name)
    
    plt.show()
    
    image.png
    # 2D-PDP图
    # 查看的是 slope_upsloping 、slope_flat和 oldpeak的关系:
    inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_upsloping","oldpeak"])
    pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_upsloping","oldpeak"],plot_type="contour")
    
    plt.show()
    
    image.png
    inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_flat","oldpeak"])
    pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_flat","oldpeak"],plot_type="contour")
    
    plt.show()
    
    image.png

    SHAP可视化

    # Explainer
    explainer = shap.TreeExplainer(rf)
    shap_values = explainer.shap_values(X_test)
    shap_values
    
    image.png
    # Feature Importance
    shap.summary_plot(shap_values[1],X_test,plot_type="bar")
    
    image.png
    # summary_plot
    shap.summary_plot(shap_values[1],X_test)
    
    image.png
    # 个体差异
    # 查看单个病人的不同特征属性对其结果的影响
    def heart_disease_risk_factors(model,patient):
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(patient)
        shap.initjs()
        return shap.force_plot(explainer.expected_value[1],shap_values[1],patient)
    
    data_for_prediction = X_test.iloc[1,:].astype(float)
    heart_disease_risk_factors(rf,data_for_prediction)
    
    image.png
    data_for_prediction = X_test.iloc[3,:].astype(float)
    heart_disease_risk_factors(rf,data_for_prediction)
    
    image.png
    # dependence_plot
    # 为了理解单个feature如何影响模型的输出,我们可以将该feature的SHAP值与数据集中所有样本的feature值进行比较:
    ax2 = fig.add_subplot(224)
    shap.dependence_plot("ca",shap_values[1],X_test,interaction_index="oldpeak")
    
    image.png
    # 多样本可视化探索
    shap_values = explainer.shap_values(X_train.iloc[:50])
    shap.force_plot(explainer.expected_value[1],shap_values[1],X_test.iloc[:50])
    
    image.png

    相关文章

      网友评论

          本文标题:06基于随机森林模型的心脏病患者预测分类

          本文链接:https://www.haomeiwen.com/subject/lhsqcdtx.html