导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
import shap
from pdpbox import pdp,info_plots
np.random.seed(123)
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore")
数据探索EDA
# 导入数据
df = pd.read_excel("heart.xlsx")
df.head()
data:image/s3,"s3://crabby-images/c1adb/c1adbe5e42aef17b469fff835eb798dc89f9fca9" alt=""
# 缺失值情况
df.isnull().sum()
data:image/s3,"s3://crabby-images/103a6/103a685b07781c221a70c182f8acd2c8b98a93cf" alt=""
字段转化
# 转化编码
df["sex"][df["sex"]==0] = "female"
df["sex"][df["sex"]==1] = "male"
df["cp"][df["cp"]==1] = "typical angina"
df["cp"][df["cp"]==2] = "atypical angina"
df["cp"][df["cp"]==3] = "non-anginal pain"
df["cp"][df["cp"]==4] = "asymptomatic"
df["fbs"][df["fbs"]==0] = "lower than 120mg/ml"
df["fbs"][df["fbs"]==1] = "greater than 120mg/ml"
df["restecg"][df["restecg"]==0] = "normal"
df["restecg"][df["restecg"]==1] = "ST-T wave abnormality"
df["restecg"][df["restecg"]==2] = "left ventricular hypertrophy"
df["exang"][df["exang"]==0] = "no"
df["exang"][df["exang"]==1] = "yes"
df["slope"][df["slope"]==1] = "upsloping"
df["slope"][df["slope"]==2] = "flat"
df["slope"][df["slope"]==3] = "downsloping"
df["thal"][df["thal"]==1] = "normal"
df["thal"][df["thal"]==2] = "fixed defect"
df["thal"][df["thal"]==3] = "reversable defect"
# 字段类型转化
df["sex"] = df["sex"].astype("object")
df["cp"] = df["cp"].astype("object")
df["fbs"] = df["fbs"].astype("object")
df["restecg"] = df["restecg"].astype("object")
df["exang"] = df["exang"].astype("object")
df["slope"] = df["slope"].astype("object")
df["thal"] = df["thal"].astype("object")
# 生成哑变量
df = pd.get_dummies(df,drop_first=True)
df
data:image/s3,"s3://crabby-images/c6741/c6741cbc4c6ddf58270761dc68181332ccb1d2f8" alt=""
随机森林RandomForest
# 切分数据
X = df.drop("target",1)
y = df["target"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)
X_train
data:image/s3,"s3://crabby-images/9800e/9800e81995d53bc3a1e85bfd31f87ab6636f23a9" alt=""
# 建模
rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train,y_train)
data:image/s3,"s3://crabby-images/ccb24/ccb249e370bf3adfe6f3cd40f1ebb18a6974719b" alt=""
3个重要属性
查看森林中树的状况:estimators_
袋外估计准确率得分:oob_score_,必须是oob_score参数选择True的时候才可用
变量的重要性:feature_importances_
# 决策树可视化
estimator = rf.estimators_[1]
feature_names = [i for i in X_train.columns]
y_train_str = y_train.astype("str")
y_train_str[y_train_str=="0"] = "no disease"
y_train_str[y_train_str=="1"] = "disease"
y_train_str = y_train_str.values
y_train_str[:5]
data:image/s3,"s3://crabby-images/ded0b/ded0b55a4910f172284e138e4c77aadd602146ff" alt=""
export_graphviz(
estimator,
out_file='tree.dot',
feature_names = feature_names,
class_names = y_train_str,
rounded = True,
proportion = True,
label='root',
precision = 2,
filled = True
)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename = 'tree.png')
# 模型得分验证
y_predict = rf.predict(X_test)
y_pred_quant = rf.predict_proba(X_test)[:,1]
y_pred_bin = rf.predict(X_test)
confusion_matrix = confusion_matrix(y_test,y_pred_bin)
confusion_matrix
data:image/s3,"s3://crabby-images/af878/af87873be4b0140f59897c8694cd474405b269bb" alt=""
total = sum(sum(confusion_matrix))
sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print("Sensitivity: ",sensitivity)
data:image/s3,"s3://crabby-images/aca70/aca70330f4725debab3830dbb99c916d3249e928" alt=""
specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print("Specificity: ",specificity)
data:image/s3,"s3://crabby-images/a9e6a/a9e6a3a87177b0ee347d1d7c94554ba741661bc1" alt=""
# 绘制ROC曲线
fpr, tpr, thresholds = roc_curve(y_test,y_pred_quant)
fig, ax = plt.subplots()
ax.plot(fpr,tpr)
ax.plot([0,1],[0,1],transform=ax.transAxes,ls="--",c=".3")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.rcParams["font.size"] = 12
plt.title("ROC Curve")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.grid(True)
data:image/s3,"s3://crabby-images/80b03/80b0396669ecc54f579ec43644ee57a5646c68e5" alt=""
auc(fpr,tpr)
data:image/s3,"s3://crabby-images/cf793/cf79320a89b57698c884b103ebd1f4e5d11bdabf" alt=""
常见的评价指标:
1、ACC:classification accuracy,描述分类器的分类准确率,计算公式为:ACC=(TP+TN)/(TP+FP+FN+TN)
2、BER:balanced error rate 计算公式为:BER=1/2*(FPR+FN/(FN+TP))
3、TPR:true positive rate,描述识别出的所有正例占所有正例的比例 计算公式为:TPR=TP / (TP+ FN)
4、FPR:false positive rate,描述将负例识别为正例的情况占所有负例的比例 计算公式为:FPR= FP / (FP + TN)
5、TNR:true negative rate,描述识别出的负例占所有负例的比例 计算公式为:TNR= TN / (FP + TN)
6、PPV:Positive predictive value计算公式为:PPV=TP / (TP + FP)
7、NPV:Negative predictive value计算公式:NPV=TN / (FN + TN)
其中TPR即为敏感度(sensitivity),TNR即为特异度(specificity)
可解释性
# 排列重要性-Permutation Importance
perm = PermutationImportance(rf,random_state=10).fit(X_test,y_test)
perm
data:image/s3,"s3://crabby-images/7288c/7288ca6ebf62a2d7a8d6a45633b17ca7abb40f83" alt=""
eli5.show_weights(perm,feature_names=X_test.columns.tolist())
data:image/s3,"s3://crabby-images/d5888/d588805114e50b35ad2fa3f9ca94b712fac7b21c" alt=""
部分依赖图(Partial dependence plots,PDP)
Partial Dependence就是用来解释某个特征和目标值y的关系的,一般是通过画出Partial Dependence Plot(PDP)来体现。也就是说PDP在X1的值,就是把训练集中第一个变量换成X1之后,原模型预测出来的平均值。
# 字段ca
base_features = df.columns.values.tolist()
base_features.remove("target")
feat_name = "ca"
pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)
plt.show()
data:image/s3,"s3://crabby-images/be9dd/be9dd1e57ea6138f4069a427de4349d1e75e012a" alt=""
# 字段age
feat_name = "age"
pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)
plt.show()
data:image/s3,"s3://crabby-images/49edc/49edc55cccff36e01bb1a6c5b94aa875ed7ff8f6" alt=""
# 字段oldpeak
feat_name = "oldpeak"
pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)
plt.show()
data:image/s3,"s3://crabby-images/94435/944352179c4d85d829639eff1f2cb63360f0e9e3" alt=""
# 2D-PDP图
# 查看的是 slope_upsloping 、slope_flat和 oldpeak的关系:
inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_upsloping","oldpeak"])
pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_upsloping","oldpeak"],plot_type="contour")
plt.show()
data:image/s3,"s3://crabby-images/57b12/57b12f37b2c9fe8276acf92014d9fda665cc6bbf" alt=""
inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_flat","oldpeak"])
pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_flat","oldpeak"],plot_type="contour")
plt.show()
data:image/s3,"s3://crabby-images/58d06/58d06455b30aa0881056790d0a1cbb80eeafb65e" alt=""
SHAP可视化
# Explainer
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap_values
data:image/s3,"s3://crabby-images/97737/9773770f4e2c59bd142fdf3c473acd6ed6c04d25" alt=""
# Feature Importance
shap.summary_plot(shap_values[1],X_test,plot_type="bar")
data:image/s3,"s3://crabby-images/d5ed1/d5ed1eac39fef4df3236d1b0013ed4af24551f5e" alt=""
# summary_plot
shap.summary_plot(shap_values[1],X_test)
data:image/s3,"s3://crabby-images/396db/396db29912d8dff1e1b375b8081d101511138b7c" alt=""
# 个体差异
# 查看单个病人的不同特征属性对其结果的影响
def heart_disease_risk_factors(model,patient):
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(patient)
shap.initjs()
return shap.force_plot(explainer.expected_value[1],shap_values[1],patient)
data_for_prediction = X_test.iloc[1,:].astype(float)
heart_disease_risk_factors(rf,data_for_prediction)
data:image/s3,"s3://crabby-images/5af51/5af51a7705d5237ebfe0d8acd0f80f940c78df53" alt=""
data_for_prediction = X_test.iloc[3,:].astype(float)
heart_disease_risk_factors(rf,data_for_prediction)
data:image/s3,"s3://crabby-images/acc76/acc76acf37704f439302915fe5200d7e2f08d1fb" alt=""
# dependence_plot
# 为了理解单个feature如何影响模型的输出,我们可以将该feature的SHAP值与数据集中所有样本的feature值进行比较:
ax2 = fig.add_subplot(224)
shap.dependence_plot("ca",shap_values[1],X_test,interaction_index="oldpeak")
data:image/s3,"s3://crabby-images/c77eb/c77eb636e4f0527c21e3b95708cabd8479f4fe44" alt=""
# 多样本可视化探索
shap_values = explainer.shap_values(X_train.iloc[:50])
shap.force_plot(explainer.expected_value[1],shap_values[1],X_test.iloc[:50])
data:image/s3,"s3://crabby-images/b0792/b0792f5ce770d6ad2e101d4acd2292ca3b0f1d83" alt=""
网友评论