导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
import shap
from pdpbox import pdp,info_plots
np.random.seed(123)
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore")
数据探索EDA
# 导入数据
df = pd.read_excel("heart.xlsx")
df.head()

# 缺失值情况
df.isnull().sum()

字段转化
# 转化编码
df["sex"][df["sex"]==0] = "female"
df["sex"][df["sex"]==1] = "male"
df["cp"][df["cp"]==1] = "typical angina"
df["cp"][df["cp"]==2] = "atypical angina"
df["cp"][df["cp"]==3] = "non-anginal pain"
df["cp"][df["cp"]==4] = "asymptomatic"
df["fbs"][df["fbs"]==0] = "lower than 120mg/ml"
df["fbs"][df["fbs"]==1] = "greater than 120mg/ml"
df["restecg"][df["restecg"]==0] = "normal"
df["restecg"][df["restecg"]==1] = "ST-T wave abnormality"
df["restecg"][df["restecg"]==2] = "left ventricular hypertrophy"
df["exang"][df["exang"]==0] = "no"
df["exang"][df["exang"]==1] = "yes"
df["slope"][df["slope"]==1] = "upsloping"
df["slope"][df["slope"]==2] = "flat"
df["slope"][df["slope"]==3] = "downsloping"
df["thal"][df["thal"]==1] = "normal"
df["thal"][df["thal"]==2] = "fixed defect"
df["thal"][df["thal"]==3] = "reversable defect"
# 字段类型转化
df["sex"] = df["sex"].astype("object")
df["cp"] = df["cp"].astype("object")
df["fbs"] = df["fbs"].astype("object")
df["restecg"] = df["restecg"].astype("object")
df["exang"] = df["exang"].astype("object")
df["slope"] = df["slope"].astype("object")
df["thal"] = df["thal"].astype("object")
# 生成哑变量
df = pd.get_dummies(df,drop_first=True)
df

随机森林RandomForest
# 切分数据
X = df.drop("target",1)
y = df["target"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)
X_train

# 建模
rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train,y_train)

3个重要属性
查看森林中树的状况:estimators_
袋外估计准确率得分:oob_score_,必须是oob_score参数选择True的时候才可用
变量的重要性:feature_importances_
# 决策树可视化
estimator = rf.estimators_[1]
feature_names = [i for i in X_train.columns]
y_train_str = y_train.astype("str")
y_train_str[y_train_str=="0"] = "no disease"
y_train_str[y_train_str=="1"] = "disease"
y_train_str = y_train_str.values
y_train_str[:5]

export_graphviz(
estimator,
out_file='tree.dot',
feature_names = feature_names,
class_names = y_train_str,
rounded = True,
proportion = True,
label='root',
precision = 2,
filled = True
)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename = 'tree.png')
# 模型得分验证
y_predict = rf.predict(X_test)
y_pred_quant = rf.predict_proba(X_test)[:,1]
y_pred_bin = rf.predict(X_test)
confusion_matrix = confusion_matrix(y_test,y_pred_bin)
confusion_matrix

total = sum(sum(confusion_matrix))
sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print("Sensitivity: ",sensitivity)

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print("Specificity: ",specificity)

# 绘制ROC曲线
fpr, tpr, thresholds = roc_curve(y_test,y_pred_quant)
fig, ax = plt.subplots()
ax.plot(fpr,tpr)
ax.plot([0,1],[0,1],transform=ax.transAxes,ls="--",c=".3")
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.rcParams["font.size"] = 12
plt.title("ROC Curve")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.grid(True)

auc(fpr,tpr)

常见的评价指标:
1、ACC:classification accuracy,描述分类器的分类准确率,计算公式为:ACC=(TP+TN)/(TP+FP+FN+TN)
2、BER:balanced error rate 计算公式为:BER=1/2*(FPR+FN/(FN+TP))
3、TPR:true positive rate,描述识别出的所有正例占所有正例的比例 计算公式为:TPR=TP / (TP+ FN)
4、FPR:false positive rate,描述将负例识别为正例的情况占所有负例的比例 计算公式为:FPR= FP / (FP + TN)
5、TNR:true negative rate,描述识别出的负例占所有负例的比例 计算公式为:TNR= TN / (FP + TN)
6、PPV:Positive predictive value计算公式为:PPV=TP / (TP + FP)
7、NPV:Negative predictive value计算公式:NPV=TN / (FN + TN)
其中TPR即为敏感度(sensitivity),TNR即为特异度(specificity)
可解释性
# 排列重要性-Permutation Importance
perm = PermutationImportance(rf,random_state=10).fit(X_test,y_test)
perm

eli5.show_weights(perm,feature_names=X_test.columns.tolist())

部分依赖图(Partial dependence plots,PDP)
Partial Dependence就是用来解释某个特征和目标值y的关系的,一般是通过画出Partial Dependence Plot(PDP)来体现。也就是说PDP在X1的值,就是把训练集中第一个变量换成X1之后,原模型预测出来的平均值。
# 字段ca
base_features = df.columns.values.tolist()
base_features.remove("target")
feat_name = "ca"
pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)
plt.show()

# 字段age
feat_name = "age"
pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)
plt.show()

# 字段oldpeak
feat_name = "oldpeak"
pdp_dist = pdp.pdp_isolate(model=rf,dataset=X_test,model_features=base_features,feature=feat_name)
pdp.pdp_plot(pdp_dist,feat_name)
plt.show()

# 2D-PDP图
# 查看的是 slope_upsloping 、slope_flat和 oldpeak的关系:
inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_upsloping","oldpeak"])
pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_upsloping","oldpeak"],plot_type="contour")
plt.show()

inter1 = pdp.pdp_interact(model=rf,dataset=X_test,model_features=base_features,features=["slope_flat","oldpeak"])
pdp.pdp_interact_plot(pdp_interact_out=inter1,feature_names=["slope_flat","oldpeak"],plot_type="contour")
plt.show()

SHAP可视化
# Explainer
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
shap_values

# Feature Importance
shap.summary_plot(shap_values[1],X_test,plot_type="bar")

# summary_plot
shap.summary_plot(shap_values[1],X_test)

# 个体差异
# 查看单个病人的不同特征属性对其结果的影响
def heart_disease_risk_factors(model,patient):
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(patient)
shap.initjs()
return shap.force_plot(explainer.expected_value[1],shap_values[1],patient)
data_for_prediction = X_test.iloc[1,:].astype(float)
heart_disease_risk_factors(rf,data_for_prediction)

data_for_prediction = X_test.iloc[3,:].astype(float)
heart_disease_risk_factors(rf,data_for_prediction)

# dependence_plot
# 为了理解单个feature如何影响模型的输出,我们可以将该feature的SHAP值与数据集中所有样本的feature值进行比较:
ax2 = fig.add_subplot(224)
shap.dependence_plot("ca",shap_values[1],X_test,interaction_index="oldpeak")

# 多样本可视化探索
shap_values = explainer.shap_values(X_train.iloc[:50])
shap.force_plot(explainer.expected_value[1],shap_values[1],X_test.iloc[:50])

网友评论