1、数据下载
https://www.kaggle.com/c/titanic/data?select=train.csv
2、代码
import pandas as pd
import numpy as np
#(1)读取数据
train_path = './titanic/train.csv' #训练数据
df = pd.read_csv(train_path) #读取训练数据
df.drop(['PassengerId','Name','Ticket','Cabin'], inplace = True, axis = 1)#删除 PassengerId, Name, Ticket and Cabin
print(df.head())
#(2)填充 Age, Cabin和Embarked缺失值
df['Age'].fillna(value=df['Age'].mean(), inplace=True)
df['Embarked'].fillna(value='S',inplace=True)
#(3)将类别数据转化为数值
df = pd.get_dummies(df, columns=['Sex','Embarked'])
# print(df)
#(4)划分训练集和测试集
df_x = df.iloc[:,1:]
df_y = df.iloc[:,:1]
X = df_x.to_numpy()
Y = df_y.iloc[:,0].to_numpy()
x_train, x_val, y_train, y_val = train_test_split(X, Y,test_size = 0.2,random_state = 0)
#(5)训练模型
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200,n_jobs=-1, random_state=0)
model.fit(x_train,y_train)
#(5)模型可解释性
import lime
from lime import lime_tabular
feature_names = list(df_x.columns)#每一列特征名字
explainer = lime_tabular.LimeTabularExplainer(#构建解释器
training_data=x_train,
feature_names=feature_names,
class_names=['diead', 'survived'],
mode='classification'
)
#对x_val第一个样本进行解释
exp = explainer.explain_instance(
data_row=x_val[0],
predict_fn=model.predict_proba
)
print(y_val[0])
exp.show_in_notebook(show_table=True)
3、实验结果

网友评论