正文

训练模型

log_regress = linear_model.LogisticRegression()
# Train the model
log_regress.fit(X = train_features ,
                y = train_label)


# Check trained model intercept
print(log_regress.intercept_)
# Check trained model coefficients
print(log_regress.coef_)
[3.8742755]
[[-0.85532933 -2.30146606 -0.03444764 -0.29622237 -0.00644779  0.00482113
  -0.01987031]]

获取预测结果和预测概率

  # In[*]   

# Make predictions
preds = log_regress.predict(X=test_features)
print(preds)
 # In[*]   
 # Predict the probablities
pred_probs = log_regress.predict_proba(X=test_features)
print(pred_probs)

获取预测生存状态与实际生存状态的交叉矩阵

print(pd.crosstab(preds, test_label))
Survived   0   1
row_0           
0         92  24
1         14  48

获取预测准确值

 # In[*] 
 # get the accuracy of the prediction
log_regress.score(X = test_features ,
                  y = test_label)
0.7865168539325843

除了使用crosstab（）函数生成混淆矩阵之外，您还可以使用Scikit中的metrics模块中的confusion_matrix（）函数来学习：

 # In[*] 
from sklearn import metrics
# view the confusion matrix
metrics.confusion_matrix(
    y_true = test_label, # True labels
    y_pred = preds) # Predicted labels

获取模型的其他评价指标

print(metrics.classification_report(
      y_true = test_label,
      y_pred = preds))
             precision    recall  f1-score   support

          0       0.79      0.87      0.83       106
          1       0.77      0.67      0.72        72

avg / total       0.79      0.79      0.78       178

绘制ROC曲线

   # In[*]  
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# convert the probabilities from ndarray to
# dataframe
df_prob = pd.DataFrame(
    pred_probs,
    columns=['Death', 'Survived'])
fpr, tpr, thresholds = roc_curve(
    test_label, df_prob['Survived'])
# find the area under the curve (auc) for the
# ROC
roc_auc = auc(fpr, tpr)
plt.title(
    'Receiver Operating Characteristic Curve')
plt.plot(fpr, tpr, 'black',
         label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate (TPR)')
plt.xlabel('False Positive Rate (FPR)')
plt.show()