数据探索
# 导入库
import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")
# 导入数据
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.shape
image.png
test.shape
image.png
combine = [train,test]
# 字段信息
train.columns
image.png
# 字段分类
train.dtypes
image.png
# 缺失值
train.isnull().sum()
image.png
test.isnull().sum()
image.png
train.info()
image.png
数据假设
删除字段
本项目主要是考察其他字段和Survival字段的关系
重点关注字段:Age、Embarked
删除字段:对数据分析没有作用,直接删除的字段:Ticket(票号)、Cabin(客舱号)、PassengerId(乘客号)、Name(姓名)
修改、增加字段
增加Family:根据Parch(船上的兄弟姐妹个数) 和 SibSp(船上的父母小孩个数)
从Name字段中提取Title作为新特征
将年龄Age字段转成有序的分类特征
创建一个基于票价Fare 范围的特征
猜想
女人(Sex=female)更容易生还
小孩(Age>?)更容易生还
船舱等级高的乘客更容易生还(Pclass=1)
统计分析
# 1.船舱等级(1-头等,2-二等,3-三等)
train[["Pclass","Survived"]].groupby(["Pclass"],as_index=False).mean().sort_values(by="Survived",ascending=False)
image.png
# 2.性别
train[["Sex","Survived"]].groupby(["Sex"],as_index=False).mean().sort_values(by="Survived",ascending=False)
image.png
# 3.兄弟姐妹/配偶数
train[["SibSp","Survived"]].groupby(["SibSp"],as_index=False).mean().sort_values(by="Survived",ascending=False)
image.png
# 4.父母/孩子数
train[["Parch","Survived"]].groupby("Parch",as_index=False).mean().sort_values(by="Survived",ascending=False)
image.png
可视化分析
# 年龄与生还
g = sns.FacetGrid(train,col="Survived")
g.map(plt.hist,"Age",bins=20)
plt.show()
image.png
# 舱位与生还
grid = sns.FacetGrid(train,col="Survived",row="Pclass",aspect=1.6)
grid.map(plt.hist,"Age",alpha=0.5,bins=20)
grid.add_legend()
plt.show()
image.png
# 登船低点、性别与生还关系
grid = sns.FacetGrid(train,row="Embarked",aspect=1.6)
grid.map(sns.pointplot,"Pclass","Survived","Sex",palette="deep")
grid.add_legend()
plt.show()
image.png
# 票价、舱位与生还
grid = sns.FacetGrid(train,row="Embarked",col="Survived",aspect=1.6)
grid.map(sns.barplot,"Sex","Fare",alpha=.5,ci=None)
grid.add_legend()
plt.show()
image.png
删除无效字段
print("Before",train.shape,test.shape,combine[0].shape,combine[1].shape)
image.png
train = train.drop(["Ticket","Cabin"],axis=1)
test = test.drop(["Ticket","Cabin"],axis=1)
combine = [train,test]
print("After",train.shape,test.shape,combine[0].shape,combine[1].shape)
image.png
生成新特征
# 字段Name处理
for dataset in combine:
dataset["Title"] = dataset.Name.str.extract('([A-Za-z]+)\.',expand=False)
train.groupby(["Sex","Title"]).size().reset_index()
image.png
pd.crosstab(train["Title"],train["Sex"])
image.png
for dataset in combine:
dataset["Title"] = dataset["Title"].replace(["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],"Rare")
dataset["Title"] = dataset["Title"].replace("Mlle","Miss")
dataset["Title"] = dataset["Title"].replace("Ms","Miss")
dataset["Title"] = dataset["Title"].replace("Mme","Mrs")
train[["Title","Survived"]].groupby("Title",as_index=False).mean()
image.png
title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Rare":5}
for dataset in combine:
dataset["Title"] = dataset["Title"].map(title_mapping)
dataset["Title"] = dataset["Title"].fillna(0)
train.head()
image.png
train = train.drop(["Name","PassengerId"],axis=1)
test = test.drop(["Name"],axis=1)
combine = [train,test]
print(train.shape,test.shape)
image.png
# 字段Sex
for dataset in combine:
dataset["Sex"] = dataset["Sex"].map({"female":1,"male":0}).astype(int)
grid = sns.FacetGrid(train,row="Pclass",col="Sex",aspect=1.6)
grid.map(plt.hist,"Age",alpha=.5,bins=20)
grid.add_legend()
plt.show()
image.png
# 字段Age
# 1.字段的缺失值处理
train.isnull().sum()
image.png
guess_ages = np.zeros((2,3))
for dataset in combine:
for i in range(0,2):
for j in range(0,3):
guess_df = dataset[(dataset["Sex"]==i)&(dataset["Pclass"]==j+1)]["Age"].dropna()
age_guess = guess_df.median()
guess_ages[i,j] = int(age_guess/0.5 + 0.5)*0.5
for i in range(0,2):
for j in range(0,3):
dataset.loc[(dataset.Age.isnull())&(dataset.Sex==i)&(dataset.Pclass==j+1),"Age"] = guess_ages[i,j]
dataset["Age"] = dataset["Age"].astype(int)
train.isnull().sum()
image.png
# 2.年龄分段分箱
train["AgeBand"] = pd.cut(train["Age"],5)
train.head()
image.png
train[["AgeBand","Survived"]].groupby(["AgeBand"],as_index=False).mean().sort_values(by="AgeBand",ascending=True)
image.png
# 3.转成数值类型
for dataset in combine:
dataset.loc[dataset["Age"]<=16,"Age"] = 0
dataset.loc[(dataset["Age"]>16) & (dataset["Age"]<=32),"Age"] = 1
dataset.loc[(dataset["Age"]>32) & (dataset["Age"]<=48),"Age"] = 2
dataset.loc[(dataset["Age"]>48) & (dataset["Age"]<=64),"Age"] = 3
dataset.loc[dataset["Age"]>64,"Age"] = 4
train = train.drop(["AgeBand"],axis=1)
combine = [train,test]
字段处理
# 生成新字段1
for dataset in combine:
dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1
train[["FamilySize","Survived"]].groupby(["FamilySize"],as_index=False).mean().sort_values(by="Survived",ascending=False)
image.png
for dataset in combine:
dataset["IsAlone"] = 0
dataset.loc[dataset["FamilySize"]==1,"IsAlone"] = 1
train[["IsAlone","Survived"]].groupby(["IsAlone"],as_index=False).mean()
image.png
train = train.drop(["Parch","SibSp","FamilySize"],axis=1)
test = test.drop(["Parch","SibSp","FamilySize"],axis=1)
combine = [train,test]
train.head()
image.png
# 生成新字段2
for dataset in combine:
dataset["Age*Class"] = dataset.Age * dataset.Pclass
train.head()
image.png
# Embarked字段的分类
train.isnull().sum()
image.png
freq_port = train["Embarked"].dropna().mode()[0]
freq_port
image.png
for dataset in combine:
dataset["Embarked"] = dataset["Embarked"].fillna(freq_port)
train[["Embarked","Survived"]].groupby(["Embarked"],as_index=False).mean().sort_values(by="Survived",ascending=False)
image.png
for dataset in combine:
dataset["Embarked"] = dataset["Embarked"].map({"S":0,"C":1,"Q":2}).astype(int)
train.head()
image.png
# Fare字段处理
train.isnull().sum()
image.png
test.isnull().sum()
image.png
test["Fare"].fillna(test["Fare"].dropna().median(),inplace=True)
test.head()
image.png
train["FareBand"] = pd.qcut(train["Fare"],4)
train[["FareBand","Survived"]].groupby(["FareBand"],as_index=False).mean().sort_values(by="FareBand",ascending=True)
image.png
for dataset in combine:
dataset.loc[dataset["Fare"]<=7.91,"Fare"] = 0
dataset.loc[(dataset["Fare"]>7.91) & (dataset["Fare"]<=14.454),"Fare"] = 1
dataset.loc[(dataset["Fare"]>14.454) & (dataset["Fare"]<=31),"Fare"] = 2
dataset.loc[dataset["Fare"]>31,"Fare"] = 3
dataset["Fare"] = dataset["Fare"].astype(int)
train = train.drop(["FareBand"],axis=1)
combine = [train,test]
test.head()
image.png
建模
X_train = train.drop("Survived",axis=1)
Y_train = train["Survived"]
X_test = test.drop("PassengerId",axis=1).copy()
print(X_train.shape,Y_train.shape,X_test.shape)
image.png
# 模型1:逻辑回归
logreg = LogisticRegression()
logreg.fit(X_train,Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train,Y_train)*100,2)
acc_log
image.png
coeff_df = pd.DataFrame(train.columns[1:])
coeff_df.columns = ["Features"]
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by="Correlation",ascending=False)
image.png
# 模型2:支持向量机SVM
svc = SVC()
svc.fit(X_train,Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train,Y_train)*100,2)
acc_svc
image.png
# 模型3:KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train,Y_train)*100,2)
acc_knn
image.png
# 模型4:朴素贝叶斯
gaussian = GaussianNB()
gaussian.fit(X_train,Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train,Y_train)*100,2)
acc_gaussian
image.png
# 模型5:感知机
perceptron = Perceptron()
perceptron.fit(X_train,Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train,Y_train)*100,2)
acc_perceptron
image.png
# 模型6:线性支持向量分类
linear_svc = LinearSVC()
linear_svc.fit(X_train,Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train,Y_train)*100,2)
acc_linear_svc
image.png
# 模型7:随机梯度下降
sgd = SGDClassifier()
sgd.fit(X_train,Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train,Y_train)*100,2)
acc_sgd
image.png
# 模型8:决策树
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train,Y_train)*100,2)
acc_decision_tree
image.png
# 模型9:随机森林
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train,Y_train)
Y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train,Y_train)*100,2)
acc_random_forest
image.png
模型对比
models = pd.DataFrame({
"Model":["Support Vector Machines","KNN","Logistic Regression",
"Random Forest","Naive Bayes","Perceptron",
"Stochastic Gradient Decent","Linear SVC","Decision Tree"],
"Score":[acc_svc,acc_knn,acc_log,
acc_random_forest,acc_gaussian,acc_perceptron,
acc_sgd,acc_linear_svc,acc_decision_tree]
})
models.sort_values(by="Score",ascending=False)
image.png
网友评论