1、加载文件,查看:(两个数据集,train作为学习集进行数据建模,通过test测试集查看建模的情况。)
train<-read.csv("C:/Users/Administrator/Desktop/train.csv",stringsAsFactors=F)——读取文件
test<-read.csv("C:/Users/Administrator/Desktop/test.csv",stringsAsFactors=F)
test$Survived<-NA——增加变量,进行匹配
full<-rbind(train,test)——行合并,做成完整数据集
head(full)——查看数据情况
str(full)/describe(full)
2、数据清洗:
数据集中有多个数据值缺失,需要添加数据来替代旧数据:
年龄确实值处理:age<-full$Age;n<-length(age)
set.seed(123)
for (i in 1:n){
if(is.na(age[i])) {
age[i]=sample(na.omit(full$Age),1)}} ——用已经存在的非缺失年龄数据集中抽选一个来替代缺失值
par(mfrow=c(1,2)) ——验证数据集,通过直方图进行年龄数据集对比
hist(full$Age,freq=F,main="before replacement",col='lightblue',ylim=c(0,0.04),xlab="age")
hist(age,freq=F,main="after replacement",col='darkblue',ylim=c(0,0.04),xlab="age")
船舱缺失值处理:cabin<-full$Cabin;n=length(cabin)
for (i in 1:n){
if (nchar(cabin[i])==0){
cabin[i]=0} else{
s=strsplit(cabin[i]," ")
cabin[i]=length(s[[1]])}} ——因为船舱编码对存活率影响较小,所以增加船舱数量这个变量。
table(cabin)
Fare缺失值处理:——将1044fare缺失值按照同等级船舱和港口的中位数进行处理
full$Fare[1044]<-median(full[full$Pclass==3&full$Embarked=="S",]$Fare,na.rm=T
3、数据探索性分析和处理:
查看各个变量与Survived的关系
d<-data.frame(Age=age[1:891],Survived=train$Survived)
ggplot(d,aes(Age,fill=factor(Survived)))+geom_histogram()——利用直方图看不同年龄层存活率比较
cuts<-cut(d$Age,hist(d$Age,10,plot=F)$breaks)——将年龄平分成几段
rate<-tapply(d$Survived,cuts,mean)
d2<-data.frame(age=names(rate),rate)
barplot(d2$rate,xlab="age",ylab="survival rate")——通过柱形图查看每个年龄层的存活率
性别和存活率:
ggplot(train,aes(Sex,fill=factor(Survived)))+geom_histogram(stat="count")
tapply(train$Survived,train$Sex,mean)
姓名和存活率:
n=length(full$Survived)
title<-rep(NA,n)
for(i in 1:n){
lastname=strsplit(full$Name[i]," ,")[[1]][2]——生成的是列表
title[i]=strsplit(lastname,". ")[[1]][1]}
d<-data.frame(title=title[1:891],Survived=train$Survived)
ggplot(d,aes(title,fill=factor(Survived)))+geom_histogram(stat="count")
tapply(d$Survived,d$title,mean)
title[title!="Mr"&title!="Miss"&title!="Mrs"&title!="Master"]<-"Rare"
table(title)
……以此通过堆砌条形图和tapply()求存活率的平均值
4、数据模型建立:
通过广义线性回归、随机森林、决策树、向量机来建模预测验证模型的准确性。
建立新的训练集:
new_train=data.frame(survived=f.survived,age=f.age,fare=f.fare,sex=f.sex,embarked=f.embarked,family=f.family,title=f.title,cabin=f.cabin,pclass=f.pclass)
logistic 回归
fit_logit<glm(factor(survived)~age+fare+sex+embarked+family+title+cabin+pclass,data=new_train,family=binomial)
ans_logit=rep(NA,891)
for(i in1:891){ans_logit[i]=round(fit_logit$fitted.values[[i]],0)}
mean(ans_logit==train$Survived)
table(ans_logit)
library('randomForest')
set.seed(123)fit_rf<-randomForest(factor(survived)~age+fare+sex+embarked+family+title+cabin+pclass,data=new_train)
fit_dt<-rpart(factor(survived)~age+fare+sex+embarked+family+title+cabin+pclass,data=new_train)
fit_svm<-svm(factor(survived)~age+fare+sex+embarked+family+title+cabin+pclass,data=new_train)
svm.fitted=predict(fit_svm)ans_svm=rep(NA,891)
for(i in1:891){ans_svm[i]=as.integer(svm.fitted[[i]])-1}
mean(ans_svm==train$Survived)table(ans_svm)
5、模型评价:
a=sum(ans_logit==1&f.survived==1)
b=sum(ans_logit==1&f.survived==0)
c=sum(ans_logit==0&f.survived==1)
d=sum(ans_logit==0&f.survived==0)
data.frame(a,b,c,d)
6、模型预测
test_data_set<data.frame(age=t.age,fare=t.fare,sex=t.sex,embarked=t.embarked,family=t.family,title=t.title,cabin=t.cabin,pclass=t.pclass)
svm_predict=predict(fit_svm,newdata=test_data_set)ans_svm_predict=rep(NA,418)for(iin1:418){ans_svm_predict[i]=as.integer(svm_predict[[i]])-1}table(ans_svm_predict)
网友评论