data:image/s3,"s3://crabby-images/c3cba/c3cba2ac733cc270b90dc1423ba59a3bffbbaaee" alt=""
library(tidyverse)
#读取chji.txt
data_sc<-read_table('chji.txt',skip=1, #skip=1跳过第一行读取文件,因为第一行是中文名
col_names=c('stuid','W_politics',
'W_chinese','W_english',
'L_math','L_physics'))
data:image/s3,"s3://crabby-images/04c1b/04c1b1d69ed0f8424fff94a1003d3e9ff01dbac3" alt=""
ggplot2画箱线图
ggplot(data=data_sc,aes(x='W_politics',y=W_politics))+
geom_boxplot()
data:image/s3,"s3://crabby-images/ddb1c/ddb1cabaf4767b4feb99d2a9fdca49716c863073" alt=""
ggplot(data=data_sc,aes(x='W_politics',y=W_politics))+
geom_boxplot()+geom_boxplot(aes(x='W_chinese',y=W_chinese))
data:image/s3,"s3://crabby-images/8c615/8c61587f1ccc362e280e171bbbd808c465b91032" alt=""
每个箱线图叠加,一步步操作,这样太麻烦了,换个方法,用tidyverse包里的gather函数,将data_sc宽型数据变成长型数据
#gather:宽型数据变成长型数据
reshape_data<-gather(data_sc,course,score,-stuid) #k和value构成新数据,k=course为课程,value=score,-stuid去掉这一列不参与构建
data:image/s3,"s3://crabby-images/d9d75/d9d759f310301bf3b02b486309676bb54f7cafa5" alt=""
插个话题,course列里的W_politics也太丑了,如何将course列里的数据分割呢,那就用separate函数
#seprate:拆分列
separate_data<-separate(reshape_data,col=course,sep="_", #col=course要拆分的列,sep="_"按照_拆分
into=c("subject","course")) #into=c("subject","course")将拆分后得到的新列用subject和course命名
data:image/s3,"s3://crabby-images/fbfbb/fbfbb7e1f9a05eca0dad6a3680183d44b82139aa" alt=""
既然已经将course列拆分了,那么怎么将拆分的数据再合并回去呢
unite函数来完成此任务
#unite:和并列
combine_data<-unite(separate_data,col=course,subject,course, #col=course数据合并后的新列名,将subject和course两列合并
sep="_") #两列以_分隔
data:image/s3,"s3://crabby-images/76364/76364a3a5ea95db21ded6ce6c8ea94e0e309c52f" alt=""
ab<-unite(combine_data,col=score,course,score,sep=":")
data:image/s3,"s3://crabby-images/8dc6d/8dc6d701fc3f0c5015349a3e21d39958b0d011b7" alt=""
回到正题,现在宽型数据已经变成长型数据了,箱线图码起
#不同科目的成绩比较
ggplot(reshape_data,aes(x=course,y=score,fill=course))+
geom_boxplot()
data:image/s3,"s3://crabby-images/60d8b/60d8b8e325a75c728d14b7d31f91dedf02453f0c" alt=""
#同学之间的成绩比较
ggplot(reshape_data,aes(x=factor(stuid),y=score,fill=factor(stuid)))+
geom_boxplot()
data:image/s3,"s3://crabby-images/b69dd/b69dd13eb1c3ca66adc74a63942fa5dabb81775f" alt=""
图画完了,我想将长型数据变回宽型数据,spread函数走起
#spread:长型数据变宽型数据
cc<-spread(reshape_data,key=course,value=score) #key=course,将course列里的类别名称转成数据列名,value=score,分数为其向量里的内容
data:image/s3,"s3://crabby-images/21adc/21adcc889538fa23142a84d6cbf636ec79571462" alt=""
有时数据中含有NA缺失值,那我们通过以下几种方法去除含缺失值的行
#读取数据
v_information<-read_csv("vip_information.csv",col_names=T)
data:image/s3,"s3://crabby-images/c05e7/c05e70ec51bcf6eada80b67bbcded4b84ee9b9f1" alt=""
#查看出生年月有缺失值的个数
sum(is.na(v_information$csny))
#去除csny向量里含有缺失值的行
drop_csny<-drop_na(v_information,csny)
sum(is.na(drop_csny$csny))
data:image/s3,"s3://crabby-images/3a71d/3a71d9ae2dc75aeee73f518038333991ae05b41f" alt=""
#需要删除多个向量里含有缺失值的行,直接在后面累加
drop_csny<-drop_na(v_information,csny,xb,class)
sum(is.na(drop_csny$class))
data:image/s3,"s3://crabby-images/22b39/22b39eea42cd2994dc3186b1fa81fe10c4194a70" alt=""
抽提向量里的数据
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c", "d.e"))
data:image/s3,"s3://crabby-images/c2209/c2209b862d32ff0cc16ee25dd7bec533e5ab008d" alt=""
df %>% extract(x, c("A", "B"), "([a-z])\\.([a-z])") #x为抽取的列,A,B为新列名
#([a-z])\\.([a-z])为正则表达式,在表达式里[a-z]代表所有的小写字母,\\.点为分隔符
data:image/s3,"s3://crabby-images/4c2a1/4c2a1dd2e1a4e84c3465f96776a8c97e96413b0f" alt=""
网友评论