tidyr dplyr stringr “R数据科学”
image.pngR1:tidyr 核心函数
gather-spread # 列的聚集与分散
separate-unite # 列的拆分与合并
image.png
image.png
处理NA:
drop_na(数据) #删除含有na的所有行
drop_na(数据名,x1) #删除 x1列中所有行
replace_na(x$x2,0) #吧=把数据x中的所有na替换成0,但是并未赋值
fill(x,x2) #按照上一行填充缺失值
R2:dplyr
基础
- mutate(), 新增列
- select(), 按列筛选 #按列取子集
- filter(), 按行筛选 #按列取子集
- arrange(), 按某一行对数据框进行排序
- summarise(), 汇总
进阶
*count()
输出结果为数据框,能统计数据及重复次数
*管道符号 %>% (ctr+shift+M)
上一步的输出作为下一步的输入
*处理关系数据的函数
tidyr
一、数据清理
rm(list = ls())
options(stringsAsFactors = F)
if(!require(tidyr))install.packages("tidyr")
library(tidyr)
test <- data.frame(geneid = paste0("gene",1:4),
sample1 = c(1,4,7,10),
sample2 = c(2,5,0.8,11),
sample3 = c(0.3,6,9,12))
test
geneid sample1 sample2 sample3
1 gene1 1 2.0 0.3
2 gene2 4 5.0 6.0
3 gene3 7 0.8 9.0
4 gene4 10 11.0 12.0
扁变长
test_gather <- gather(data = test,
key = sample_nm,
value = exp,
- geneid)
head(test_gather)
geneid sample_nm exp
1 gene1 sample1 1
2 gene2 sample1 4
3 gene3 sample1 7
4 gene4 sample1 10
5 gene1 sample2 2
6 gene2 sample2 5
长变扁
test_re <- spread(data = test_gather,
key = sample_nm,
value = exp)
head(test_re)
geneid sample1 sample2 sample3
1 gene1 1 2.0 0.3
2 gene2 4 5.0 6.0
3 gene3 7 0.8 9.0
4 gene4 10 11.0 12.0
二、分割和合并
原始数据
test <- data.frame(x = c( "a,b", "a,d", "b,c"));test
x
1 a,b
2 a,d
3 b,c
分割
test_seprate <- separate(test,x, c("X", "Y"),sep = ",");test_seprate
X Y
1 a b
2 a d
3 b c
合并
test_re <- unite(test_seprate,"x",X,Y,sep = ",");test_re
x
1 a,b
2 a,d
3 b,c
三、处理NA
原始数据
X<-data.frame(X1 = LETTERS[1:5],X2 = 1:5)
X[2,2] <- NA
X[4,1] <- NA;X
X1 X2
1 A 1
2 B NA
3 C 3
4 <NA> 4
5 E 5
1.去掉含有NA的行,可以选择只根据某一列来去除
drop_na(X) #删除x这个数据里的na
X1 X2
1 A 1
2 C 3
3 E 5
drop_na(X,X1) #删除x1列na
X1 X2
1 A 1
2 B NA
3 C 3
4 E 5
drop_na(X,X2)
X1 X2
1 A 1
2 C 3
3 <NA> 4
4 E 5
2.替换NA
replace_na(X$X2,0) #将x2里面的na替换成0
## [1] 1 0 3 4 5
3.用上一行的值填充NA
X
X1 X2
1 A 1
2 B NA
3 C 3
4 <NA> 4
5 E 5
fill(X,X2)
X1 X2 #把x的x2中na填充上一行的值
1 A 1
2 B 1
3 C 3
4 <NA> 4
5 E 5
dplyr
rm(list = ls())
if(!require(dplyr))install.packages("dplyr")
library(dplyr)
test <- iris[c(1:2,51:52,101:102),]
rownames(test) =NULL
五个基础函数
1.mutate(),新增列
mutate(test, new = Sepal.Length * Sepal.Width)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species new
## 1 5.1 3.5 1.4 0.2 setosa 17.85
## 2 4.9 3.0 1.4 0.2 setosa 14.70
## 3 7.0 3.2 4.7 1.4 versicolor 22.40
## 4 6.4 3.2 4.5 1.5 versicolor 20.48
## 5 6.3 3.3 6.0 2.5 virginica 20.79
## 6 5.8 2.7 5.1 1.9 virginica 15.66
2.select(),按列筛选
(1)按列号筛选
select(test,1)
## Sepal.Length
## 1 5.1
## 2 4.9
## 3 7.0
## 4 6.4
## 5 6.3
## 6 5.8
select(test,c(1,5))
## Sepal.Length Species
## 1 5.1 setosa
## 2 4.9 setosa
## 3 7.0 versicolor
## 4 6.4 versicolor
## 5 6.3 virginica
## 6 5.8 virginica
(2)按列名筛选
select(test,Sepal.Length)
## Sepal.Length
## 1 5.1
## 2 4.9
## 3 7.0
## 4 6.4
## 5 6.3
## 6 5.8
select(test, Petal.Length, Petal.Width)
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 4.7 1.4
## 4 4.5 1.5
## 5 6.0 2.5
## 6 5.1 1.9
vars <- c("Petal.Length", "Petal.Width")
select(test, one_of(vars))
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 4.7 1.4
## 4 4.5 1.5
## 5 6.0 2.5
## 6 5.1 1.9
(3)一组来自tidyselect的有用函数
select(test, starts_with("Petal"))
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 4.7 1.4
## 4 4.5 1.5
## 5 6.0 2.5
## 6 5.1 1.9
select(test, ends_with("Width"))
## Sepal.Width Petal.Width
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 1.4
## 4 3.2 1.5
## 5 3.3 2.5
## 6 2.7 1.9
select(test, contains("etal"))
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 4.7 1.4
## 4 4.5 1.5
## 5 6.0 2.5
## 6 5.1 1.9
select(test, matches(".t."))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 7.0 3.2 4.7 1.4
## 4 6.4 3.2 4.5 1.5
## 5 6.3 3.3 6.0 2.5
## 6 5.8 2.7 5.1 1.9
select(test, everything())
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 7.0 3.2 4.7 1.4 versicolor
## 4 6.4 3.2 4.5 1.5 versicolor
## 5 6.3 3.3 6.0 2.5 virginica
## 6 5.8 2.7 5.1 1.9 virginica
select(test, last_col())
## Species
## 1 setosa
## 2 setosa
## 3 versicolor
## 4 versicolor
## 5 virginica
## 6 virginica
select(test, last_col(offset = 1)) # offset 就是 ncol - 1那就是选取倒数第二列
## Petal.Width
## 1 0.2
## 2 0.2
## 3 1.4
## 4 1.5
## 5 2.5
## 6 1.9
(4)利用everything(),列名可以重排序
select(test,Species,everything())
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3.0 1.4 0.2
## 3 versicolor 7.0 3.2 4.7 1.4
## 4 versicolor 6.4 3.2 4.5 1.5
## 5 virginica 6.3 3.3 6.0 2.5
## 6 virginica 5.8 2.7 5.1 1.9
3.filter()筛选行
filter(test, Species == "setosa")
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
filter(test, Species == "setosa"&Sepal.Length > 5 )
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
filter(test, Species %in% c("setosa","versicolor"))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 7.0 3.2 4.7 1.4 versicolor
## 4 6.4 3.2 4.5 1.5 versicolor
4.arrange(),按某一列对整个表格进行排序
arrange(test, Sepal.Length) #默认从小到大排序
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 4.9 3.0 1.4 0.2 setosa
## 2 5.1 3.5 1.4 0.2 setosa
## 3 5.8 2.7 5.1 1.9 virginica
## 4 6.3 3.3 6.0 2.5 virginica
## 5 6.4 3.2 4.5 1.5 versicolor
## 6 7.0 3.2 4.7 1.4 versicolor
arrange(test, desc(Sepal.Length)) #用desc从大到小
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.0 3.2 4.7 1.4 versicolor
## 2 6.4 3.2 4.5 1.5 versicolor
## 3 6.3 3.3 6.0 2.5 virginica
## 4 5.8 2.7 5.1 1.9 virginica
## 5 5.1 3.5 1.4 0.2 setosa
## 6 4.9 3.0 1.4 0.2 setosa
arrange(test, desc(Sepal.Width),Sepal.Length) #按照多列进行排序??
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 6.3 3.3 6.0 2.5 virginica
## 3 6.4 3.2 4.5 1.5 versicolor
## 4 7.0 3.2 4.7 1.4 versicolor
## 5 4.9 3.0 1.4 0.2 setosa
## 6 5.8 2.7 5.1 1.9 virginica
5.summarise():汇总
对数据进行汇总操作,结合group_by使用实用性强
#计算Sepal.Length的平均值和标准差:
summarise(test, mean(Sepal.Length), sd(Sepal.Length))
mean(Sepal.Length) sd(Sepal.Length)
1 5.916667 0.8084965
# 先按照Species分组,计算每组Sepal.Length的平均值和标准差
group_by(test, Species)
tmp = summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length));tmp
image.png
两个实用技能
1:管道操作 %>% (cmd/ctr + shift + M)
library(dplyr)
x1 = filter(iris,Sepal.Width>3)
x2 = select(x1,c("Sepal.Length","Sepal.Width" ))
x3 = arrange(x2,Sepal.Length)
colnames(iris)
[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
iris %>%
filter(Sepal.Width>3) %>%
select(c("Sepal.Length","Sepal.Width" ))%>%
arrange(Sepal.Length)
2:count统计某列的unique值
> count(test,Species)
Species n
1 setosa 2
2 versicolor 2
3 virginica 2
未讲内容:
处理关系数据:即将2个表进行连接,注意:不要引入factor
options(stringsAsFactors = F)
test1 <- data.frame(name = c('jimmy','nicker','doodle'),
blood_type = c("A","B","O"))
name blood_type
1 jimmy A
2 nicker B
3 doodle O
test2 <- data.frame(name = c('doodle','jimmy','nicker','tony'),
group = c("group1","group1","group2","group2"),
vision = c(4.2,4.3,4.9,4.5))
name group vision
1 doodle group1 4.2
2 jimmy group1 4.3
3 nicker group2 4.9
4 tony group2 4.5
test3 <- data.frame(NAME = c('doodle','jimmy','lucy','nicker'),
weight = c(140,145,110,138))
NAME weight
1 doodle 140
2 jimmy 145
3 lucy 110
4 nicker 138
merge(test1,test2,by="name") #tony这项并不重复的项目不存在了
name blood_type group vision
## 1 doodle O group1 4.2
## 2 jimmy A group1 4.3
## 3 nicker B group2 4.9
merge(test1,test3,by.x = "name",by.y = "NAME") #当相同列名格式不一致时
## name blood_type weight
## 1 doodle O 140
## 2 jimmy A 145
## 3 nicker B 138
1.內连inner_join,取交集
inner_join(test1, test2, by = "name") #跟merge一样效果
## name blood_type group vision
## 1 jimmy A group1 4.3
## 2 nicker B group2 4.9
## 3 doodle O group1 4.2
inner_join(test1,test3,by = c("name"="NAME"))
## name blood_type weight
## 1 jimmy A 145
## 2 nicker B 138
## 3 doodle O 140
2.左连left_join
left_join(test1, test2, by = 'name')
## name blood_type group vision
## 1 jimmy A group1 4.3
## 2 nicker B group2 4.9
## 3 doodle O group1 4.2
left_join(test2, test1, by = 'name') #以test2为准,此时tony存在
## name group vision blood_type
## 1 doodle group1 4.2 O
## 2 jimmy group1 4.3 A
## 3 nicker group2 4.9 B
## 4 tony group2 4.5 <NA>
3.全连full_join
full_join(test1, test2, by = 'name')
## name blood_type group vision
## 1 jimmy A group1 4.3
## 2 nicker B group2 4.9
## 3 doodle O group1 4.2
## 4 tony <NA> group2 4.5
4.半连接:返回能够与y表匹配的x表所有记录semi_join
semi_join(x = test1, y = test2, by = 'name')
## name blood_type
## 1 jimmy A
## 2 nicker B
## 3 doodle O
5.反连接:返回无法与y表匹配的x表的所记录anti_join
anti_join(x = test2, y = test1, by = 'name')
## name group vision
## 1 tony group2 4.5
6.数据的简单合并
在相当于base包里的cbind()函数和rbind()函数;注意,bind_rows()函数需要两个表格列数相同,而bind_cols()函数则需要两个数据框有相同的行数
test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
test1
## x y
## 1 1 10
## 2 2 20
## 3 3 30
## 4 4 40
test2 <- data.frame(x = c(5,6), y = c(50,60))
test2
## x y
## 1 5 50
## 2 6 60
test3 <- data.frame(z = c(100,200,300,400))
test3
## z
## 1 100
## 2 200
## 3 300
## 4 400
bind_rows(test1, test2)
## x y
## 1 1 10
## 2 2 20
## 3 3 30
## 4 4 40
## 5 5 50
## 6 6 60
bind_cols(test1, test3)
## x y z
## 1 1 10 100
## 2 2 20 200
## 3 3 30 300
## 4 4 40 400
练习6-1
1.将iris数据框的前4列gather,然后还原
tmp <- iris
tmp_gather <- tmp %>%
gather(key = bioinformation, value = number, -Species)
head(tmp_gather)
## Species bioinformation number
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Length 4.9
## 3 setosa Sepal.Length 4.7
## 4 setosa Sepal.Length 4.6
## 5 setosa Sepal.Length 5.0
## 6 setosa Sepal.Length 5.4
tmp_re <- tmp_gather %>%
group_by(bioinformation) %>%
mutate(id=1:n()) %>%
spread(bioinformation,number)
head(tmp_re)
## # A tibble: 6 x 6
## Species id Petal.Length Petal.Width Sepal.Length Sepal.Width
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 1 1.4 0.2 5.1 3.5
## 2 setosa 2 1.4 0.2 4.9 3
## 3 setosa 3 1.3 0.2 4.7 3.2
## 4 setosa 4 1.5 0.2 4.6 3.1
## 5 setosa 5 1.4 0.2 5 3.6
## 6 setosa 6 1.7 0.4 5.4 3.9
2.将第二列分成两列(以小数点为分隔符)然后合并
#### 点号表示任意字符
x=separate(test,
Sepal.Width,
into = c('a','b'),
sep = "\\.")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [2].
x$b <- replace_na(x$b,0);x
## Sepal.Length a b Petal.Length Petal.Width Species
## 1 5.1 3 5 1.4 0.2 setosa
## 2 4.9 3 0 1.4 0.2 setosa
## 3 7.0 3 2 4.7 1.4 versicolor
## 4 6.4 3 2 4.5 1.5 versicolor
## 5 6.3 3 3 6.0 2.5 virginica
## 6 5.8 2 7 5.1 1.9 virginica
x_re=unite(x,
"Sepal.Width",
a,b,sep = ".");x_re
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 7.0 3.2 4.7 1.4 versicolor
## 4 6.4 3.2 4.5 1.5 versicolor
## 5 6.3 3.3 6.0 2.5 virginica
## 6 5.8 2.7 5.1 1.9 virginica
x_re$Sepal.Width <- as.numeric(x_re$Sepal.Width)
str(x_re)
## 'data.frame': 6 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 7 6.4 6.3 5.8
## $ Sepal.Width : num 3.5 3 3.2 3.2 3.3 2.7
## $ Petal.Length: num 1.4 1.4 4.7 4.5 6 5.1
## $ Petal.Width : num 0.2 0.2 1.4 1.5 2.5 1.9
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 2 2 3 3
网友评论