美文网首页
12-1 tidyr dplyr stringr

12-1 tidyr dplyr stringr

作者: 小胡同学ime | 来源:发表于2021-10-15 21:06 被阅读0次

tidyr dplyr stringr “R数据科学”

image.png

R1:tidyr 核心函数

gather-spread # 列的聚集与分散 
separate-unite # 列的拆分与合并
image.png
image.png

处理NA:

drop_na(数据) #删除含有na的所有行
drop_na(数据名,x1) #删除 x1列中所有行
replace_na(x$x2,0)  #吧=把数据x中的所有na替换成0,但是并未赋值
fill(x,x2) #按照上一行填充缺失值

R2:dplyr

基础

  1. mutate(), 新增列
  2. select(), 按列筛选 #按列取子集
  3. filter(), 按行筛选 #按列取子集
  4. arrange(), 按某一行对数据框进行排序
  5. summarise(), 汇总

进阶
*count()
输出结果为数据框,能统计数据及重复次数
*管道符号 %>% (ctr+shift+M)
上一步的输出作为下一步的输入

image.png
*处理关系数据的函数

tidyr

一、数据清理

rm(list = ls())
options(stringsAsFactors = F)
if(!require(tidyr))install.packages("tidyr")
library(tidyr)

test <- data.frame(geneid = paste0("gene",1:4),
                 sample1 = c(1,4,7,10),
                 sample2 = c(2,5,0.8,11),
                 sample3 = c(0.3,6,9,12))
test
    geneid sample1 sample2 sample3
1  gene1       1           2.0       0.3
2  gene2       4           5.0       6.0
3  gene3       7           0.8       9.0
4  gene4      10         11.0      12.0

扁变长

test_gather <- gather(data = test,
                    key = sample_nm,
                    value = exp,
                    - geneid)
head(test_gather)
   geneid sample_nm exp
1  gene1   sample1   1
2  gene2   sample1   4
3  gene3   sample1   7
4  gene4   sample1  10
5  gene1   sample2   2
6  gene2   sample2   5

长变扁

test_re <- spread(data = test_gather,
                key = sample_nm,
                value = exp)
head(test_re)
   geneid sample1 sample2 sample3
1  gene1       1     2.0     0.3
2  gene2       4     5.0     6.0
3  gene3       7     0.8     9.0
4  gene4      10    11.0    12.0

二、分割和合并

原始数据

test <- data.frame(x = c( "a,b", "a,d", "b,c"));test
  x
1 a,b
2 a,d
3 b,c

分割

test_seprate <- separate(test,x, c("X", "Y"),sep = ",");test_seprate
  X Y
1 a b
2 a d
3 b c

合并

test_re <- unite(test_seprate,"x",X,Y,sep = ",");test_re  
     x
1 a,b
2 a,d
3 b,c

三、处理NA

原始数据

X<-data.frame(X1 = LETTERS[1:5],X2 = 1:5)
X[2,2] <- NA
X[4,1] <- NA;X
     X1 X2
1    A  1
2    B NA
3    C  3
4 <NA>  4
5    E  5

1.去掉含有NA的行,可以选择只根据某一列来去除

drop_na(X)   #删除x这个数据里的na
   X1 X2
1  A  1
2  C  3
3  E  5

drop_na(X,X1)   #删除x1列na
     X1 X2
1    A  1
2    B NA
3    C  3
4    E  5

drop_na(X,X2)
     X1 X2
1    A  1
2    C  3
3 <NA>  4
4    E  5

2.替换NA

replace_na(X$X2,0)    #将x2里面的na替换成0
## [1] 1 0 3 4 5

3.用上一行的值填充NA

X
      X1 X2
1    A  1
2    B NA
3    C  3
4 <NA>  4
5    E  5
fill(X,X2)
      X1 X2  #把x的x2中na填充上一行的值
1    A  1
2    B  1
3    C  3
4 <NA>  4
5    E  5

小抄

dplyr

rm(list = ls())
if(!require(dplyr))install.packages("dplyr")
library(dplyr)

test <- iris[c(1:2,51:52,101:102),]
rownames(test) =NULL

五个基础函数

1.mutate(),新增列

mutate(test, new = Sepal.Length * Sepal.Width)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species   new
## 1          5.1         3.5          1.4         0.2     setosa 17.85
## 2          4.9         3.0          1.4         0.2     setosa 14.70
## 3          7.0         3.2          4.7         1.4 versicolor 22.40
## 4          6.4         3.2          4.5         1.5 versicolor 20.48
## 5          6.3         3.3          6.0         2.5  virginica 20.79
## 6          5.8         2.7          5.1         1.9  virginica 15.66

2.select(),按列筛选

(1)按列号筛选
select(test,1)
##   Sepal.Length
## 1          5.1
## 2          4.9
## 3          7.0
## 4          6.4
## 5          6.3
## 6          5.8

select(test,c(1,5))
##   Sepal.Length    Species
## 1          5.1     setosa
## 2          4.9     setosa
## 3          7.0 versicolor
## 4          6.4 versicolor
## 5          6.3  virginica
## 6          5.8  virginica

(2)按列名筛选
select(test,Sepal.Length)
##   Sepal.Length
## 1          5.1
## 2          4.9
## 3          7.0
## 4          6.4
## 5          6.3
## 6          5.8

select(test, Petal.Length, Petal.Width)
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          4.7         1.4
## 4          4.5         1.5
## 5          6.0         2.5
## 6          5.1         1.9

vars <- c("Petal.Length", "Petal.Width")
select(test, one_of(vars))
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          4.7         1.4
## 4          4.5         1.5
## 5          6.0         2.5
## 6          5.1         1.9

(3)一组来自tidyselect的有用函数
select(test, starts_with("Petal"))
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          4.7         1.4
## 4          4.5         1.5
## 5          6.0         2.5
## 6          5.1         1.9

select(test, ends_with("Width"))
##   Sepal.Width Petal.Width
## 1         3.5         0.2
## 2         3.0         0.2
## 3         3.2         1.4
## 4         3.2         1.5
## 5         3.3         2.5
## 6         2.7         1.9

select(test, contains("etal"))
##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          4.7         1.4
## 4          4.5         1.5
## 5          6.0         2.5
## 6          5.1         1.9

select(test, matches(".t."))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          7.0         3.2          4.7         1.4
## 4          6.4         3.2          4.5         1.5
## 5          6.3         3.3          6.0         2.5
## 6          5.8         2.7          5.1         1.9

select(test, everything())
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.1         3.5          1.4         0.2     setosa
## 2          4.9         3.0          1.4         0.2     setosa
## 3          7.0         3.2          4.7         1.4 versicolor
## 4          6.4         3.2          4.5         1.5 versicolor
## 5          6.3         3.3          6.0         2.5  virginica
## 6          5.8         2.7          5.1         1.9  virginica

select(test, last_col())
##      Species
## 1     setosa
## 2     setosa
## 3 versicolor
## 4 versicolor
## 5  virginica
## 6  virginica

select(test, last_col(offset = 1)) # offset 就是 ncol - 1那就是选取倒数第二列
##   Petal.Width
## 1         0.2
## 2         0.2
## 3         1.4
## 4         1.5
## 5         2.5
## 6         1.9
(4)利用everything(),列名可以重排序
select(test,Species,everything())
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa          5.1         3.5          1.4         0.2
## 2     setosa          4.9         3.0          1.4         0.2
## 3 versicolor          7.0         3.2          4.7         1.4
## 4 versicolor          6.4         3.2          4.5         1.5
## 5  virginica          6.3         3.3          6.0         2.5
## 6  virginica          5.8         2.7          5.1         1.9

3.filter()筛选行

filter(test, Species == "setosa")
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa

filter(test, Species == "setosa"&Sepal.Length > 5 )
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa

filter(test, Species %in% c("setosa","versicolor"))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.1         3.5          1.4         0.2     setosa
## 2          4.9         3.0          1.4         0.2     setosa
## 3          7.0         3.2          4.7         1.4 versicolor
## 4          6.4         3.2          4.5         1.5 versicolor

4.arrange(),按某一列对整个表格进行排序

arrange(test, Sepal.Length) #默认从小到大排序
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          4.9         3.0          1.4         0.2     setosa
## 2          5.1         3.5          1.4         0.2     setosa
## 3          5.8         2.7          5.1         1.9  virginica
## 4          6.3         3.3          6.0         2.5  virginica
## 5          6.4         3.2          4.5         1.5 versicolor
## 6          7.0         3.2          4.7         1.4 versicolor

arrange(test, desc(Sepal.Length)) #用desc从大到小
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          7.0         3.2          4.7         1.4 versicolor
## 2          6.4         3.2          4.5         1.5 versicolor
## 3          6.3         3.3          6.0         2.5  virginica
## 4          5.8         2.7          5.1         1.9  virginica
## 5          5.1         3.5          1.4         0.2     setosa
## 6          4.9         3.0          1.4         0.2     setosa

arrange(test, desc(Sepal.Width),Sepal.Length) #按照多列进行排序??
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.1         3.5          1.4         0.2     setosa
## 2          6.3         3.3          6.0         2.5  virginica
## 3          6.4         3.2          4.5         1.5 versicolor
## 4          7.0         3.2          4.7         1.4 versicolor
## 5          4.9         3.0          1.4         0.2     setosa
## 6          5.8         2.7          5.1         1.9  virginica

5.summarise():汇总

对数据进行汇总操作,结合group_by使用实用性强
#计算Sepal.Length的平均值和标准差:
summarise(test, mean(Sepal.Length), sd(Sepal.Length))
     mean(Sepal.Length) sd(Sepal.Length)
1           5.916667        0.8084965

# 先按照Species分组,计算每组Sepal.Length的平均值和标准差
group_by(test, Species)
tmp = summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length));tmp
image.png

两个实用技能

1:管道操作 %>% (cmd/ctr + shift + M)
library(dplyr)
x1 = filter(iris,Sepal.Width>3)
x2 = select(x1,c("Sepal.Length","Sepal.Width" ))
x3 = arrange(x2,Sepal.Length)

colnames(iris)
 [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

iris %>% 
  filter(Sepal.Width>3) %>% 
  select(c("Sepal.Length","Sepal.Width" ))%>%
  arrange(Sepal.Length)

2:count统计某列的unique值

> count(test,Species)
     Species n
1     setosa 2
2 versicolor 2
3  virginica 2

未讲内容:

处理关系数据:即将2个表进行连接,注意:不要引入factor
options(stringsAsFactors = F)

test1 <- data.frame(name = c('jimmy','nicker','doodle'), 
                    blood_type = c("A","B","O"))
       name blood_type
1     jimmy          A
2     nicker         B
3     doodle         O

test2 <- data.frame(name = c('doodle','jimmy','nicker','tony'),
                    group = c("group1","group1","group2","group2"),
                    vision = c(4.2,4.3,4.9,4.5)) 
     name  group vision
1 doodle group1    4.2
2  jimmy group1    4.3
3 nicker group2    4.9
4   tony group2    4.5

test3 <- data.frame(NAME = c('doodle','jimmy','lucy','nicker'),
                    weight = c(140,145,110,138))
  NAME weight
1 doodle    140
2  jimmy    145
3   lucy    110
4 nicker    138

merge(test1,test2,by="name")  #tony这项并不重复的项目不存在了
name blood_type  group vision
## 1 doodle          O group1    4.2
## 2  jimmy          A group1    4.3
## 3 nicker          B group2    4.9

merge(test1,test3,by.x = "name",by.y = "NAME")  #当相同列名格式不一致时
##     name blood_type weight
## 1 doodle          O    140
## 2  jimmy          A    145
## 3 nicker          B    138
1.內连inner_join,取交集
inner_join(test1, test2, by = "name") #跟merge一样效果
##     name blood_type  group vision
## 1  jimmy          A group1    4.3
## 2 nicker          B group2    4.9
## 3 doodle          O group1    4.2

inner_join(test1,test3,by = c("name"="NAME"))
##     name blood_type weight
## 1  jimmy          A    145
## 2 nicker          B    138
## 3 doodle          O    140

2.左连left_join
left_join(test1, test2, by = 'name')
##     name blood_type  group vision
## 1  jimmy          A group1    4.3
## 2 nicker          B group2    4.9
## 3 doodle          O group1    4.2

left_join(test2, test1, by = 'name') #以test2为准,此时tony存在
##     name  group vision blood_type
## 1 doodle group1    4.2          O
## 2  jimmy group1    4.3          A
## 3 nicker group2    4.9          B
## 4   tony group2    4.5       <NA>
3.全连full_join
full_join(test1, test2, by = 'name')
##     name blood_type  group vision
## 1  jimmy          A group1    4.3
## 2 nicker          B group2    4.9
## 3 doodle          O group1    4.2
## 4   tony       <NA> group2    4.5

4.半连接:返回能够与y表匹配的x表所有记录semi_join
semi_join(x = test1, y = test2, by = 'name')
##     name blood_type
## 1  jimmy          A
## 2 nicker          B
## 3 doodle          O
5.反连接:返回无法与y表匹配的x表的所记录anti_join
anti_join(x = test2, y = test1, by = 'name')
##   name  group vision
## 1 tony group2    4.5
6.数据的简单合并
在相当于base包里的cbind()函数和rbind()函数;注意,bind_rows()函数需要两个表格列数相同,而bind_cols()函数则需要两个数据框有相同的行数
test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
test1
##   x  y
## 1 1 10
## 2 2 20
## 3 3 30
## 4 4 40
test2 <- data.frame(x = c(5,6), y = c(50,60))
test2
##   x  y
## 1 5 50
## 2 6 60
test3 <- data.frame(z = c(100,200,300,400))
test3
##     z
## 1 100
## 2 200
## 3 300
## 4 400

bind_rows(test1, test2)
##   x  y
## 1 1 10
## 2 2 20
## 3 3 30
## 4 4 40
## 5 5 50
## 6 6 60

bind_cols(test1, test3)
##   x  y   z
## 1 1 10 100
## 2 2 20 200
## 3 3 30 300
## 4 4 40 400

练习6-1

1.将iris数据框的前4列gather,然后还原
tmp <- iris
tmp_gather <- tmp %>% 
  gather(key = bioinformation, value = number, -Species)
head(tmp_gather)
##   Species bioinformation number
## 1  setosa   Sepal.Length    5.1
## 2  setosa   Sepal.Length    4.9
## 3  setosa   Sepal.Length    4.7
## 4  setosa   Sepal.Length    4.6
## 5  setosa   Sepal.Length    5.0
## 6  setosa   Sepal.Length    5.4

tmp_re <- tmp_gather %>%
  group_by(bioinformation) %>% 
  mutate(id=1:n()) %>%
  spread(bioinformation,number)
head(tmp_re)
## # A tibble: 6 x 6
##   Species    id Petal.Length Petal.Width Sepal.Length Sepal.Width
##   <fct>   <int>        <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa      1          1.4         0.2          5.1         3.5
## 2 setosa      2          1.4         0.2          4.9         3  
## 3 setosa      3          1.3         0.2          4.7         3.2
## 4 setosa      4          1.5         0.2          4.6         3.1
## 5 setosa      5          1.4         0.2          5           3.6
## 6 setosa      6          1.7         0.4          5.4         3.9

小洁老师本人的解答

2.将第二列分成两列(以小数点为分隔符)然后合并
#### 点号表示任意字符
x=separate(test,
           Sepal.Width,
           into = c('a','b'),
           sep = "\\.") 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [2].
x$b <- replace_na(x$b,0);x 
##   Sepal.Length a b Petal.Length Petal.Width    Species
## 1          5.1 3 5          1.4         0.2     setosa
## 2          4.9 3 0          1.4         0.2     setosa
## 3          7.0 3 2          4.7         1.4 versicolor
## 4          6.4 3 2          4.5         1.5 versicolor
## 5          6.3 3 3          6.0         2.5  virginica
## 6          5.8 2 7          5.1         1.9  virginica

x_re=unite(x,
           "Sepal.Width",
           a,b,sep = ".");x_re
##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.1         3.5          1.4         0.2     setosa
## 2          4.9         3.0          1.4         0.2     setosa
## 3          7.0         3.2          4.7         1.4 versicolor
## 4          6.4         3.2          4.5         1.5 versicolor
## 5          6.3         3.3          6.0         2.5  virginica
## 6          5.8         2.7          5.1         1.9  virginica

x_re$Sepal.Width <- as.numeric(x_re$Sepal.Width)
str(x_re)
## 'data.frame':    6 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 7 6.4 6.3 5.8
##  $ Sepal.Width : num  3.5 3 3.2 3.2 3.3 2.7
##  $ Petal.Length: num  1.4 1.4 4.7 4.5 6 5.1
##  $ Petal.Width : num  0.2 0.2 1.4 1.5 2.5 1.9
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 2 2 3 3

相关文章

网友评论

      本文标题:12-1 tidyr dplyr stringr

      本文链接:https://www.haomeiwen.com/subject/vbxamltx.html