tidyr 核心函数
一、数据清理
##原始数据
> test <- data.frame(geneid = paste0("gene",1:4),
+ sample1 = c(1,4,7,10),
+ sample2 = c(2,5,0.8,11),
+ sample3 = c(0.3,6,9,12))
扁变长(gather-spread)
> test_gather <- gather(data = test,
+ key = sample_nm,
+ value = exp,
+ - geneid)
长变扁
> test_re <- spread(data = test_gather,
+ key = sample_nm,
+ value = exp)
二、分割和合并(separate-unite)
#原始数据
> test <- data.frame(x = c( "a,b", "a,d", "b,c"))
分割
> test_seprate <- separate(test,x, c("X", "Y"),sep = ",")
合并
> test_re <- unite(test_seprate,"x",X,Y,sep = ",")
三、处理NA
### 原始数据
> X<-data.frame(X1 = LETTERS[1:5],X2 = 1:5)
> X[2,2] <- NA
> X[4,1] <- NA
> X
X1 X2
1 A 1
2 B NA
3 C 3
4 <NA> 4
5 E 5
去掉含有NA的行,可以选择只根据某一列来去除
> drop_na(X) #把所有带有NA的行都删掉
X1 X2
1 A 1
2 C 3
3 E 5
> drop_na(X,X1) #只对X1这一列分析,X1中有NA的这一行去掉,注意不是赋值,原本X没改变
X1 X2
1 A 1
2 B NA
3 C 3
4 E 5
替换NA
> replace_na(X$X2,0) #将X2这一列的NA值改为0
[1] 1 0 3 4 5
用上一行的值填充NA
> X
X1 X2
1 A 1
2 B NA
3 C 3
4 <NA> 4
5 E 5
> fill(X,X2) #X2的这一列的空值按上一行填充
X1 X2
1 A 1
2 B 1
3 C 3
4 <NA> 4
5 E 5
完整版见小抄 https://rstudio.com/resources/cheatsheets/
dplyr核心函数
数据准备
> library(dplyr)
> test <- iris[c(1:2,51:52,101:102),]
> rownames(test) =NULL
五个基础函数
1.mutate(),新增列
2.select(),按列筛选
3.filter()筛选行
4.arrange(),按某一列对整个表格进行排序
5.summarise():汇总
###1.mutate(),新增列
> mutate(test, new = Sepal.Length * Sepal.Width)
###2.select(),按列筛选
####(1)按列号筛选
> select(test,1) #筛选第一列
> select(test,c(1,5)) #筛选第一列和第五列
####(2)按列名筛选
> select(test,Sepal.Length)
> select(test, Petal.Length, Petal.Width)
> vars <- c("Petal.Length", "Petal.Width")
> select(test, one_of(vars))
> select(test, starts_with("Petal"))
> select(test, ends_with("Width"))
> select(test, contains("etal"))
> select(test, matches(".t."))
> select(test, everything())
> select(test, last_col())
> select(test, last_col(offset = 1))
####(3)利用everything(),列名可以重排序
> select(test,Species,everything())
###3.filter()筛选行
> filter(test, Species == "setosa")
> filter(test, Species == "setosa"&Sepal.Length > 5 )
> filter(test, Species %in% c("setosa","versicolor"))
###4.arrange(),按某一列对整个表格进行排序
> arrange(test, Sepal.Length)#默认从小到大排序
> arrange(test, desc(Sepal.Length))#用desc从大到小
###5.summarise():汇总
> summarise(test, mean(Sepal.Length), sd(Sepal.Length))计算Sepal.Length的平均值和标准差:
mean(Sepal.Length) sd(Sepal.Length)
1 5.916667 0.8084965
> # 先按照Species分组,计算每组Sepal.Length的平均值和标准差
> group_by(test, Species)
> tmp = summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length))
> tmp
# A tibble: 3 x 3
Species `mean(Sepal.Length)` `sd(Sepal.Length)`
* <fct> <dbl> <dbl>
1 setosa 5 0.141
2 versicolor 6.7 0.424
3 virginica 6.05 0.354
补充arrange()函数使用方法
> library(dplyr)
> test = iris[c(1,2,51,52,101,102),] #选取iris的第1,2,51,52,101,102行
> rownames(test) = NULL #去掉iris的行名
> arrange(test,Sepal.Length) #根据Sepal.Length从小到大排序
> arrange(test,desc(Sepal.Length)) #根据Sepal.Length从大到小排序
> arrange(test,Sepal.Length,Sepal.Width) #按照两列排序,如果有一列有相同值,按照第二列的顺序排序
> o = order(test$Sepal.Length) #返回值是位置下标
> test$Sepal.Length[o]
[1] 4.9 5.1 5.8 6.3 6.4 7.0
> x[order(x)]等同于 sort(x),但是用order不仅仅可以对列排序,还可以对数据框排序
> test[o,]
两个实用技能
1:管道操作 %>% (cmd/ctrl + shift + M)
> library(dplyr)
> x1 = filter(iris,Sepal.Width>3)
> x2 = select(x1,c("Sepal.Length","Sepal.Width" ))
> x3 = arrange(x2,Sepal.Length)
> iris %>%
+ filter(Sepal.Width>3) %>%
+ select(c("Sepal.Length","Sepal.Width" ))%>%
+ arrange(Sepal.Length)
2:count统计某列的unique值
> count(test,Species)
Species n
1 setosa 2
2 versicolor 2
3 virginica 2
处理关系数据:即将2个表进行连接,注意:不要引入factor
原始数据
> options(stringsAsFactors = F)
> test1 <- data.frame(name = c('jimmy','nicker','doodle'),
+ blood_type = c("A","B","O"))
> test1
name blood_type
1 jimmy A
2 nicker B
3 doodle O
> test2 <- data.frame(name = c('doodle','jimmy','nicker','tony'),
+ group = c("group1","group1","group2","group2"),
+ vision = c(4.2,4.3,4.9,4.5))
> test2
name group vision
1 doodle group1 4.2
2 jimmy group1 4.3
3 nicker group2 4.9
4 tony group2 4.5
> test3 <- data.frame(NAME = c('doodle','jimmy','lucy','nicker'),
+ weight = c(140,145,110,138))
> test3
NAME weight
1 doodle 140
2 jimmy 145
3 lucy 110
4 nicker 138
> merge(test1,test2,by="name")
name blood_type group vision
1 doodle O group1 4.2
2 jimmy A group1 4.3
3 nicker B group2 4.9
> merge(test1,test3,by.x = "name",by.y = "NAME")
name blood_type weight
1 doodle O 140
2 jimmy A 145
3 nicker B 138
1.內连inner_join,取交集
> inner_join(test1, test2, by = "name")
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 doodle O group1 4.2
> inner_join(test1,test3,by = c("name"="NAME"))
name blood_type weight
1 jimmy A 145
2 nicker B 138
3 doodle O 140
2.左连left_join
> left_join(test1, test2, by = 'name')
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 doodle O group1 4.2
> left_join(test2, test1, by = 'name')
name group vision blood_type
1 doodle group1 4.2 O
2 jimmy group1 4.3 A
3 nicker group2 4.9 B
4 tony group2 4.5 <NA>
3.全连full_join
> full_join(test1, test2, by = 'name')
name blood_type group vision
1 jimmy A group1 4.3
2 nicker B group2 4.9
3 doodle O group1 4.2
4 tony <NA> group2 4.5
4.半连接:返回能够与y表匹配的x表所有记录semi_join
> semi_join(x = test1, y = test2, by = 'name')
name blood_type
1 jimmy A
2 nicker B
3 doodle O
5.反连接:返回无法与y表匹配的x表的所记录anti_join
> anti_join(x = test2, y = test1, by = 'name')
name group vision
1 tony group2 4.5
6.数据的简单合并
在相当于base包里的cbind()函数和rbind()函数;注意,bind_rows()函数需要两个表格列数相同,而bind_cols()函数则需要两个数据框有相同的行数
> test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
> test1
x y
1 1 10
2 2 20
3 3 30
4 4 40
> test2 <- data.frame(x = c(5,6), y = c(50,60))
> test2
x y
1 5 50
2 6 60
> test3 <- data.frame(z = c(100,200,300,400))
> test3
z
1 100
2 200
3 300
4 400
> bind_rows(test1, test2)
x y
1 1 10
2 2 20
3 3 30
4 4 40
5 5 50
6 6 60
> bind_cols(test1, test3)
x y z
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400
stringr函数
> library(stringr)
> x <- "The birch canoe slid on the smooth planks."
> x
[1] "The birch canoe slid on the smooth planks."
1.检测字符串长度
> length(x)
[1] 1
> str_length(x) #含有多少个字符
[1] 42
2.字符串拆分与组合
> str_split(x," ") #按空格将字符串拆分
[[1]]
[1] "The" "birch" "canoe" "slid" "on"
[6] "the" "smooth" "planks."
> x2 = str_split(x," ")[[1]]
> str_c(x2,collapse = " ") #按空格组合
[1] "The birch canoe slid on the smooth planks."
> str_c(x2,1234,sep = "+")
[1] "The+1234" "birch+1234" "canoe+1234"
[4] "slid+1234" "on+1234" "the+1234"
[7] "smooth+1234" "planks.+1234"
3.提取字符串的一部分
> str_sub(x,5,9) #从第5位到第9位
[1] "birch"
4.大小写转换
> str_to_upper(x2) #将字符串改为大写
[1] "THE" "BIRCH" "CANOE" "SLID" "ON"
[6] "THE" "SMOOTH" "PLANKS."
> str_to_lower(x2) #将字符串改为小写
[1] "the" "birch" "canoe" "slid" "on"
[6] "the" "smooth" "planks."
> str_to_title(x2) #将首字母改为大写
[1] "The" "Birch" "Canoe" "Slid" "On"
[6] "The" "Smooth" "Planks."
5.字符串排序
> str_sort(x2)
[1] "birch" "canoe" "on" "planks." "slid"
[6] "smooth" "the" "The"
6.字符检测 --返回值为逻辑值
> str_detect(x2,"h") #字符串含有"h"
[1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
> str_starts(x2,"T") #字符串含有"T"
[1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> str_ends(x2,"e") #字符串以e结尾的
[1] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
###与sum和mean连用,可以统计匹配的个数和比例
> sum(str_detect(x2,"h"))
[1] 4
> mean(str_detect(x2,"h"))
[1] 0.5
7.提取匹配到的字符串
> str_subset(x2,"h")
[1] "The" "birch" "the" "smooth"
8.字符计数
[1] 7
> str_count(x2,"o")
[1] 0 0 1 0 1 0 2 0
9.字符串替换
[1] "The" "birch" "canAe" "slid" "An"
[6] "the" "smAoth" "planks."
> str_replace_all(x2,"o","A") #将所有的o替换为A
[1] "The" "birch" "canAe" "slid" "An"
[6] "the" "smAAth" "planks."
练习
#Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community.
#1.将上面这句话作为一个长字符串,赋值给tmp
#2.拆分为一个由单词组成的向量,赋值给tmp2(注意标点符号)
#3.用函数返回这句话中有多少个单词。
#4.用函数返回这句话中每个单词由多少个字母组成。
#5.统计tmp2有多少个单词中含有字母"e"
> tmp <- "Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community."
> tmp2 <- tmp %>%
+ str_replace(","," ")%>% #将逗号变为空格
+ str_remove("[.]")%>% #将.变为本身意义
+ str_split(" ")
> tmp2 <- tmp2[[1]]
> length(tmp2)
[1] 16
> str_length(tmp2)
[1] 14 2 1 3 7 2 7 4 10 8 3 13 2 3 8 9
> table(str_detect(tmp2,"e"))
FALSE TRUE
9 7
> sum(str_detect(tmp2,"e"))
[1] 7
#str_count(tmp2,"e") #是指每个字符串里含有多少个e
条件语句和循环语句
一.条件语句
1.if(){ }
(1)只有if没有else,那么条件是FALSE时就什么都不做
> i = -1
> if (i<0) print('up')
[1] "up"
> if (i>0) print('up') #条件是FALSE
(2)有else
> i =1
> if (i>0){
+ cat('+')
+ } else {
+ print("-")
+ }
+ #返回值为+
ifelse函数
ifelse有三个参数
ifelse(x,yes,no)
x:是逻辑值
yes:逻辑值为TRUE时的返回值
no:逻辑值为FALSE时的返回值
> i=c(0.11548,-5.123,2.654)
> ifelse(i>0,"+","-")
[1] "+" "-" "+"
> x=rnorm(10)
> x
[1] 0.6425792 -0.6829069 0.1632753 -0.2406404
[5] -0.3182894 -0.7686996 -0.1892211 -0.1442053
[9] 1.0053013 -1.4639149
> y=ifelse(x>0,"+","-")
> y
[1] "+" "-" "+" "-" "-" "-" "-" "-" "+" "-"
(3)多个条件
> i = 0
> if (i>0){
+ print('+')
+ } else if (i==0) {
+ print('0')
+ } else if (i< 0){
+ print('-')
+ }
[1] "0"
> ifelse(i>0,"+",ifelse((i<0),"-","0"))
[1] "0"
2.switch()
> cd = 3
> foo <- switch(EXPR = cd,
+ #EXPR = "aa",
+ aa=c(3.4,1),
+ bb=matrix(1:4,2,2),
+ cc=matrix(c(T,T,F,T,F,F),3,2),
+ dd="string here",
+ ee=matrix(c("red","green","blue","yellow")))
> foo
[,1] [,2]
[1,] TRUE TRUE
[2,] TRUE FALSE
[3,] FALSE FALSE
练习
#1.使用循环,查看"a",TRUE和3的数据类型
> a <- list("a",TRUE,3)
> for (i in 1:length(a)) {
+ print(class(a[[i]]))
+
+ }
[1] "character"
[1] "logical"
[1] "numeric"
#2.生成10个随机数,根据这10个随机数生成一个新向量,>中位数的值对应"A",<中位数的值对应"B"。
> b <- rnorm(10)
> ifelse(b>median(b),"A","B")
[1] "A" "B" "A" "B" "B" "A" "B" "B" "A" "A"
#3.根据上一练习题中的tmp2生成一个新向量,含有e的值对应"A",不含有e的值对应"B"
> tmp2 <- tmp %>%
+ str_replace(","," ") %>%
+ str_remove("[.]") %>%
+ str_split(" ")
> tmp2
[[1]]
[1] "Bioinformatics" "is"
[3] "a" "new"
[5] "subject" "of"
[7] "genetic" "data"
[9] "collection" "analysis"
[11] "and" "dissemination"
[13] "to" "the"
[15] "research" "community"
> tmp2 <- tmp2[[1]]
> ifelse(str_detect(tmp2,"e"),"A","B")
[1] "B" "B" "B" "A" "A" "B" "A" "B" "A" "B" "B" "A"
[13] "B" "A" "A" "B"
#4.加载deg.Rdata,根据a、b两列的值,按照以下条件生成向量x:
#a<1 且b<0.05,则x对应的值为down;
#a>1 且b<0.05,则x对应的值为up;
#其他情况,x对应的值为no
> load("deg.Rdata")
> k1 = deg$a<1 & deg$b<0.05
> k2 = deg$a>1 & deg$b<0.05
> x = ifelse(k1,"down",ifelse(k2,"up","no"))
# 5.统计x的重复值个数
> table(x)
x
down no up
3828 26094 853
# 6.将x添加到deg数据框中,成为新的一列
> deg$x <- x
二、循环语句
1.for循环
> x <- c(5,6,0,3)
> s=0
> for (i in x){
+ s=s+i
+ #if(i == 0) next
+ #if (i == 0) break
+ print(c(which(x==i),i,1/i,s))
+ }
[1] 1.0 5.0 0.2 5.0
[1] 2.0000000 6.0000000 0.1666667 11.0000000
[1] 3 0 Inf 11
[1] 4.0000000 3.0000000 0.3333333 14.0000000
> x <- c(5,6,0,3)
> s=0
> for (i in x){
+ s=s+i
+ if(i == 0) next
+ #if (i == 0) break
+ print(c(which(x==i),i,1/i,s))
+ }
[1] 1.0 5.0 0.2 5.0
[1] 2.0000000 6.0000000 0.1666667 11.0000000
[1] 4.0000000 3.0000000 0.3333333 14.0000000
> x <- c(5,6,0,3)
> s=0
> for (i in x){
+ s=s+i
+ #if(i == 0) next
+ if (i == 0) break
+ print(c(which(x==i),i,1/i,s))
+ }
[1] 1.0 5.0 0.2 5.0
[1] 2.0000000 6.0000000 0.1666667 11.0000000
#如何将结果存下来?
> s = 0
> result = list()
> for(i in 1:length(x)){
+ s=s+x[[i]]
+ result[[i]] = c(i,x[[i]],1/i,s)
+ }
> do.call(cbind,result)
[,1] [,2] [,3] [,4]
[1,] 1 2.0 3.0000000 4.00
[2,] 5 6.0 0.0000000 3.00
[3,] 1 0.5 0.3333333 0.25
[4,] 5 11.0 11.0000000 14.00
#练习4----
#1.使用循环,对iris的1到4列分别画点图(plot)
> par(mfrow = c(2,2)) #par()函数可以将绘图区分割成规则的几个部分,而且是先按行绘制,mfcol是先按列绘制
> for(i in 1:4){
+ plot(iris[,i],col = iris[,5])
+ }
#2.生成一个随机数(rnorm)组成的10行6列的矩阵,列名为sample1,sample2….sample6,行名为gene1,gene2…gene10,分组为sample1、2、3属于A组,sample4、5、6属于B组。用循环对每个基因画ggplot2箱线图,并尝试把10张图拼到一起。
> exp = matrix(rnorm(60),nrow = 10)
> colnames(exp) <- paste0("sample",1:6)
> rownames(exp) <- paste0("gene",1:10)
> exp[1:4,1:4]
sample1 sample2 sample3 sample4
gene1 0.3756800 -0.35824521 0.04884076 0.004333555
gene2 1.3406486 1.29023800 -0.18444678 -0.379581765
gene3 -0.2858732 -0.03525992 0.46980022 0.582935510
gene4 -1.2478246 -0.47409951 -0.72981205 1.374565803
> #dat = cbind(t(exp),group = rep(c("A","B"),each = 3))
> dat = data.frame(t(exp))
> dat = mutate(dat,group = rep(c("A","B"),each = 3))
> p = list()
> library(ggplot2)
> for(i in 1:(ncol(dat)-1)){
+ p[[i]] = ggplot(data = dat,aes_string(x = "group",y=colnames(dat)[i]))+ #批量出图时,需用到aes_string(),字符向量的循环
+ geom_boxplot(aes(color = group))+
+ geom_jitter(aes(color = group))+
+ theme_bw()
+ }
> library(patchwork)
> wrap_plots(p,nrow = 2,guides = "collect")
2.while 循环
> i = 0
> while (i < 5){
+ print(c(i,i^2))
+ i = i+1
+ }
[1] 0 0
[1] 1 1
[1] 2 4
[1] 3 9
[1] 4 16
3.repeat 语句
注意:必须有break
> i=0L
> s=0L
> repeat{
+ i = i + 1
+ s = s + i
+ print(c(i,s))
+ if(i==10) break
+ }
[1] 1 1
[1] 2 3
[1] 3 6
[1] 4 10
[1] 5 15
[1] 6 21
[1] 7 28
[1] 8 36
[1] 9 45
[1] 10 55
apply()族函数
1.apply 处理矩阵或数据框
apply(X, MARGIN, FUN, …)
其中X是数据框/矩阵名;
MARGIN为1表示取行,为2表示取列,FUN是函数
> test<- iris[,1:4]
> apply(test, 2, mean)
Sepal.Length Sepal.Width Petal.Length Petal.Width
5.843333 3.057333 3.758000 1.199333
> apply(test, 1, sum)
[1] 10.2 9.5 9.4 9.4 10.2 11.4 9.7 10.1 8.9
[10] 9.6 10.8 10.0 9.3 8.5 11.2 12.0 11.0 10.3
[19] 11.5 10.7 10.7 10.7 9.4 10.6 10.3 9.8 10.4
[28] 10.4 10.2 9.7 9.7 10.7 10.9 11.3 9.7 9.6
[37] 10.5 10.0 8.9 10.2 10.1 8.4 9.1 10.7 11.2
[46] 9.5 10.7 9.4 10.7 9.9 16.3 15.6 16.4 13.1
[55] 15.4 14.3 15.9 11.6 15.4 13.2 11.5 14.6 13.2
[64] 15.1 13.4 15.6 14.6 13.6 14.4 13.1 15.7 14.2
[73] 15.2 14.8 14.9 15.4 15.8 16.4 14.9 12.8 12.8
[82] 12.6 13.6 15.4 14.4 15.5 16.0 14.3 14.0 13.3
[91] 13.7 15.1 13.6 11.6 13.8 14.1 14.1 14.7 11.7
[100] 13.9 18.1 15.5 18.1 16.6 17.5 19.3 13.6 18.3
[109] 16.8 19.4 16.8 16.3 17.4 15.2 16.1 17.2 16.8
[118] 20.4 19.5 14.7 18.1 15.3 19.2 15.7 17.8 18.2
[127] 15.6 15.8 16.9 17.6 18.2 20.1 17.0 15.7 15.7
[136] 19.1 17.7 16.8 15.6 17.5 17.8 17.4 15.5 18.2
[145] 18.2 17.2 15.7 16.7 17.3 15.8
> res <- c()
> for(i in 1:nrow(test)){
+ res[[i]] <- sum(test[i,])
+ }
2.lapply(list, FUN, …)
对列表/向量中的每个元素(向量)实施相同的操作
> test <- list(x = 36:33,
+ y = 32:35,
+ z = 30:27)
返回值是列表,对列表中的每个元素(向量)求均值(试试方差var,分位数quantile)
> lapply(test,mean)
$x
[1] 34.5
$y
[1] 33.5
$z
[1] 28.5
> class(lapply(test,mean))
[1] "list"
> x <- unlist(lapply(test,mean));x
x y z
34.5 33.5 28.5
> class(x)
[1] "numeric"
3.sapply 处理列表,简化结果,直接返回矩阵和向量
sapply(X, FUN, …) 注意和lapply的区别,返回值不一样
> lapply(test,min)
$x
[1] 33
$y
[1] 32
$z
[1] 27
> sapply(test,min)
x y z
33 32 27
> lapply(test,range)
$x
[1] 33 36
$y
[1] 32 35
$z
[1] 27 30
> sapply(test,range)
x y z
[1,] 33 32 27
[2,] 36 35 30
> class(sapply(test,range))
[1] "matrix" "array"
网友评论