新建与读取数据框
新建数据框
> df <- data.frame(gene = paste0("gene",1:3),
+ sam = paste0("sample",1:3),
+ exp = c(32,34,45))
> df
gene sam exp
1 gene1 sample1 32
2 gene2 sample2 34
3 gene3 sample3 45
读取数据框
df2 <- read.csv("gene.csv")
数据框的属性
dim(df)
nrow(df)
ncol(df)
rownames(df)
colnames(df)
数据框取子集
df[2,2]
df[2,]
df[,2]
df[c(1,3),1:2]
df[,"gene"]
df[,c('gene','exp')]
df[,ncol(df)]
df[,-ncol(df)]
数据框编辑
#改一个格
df[3,3]<- 5
#改一整列
df$exp<-c(12,23,50)
#改行名和列名
rownames(df) <- c("r1","r2","r3")
#只修改某一行/列的名
rownames(df)[2]="x"
两个表格的连接
tmp =merge(test1,test2,by="name")
merge(test1,test3,by.x = "name",by.y = "NAME")
删除
#删除一个
rm(l)
#删除多个
rm(df,m)
#删除全部
rm(list = ls())
#删除含有NA的行
na.omit(df)
元素的名字
x=1:10
> names(x)=letters[1:10]
> x
a b c d e f g h i j
1 2 3 4 5 6 7 8 9 10
> x["a"]
a
1
统计数据框中某一列有哪几个重复值,分别重复了多少次
table(iris$Species)
分割和合并
(library(tidyr))
#原始数据
test <- data.frame(x = c( "a,b", "a,d", "b,c"));test
#分割
test_seprate <- separate(test,x, c("X", "Y"),sep = ",");test_seprate
#合并
test_re <- unite(test_seprate,"x",X,Y,sep = ",")
表格拆分多次
y_seprate <- separate(y,source_name_ch1, c("X", "Y"),sep = ";") %>%
separate(characteristics_ch1.10, c("Z", "W"),sep = ":")%>%
separate(W, c("A", "B"),sep = ";")
删除多余列
y_pd <- y_seprate[,c('Y','B')]
library(dplyr)
1.mutate(),新增列
mutate(test, new = Sepal.Length * Sepal.Width)
2.select(),按列筛选
(1)按列号筛选
select(test,1)
select(test,c(1,5))
(2)按列名筛选
select(test, Petal.Length, Petal.Width)
vars <- c("Petal.Length", "Petal.Width")
select(test, one_of(vars))
一组来自tidyselect的有用函数
select(test, starts_with("Petal"))
select(test, ends_with("Width"))
select(test, contains("etal"))
select(test, matches(".t."))
select(test, everything())
select(test, last_col())
select(test, last_col(offset = 2))
(4)利用everything(),列名可以重排序
select(test,Species,everything())
3.filter()筛选行
filter(test, Species == "setosa")
filter(test, Species == "setosa"&Sepal.Length > 5 )
filter(test, Species %in% c("setosa","versicolor"))
4.arrange(),按某一列对整个表格进行排序
arrange(test, Sepal.Length)#默认从小到大排序
arrange(test, desc(Sepal.Length))#用desc从大到小
arrange(test, Sepal.Length, desc(Sepal.Width))
5.summarise():汇总
对数据进行汇总操作,结合group_by使用实用性强
summarise(test, mean(Sepal.Length), sd(Sepal.Length))
计算Sepal.Length的平均值和标准差:
先按照Species分组,计算每组Sepal.Length的平均值和标准差
group_by(test, Species)
summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length))
两个实用技能
1:管道操作 %>% (cmd/ctr + shift + M)
library(dplyr)
x1 = filter(iris,Sepal.Width>3)
x2 = select(x1,c("Sepal.Length","Sepal.Width" ))
x3 = arrange(x2,Sepal.Length)
colnames(iris)
iris %>%
filter(Sepal.Width>3) %>%
select(c("Sepal.Length","Sepal.Width" ))%>%
arrange(Sepal.Length)
2:count统计某列的unique值
count(test,Species)
处理关系数据:即将2个表进行连接,注意:不要引入factor
options(stringsAsFactors = F)
test1 <- data.frame(name = c('jimmy','nicker','doodle'),
blood_type = c("A","B","O"))
test1
test2 <- data.frame(name = c('doodle','jimmy','nicker','tony'),
group = c("group1","group1","group2","group2"),
vision = c(4.2,4.3,4.9,4.5))
test2
test3 <- data.frame(NAME = c('doodle','jimmy','lucy','nicker'),
weight = c(140,145,110,138))
merge(test1,test2,by="name")
merge(test1,test3,by.x = "name",by.y = "NAME")
1.內连inner_join,取交集
inner_join(test1, test2, by = "name")
inner_join(test1,test3,by = c("name"="NAME"))
2.左连left_join
left_join(test1, test2, by = 'name')
left_join(test2, test1, by = 'name')
3.全连full_join
full_join( test1, test2, by = 'name')
4.半连接:返回能够与y表匹配的x表所有记录semi_join
semi_join(x = test1, y = test2, by = 'name')
5.反连接:返回无法与y表匹配的x表的所记录anti_join
anti_join(x = test2, y = test1, by = 'name')
6.数据的简单合并
在相当于base包里的cbind()函数和rbind()函数;注意,bind_rows()函数需要两个表格列数相同,而bind_cols()函数则需要两个数据框有相同的行数
test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
test1
test2 <- data.frame(x = c(5,6), y = c(50,60))
test2
test3 <- data.frame(z = c(100,200,300,400))
test3
bind_rows(test1, test2)
bind_cols(test1, test3)
网友评论