第一步_安装和加载R包

01_设置镜像

# options函数就是设置R运行过程中的一些选项设置
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) #对应清华源
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") #对应中科大源

02_安装R包

安装R包使用的函数取决于下载的地址

install.packages() # CRAN网站
BiocManager::install() # Bioconductor

03_加载R包

library()
require()

第二步_dplyr包的五个基础函数

test <- iris[c(1:2,51:52,101:102),]
# mutate()新增列
mutate(test, new = Sepal.Length * Sepal.Width) #  新增new列

# select()按列筛选
select(test,1) # 选取第一列
select(test,c(1,5)) # 选取1和5列
select(test, Sepal.Length) # 选取Sepal.Length列

select(test, Petal.Length, Petal.Width) # 选取Petal.Length, Petal.Width两列

vars <- c("Petal.Length", "Petal.Width") # 选取vars变量中的任意一列
select(test, one_of(vars)) # 

# filter()按行筛选
filter(test, Species == "setosa") # 选取setosa的行
filter(test, Species == "setosa"&Sepal.Length > 5 ) # 选取setosa且Sepal.Length > 5的行
filter(test, Species %in% c("setosa","versicolor")) # 选取向量中包含元素的列

# arrange()排序
arrange(test, Sepal.Length) # 按照Sepal.Length列排序,默认从小到大
arrange(test, desc(Sepal.Length)) # 用desc()排序顺序,从大到小

# summarise()对数据进行汇总
summarise(test,
          mean(Sepal.Length), # 计算Sepal.Length的平均值
          sd(Sepal.Length)) # 计算Sepal.Length的标准差

group_by(test, Species) # 先按照Species分三组setosa；versicolor；virginica 
summarise(group_by(test,Species),mean(Sepal.Length),sd(Sepal.Length)) # 计算每组Sepal.Length的平均值和标准差

dplyr两个实用技能

## 管道操作：%>% ，加载任意一个tidyverse包即可使用
test  %>%  group_by(Species)  %>%  summarise(mean(Sepal.Length), sd(Sepal.Length))

## count统计某列的unique值
count(test,Species)

dplyr处理数据关系

将两个表进行连接，注意不要引入因子变量

# 合并两组向量形成数据框
test1 <- data.frame(x = c('b','e','f','x'), 
                    z = c("A","B","C",'D'),
                    stringsAsFactors = F)

test2 <- data.frame(x = c('a','b','c','d','e','f'), 
                    y = c(1,2,3,4,5,6),
                    stringsAsFactors = F)

# 內连inner_join,取交集
inner_join(test1, test2, by = "x") # test1和test2两个数据框根据x列共同元素取交集合并

# 左连left_jion
left_join(test1, test2, by = 'x') # 以test1为标准，匹配不到test2的x列，即缺失值
left_join(test2, test1, by = 'x') # 以test2为标准，匹配不到test1的x列，即缺失值

# 全连full_jion
full_join( test1, test2, by = 'x') # test1和test2数据框中x列元素全部合并保留

# 半连接：返回test1数据框中能够与test2匹配的x列的元素
semi_join( test1, test2, by = 'x')

# 反连接：返回test1数据框中无法与test2匹配的x列的元素
anti_join(x = test1, y = test2, by = 'x')

# 简单合并，相当于cbind()和rbind()函数
# 注意：bind_rows()需要两个数据框有相同的列；bind_cols需要两组数据框具有相同的行
test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40)) # 4*2
test1
test2 <- data.frame(x = c(5,6), y = c(50,60)) # 2*2
test2
test3 <- data.frame(z = c(100,200,300,400)) # 4*1
test3
bind_rows(test1,test2) # 6*2
bind_cols(test1,test3) # 4*3