R作业初级

作者: 熊逸Byron | 来源:发表于2019-06-29 08:50 被阅读0次

R作业初级
R语言初级作业
R语言初级作业
R语言作业—初级
R语言作业·初级
2019-06-15 R语言作业（初级）
R 练习作业（初级）
R语言作业（初级·上）
R语言for循环练习
R语言初级题作业笔记

初级题目

正式开始我们的旅程

library(tidyverse)
library(ggpubr)

软件安装以及R包安装
参考：http://www.bio-info-trainee.com/3727.html

# # 先注释掉，避免在Rmarkdown中运行
# rm(list = ls()) 
# options()$repos 
# options()$BioC_mirror
# options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
# options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
# options()$repos 
# options()$BioC_mirror
# 
# # https://bioconductor.org/packages/release/bioc/html/GEOquery.html
# if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
# BiocManager::install("KEGG.db",ask = F,update = F)
# BiocManager::install(c("GSEABase","GSVA","clusterProfiler" ),ask = F,update = F)
# BiocManager::install(c("GEOquery","limma","impute" ),ask = F,update = F)
# BiocManager::install(c("genefu","org.Hs.eg.db","hgu133plus2.db" ),ask = F,update = F)
# 
# # 下面代码被我注释了，意思是这些代码不需要运行，因为它过时了，很多旧教程就忽略
# # source("https://bioconductor.org/biocLite.R") 
# # library('BiocInstaller') 
# # options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") 
# # BiocInstaller::biocLite("GEOquery")
# # BiocInstaller::biocLite(c("limma"))
# # BiocInstaller::biocLite(c("impute"))
# 
# options()$repos
# install.packages('WGCNA')
# install.packages(c("FactoMineR", "factoextra"))
# install.packages(c("ggplot2", "pheatmap","ggpubr"))
# library("FactoMineR")
# library("factoextra")
# 
# library(GSEABase)
# library(GSVA)
# library(clusterProfiler)
# library(genefu)
# library(ggplot2)
# library(ggpubr)
# library(hgu133plus2.db)
# library(limma)
# library(org.Hs.eg.db)
# library(pheatmap)

getwd()

3.新建6个向量，基于不同的原子类型。（重点是字符串，数值，逻辑值）

a1 <- 1:10
a1
class(a1)
a2 <- c('hello','the','world')
class(a2)
a3 <- c(T,T,T,F,F,T,T,F)
a3
class(a3)
a4 <- 607L
a4
class(a4)
a5 <- seq(from = 0, to = 20, by =2)
a5
class(a5)
a6 <- c(13.14,14)
a6
class(a6)

4.新建一些数据结构，比如矩阵，数组，数据框，列表等重点是数据框，矩阵）

c <- c(1,2,3)
list <- list(c(1,2,3),'3,14')
M = matrix( c('a','a','b','c','b','a'), nrow = 2, ncol = 3, byrow = TRUE)
a <- array(c('a','b'),dim = c(3,3,2))
f <- factor(c)
df <- data.frame(gene=paste0("gene",1:15),
                 s1=rnorm(n=15),s2=rnorm(n=15),s3=rnorm(n=15),s4=rnorm(n=15),s5=rnorm(n=15))
# 取df的第1，3行，取第4，6列
df[c(1,3),]
df[,c(4,6)]

5.使用data函数来加载R内置数据集 rivers，其他数据集：

data("rivers")
#Lengths of Major North American Rivers

head(rivers)
tail(rivers)

# 数据集的长
length(rivers)
# structure 显示对象内部结构
str(rivers) 
# 获取描述性统计量（最小值/最大值/四分位数/数值型变量/因子向量/逻辑型向量）
summary(rivers) 
rm(list = ls())

下载 https://www.ncbi.nlm.nih.gov/sra?term=SRP133642 里面的 RunInfo Table 文件读入到R里面，了解这个数据框，多少列，每一列都是什么属性的元素
打开链接https://www.ncbi.nlm.nih.gov/sra?term=SRP133642
点击Send results to Run selector链接
点击RunInfo Table按钮即可下载RunInfo Table文件

rm(list = ls())
options(stringsAsFactors = F)
Table <- read.table(file = "SraRunTable.txt",header = T,sep = '\t')
class(Table)
dim(Table) # 查看data.frame维度
ncol(Table) #查看data.frame列数
str(Table)

下载 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111229 里面的样本信息sample.csv读入到R里面，了解这个数据框，多少列，每一列都是什么属性的元素
7.1 打开GEO主页https://www.ncbi.nlm.nih.gov/geo/，点击Samples链接
7.2 在搜索栏中输入GSE111229，点击Search按钮，然后点击Export按钮
7.3 在弹出的对话框点击Export按钮，得到sample.csv

# 读取样本信息
sample <-read.csv("sample.csv")
colnames(sample)
dim(Table)
## [1] 768  31
dim(sample)
## [1] 768  12
d = merge(Table,sample,by.x = "Sample_Name",by.y = "Accession")
# merge() 函数 此时Table的Sample_Name列和sample的Accession列内容相同，合并这一列
dim(d)
library(tidyverse)
d = sample %>% 
  dplyr::rename("Sample_Name" = "Accession") %>%
  left_join(Table,by = "Sample_Name")
dim(d)

对前面读取的 RunInfo Table 文件在R里面探索其MBases列，包括箱线图(boxplot)和五分位数(fivenum)，还有频数图(hist)，以及密度图(density)

# Mbases样本的碱基数
# 举例：
boxplot(Table$MBases, main = "boxplot of MBases")
plot(fivenum(Table$MBases), main = "fivenum of MBases")
hist(Table$MBases, main = "hist of MBases")
plot(density(Table$MBases,na.rm=T), main = "density of MBases")

把前面读取的样本信息表格的样本名字根据下划线分割看第3列元素的统计情况。第三列代表该样本所在的plate

title = sample$Title
plate = unlist(lapply(title,function(x){stringr::str_split(x,"_")[[1]][3]}))
table(plate)

根据plate把关联到的 RunInfo Table 信息的MBases列分组检验是否有统计学显著的差异

# plate 指384孔PCR板，编号分别是48号和49号

t.test(Table$MBases~plate)

分组绘制箱线图(boxplot)，频数图(hist)，以及密度图(density)

boxplot(d$MBases~plate)
typeof(plate)
e = d[,c("MBases","Title")]
e$plate = plate
hist(e$MBases,freq = F, breaks = "sturges")
plot(density(e$MBases,na.rm=T))
# 比较简陋，可以尝试用ggplot2和ggpubr画图包

使用ggplot2把上面的图进行重新绘制

suppressMessages(library(ggplot2))
e$plate = plate
e$num=c(1:768)
colnames(e)
# 箱线图
ggplot(e,aes(x=plate,y=MBases)) + geom_boxplot()   
# 频数图
ggplot(e,aes(x=MBases)) + geom_histogram(fill="lightblue",colour="grey") + facet_grid(plate ~ .) # 频数图
ggplot(e,aes(x=MBases,fill=plate))+geom_histogram()
# 密度图
ggplot(e,aes(y=MBases,x=num)) + geom_point() + stat_density2d(aes(alpha=..density..),geom = "raster",contour = F)+ facet_grid(plate ~ .)  
ggplot(e,aes(x=MBases,fill=plate))+geom_density()

使用ggpubr把上面的图进行重新绘制

suppressMessages(library(ggpubr))
ggboxplot(e, x="plate", y="MBases", color = "plate", palette = "aaas",add = "jitter") + stat_compare_means(method = "t.test")
gghistogram(e, x="MBases", fill = "plate",palette = c("#f4424e", "#41a6f4"))
ggdensity(e, x="MBases", fill = "plate", color = "plate", add = "mean",palette = c("#f4424e", "#41a6f4"))

随机取384个MBases信息，跟前面的两个plate的信息组合成新的数据框，第一列是分组，第二列是MBases，总共是384*3行数据

# sample() 函数 随机抽样
data <- e[sample(nrow(e),384),][,c(3,1,2)]
str(data)

感谢

https://www.jianshu.com/p/c07e67e2c757
https://www.jianshu.com/p/2e5a5192f219
欢迎关注生信菜鸟团、生信技能树！！！

网友评论

本文标题：R作业初级

本文链接：https://www.haomeiwen.com/subject/cnsucctx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

R作业初级

初级题目

正式开始我们的旅程

感谢

相关文章

R作业初级

R语言初级作业

R语言初级作业

R语言作业—初级

R语言作业·初级

2019-06-15 R语言作业（初级）

R 练习作业（初级）

R语言作业（初级·上）

R语言for循环练习

R语言初级题作业笔记

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读