初级题目
正式开始我们的旅程
library(tidyverse)
library(ggpubr)
- 软件安装以及R包安装
参考:http://www.bio-info-trainee.com/3727.html
# # 先注释掉,避免在Rmarkdown中运行
# rm(list = ls())
# options()$repos
# options()$BioC_mirror
# options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
# options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
# options()$repos
# options()$BioC_mirror
#
# # https://bioconductor.org/packages/release/bioc/html/GEOquery.html
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("KEGG.db",ask = F,update = F)
# BiocManager::install(c("GSEABase","GSVA","clusterProfiler" ),ask = F,update = F)
# BiocManager::install(c("GEOquery","limma","impute" ),ask = F,update = F)
# BiocManager::install(c("genefu","org.Hs.eg.db","hgu133plus2.db" ),ask = F,update = F)
#
# # 下面代码被我注释了,意思是这些代码不需要运行,因为它过时了,很多旧教程就忽略
# # source("https://bioconductor.org/biocLite.R")
# # library('BiocInstaller')
# # options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
# # BiocInstaller::biocLite("GEOquery")
# # BiocInstaller::biocLite(c("limma"))
# # BiocInstaller::biocLite(c("impute"))
#
# options()$repos
# install.packages('WGCNA')
# install.packages(c("FactoMineR", "factoextra"))
# install.packages(c("ggplot2", "pheatmap","ggpubr"))
# library("FactoMineR")
# library("factoextra")
#
# library(GSEABase)
# library(GSVA)
# library(clusterProfiler)
# library(genefu)
# library(ggplot2)
# library(ggpubr)
# library(hgu133plus2.db)
# library(limma)
# library(org.Hs.eg.db)
# library(pheatmap)
- 打开 Rstudio 告诉我它的工作目录
getwd()
3.新建6个向量,基于不同的原子类型。(重点是字符串,数值,逻辑值)
a1 <- 1:10
a1
class(a1)
a2 <- c('hello','the','world')
class(a2)
a3 <- c(T,T,T,F,F,T,T,F)
a3
class(a3)
a4 <- 607L
a4
class(a4)
a5 <- seq(from = 0, to = 20, by =2)
a5
class(a5)
a6 <- c(13.14,14)
a6
class(a6)
4.新建一些数据结构,比如矩阵,数组,数据框,列表等重点是数据框,矩阵)
c <- c(1,2,3)
list <- list(c(1,2,3),'3,14')
M = matrix( c('a','a','b','c','b','a'), nrow = 2, ncol = 3, byrow = TRUE)
a <- array(c('a','b'),dim = c(3,3,2))
f <- factor(c)
df <- data.frame(gene=paste0("gene",1:15),
s1=rnorm(n=15),s2=rnorm(n=15),s3=rnorm(n=15),s4=rnorm(n=15),s5=rnorm(n=15))
# 取df的第1,3行,取第4,6列
df[c(1,3),]
df[,c(4,6)]
5.使用data函数来加载R内置数据集 rivers,其他数据集:
data("rivers")
#Lengths of Major North American Rivers
head(rivers)
tail(rivers)
# 数据集的长
length(rivers)
# structure 显示对象内部结构
str(rivers)
# 获取描述性统计量(最小值/最大值/四分位数/数值型变量/因子向量/逻辑型向量)
summary(rivers)
rm(list = ls())
- 下载 https://www.ncbi.nlm.nih.gov/sra?term=SRP133642 里面的 RunInfo Table 文件读入到R里面,了解这个数据框,多少列,每一列都是什么属性的元素
打开链接https://www.ncbi.nlm.nih.gov/sra?term=SRP133642
点击Send results to Run selector
链接
点击RunInfo Table
按钮即可下载RunInfo Table
文件
rm(list = ls())
options(stringsAsFactors = F)
Table <- read.table(file = "SraRunTable.txt",header = T,sep = '\t')
class(Table)
dim(Table) # 查看data.frame维度
ncol(Table) #查看data.frame列数
str(Table)
- 下载 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111229 里面的样本信息sample.csv读入到R里面,了解这个数据框,多少列,每一列都是什么属性的元素
7.1 打开GEO主页https://www.ncbi.nlm.nih.gov/geo/,点击Samples
链接
7.2 在搜索栏中输入GSE111229
,点击Search
按钮,然后点击Export
按钮
7.3 在弹出的对话框点击Export
按钮,得到sample.csv
# 读取样本信息
sample <-read.csv("sample.csv")
colnames(sample)
dim(Table)
## [1] 768 31
dim(sample)
## [1] 768 12
d = merge(Table,sample,by.x = "Sample_Name",by.y = "Accession")
# merge() 函数 此时Table的Sample_Name列和sample的Accession列内容相同,合并这一列
dim(d)
library(tidyverse)
d = sample %>%
dplyr::rename("Sample_Name" = "Accession") %>%
left_join(Table,by = "Sample_Name")
dim(d)
- 对前面读取的 RunInfo Table 文件在R里面探索其MBases列,包括 箱线图(boxplot)和五分位数(fivenum),还有频数图(hist),以及密度图(density)
# Mbases样本的碱基数
# 举例:
boxplot(Table$MBases, main = "boxplot of MBases")
plot(fivenum(Table$MBases), main = "fivenum of MBases")
hist(Table$MBases, main = "hist of MBases")
plot(density(Table$MBases,na.rm=T), main = "density of MBases")
- 把前面读取的样本信息表格的样本名字根据下划线分割看第3列元素的统计情况。第三列代表该样本所在的plate
title = sample$Title
plate = unlist(lapply(title,function(x){stringr::str_split(x,"_")[[1]][3]}))
table(plate)
- 根据plate把关联到的 RunInfo Table 信息的MBases列分组检验是否有统计学显著的差异
# plate 指384孔PCR板,编号分别是48号和49号
t.test(Table$MBases~plate)
- 分组绘制箱线图(boxplot),频数图(hist),以及密度图(density)
boxplot(d$MBases~plate)
typeof(plate)
e = d[,c("MBases","Title")]
e$plate = plate
hist(e$MBases,freq = F, breaks = "sturges")
plot(density(e$MBases,na.rm=T))
# 比较简陋,可以尝试用ggplot2和ggpubr画图包
- 使用ggplot2把上面的图进行重新绘制
suppressMessages(library(ggplot2))
e$plate = plate
e$num=c(1:768)
colnames(e)
# 箱线图
ggplot(e,aes(x=plate,y=MBases)) + geom_boxplot()
# 频数图
ggplot(e,aes(x=MBases)) + geom_histogram(fill="lightblue",colour="grey") + facet_grid(plate ~ .) # 频数图
ggplot(e,aes(x=MBases,fill=plate))+geom_histogram()
# 密度图
ggplot(e,aes(y=MBases,x=num)) + geom_point() + stat_density2d(aes(alpha=..density..),geom = "raster",contour = F)+ facet_grid(plate ~ .)
ggplot(e,aes(x=MBases,fill=plate))+geom_density()
- 使用ggpubr把上面的图进行重新绘制
suppressMessages(library(ggpubr))
ggboxplot(e, x="plate", y="MBases", color = "plate", palette = "aaas",add = "jitter") + stat_compare_means(method = "t.test")
gghistogram(e, x="MBases", fill = "plate",palette = c("#f4424e", "#41a6f4"))
ggdensity(e, x="MBases", fill = "plate", color = "plate", add = "mean",palette = c("#f4424e", "#41a6f4"))
- 随机取384个MBases信息,跟前面的两个plate的信息组合成新的数据框,第一列是分组,第二列是MBases,总共是384*3行数据
# sample() 函数 随机抽样
data <- e[sample(nrow(e),384),][,c(3,1,2)]
str(data)
感谢
https://www.jianshu.com/p/c07e67e2c757
https://www.jianshu.com/p/2e5a5192f219
欢迎关注生信菜鸟团、生信技能树!!!
网友评论