题目链接:https://mp.weixin.qq.com/s/4zr8mnqd1zDGHuBXAY8USA
参考优秀学徒的笔记:https://www.jianshu.com/p/fab27c63af94
基础包
读取数据
options(stringsAsFactors = F)
library(airway)
data(airway)
RNAseq_expr=assay(airway)
colnames(RNAseq_expr)
RNAseq_expr[1:4,1:4]
RNAseq_gl=colData(airway)[,3]
table(RNAseq_gl)
Q1: 对RNAseq_expr的每一列绘制boxplot图
boxplot(RNAseq_expr)
- 可以看到有较多的异常值
Q2: 对RNAseq_expr的每一列绘制density图
plot(density(RNAseq_expr)) #直接画发现0的比例太高了,应该先过滤一下
filtered=RNAseq_expr[apply(RNAseq_expr,1,function(x) sum(x>0)>1),]
dim(RNAseq_expr)
dim(filtered)
plot(density(filtered)) #但并不是这个问题,先往下做
Q3: 对RNAseq_expr的每一列绘制条形图
barplot(RNAseq_expr)
Q4: 对RNAseq_expr的每一列取log2后重新绘制boxplot图,density图和条形图
normalized=log2(filtered+1)
#此处使用借鉴来的更好的代码,顺便把Q5的颜色也加了
#箱线图
boxplot(normalized,main = 'Boxplot of RNAseq-expr', xlab = 'samples',ylab = 'expression',col=RNAseq_gl)
#8列一起画密度图
plot(density(normalized)) #看来之前密度图不太对是异常值过多导致的
#分开画密度图
opar <- par(no.readonly=T)
par(mfrow=c(3,3))
for (i in c(1:8)){
plot(density(normalized[,i]),col=as.integer(RNAseq_gl)[i],main = "Density")
}
par(opar)
#条形图
col = c("lightblue","lightgreen")
barplot(normalized, main = 'Barplot of RNAseq-expr', xlab = 'samples',ylab = 'expression',col = RNAseq_gl) #奇怪的是并没有加上颜色
Q5: 对Q4的3个图里面添加 trt 和 untrt 组颜色区分开来
略
Q6: 对RNAseq_expr的前两列画散点图并且计算线性回归方程
plot(RNAseq_expr[,1:2]) #原始数据
plot(normalized[,1:2]) #过滤、归一化后
Q7: 对RNAseq_expr的所有列两两之间计算相关系数,并且热图可视化。
M=cor(RNAseq_expr)
pheatmap::pheatmap(M)
Q8: 取RNAseq_expr第一行表达量绘制折线图
plot(RNAseq_expr[1,],type="l",xlab = "gene",ylab="expression",col="red")
Q9: 取RNAseq_expr表达量最高的10个基因的行绘制多行折线图
top10=RNAseq_expr[rownames(RNAseq_expr) %in% names(tail(sort(rowSums(RNAseq_expr)),10)),]
plot(top10[1,],type="b",xlab = "gene",ylab = "expression",pch = 1,
ylim=c(50000,550000)) #不调整y轴范围的话画出来的图会超过默认范围
for (i in c(2:10)){
lines(top10[i,],type="b",xlab = "gene",ylab = "expression",pch = i)
}
Q10: 运行代码
https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/2-chunjuan-600.R
ggplot
准备数据
- 由于ggplot的“映射”理念,我们需要先把原有的扁数据变成长数据,即每列为一个变量的数据。我们采用tidyr包实现这一需求。
if(!require(tidyr))install.packages("tidyr")
library(tidyr)
tmp=as.data.frame(RNAseq_expr)
tmp$geneid=rownames(tmp)
exp_gather <- gather(tmp,"sample","exp",-geneid) #这一行是转换的关键
gg=exp_gather
- 其他学徒的优秀代码
suppressMessages(library(reshape2)) #用的是reshape2这个包
box_e <- melt(filtered) #似乎是比gather函数简洁
colnames(box_e) <- c("gene","sample","expression")
#这里开始是添加分组信息,方便画图时填色
tmp=data.frame(group_list=RNAseq_gl)
rownames(tmp)=colnames(RNAseq_expr)
tmp$sample <- rownames(tmp)
d = merge(box_e,tmp,by="sample")
Q1
library(ggplot2)
ggplot(gg,aes(sample,exp,fill=group_list)) + geom_boxplot()
- 没做log之前图都是没法看的,这里就不放了
Q2
#先效仿优秀代码,增加一列分组信息
tmp=data.frame(group_list=RNAseq_gl)
rownames(tmp)=colnames(RNAseq_expr)
tmp$sample <- rownames(tmp)
gg = merge(gg,tmp,by="sample")
p2 <- ggplot(gg,aes(x=exp,colour=sample)) + geom_density()
Q3
p3 <- ggplot(gg,aes(sample,exp,fill=group_list)) + geom_bar(stat="identity")
Q4
#再处理一遍数据
if(!require(tidyr))install.packages("tidyr")
library(tidyr)
tmp=as.data.frame(normalized)
tmp$geneid=rownames(tmp)
exp_gather <- gather(tmp,"sample","exp",-geneid)
gg=exp_gather
tmp=data.frame(group_list=RNAseq_gl)
rownames(tmp)=colnames(RNAseq_expr)
tmp$sample <- rownames(tmp)
gg = merge(gg,tmp,by="sample")
#和之前一样的步骤画图
p1 <- ggplot(data = gg,aes(y=exp,x=sample,fill=group_list)) + geom_boxplot()
p2 <- ggplot(gg,aes(x=exp,colour=sample)) + geom_density()
p3 <- ggplot(gg,aes(sample,exp,fill=group_list)) + geom_bar(stat="identity")
Q5
略
Q6
p6 <- ggplot(as.data.frame(RNAseq_expr[,1:2]),aes(x=SRR1039508,y=SRR1039509)) +
geom_point() +
geom_smooth(method= "lm")
Q7
M=cor(RNAseq_expr)
melt_M <- melt(M)
p7 <- ggplot(data = melt_Q7, aes(x=Var1, y=Var2, fill=value)) + geom_tile()
Q8
#p8 <- ggplot(tmp,aes(x=sample, y=exp)) + geom_line() + geom_point()
#会报错geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
p8 <- ggplot(tmp,aes(x=sample, y=exp, group=1)) + geom_line() + geom_point()
Q9
top10=RNAseq_expr[rownames(RNAseq_expr) %in% names(tail(sort(rowSums(RNAseq_expr)),10)),]
tmp=melt(top10)
colnames(tmp)=c("geneid","sample","exp")
p9 <- ggplot(tmp,aes(x=sample,y=exp,colour=geneid,group=geneid)) + geom_line() + geom_point()
p9
- 其他学徒的优秀代码(完整)
# 箱型图
A <- ggplot(data = d,aes(y=expression,x=sample,fill=group_list)) + geom_boxplot() +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
# 密度图
B1 <- ggplot(box_e,aes(x=expression,colour=sample)) + geom_density() +
theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
B2 <- ggplot(data= d,aes(expression,col=group_list)) + geom_density() +
theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
# 条形图
C <- ggplot(data = d,aes(x=sample,y=expression,fill= group_list)) + geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
# factor() 函数将连续型变量转化为离散型变量
# 散点图
D <- ggplot(as.data.frame(RNAseq_expr[,1:2]),aes(x=SRR1039508,y=SRR1039509)) +
geom_point() +
geom_smooth(method = "lm")
# 热图
melt_Q7 <- melt(Q7)
E <- ggplot(data = melt_Q7, aes(x=Var1, y=Var2, fill=value)) + geom_tile()+
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size= 4),
axis.text.y = element_text(size= 4)) + labs(x="",y="")
# 折线图
F1 <- ggplot(Q8_1,aes(x=sample, y=expression, group =1)) + geom_line() + geom_point() +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ))
Q8_2_m = melt(Q8_2)
colnames(Q8_2_m) = c("gene","sample","expression")
F2 <- ggplot(Q8_2_m,aes(x=sample,y=expression,colour=gene,group=gene)) + geom_line() + geom_point() +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ),
legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
multiplot(A,B1,B2, layout = matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
multiplot(C,D,E, layout = matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
multiplot(F1,F2)
- 最值得学习的地方
- 利用theme函数把图变得更好看了,尤其是样本名太长进而导致叠在一起的问题被解决了;
- 拼图功能以前没见过,感觉很不错。
生物信息学作图
Q2
tmp=apply(filtered,1,mad)
str(tmp)
z=t(scale(t(normalized))) #这里我认为标准化必须对整体做,不能取top100出来后再做。
top=z[rownames(z) %in% names(tail(sort(tmp),100)),]
pheatmap::pheatmap(top)
Q3
# PCA图应用z-score矩阵绘制
if(T){
library(ggfortify)
dat=z
df=as.data.frame(t(dat))
group_list=RNAseq_gl
df$group=group_list
autoplot(prcomp(df[,1:(ncol(df)-1)]), data = df, colour = 'group') #prcomp is the function of PCA
library("FactoMineR")
library("factoextra") #PCA diagram requires these two packages
df=as.data.frame(t(dat))
dat.pca <- PCA(df, graph = FALSE)
fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
# palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups")
#if there are only two replications, ellipse cannot be calculated
}
Q4
#差异分析
suppressMessages(library(DESeq2))
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = RNAseq_expr,
colData = colData,
design = ~ group_list)
dds <- DESeq(dds)
res <- results(dds,
contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
DEG=as.data.frame(resOrdered)
nrDEG=na.omit(DEG)
DEseq_DEG=nrDEG
nrDEG=DEseq_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
#火山图
if(T){
pretty_volcano <- function(resObject,title,alpha=0.05,lfc=2) {
# alpha is the threshold of p-value, default 0.05
# lfc is the threshold of -log10FoldChange, default 2
resObject$sig <- -log10(resObject$padj)
resObject[is.infinite(resObject$sig),"sig"] <- 350
# Select genes with a defined p-value (DESeq2 assigns NA to some genes)
genes.to.plot <- !is.na(resObject$padj)
# sum(genes.to.plot)
range(resObject[genes.to.plot, "log2FoldChange"])
## Volcano plot of adjusted p-values
cols <- densCols(resObject$log2FoldChange, resObject$sig)
cols[resObject$padj ==0] <- "purple"
resObject$pch <- 19
resObject$pch[resObject$padj ==0] <- 6
plot(resObject$log2FoldChange,
resObject$sig,
col=cols, panel.first=grid(),
main=title,
xlab="Effect size: log2(fold-change)",
ylab="-log10(adjusted p-value)",
pch=resObject$pch, cex=0.4)
abline(v=0)
abline(v=c(-lfc,lfc), col="brown")
abline(h=-log10(alpha), col="brown")
## Plot the names of a reasonable number of genes, by selecting those begin not only significant but also having a strong effect size
gn.selected <- abs(resObject$log2FoldChange) > lfc & resObject$padj < alpha
# text(resObject$log2FoldChange[gn.selected],
# -log10(resObject$padj)[gn.selected],
# lab=rownames(resObject)[gn.selected ], cex=0.6)
# # mtext(paste("padj =", alpha), side = 4, at = -log10(alpha), cex = 0.8, line = 0.5, las = 1)
# mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.8, line = 0.5)
}
#The following two parameters can be adjusted according to actual needs
alpha=1e-10
lfc=2
pretty_volcano(DEseq_DEG, 'Treat vs Control', alpha = alpha, lfc = lfc)
}
Q5
plotMA(res,ylim=c(-5,5))
# lfcShrink 收缩log2 fold change
resLFC <- lfcShrink(dds,coef = 2,res=res)
plotMA(resLFC, ylim=c(-5,5))
Q6
choose_gene = rownames(nrDEG)[9]
choose_gene_d <- cbind(as.data.frame(RNAseq_expr[choose_gene,]),
as.data.frame(tmp$group_list))
choose_gene_d$sample = rownames(choose_gene_d)
colnames(choose_gene_d) = c("e","group","sample")
ggplot(data = choose_gene_d,aes(y=e,x=group)) + geom_boxplot() +
stat_compare_means(method = "t.test")
Q7
suppressMessages(library(org.Hs.eg.db))
CHR <- toTable(org.Hs.egCHR)
ensembl <- toTable(org.Hs.egENSEMBL)
table(rownames(RNAseq_expr) %in% ensembl$ensembl_id)
### FALSE TRUE
### 38372 25730
table(rownames(filtered) %in% ensembl$ensembl_id)
### FALSE TRUE
### 10583 18294
# 可以看到org.Hs.eg.db这个包的数据量还是比较有限的,超过一半的基因找不到。
ids=merge(CHR,ensembl,by="gene_id")
gene=as.data.frame(rownames(RNAseq_expr))
colnames(gene)="ensembl_id"
ids2=merge(ids,gene,by="ensembl_id")
ggplot(ids2,aes(chromosome)) +
geom_bar(fill="lightblue")
Q8
tmp=nrDEG[nrDEG$pvalue<0.05,]
DEG_gene=data.frame(rownames(tmp))
colnames(DEG_gene) <- c("ensembl_id")
ids3=merge(ids,DEG_gene,by="ensembl_id")
# 叠加
ids2$DEG <- as.factor(ifelse(ids2$ensembl_id %in% ids3$ensembl_id,'DEG','NOT'))
ggplot(ids2,aes(x=chromosome,y=..count..,fill=DEG)) + geom_bar(stat ='count')
Q9
- 基因名是CUL5。
rm(list = ls())
options(stringsAsFactors = F)
brca=read.csv("BRCA_8065_50_50.csv",header = T)
library(survival)
library(survminer)
colnames(brca)
colnames(brca)=c("patient","OS.time","OS","expression","group")
table(brca$OS)
brca$OS=ifelse(brca$OS=="Dead",1,0) #survfit函数要求生存状态是逻辑值,且存活对应0,死亡对应1.
sfit <- survfit(Surv(OS.time, OS)~group, data=brca)
ggsurvplot(sfit,palette = c("red", "black"),
risk.table =TRUE,pval =TRUE,
xlab ="Time in days",
ggtheme =theme_light())
Q10
tmp <- read.csv(file = 'denseDataOnlyDownload.tsv',sep="\t",header = T)
table(tmp$PAM50_mRNA_nature2012)
# 除了我们要的5种分型之外还有一些病人的分组是未知的,需要去除。
cul5 <- read.csv(file = 'denseDataOnlyDownload.tsv',sep="\t",header = T,na.strings="")
cul5<- cul5[complete.cases(cul5),]
library(RColorBrewer)
ggplot(data = cul5,aes(y=CUL5,x=PAM50_mRNA_nature2012,color=PAM50_mRNA_nature2012)) +
geom_boxplot() +
theme_light() +
stat_boxplot(geom = "errorbar",width=0.2) +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=10))
网友评论