中级题链接:http://www.bio-info-trainee.com/3750.html
文末有彩蛋!
第一题
rm(list = ls())
options(stringsAsFactors = F)
library(org.Hs.eg.db)
g2s=toTable(org.Hs.egSYMBOL)
g2e=toTable(org.Hs.egENSEMBL) #获得gene id分别和symbol/ensemble对应的信息
a = read.table("e1.txt",header = F) #读入ensemble id
colnames(a) = c("ensembl_id")
a$ensembl_id = apply(a,1,function(x) unlist(strsplit(as.character(x),"[.]"))[1]) #将.后的无用部分去除,注意.需要用[]或//转译
tmp = merge(a,g2e,by='ensembl_id')
tmp = merge(tmp,g2s,by='gene_id') #利用merge将三种信息整合起来
第二题
rm(list = ls())
options(stringsAsFactors = F)
library(hgu133plus2.db)
ids=toTable(hgu133plus2SYMBOL) #依旧是常规的toTable
head(ids)
a=read.table("e2.txt")
colnames(a) = c("probe_id")
tmp=merge(a,ids,by="probe_id")
第三题
rm(list = ls())
options(stringsAsFactors = F)
suppressPackageStartupMessages(library(CLL))
data(sCLLex)
sCLLex
exprSet=exprs(sCLLex) #exprs获取表达矩阵
pd=pData(sCLLex) #获取分组信息
suppressPackageStartupMessages(library(hgu95av2.db))
ids=toTable(hgu95av2SYMBOL)
pro = ids[which(ids$symbol=='TP53'),][,1]
boxplot(exprSet[pro[1],] ~ pd$Disease)
boxplot(exprSet[pro[2],] ~ pd$Disease)
boxplot(exprSet[pro[3],] ~ pd$Disease) #取出TP53对应的探针作箱线图
suppressPackageStartupMessages(library(ggpubr))
sigprob = exprSet[pro[1],]
pd$expression = sigprob
ggboxplot(pd, x = "Disease", y = "expression",
color = "Disease", palette = "npg",
add = "jitter", shape = "Disease")
sigprob = exprSet[pro[2],]
pd$expression = sigprob
ggboxplot(pd, x = "Disease", y = "expression",
color = "Disease", palette = "npg",
add = "jitter", shape = "Disease")
sigprob = exprSet[pro[3],]
pd$expression = sigprob
ggboxplot(pd, x = "Disease", y = "expression",
color = "Disease", palette = "npg",
add = "jitter", shape = "Disease")
#palette分为三种:
#1.grey,Red等单纯的渐变色
#2.palette =c("#00AFBB", "#FC4E07")等直接用色号
#3.npg,aaas等期刊配色
两种图各取一张为例。


第四题
rm(list = ls())
options(stringsAsFactors = F)
a=read.table('e4-plot.txt',sep = '\t',fill = T,header = T) #读取数据前先打开文件看一下分隔符是什么
colnames(a)=c('id','subtype','expression','mut')
dat=a
suppressPackageStartupMessages(library(ggstatsplot))
ggbetweenstats(data = dat, x = subtype, y = expression) #按subtype分组作图
library(ggplot2)
ggsave('plot-again-BRCA1-TCGA-BRCA-cbioportal.png')

第五题
rm(list = ls())
options(stringsAsFactors = F)
a=read.table('BRCA_7157_50_50.csv',sep = ',',fill = T,header = T)
dat=a
library(ggplot2)
library(survival)
library(survminer)
table(dat$Status)
dat$Status=ifelse(dat$Status=='Dead',1,0)
sfit <- survfit(Surv(Days, Status)~Group, data=dat)
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE) #生存曲线的常规操作
ggsave('survival_TP53_in_BRCA_TCGA.png')
table(tmp$subtype)
#之后是对于每一个type做同样的事情
type='BRCA_LumB'
x=tmp[tmp$subtype==type,]
library(ggplot2)
library(survival)
library(survminer)
#table(x$Status)
x$Status=ifelse(x$Status=='Dead',1,0)
sfit <- survfit(Surv(Days, Status)~Group, data=x)
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE)
type='BRCA_Normal'
x=tmp[tmp$subtype==type,]
library(ggplot2)
library(survival)
library(survminer)
#table(x$Status)
x$Status=ifelse(x$Status=='Dead',1,0)
sfit <- survfit(Surv(Days, Status)~Group, data=x)
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE)
type='BRCA_Basal'
x=tmp[tmp$subtype==type,]
library(ggplot2)
library(survival)
library(survminer)
#table(x$Status)
x$Status=ifelse(x$Status=='Dead',1,0)
sfit <- survfit(Surv(Days, Status)~Group, data=x)
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE)
type='BRCA_Her2'
x=tmp[tmp$subtype==type,]
library(ggplot2)
library(survival)
library(survminer)
#table(x$Status)
x$Status=ifelse(x$Status=='Dead',1,0)
sfit <- survfit(Surv(Days, Status)~Group, data=x)
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE)
type='BRCA_LumA'
x=tmp[tmp$subtype==type,]
library(ggplot2)
library(survival)
library(survminer)
#table(x$Status)
x$Status=ifelse(x$Status=='Dead',1,0)
sfit <- survfit(Surv(Days, Status)~Group, data=x)
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE)

第六题
rm(list = ls())
options(stringsAsFactors = F)
#以下是从线上直接获取数据的代码
# f='GSE17215_eSet.Rdata'
#
# suppressPackageStartupMessages(library(GEOquery))
# path.package()
# if(!file.exists(f)){
# gset <- getGEO('GSE17215', destdir=".",
# AnnotGPL = F,
# getGPL = F)
# save(gset,file=f)
# }
load("GSE17215_eSet.Rdata")
class(gset)
head(gset)
# exprSet = assay(gset)
# exprSet = exprs(gset) #尝试了这两个方式获取表达矩阵未果,发现gset变量的第一项才能取表达矩阵
class(gset[[1]])
a=gset[[1]]
dat=exprs(a) #成功取出表达矩阵
dim(dat)
suppressPackageStartupMessages(library(hgu133a.db))
path.package() #检查一下包的加载情况
ids=toTable(hgu133aSYMBOL)
head(ids)
dat=dat[ids$probe_id,] #将没有收录在ids中的探针剔除
dat[1:4,1:4]
ids$median=apply(dat,1,median)
ids=ids[order(ids$symbol,ids$median,decreasing = T),]
ids=ids[!duplicated(ids$symbol),]
dat=dat[ids$probe_id,] #精髓操作,得到了每个基因对应探针中整体表达量(中位数)最大的探针
rownames(dat)=ids$symbol #由于基因和探针已经一一对应,行名可以改为基因symbol
dat[1:4,1:4]
dim(dat)
goi = as.vector(read.table('e6.txt',sep = ' ',fill = T)) #读入GOI,但其实直接用下面的方法更方便
#goi = 'ACTR3B ANLN BAG1 BCL2 BIRC5 BLVRA CCNB1 CCNE1 CDC20 CDC6 CDCA1 CDH3 CENPF CEP55 CXXC5 EGFR ERBB2 ESR1 EXO1 FGFR4 FOXA1 FOXC1 GPR160 GRB7 KIF2C KNTC2 KRT14 KRT17 KRT5 MAPT MDM2 MELK MIA MKI67 MLPH MMP11 MYBL2 MYC NAT1 ORC6L PGR PHGDH PTTG1 RRM2 SFRP1 SLC39A6 TMEM45B TYMS UBE2C UBE2T'
table(goi %in% rownames(dat))
goi = goi[goi %in% rownames(dat)] #将表达矩阵中没有的删去
table(rownames(dat) %in% goi)
dat = dat[rownames(dat) %in% goi,] #将表达矩阵中的goi提取出来
dat[1:4,1:4]
pheatmap::pheatmap(dat,scale = 'row')

第七题
rm(list = ls())
options(stringsAsFactors = F)
load('GSE24673_eSet.Rdata')
class(gset)
class(gset[[1]])
a = gset[[1]]
exprSet = exprs(a)
exprSet[1:4,1:4] #常规的获取表达矩阵方式
# nrow(exprSet)
# ncol(exprSet)
dim(exprSet) #用nrow ncol不如直接dim
pdat = pData(a)
str(pdat)
table(pdat$source_name_ch1)
pdat$source_name_ch1 #寻找分组信息
group_list = c('rbc','rbc','rbc',
'rbn','rbn','rbn',
'rbc','rbc','rbc',
'norm','norm')
c = cor(exprSet)
ann_col = data.frame(group = group_list)
rownames(ann_col) = colnames(c) #添加注释信息
pheatmap::pheatmap(c,annotation_col = ann_col)

第八题
options()$repos
options()$BioC_mirror
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("KEGG.db",ask = F,update = F)
第九题
rm(list = ls())
options(stringsAsFactors = F)
load('GSE42872_eSet.Rdata')
a = gset[[1]]
dat = exprs(a)
dim(dat)
dat[1:4,1:4] #常规方法获取表达矩阵
boxplot(dat)
# df = data.frame(probe_id = rownames(dat))
# df$mean = apply(dat,1,function(x) mean(x))
# df$sym = ids[df$probe_id,]
# dat$mea = apply(dat,1,function(x){
# mean(x)})
#
# dat$med = apply(dat,1,function(x) median(x))
# suppressPackageStartupMessages(library(hugene10sttranscriptcluster.db))
# path.package()
# ids = toTable(hugene10sttranscriptclusterSYMBOL)
#一开始以为和第六题一样要过滤表达矩阵,后来发现只要找最高的探针就行了
sort(apply(dat,1,mean),decreasing = T)[1]
sort(apply(dat,1,sd),decreasing = T)[1]
sort(apply(dat,1,mad),decreasing = T)[1]
第十题
rm(list = ls())
options(stringsAsFactors = F)
load('GSE42872_eSet.Rdata') # 载入数据
a=gset[[1]]
dat=exprs(a)
dim(dat)
pd=pData(a)
group_list=unlist(lapply(pd$title,function(x){
strsplit(x,' ')[[1]][4]
}))
exprSet=dat
exprSet[1:4,1:4]
# DEG by limma
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast.matrix<-makeContrasts("progres.-stable",levels = design)
contrast.matrix
#构建比较矩阵,声明在progres.组和stable组之间进行差异分析
#后面是差异分析的散步
##step1
fit <- lmFit(exprSet,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG) #得到差异分析结果,后续可画火山图等
#后面是画火山图的代码
DEG=nrDEG
logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile
head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)

网友评论