美文网首页
visualization

visualization

作者: nvzhang | 来源:发表于2020-08-08 12:16 被阅读0次

    题目:http://www.bio-info-trainee.com/4387.html
    参考: https://www.jianshu.com/p/fab27c63af94

    options(stringsAsFactors = F)
    library(airway)
    data(airway)
    RNAseq_expr=assay(airway)
    #RNAseq_expr 是一个数值型矩阵,属于连续性变量,可以探索众数、分位数和平均数 ,极差,方差和标准差等统计学指标
    dex=colData(airway)[,3]
    #coldata是样本信息,提取coldata中的第三列:药物dex treat&untreat
    
    #Q1: 对RNAseq_expr的每一列绘制boxplot图
    boxplot(RNAseq_expr,main = 'Boxplot of RNAseq-expr',
            xlab = 'samples',ylab = 'expression')
    
    图片.png
    #Q4: 对RNAseq_expr的每一列log2后绘制boxplot图,density图和条形图
    #Q5: 对Q4的3个图里面添加 trt 和 untrt 组颜色区分开来
    RNAseq_filter = RNAseq_expr[apply(RNAseq_expr,1,function(x) sum(x)>1),] 
    ##过滤都是0的行
    RNA_log2 <- log2(RNAseq_filter+1)
    boxplot(RNA_log2,main = 'Boxplot of RNAseq_log2',
            xlab = 'samples',ylab = 'expression',col= c("blue","red"),fill = as.integer(dex))
    
    图片.png
    opar <- par(no.readonly=T)
    par(mfrow=c(2,4))
    for (i in c(1:8)) {
      plot(density(RNA_log2[,i]),col=as.integer(dex)[i],main = "Density")}
    par(opar)
    
    图片.png
    #?如何给条形图不同组设置不同颜色
    col = c("lightblue","lightgreen")
    par(mfrow=c(1,1))
    barplot(RNA_log2, main = 'Barplot of RNAseq-expr',
            xlab = 'samples',ylab = 'expression',col = as.integer(dex))
    
    图片.png
    #Q6:对RNAseq_expr的前两列画散点图并且计算线性回归方程
    pairs(RNAseq_expr[,1:2])
    RNAseq_df <- as.data.frame(RNAseq_expr)
    fit <- lm(RNAseq_df[,1] ~ RNAseq_df[,2],data = RNAseq_df)
    #lm()是最基本的拟合线性模型函数
    plot(RNAseq_df[,1],RNAseq_df[,2],xlab="SRR1039508",ylab="SRR1039509")
    abline(fit)
    
    图片.png
    #Q7:对RNAseq_expr的所有列两两之间计算相关系数,并且热图可视化
    cor = cor(RNAseq_filter)
    heatmap(cor,symm = TRUE )
    
    图片.png
    #Q8: 取RNAseq_expr第一行表达量绘制折线图
    plot(RNAseq_filter[1,],type="l",xlab = "gene",ylab="expression",col="red")
    
    图片.png
    #Q9: 取RNAseq_expr表达量最高的10个基因的行绘制多行折线图
    max10 <- rownames(as.data.frame(sort(rowSums(RNAseq_expr),decreasing=T)[1:10]))
    max10 <- RNAseq_filter[max10,]
    plot(max10[1,],type="b",xlab = "samples",ylab="expression",pch = 1,col =1)
    for (i in c(2:10)){
      lines(max10[i,],type="b",xlab = "samples",ylab="expression",pch = i, col=i)
    }
    
    图片.png
    #ggplot
    suppressMessages(library(reshape2))
    suppressMessages(library(ggplot2))
    suppressMessages(library("Rmisc"))
    suppressMessages(library("plyr"))
    #准备数据
    RNAseq_L <- melt(RNA_log2)
    colnames(RNAseq_L) <- c("gene","sample","expression")
    RNAseq_L$group=rep(dex,each=nrow(RNA_log2))
    #箱型图
    box=ggplot(RNAseq_L,aes(x=sample,y=expression,fill=group))+geom_boxplot() +
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
    print(box)
    # 密度图
    d <- ggplot(RNAseq_L,aes(x=expression,col=sample)) + 
                geom_density() +
      theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
    print(d)
    d2 <- ggplot(RNAseq_L,aes(expression,col=group)) + 
          geom_density() +
      theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
    print(d2)
    # 条形图
    bar <- ggplot(RNAseq_L,aes(x=sample,y=expression,fill= group)) + 
                  geom_bar(stat="identity") +
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
    print(bar)
    # 散点图
    s <- ggplot(as.data.frame(RNAseq_expr[,1:2]),aes(x=SRR1039508,y=SRR1039509)) + 
      geom_point() +
      geom_smooth(method = "lm")
    print(s)
    # 热图
    melt_cor <- melt(cor)
    h <- ggplot(melt_cor, aes(x=Var1, y=Var2, fill=value)) + 
      geom_tile()+
      theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size= 4),
            axis.text.y = element_text(size= 4)) + labs(x="",y="")
    print(h)
    
    #Q8:RNAseq_expr第一行表达量折线图
    df1 = data.frame(expression = RNAseq_filter[1,])
    df1$sample <- rownames(df1)
    l1 <- ggplot(df1,aes(x=sample, y=expression,group =1)) + geom_line() + 
      geom_point() +
      theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ))
    print(l1)
    
    #Q9: 取RNAseq_expr表达量最高的10个基因的行绘制多行折线图
    df10 = melt(max10)
    colnames(df10) <- c("gene","sample","expression")
    l10 <- ggplot(df10,aes(x=sample,y=expression,colour=gene,group=gene)) + geom_line() + geom_point() +
      theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ),
            legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
    print(l10)
    #多图展示
    multiplot(box,d,d2,bar,s,h, layout = matrix(c(1,2,3,4,5,6), nrow=3, byrow=TRUE))
    #c(1, 1, 2, 3):“1, 1”代表一张图占了两格,“2”和“3”各代表一张图
    multiplot(l1,l10)
    
    图片.png
    图片.png

    生物信息学绘图

    #Q2: 对RNAseq_expr挑选MAD值最大的100个基因的表达矩阵绘制热图
    top100_mad <- RNAseq_expr[names(sort(apply(RNAseq_expr,1,mad),decreasing=T)[1:100]),]
    top100_mad_standard = t(scale(t(top100_mad)))
    pheatmap::pheatmap(top100_mad_standard)
    
    图片.png
    #Q3: 对RNAseq_expr进行主成分分析并且绘图
    library(ggfortify)
    library(dplyr)
    df = as.data.frame(t(RNAseq_expr))
    df$group=dex
    autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')+theme_bw()
    #theme_bw()--ggtheme
    
    图片.png
    #Q4: 对RNAseq_expr进行差异分析并且绘制火山图
    ## DEG by DESeq2
    suppressMessages(library(DESeq2))
    #step1: 导入表达矩阵并设置分组
    dex_df=data.frame(dex=dex)
    dds <- DESeqDataSetFromMatrix(countData = RNAseq_expr,
                                  colData = dex_df,
                                  design = ~ dex)
    ##表达矩阵RNAseq_expr,样本信息coldata,
    ##design 表示怎样设计样本的模型(这里考虑药物dex,这项是coldata的因子型变量)
    dds$dex <- factor(dds$dex, levels = c("untrt","trt"))
    ##在design中设置factor level,如果不告诉DESeq2哪组作为对照组的话,它会自动根据字母表顺序设置!!!
    ##设置untrt为对照组!
    
    #step2:Pre-filtering
    ##这不是必须,只是为了减少后面计算量,主要就是去除表达量非常少的行,比如设置阈值为每行表达量为10
    dds_filter <- rowSums(counts(dds)) >= 10
    dds <- dds[dds_filter,]
    
    #step3
    dds <- DESeq(dds)
    res <- results(dds, contrast=c("dex","trt","untrt"))
    
    #step4 shrink
    ##用lfcShrink收缩log2_fold_change
    ##log2FC estimates do not account for the large dispersion we observe with low read counts.
    ##As with the shrinkage of dispersion estimates, LFC shrinkage uses information from all genes to generate more accurate estimates.
    library(apeglm)
    resLFC <- lfcShrink(dds,coef=resultsNames(dds)[2],res=res)
    
    #step5 logFC_cutoff
    resOrdered <- res[order(res$padj),]
    DEG =as.data.frame(resOrdered)  
    DEG = na.omit(DEG) ##返回删除NA后的向量a
    nrDEG=DEG[,c(2,6)]
    colnames(nrDEG)=c('log2FoldChange','pvalue') 
    logFC_cutoff <- with(nrDEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )
    nrDEG$change = as.factor(ifelse(nrDEG$pvalue < 0.05 & abs(nrDEG$log2FoldChange) > logFC_cutoff,
                                    ifelse(nrDEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT'))
    
    ##step6 volcano map
    this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                        '\nThe number of up gene is ',nrow(nrDEG[nrDEG$change =='UP',]) ,
                        '\nThe number of down gene is ',nrow(nrDEG[nrDEG$change =='DOWN',]))
    volcano = ggplot(data=nrDEG, 
      aes(x=log2FoldChange, y=-log10(pvalue), color=change)) +
      geom_point(alpha=0.4, size=1.75) +
      xlab("log2 fold change") + ylab("-log10 p-value") +
      ggtitle(this_tile) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
      scale_colour_manual(values = c('blue','black','red'))
    volcano
    ggsave(volcano,filename = paste0('DEseq2_volcano.png'))
    
    图片.png
    ##step7 平均值VS变化倍数图
    ##M表示log fold change,衡量基因表达量变化,上调还是下调。A表示每个基因的count的均值。
    png("MA.png")
    plotMA(res,ylim=c(-5,5))
    #标记出表达差异最大的基因
    topGene <- rownames(res)[which.min(res$padj)]
    with(res[topGene, ], {
      points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
      text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
    })
    dev.off()
    png("MA2.png")
    MA2 <- plotMA(resLFC, ylim=c(-5,5))
    topGene <- rownames(resLFC)[which.min(res$padj)]
    with(resLFC[topGene, ], {
      points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
      text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")})
    
    #identify(),交互函数,在plot中点击后显示该点索引(行数)
    idx <- identify(res$baseMean, res$log2FoldChange)
    dev.off()
    
    图片.png
    图片.png
    #Q6: 绘制其中一个差异基因在两个分组的表达量boxplot并且添加统计学显著性指标
    suppressMessages(library(ggpubr))
    choose_gene = rownames(nrDEG)[2]
    choose_gene_d <- cbind(as.data.frame(RNAseq_expr[choose_gene,]),as.data.frame(dex))
    choose_gene_d$sample = rownames(choose_gene_d)
    colnames(choose_gene_d) = c("expression","dex","sample")
    opar<-par(no.readonly=T)
    ggplot(data = choose_gene_d,aes(y=expression,x=dex)) + 
           geom_boxplot() +
           stat_compare_means(method = "t.test")
    
    图片.png
    #Q7: 通过org.Hs.eg.db包拿到RNAseq_expr所有基因的染色体信息,绘制染色体的基因数量条形图
    suppressMessages(library(org.Hs.eg.db))
    CHR <- toTable(org.Hs.egCHR)
    ensembl <- toTable(org.Hs.egENSEMBL)
    ensembl_id <- data.frame(rownames(RNAseq_expr))
    colnames(ensembl_id) <- c("ensembl_id")
    eg <- merge(ensembl_id,ensembl,by="ensembl_id")
    egc <- merge(eg,CHR,by="gene_id")
    ggplot(data = egc,aes(x=chromosome)) + geom_bar(fill="lightblue")
    
    图片.png
    #Q8: 在上面染色体的基因数量条形图并列叠加差异基因数量条形图
    DEG_ensembl <- data.frame(rownames(nrDEG))
    colnames(DEG_ensembl) <- c("ensembl_id")
    DEG_eg <- merge(DEG_ensembl,ensembl,by="ensembl_id")
    DEG_egc <- merge(DEG_eg,CHR,by="gene_id")
    colnames(DEG_egc) <- c("DEG_gene_id","DEG_ensembl_id","chromosome")
    egc$DEG <- as.factor(ifelse(egc$ensembl_id %in% DEG_egc$DEG_ensembl_id,'DEG','NOT'))
    ggplot(egc,aes(x=chromosome,fill=DEG)) + geom_bar(stat ='count')
    
    图片.png
    #Q9:在oncolnc网页工具拿到CUL5基因在BRCA数据集的表达量及病人生存资料自行本地绘制生存分析图 
    suppressMessages(library(ggstatsplot))
    CUL5 <- read.csv(file = 'BRCA_8065_50_50.csv',header = T)
    ggbetweenstats(data = CUL5, x='Status', y='Expression')
    suppressMessages(library(survival))
    suppressMessages(library(survminer))
    CUL5$Status = ifelse(CUL5$Status=='Dead',1,0)
    survf <- survfit(Surv(Days,Status)~Group,data = CUL5)
    ggsurvplot(survf, conf.int = F, pval=T)
    # complex survplot
    ggsurvplot(survf,palette = c("orange", "blue"),
               risk.table = T, pval = T,
               conf.int = T, xlab = "Time of days",
               ggtheme = theme_light(),
               ncensor.plot = T)
    
    图片.png
    图片.png
    #Q10. 在xena网页工具拿到CUL5基因在BRCA数据集的表达量及病人的PAM50分类并且绘制分类的boxplot
    cul5 <- read.csv(file = 'denseDataOnlyDownload.tsv',sep="\t",header = T,na.strings="")
    # 去除NA以后还有...读取的时候加上参数 na.strings=""
    #剔除第3列为空的行
    cul5<- cul5[complete.cases(cul5),]
    library(RColorBrewer)
    ggplot(data = cul5,aes(y=CUL5,x=PAM50_mRNA_nature2012,color=PAM50_mRNA_nature2012)) + 
           geom_boxplot() +
           theme_light() + stat_boxplot(geom = "errorbar",width=0.2) 
    
    图片.png

    相关文章

      网友评论

          本文标题:visualization

          本文链接:https://www.haomeiwen.com/subject/eyqhdktx.html