美文网首页生物信息学
30道可视化练习题笔记

30道可视化练习题笔记

作者: 大吉岭猹 | 来源:发表于2019-10-23 16:11 被阅读0次

    题目链接:https://mp.weixin.qq.com/s/4zr8mnqd1zDGHuBXAY8USA
    参考优秀学徒的笔记:https://www.jianshu.com/p/fab27c63af94

    基础包

    读取数据

    options(stringsAsFactors = F)
    library(airway)
    data(airway)
    RNAseq_expr=assay(airway)
    colnames(RNAseq_expr) 
    RNAseq_expr[1:4,1:4] 
    RNAseq_gl=colData(airway)[,3]
    table(RNAseq_gl)
    

    Q1: 对RNAseq_expr的每一列绘制boxplot图

    boxplot(RNAseq_expr)
    
    • 可以看到有较多的异常值

    Q2: 对RNAseq_expr的每一列绘制density图

    plot(density(RNAseq_expr)) #直接画发现0的比例太高了,应该先过滤一下
    filtered=RNAseq_expr[apply(RNAseq_expr,1,function(x) sum(x>0)>1),]
    dim(RNAseq_expr)
    dim(filtered)
    plot(density(filtered)) #但并不是这个问题,先往下做
    

    Q3: 对RNAseq_expr的每一列绘制条形图

    barplot(RNAseq_expr)
    

    Q4: 对RNAseq_expr的每一列取log2后重新绘制boxplot图,density图和条形图

    normalized=log2(filtered+1)
    #此处使用借鉴来的更好的代码,顺便把Q5的颜色也加了
    #箱线图
    boxplot(normalized,main = 'Boxplot of RNAseq-expr', xlab = 'samples',ylab = 'expression',col=RNAseq_gl)
    #8列一起画密度图
    plot(density(normalized)) #看来之前密度图不太对是异常值过多导致的
    #分开画密度图
    opar <- par(no.readonly=T)
    par(mfrow=c(3,3))
    for (i in c(1:8)){
      plot(density(normalized[,i]),col=as.integer(RNAseq_gl)[i],main = "Density")
      }
    par(opar)
    #条形图
    col = c("lightblue","lightgreen")
    barplot(normalized, main = 'Barplot of RNAseq-expr', xlab = 'samples',ylab = 'expression',col = RNAseq_gl) #奇怪的是并没有加上颜色
    



    Q5: 对Q4的3个图里面添加 trt 和 untrt 组颜色区分开来

    Q6: 对RNAseq_expr的前两列画散点图并且计算线性回归方程

    plot(RNAseq_expr[,1:2]) #原始数据
    plot(normalized[,1:2]) #过滤、归一化后
    

    Q7: 对RNAseq_expr的所有列两两之间计算相关系数,并且热图可视化。

    M=cor(RNAseq_expr)
    pheatmap::pheatmap(M)
    

    Q8: 取RNAseq_expr第一行表达量绘制折线图

    plot(RNAseq_expr[1,],type="l",xlab = "gene",ylab="expression",col="red")
    

    Q9: 取RNAseq_expr表达量最高的10个基因的行绘制多行折线图

    top10=RNAseq_expr[rownames(RNAseq_expr) %in% names(tail(sort(rowSums(RNAseq_expr)),10)),]
    plot(top10[1,],type="b",xlab = "gene",ylab = "expression",pch = 1,
         ylim=c(50000,550000)) #不调整y轴范围的话画出来的图会超过默认范围
    for (i in c(2:10)){
      lines(top10[i,],type="b",xlab = "gene",ylab = "expression",pch = i)
    }
    

    Q10: 运行代码

    https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/2-chunjuan-600.R

    ggplot

    准备数据

    • 由于ggplot的“映射”理念,我们需要先把原有的扁数据变成长数据,即每列为一个变量的数据。我们采用tidyr包实现这一需求。
    if(!require(tidyr))install.packages("tidyr")
    library(tidyr)
    tmp=as.data.frame(RNAseq_expr)
    tmp$geneid=rownames(tmp)
    exp_gather <- gather(tmp,"sample","exp",-geneid) #这一行是转换的关键
    gg=exp_gather
    
    • 其他学徒的优秀代码
    suppressMessages(library(reshape2)) #用的是reshape2这个包
    box_e <- melt(filtered) #似乎是比gather函数简洁
    colnames(box_e) <- c("gene","sample","expression")
    #这里开始是添加分组信息,方便画图时填色
    tmp=data.frame(group_list=RNAseq_gl)
    rownames(tmp)=colnames(RNAseq_expr)
    tmp$sample <- rownames(tmp)
    d = merge(box_e,tmp,by="sample")
    

    Q1

    library(ggplot2)
    ggplot(gg,aes(sample,exp,fill=group_list)) + geom_boxplot()
    
    • 没做log之前图都是没法看的,这里就不放了

    Q2

    #先效仿优秀代码,增加一列分组信息
    tmp=data.frame(group_list=RNAseq_gl)
    rownames(tmp)=colnames(RNAseq_expr)
    tmp$sample <- rownames(tmp)
    gg = merge(gg,tmp,by="sample")
    
    p2 <- ggplot(gg,aes(x=exp,colour=sample)) + geom_density()
    

    Q3

    p3 <- ggplot(gg,aes(sample,exp,fill=group_list)) + geom_bar(stat="identity")
    

    Q4

    #再处理一遍数据
    if(!require(tidyr))install.packages("tidyr")
    library(tidyr)
    tmp=as.data.frame(normalized)
    tmp$geneid=rownames(tmp)
    exp_gather <- gather(tmp,"sample","exp",-geneid)
    gg=exp_gather
    
    tmp=data.frame(group_list=RNAseq_gl)
    rownames(tmp)=colnames(RNAseq_expr)
    tmp$sample <- rownames(tmp)
    gg = merge(gg,tmp,by="sample")
    
    #和之前一样的步骤画图
    p1 <- ggplot(data = gg,aes(y=exp,x=sample,fill=group_list)) + geom_boxplot()
    p2 <- ggplot(gg,aes(x=exp,colour=sample)) + geom_density()
    p3 <- ggplot(gg,aes(sample,exp,fill=group_list)) + geom_bar(stat="identity")
    

    Q5

    Q6

    p6 <- ggplot(as.data.frame(RNAseq_expr[,1:2]),aes(x=SRR1039508,y=SRR1039509)) +
      geom_point() +
      geom_smooth(method= "lm")
    

    Q7

    M=cor(RNAseq_expr)
    melt_M <- melt(M)
    p7 <- ggplot(data = melt_Q7, aes(x=Var1, y=Var2, fill=value)) + geom_tile()
    

    Q8

    #p8 <- ggplot(tmp,aes(x=sample, y=exp)) + geom_line() + geom_point()
    #会报错geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
    p8 <- ggplot(tmp,aes(x=sample, y=exp, group=1)) + geom_line() + geom_point()
    

    Q9

    top10=RNAseq_expr[rownames(RNAseq_expr) %in% names(tail(sort(rowSums(RNAseq_expr)),10)),]
    tmp=melt(top10)
    colnames(tmp)=c("geneid","sample","exp")
    p9 <- ggplot(tmp,aes(x=sample,y=exp,colour=geneid,group=geneid)) + geom_line() + geom_point()
    p9
    
    • 其他学徒的优秀代码(完整)
    # 箱型图
    A <- ggplot(data = d,aes(y=expression,x=sample,fill=group_list)) + geom_boxplot() +
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
      
    
    # 密度图
    B1 <- ggplot(box_e,aes(x=expression,colour=sample)) + geom_density() +
      theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
    B2 <- ggplot(data= d,aes(expression,col=group_list)) + geom_density() +
      theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
      
    
    # 条形图
    C <- ggplot(data = d,aes(x=sample,y=expression,fill= group_list)) + geom_bar(stat="identity") +
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
    # factor() 函数将连续型变量转化为离散型变量
    
    # 散点图
    D <- ggplot(as.data.frame(RNAseq_expr[,1:2]),aes(x=SRR1039508,y=SRR1039509)) + 
      geom_point() +
      geom_smooth(method = "lm")
    
    # 热图
    melt_Q7 <- melt(Q7)
    E <- ggplot(data = melt_Q7, aes(x=Var1, y=Var2, fill=value)) + geom_tile()+
      theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size= 4),
            axis.text.y = element_text(size= 4)) + labs(x="",y="")
      
      
    
    # 折线图
    F1 <- ggplot(Q8_1,aes(x=sample, y=expression, group =1)) + geom_line() + geom_point() +
      theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ))
    Q8_2_m = melt(Q8_2)
    colnames(Q8_2_m) = c("gene","sample","expression")
    F2 <- ggplot(Q8_2_m,aes(x=sample,y=expression,colour=gene,group=gene)) + geom_line() + geom_point() +
       theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ),
             legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
    
    multiplot(A,B1,B2, layout = matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
    multiplot(C,D,E, layout = matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
    multiplot(F1,F2)
    
    • 最值得学习的地方
      1. 利用theme函数把图变得更好看了,尤其是样本名太长进而导致叠在一起的问题被解决了;
      2. 拼图功能以前没见过,感觉很不错。

    生物信息学作图

    Q2

    tmp=apply(filtered,1,mad)
    str(tmp)
    
    z=t(scale(t(normalized))) #这里我认为标准化必须对整体做,不能取top100出来后再做。
    top=z[rownames(z) %in% names(tail(sort(tmp),100)),]
    pheatmap::pheatmap(top)
    

    Q3

    # PCA图应用z-score矩阵绘制
    if(T){
      library(ggfortify)
      dat=z
      df=as.data.frame(t(dat))
      group_list=RNAseq_gl
      df$group=group_list
      autoplot(prcomp(df[,1:(ncol(df)-1)]), data = df, colour = 'group') #prcomp is the function of PCA
      
      library("FactoMineR")
      library("factoextra") #PCA diagram requires these two packages
      df=as.data.frame(t(dat))
      dat.pca <- PCA(df, graph = FALSE)
      fviz_pca_ind(dat.pca,
                   geom.ind = "point", # show points only (nbut not "text")
                   col.ind = group_list, # color by groups
                   # palette = c("#00AFBB", "#E7B800"),
                   addEllipses = TRUE, # Concentration ellipses
                   legend.title = "Groups")
      #if there are only two replications, ellipse cannot be calculated
    }
    

    Q4

    #差异分析
    suppressMessages(library(DESeq2))
    colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
    dds <- DESeqDataSetFromMatrix(countData = RNAseq_expr,
                                  colData = colData,
                                  design = ~ group_list)
    dds <- DESeq(dds)
    res <- results(dds, 
                   contrast=c("group_list","trt","untrt"))
    resOrdered <- res[order(res$padj),]
    
    DEG=as.data.frame(resOrdered)
    nrDEG=na.omit(DEG)
    DEseq_DEG=nrDEG
    nrDEG=DEseq_DEG[,c(2,6)]
    colnames(nrDEG)=c('log2FoldChange','pvalue') 
    
    #火山图
    if(T){
      pretty_volcano <- function(resObject,title,alpha=0.05,lfc=2) {
        # alpha is the threshold of p-value, default 0.05
        # lfc is the threshold of -log10FoldChange, default 2
        
        resObject$sig <- -log10(resObject$padj)
        resObject[is.infinite(resObject$sig),"sig"] <- 350
        
        # Select genes with a defined p-value (DESeq2 assigns NA to some genes)
        genes.to.plot <- !is.na(resObject$padj)
        # sum(genes.to.plot)
        range(resObject[genes.to.plot, "log2FoldChange"])
        
        ## Volcano plot of adjusted p-values
        cols <- densCols(resObject$log2FoldChange, resObject$sig)
        cols[resObject$padj ==0] <- "purple"
        resObject$pch <- 19
        resObject$pch[resObject$padj ==0] <- 6
        plot(resObject$log2FoldChange, 
             resObject$sig, 
             col=cols, panel.first=grid(),
             main=title, 
             xlab="Effect size: log2(fold-change)",
             ylab="-log10(adjusted p-value)",
             pch=resObject$pch, cex=0.4)
        abline(v=0)
        abline(v=c(-lfc,lfc), col="brown")
        abline(h=-log10(alpha), col="brown")
        
        ## Plot the names of a reasonable number of genes, by selecting those begin not only significant but also having a strong effect size
        gn.selected <- abs(resObject$log2FoldChange) > lfc & resObject$padj < alpha 
        # text(resObject$log2FoldChange[gn.selected],
        #      -log10(resObject$padj)[gn.selected],
        #      lab=rownames(resObject)[gn.selected ], cex=0.6)
        # # mtext(paste("padj =", alpha), side = 4, at = -log10(alpha), cex = 0.8, line = 0.5, las = 1) 
        # mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.8, line = 0.5)
      }
      #The following two parameters can be adjusted according to actual needs
      alpha=1e-10
      lfc=2
      pretty_volcano(DEseq_DEG, 'Treat vs Control', alpha = alpha, lfc = lfc)
    }
    

    Q5

    plotMA(res,ylim=c(-5,5))
    # lfcShrink 收缩log2 fold change
    resLFC <- lfcShrink(dds,coef = 2,res=res)
    plotMA(resLFC, ylim=c(-5,5))
    

    Q6

    choose_gene = rownames(nrDEG)[9]
    choose_gene_d <- cbind(as.data.frame(RNAseq_expr[choose_gene,]),
                           as.data.frame(tmp$group_list))
    choose_gene_d$sample = rownames(choose_gene_d)
    colnames(choose_gene_d) = c("e","group","sample")
    ggplot(data = choose_gene_d,aes(y=e,x=group)) + geom_boxplot() +
      stat_compare_means(method = "t.test")
    

    Q7

    suppressMessages(library(org.Hs.eg.db))
    CHR <- toTable(org.Hs.egCHR)
    ensembl <- toTable(org.Hs.egENSEMBL)
    
    table(rownames(RNAseq_expr) %in% ensembl$ensembl_id)
    ### FALSE  TRUE 
    ### 38372 25730 
    table(rownames(filtered) %in% ensembl$ensembl_id)
    ### FALSE  TRUE 
    ### 10583 18294
    # 可以看到org.Hs.eg.db这个包的数据量还是比较有限的,超过一半的基因找不到。
    
    ids=merge(CHR,ensembl,by="gene_id")
    gene=as.data.frame(rownames(RNAseq_expr))
    colnames(gene)="ensembl_id"
    ids2=merge(ids,gene,by="ensembl_id")
    ggplot(ids2,aes(chromosome)) +
      geom_bar(fill="lightblue")
    

    Q8

    tmp=nrDEG[nrDEG$pvalue<0.05,]
    DEG_gene=data.frame(rownames(tmp))
    colnames(DEG_gene) <- c("ensembl_id")
    ids3=merge(ids,DEG_gene,by="ensembl_id")
    
    # 叠加
    ids2$DEG <- as.factor(ifelse(ids2$ensembl_id %in% ids3$ensembl_id,'DEG','NOT'))
    ggplot(ids2,aes(x=chromosome,y=..count..,fill=DEG)) + geom_bar(stat ='count')
    

    Q9

    • 基因名是CUL5。
    rm(list = ls())
    options(stringsAsFactors = F)
    brca=read.csv("BRCA_8065_50_50.csv",header = T)
    
    library(survival)
    library(survminer)
    colnames(brca)
    colnames(brca)=c("patient","OS.time","OS","expression","group")
    table(brca$OS)
    brca$OS=ifelse(brca$OS=="Dead",1,0) #survfit函数要求生存状态是逻辑值,且存活对应0,死亡对应1.
    
    
    sfit <- survfit(Surv(OS.time, OS)~group, data=brca)
    ggsurvplot(sfit,palette = c("red", "black"),
               risk.table =TRUE,pval =TRUE,
               xlab ="Time in days", 
               ggtheme =theme_light())
    

    Q10

    tmp <- read.csv(file = 'denseDataOnlyDownload.tsv',sep="\t",header = T)
    table(tmp$PAM50_mRNA_nature2012)
    # 除了我们要的5种分型之外还有一些病人的分组是未知的,需要去除。
    cul5 <- read.csv(file = 'denseDataOnlyDownload.tsv',sep="\t",header = T,na.strings="")
    cul5<- cul5[complete.cases(cul5),]
    library(RColorBrewer)
    ggplot(data = cul5,aes(y=CUL5,x=PAM50_mRNA_nature2012,color=PAM50_mRNA_nature2012)) + 
      geom_boxplot() +
      theme_light() + 
      stat_boxplot(geom = "errorbar",width=0.2) + 
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=10))
    

    最后,向大家隆重推荐生信技能树的一系列干货!

    1. 生信技能树超级VIP入场券
    2. B站公益74小时生信工程师教学视频合辑
    3. 生信技能树简书账号
    4. 生信工程师入门最佳指南

    相关文章

      网友评论

        本文标题:30道可视化练习题笔记

        本文链接:https://www.haomeiwen.com/subject/axlkvctx.html