R试题 可视化

作者: 煎饼果子再来一套 | 来源:发表于2019-08-15 19:55 被阅读2次

    Q1: 一行行的运行:https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/top50ggplot.Rmd 代码

    Q2: 对RNAseq_expr挑选MAD值最大的100个基因的表达矩阵绘制热图

    load('../view_new.Rdata')
    tmp<- RNAseq_expr[rowSums(RNAseq_expr>0)>1,]
    tmp <-tmp[names(tail(sort(apply(tmp,1,mad)),100)),] 
    tmp <- t(scale(t(tmp)))
    pheatmap::pheatmap(tmp,filename = 'MAD_top100_heatmap.png')
    
    M <- cor(log2(tmp))
    ####加上annotation col
    annotation_col <- data.frame(group_ist = RNAseq_gl)
    annotation_col$sample <- colnames(tmp)
    rownames(annotation_col) <- colnames(tmp)
    annotation_col
    pheatmap::pheatmap(M,annotation_col = annotation_col,filename = 'correaltion_heatmap.png')
    
    correaltion_heatmap.png MAD_top100_heatmap.png

    Q3: 对RNAseq_expr进行主成分分析并且绘图

    q3 <- as.data.frame(t(e1))
    q3[1:4,1:4]
    q3$group <- RNAseq_gl
    q3[,c(1,ncol(q3))]
    qc_pca <- prcomp(q3)
    #Error in colMeans(x, na.rm = TRUE) : 'x'必需为数值
    q3_pca <- prcomp(q3[,1:(ncol(q3)-1)])
    par(mar=c(2,2,2,2))
    png(filename = 'pca.png',res = 100)
    autoplot(q3_pca,data = q3,colour = 'group')+theme_classic()
    dev.off()
    
    pca.png

    # Q4: 对RNAseq_expr进行差异分析并且绘制火山图

    setwd('limma /')
    rm(list = ls())
    suppressPackageStartupMessages(library(airway))
    data(airway)
    RNAseq_expr=assay(airway)   
    group_list=colData(airway)[,3]
    dim(RNAseq_expr)
    save(RNAseq_expr,group_list,file = 'rnaseq.Rdata')
    
    rm(list = ls())
    options(stringsAsFactors = F)
    load('rnaseq.Rdata')
    
    #### 三数据之一 ####
    ##### 表达矩阵 #####
    colnames(RNAseq_expr)
    pheatmap::pheatmap(cor(RNAseq_expr))
    group_list
    tmp=data.frame(group=group_list)
    rownames(tmp)=colnames(RNAseq_expr)
    # 组内的样本的相似性应该是要高于组间的!
    pheatmap::pheatmap(cor(RNAseq_expr),annotation_col = tmp)
    dim(RNAseq_expr)
    RNAseq_expr=RNAseq_expr[apply(RNAseq_expr,1, function(x) sum(x>1) > 5),]
    #RNAseq_expr=RNAseq_expr[rowSums(RNAseq_expr>1)>5,],等价于上一句
    dim(RNAseq_expr)
    
    RNAseq_expr=log(edgeR::cpm(RNAseq_expr)+1)
    
    suppressPackageStartupMessages(library(limma))
    
    ########## QC ############
    par(cex = 0.7)
    n.sample=ncol(RNAseq_expr)
    if(n.sample>40) par(cex = 0.5)
    cols <- rainbow(n.sample*1.2)
    boxplot(RNAseq_expr, col = cols,main="expression value",las=2)
    ########## QC ############
    
    #### 三数据之二 ####
    ###### 分组矩阵 #####
    ?factor
    ?model.matrix
    design <- model.matrix(~0+factor(group_list))
    colnames(design)=levels(factor(group_list))
    rownames(design)=colnames(RNAseq_expr)
    design
    
    #### 三数据之三 ####
    ##### 差异比较矩阵 #####
    contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
    contrast.matrix 
    
    ### 三步骤 ###
    ##step1_________lmFit
    fit <- lmFit(RNAseq_expr,design)
    ##step2_________eBayes
    fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要,大家可以自行看看效果
    fit2 <- eBayes(fit2)  ## default no trend !!!
    ##eBayes() with trend=TRUE
    ##step3_________topTable
    tempOutput = topTable(fit2, coef=1, n=Inf)
    nrDEG = na.omit(tempOutput) 
    #
    write.csv(nrDEG,"limma_notrend.results.csv",quote = F)
    head(nrDEG)
    
    source('../functions.R')###JM github 上下载
    
    DEseq_DEG=nrDEG
    nrDEG=DEseq_DEG[,c(1,4)]
    colnames(nrDEG)=c('log2FoldChange','pvalue')
    draw_h_v(RNAseq_expr,nrDEG,'limma')
    
    limma_need_DEG_top50_heatmap.png limma_volcano.png
    #####################DEseq2包进行差异分析 #######################
    #差异分析套路与limma差不多,DEseq2针对count类型的数据
    ## 参考:http://www.bio-info-trainee.com/bioconductor_China/software/DESeq2.html
    ### 三个数据 ——表达矩阵,分组矩阵,差异比较矩阵
    #### 表达矩阵,前面已经下载了,就直接载入了
    rm(list = ls())
    suppressPackageStartupMessages(library(DESeq2))
    suppressPackageStartupMessages(library(limma))
    load('./limma /rnaseq.Rdata')
    suppressPackageStartupMessages(library(airway))
    
    
    
    
    #### 三个步骤
    #### step1 构造DDS对象、####steo2 DESeq函数进行normalization处理、####step3 results函数来提取差异比较结果
    #### step1 构造DDS对象
    colData <- data.frame(row.names=colnames(RNAseq_expr), 
                          group_list=group_list
    )
    dds <- DESeqDataSetFromMatrix(countData = RNAseq_expr,
                                  colData = colData,
                                  design = ~ group_list)
    dds
    # class: DESeqDataSet 
    # dim: 64102 8 
    # metadata(1): version
    # assays(1): counts
    # rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
    # rowData names(0):
    #   colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
    # colData names(1): group_list
    
    #### 这里是jimmy提到的另一种构造dds的方法
    # library(airway)
    # data(airway)
    # suppressMessages(library(DESeq2))
    # dds  <-  DESeqDataSet(airway, design = ~ cell+ dex)
    # design(dds) <-  ~ dex
    ###
    # ## 构造好的对象还可以更改分组信息
    
    ####steo2 DESeq函数进行normalization处理
    suppressMessages(dds2 <- DESeq(dds)) 
    ##直接用DESeq函数即可
    ## 下面的代码如果你不感兴趣不需要运行,免得误导你
    ## 就是看看normalization前面的数据分布差异
    # rld <- rlogTransformation(dds2)  ## 得到经过DESeq2软件normlization的表达矩阵!
    # RNAseq_expr_new=assay(rld)
    # par(cex = 0.7)
    # n.sample=ncol(RNAseq_expr)
    # if(n.sample>40) par(cex = 0.5)
    # cols <- rainbow(n.sample*1.2)
    # par(mfrow=c(2,2))
    # boxplot(RNAseq_expr, col = cols,main="expression value",las=2)
    # boxplot(RNAseq_expr_new, col = cols,main="expression value",las=2)
    # hist(RNAseq_expr)
    # hist(RNAseq_expr_new)
    
    ####step3 results函数来提取差异比较结果
    resultsNames(dds2)
    #[1] "Intercept"               "group_list_untrt_vs_trt"
    ## 其实如果只分成了两组,并没有必要指定这个比较矩阵!
    #res <-  results(dds2, contrast=c("group_list","treated","untreated"))  
    ### 如果是第二种方法构建的dds,指定比较矩阵时,应注意对象
    res <-  results(dds2) 
    resOrdered <- res[order(res$padj),]
    resOrdered=as.data.frame(resOrdered)
    head(resOrdered)
    
    DEG <- as.data.frame(resOrdered)  
    table(is.na(DEG))
    DEG <-  na.omit(DEG) 
    nrDEG <- DEG
    DEseq_DEG <- nrDEG
    write.csv(DEseq_DEG,file = 'DESeq_DEG.csv',quote = F)
    
    ###直接下载jimmyGitHub上的functions.R,调用后,直接出图
    source('functions.R')
    draw_h_v(RNAseq_expr,DEseq_DEG,'DESeq2')
    ### 对比后,可以发现,limma和DESeq2得到的差异基因还是有一些区别的,据此画出的火山图也有较大的区别
    
    DESeq2_data_pre_nor vs data_post_nor.png DEseq2_need_DEG_top50_heatmap.png DEseq2_volcano.png
    #nrDEG=DEseq_DEG[,c(2,6)]
    #colnames(nrDEG)=c('log2FoldChange','pvalue')
    ## 土味火山图
    ###plot(DEseq_DEG$log2FoldChange, -log10(DEseq_DEG$pvalue))
    
    Juicy_natural_volcano.png
    #### 注入灵魂
    logFC_cutoff <- with(nrDEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )
    nrDEG$change = as.factor(ifelse(nrDEG$pvalue < 0.05 & abs(nrDEG$log2FoldChange) > logFC_cutoff,
                                    ifelse(nrDEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT'))
    this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                        '\nThe number of up gene is ',nrow(nrDEG[nrDEG$change =='UP',]) ,
                        '\nThe number of down gene is ',nrow(nrDEG[nrDEG$change =='DOWN',]))
    volcano = ggplot(data=nrDEG, 
                     aes(x=log2FoldChange, y=-log10(pvalue), 
                         color=change)) +
      geom_point(alpha=0.4, size=1.75) +
      xlab("log2 fold change") + ylab("-log10 p-value") +
      ggtitle(this_tile) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
      scale_colour_manual(values = c('blue','black','red'))+
      theme_classic()
    
    manual_volcano_plot.png

    Q5: 对RNAseq_expr进行差异分析并且绘制(平均值VS变化倍数)图

    #DESeq2包里有这样一个函数 plotMA
    #A simple helper function that makes a so-called "MA-plot", i.e. a scatter plot of log2 fold changes (on the y-axis) 
    #versus the mean of normalized counts (on the x-axis).
    plotMA(res)
    
    average_vs_foldchange.png

    Q6: 绘制其中一个差异基因在两个分组的表达量boxplot并且添加统计学显著性指标

    choose_gene <- rownames(nrDEG)[1]
    suppressPackageStartupMessages(library(ggpubr))
    suppressPackageStartupMessages(library(reshape2))
    RNAseq_expr_m <- melt(RNAseq_expr)
    RNAseq_expr_m$Group <- rep(levels(group_list),each = nrow(RNAseq_expr))
    colnames(RNAseq_expr_m) <- c('Gene','Sample','Value', 'Group')
    choose_gene_d <- RNAseq_expr_m[RNAseq_expr_m$Gene==choose_gene,]
    ?stat_compare_means
    ggplot(choose_gene_d,aes(x=Group, y=Value,color=Group))+
      geom_boxplot()+
      theme_classic()+
      labs(title = 'Expression boxplot',subtitle = choose_gene)
      stat_compare_means(method = 't.test')
    
    expression_boxplot.png

    Q7: 通过org.Hs.eg.db包拿到RNAseq_expr所有基因的染色体信息,绘制染色体的基因数量条形图

    ls('package:org.Hs.eg.db')
    CHR <- toTable(org.Hs.egCHR)
    colnames(CHR)
    #[1] "gene_id"    "chromosome"
    ENSEMBL <- toTable(org.Hs.egENSEMBL)
    colnames(ENSEMBL)
    #[1] "gene_id"    "ensembl_id"
    ENSEMBL_ID <- data.frame(rownames(RNAseq_expr))
    colnames(ENSEMBL_ID) <- "ensembl_id"
    EID <- merge(ENSEMBL_ID,ENSEMBL,by="ensembl_id")
    EIDC <- merge(EID,CHR,by="gene_id")
    colnames(EIDC)
    #[1] "gene_id"    "ensembl_id" "chromosome"
    suppressPackageStartupMessages(library(ggplot2))
    ggplot(EIDC,aes(x=chromosome))+
      geom_bar()
    

    Q8: 在上面染色体的基因数量条形图并列叠加差异基因数量条形图

    #### id的转换
    DEG_ensembl <- data.frame(rownames(nrDEG))
    colnames(DEG_ensembl) <- c("ensembl_id")
    DEG_E <- merge(DEG_ensembl,ENSEMBL,by="ensembl_id")
    DEG_EC <- merge(DEG_E,CHR,by="gene_id")
    colnames(DEG_EC) <- c("DEG_gene_id","DEG_ensembl_id","Chromosome")
    str(DEG_EC)
    
    head(EIDC)
    EIDC$DEG <- as.factor(ifelse(EIDC$ensembl_id %in% DEG_EC$DEG_ensembl_id,'DEG','NOT'))
    ggplot(EIDC,aes(x=chromosome,fill=DEG)) + 
      geom_bar()+
      theme_classic2()
    
    q9_chromosome.png

    Q9: 在oncolnc网页工具拿到GUL5基因在BRCA数据集的表达量及病人生存资料自行本地绘制生存分析图

    ##### 提示没有GUL5这个基因。。。。。以GULP1代替
    rm(list = ls())
    GULP1 <- read.csv('BRCA_51454_50_50.csv')
    suppressPackageStartupMessages(library(survminer))
    suppressPackageStartupMessages(library(survival))
    suppressPackageStartupMessages(library(ggstatsplot))
    colnames(GULP1)
    ggbetweenstats(data = GULP1, x='Status', y='Expression')
    
    vignette('survival')
    vignette('survminer')
    str(GULP1)
    head(GULP1)
    table(GULP1$Status)
    # Alive  Dead 
    # 871   135 
    GULP1$Status <- ifelse(GULP1$Status=='Dead',1,0)
    head(GULP1)
    table(GULP1$Status)
    fit <- surv_fit(Surv(Days, Status) ~ Group,
                    data = GULP1)
    b <- surv_pvalue(fit)
    b$pval
    ggsurvplot(fit,data = GULP1,pval =round(b$pval,3))
    
    gulp1_violin_plot.png survival_plot.png

    Q10: 在xena网页工具拿到GUL5基因在BRCA数据集的表达量及病人的PAM50分类并且绘制分类的boxplot

    ### 同样以GULP1代替
    GULP1 <- read.table('limma /denseDataOnlyDownload.tsv',sep = '\t',header = T,na.strings = '')
    colnames(GULP1)
    table(GULP1$PAM50_mRNA_nature2012)
    # Basal-like HER2-enriched     Luminal A     Luminal B   Normal-like 
    # 98            58           231           127             8 
    tail(GULP1)
    # sample         samples PAM50_mRNA_nature2012 GULP1
    # 1242 TCGA-E2-A1IO-11 TCGA-E2-A1IO-11                  <NA>    NA
    # 1243 TCGA-E2-A1LI-11 TCGA-E2-A1LI-11                  <NA>    NA
    # 1244 TCGA-E9-A1N8-11 TCGA-E9-A1N8-11                  <NA>    NA
    # 1245 TCGA-E9-A1NE-11 TCGA-E9-A1NE-11                  <NA>    NA
    # 1246 TCGA-E9-A1NH-11 TCGA-E9-A1NH-11                  <NA>    NA
    # 1247 TCGA-XX-A899-11 TCGA-XX-A899-11                  <NA>    NA
    
    table(complete.cases(GULP1))
    GULP1<- GULP1[complete.cases(GULP1),]
    ggplot(GULP1,aes(y=GULP1,x= PAM50_mRNA_nature2012,fill = PAM50_mRNA_nature2012))+
      geom_boxplot()+
      stat_boxplot(geom = 'errorbar',width = 0.1)+
      theme_classic2()
    
    pam50.png

    相关文章

      网友评论

        本文标题:R试题 可视化

        本文链接:https://www.haomeiwen.com/subject/fietsctx.html