美文网首页
3. Ballgown 流程

3. Ballgown 流程

作者: 吴十三和小可爱的札记 | 来源:发表于2021-05-07 17:42 被阅读0次

    Loading data

    ballgown 通过正则表达和自带函数可以实现RNAseq文件夹下所有/部分样本stringtie结果读取。

    library(ballgown)
    library(tidyverse)
    file <- "path\\to\\RNAseq\\"
    # load the sample information
    pheno_data <- read.table(paste0(file, "geuvadis_phenodata.csv"), header = TRUE, sep =',')
    
    # create a ballgown object
    ballgown_obj <- ballgown(dataDir = paste0(file, "ballgown"),  samplePattern = "", pData = pheno_data)
    
    # gexpr(), texpr(), eexpr(), and iexpr()
    gene_data <- gexpr(ballgown_obj)
    
    # Load all attributes including gene name
    trans_data <-  texpr(ballgown_obj, 'all')
    

    Exploratory Analysis

    通过查看转录本数量、分布、结构和样本间重复性,对数据进行探索性分析,了解数据结构和内容。

    转录本数量分布

    counts = data.frame(table(trans_data[, "gene_id"]))
    colnames(counts) <- c("gene_id", "numbers")
    ggplot(data = counts) + 
     geom_histogram(aes(x = numbers)) + 
     labs(title = "Distribution of transcript count per gene") +
     xlab(label = "Transcripts per gene") +
     theme_bw()
    

    转录本长度分布

    ggplot() + 
     geom_histogram(aes(x = trans_data$length)) + 
     labs(title = "Distribution of transcript lengths") +
     xlab(label = "Transcript length (bp)") +
     theme_bw()
    
    # Summarize statistics of FPKM values 
    # summary(trans_data[, length(trans_data)])
    

    转录本在基因上的结构

    可视化转录本或外显子在基因组上的位置和表达水平。

    plotTranscripts(gene = 'NM_000489', gown = ballgown_obj, samples = 'ERR188044_chrX', 
     meas = 'FPKM', colorby = 'transcript', 
     main='transcripts from gene NM_000489\nsample ERR188044, FPKM')
    

    处理低表达量

    一般有两种方法处理处理低表达量:

    1. set minimum non-zero FPKM

      • grabbing a copy of all data values,

      • coverting 0's to NA

      • calculating the minimum or all non NA values

      • min_nonzero = minimum

    2. set min_nonzero = 1

    数据重复性

    选择技术重复的样本做散点图,一般来说重复性好的数据是按照对角线对称分割的。

    min_nonzero = 1
    fpkm = texpr(ballgown_obj, meas="FPKM")
    colnames(fpkm) <- gsub("_chrX", "", pheno_data$ids)
    
    ERR188044 <- log2(fpkm[, "ERR188044"] + min_nonzero)
    ERR188104 <- log2(fpkm[, "ERR188104"] + min_nonzero)
    
    ggplot() + 
     geom_hex(aes(x = ERR188044, y = ERR188104)) + 
     geom_abline(intercept = 0, slope = 1, size = 1, color = "red") +
     xlab(label = "ERR188044") + 
     ylab(label = "ERR188104") + 
     theme_bw()
    

    处理低表达

    转录组数据存在大量低表达和0值,一般可以直接设定阈值过滤或者结合MatrixGenerics 包进行variance过滤。

    # variance filter
    # library(MatrixGenerics)
    # ballgown_filtted <- subset(ballgown_obj, "rowVars(texpr(ballgown_obj)) >1", genomesubset=TRUE)
    
    # or remove the lower transcripts with a threshold value
    fpkm = data.frame(fpkm)
    fpkm[ ,"Row_Sum"] <- rowSums(fpkm)
    
    judge_sum <- fpkm[ ,"Row_Sum"] > 5
    ballgown_filtted <- subset(ballgown_obj, "judge_sum", genomesubset=TRUE)
    
    index = which(fpkm[ ,"Row_Sum"] > 5)
    

    距离分析

    热图

    data_mat <-  fpkm[index, -ncol(fpkm)]
    
    pheatmap(mat = log10(data_mat + 1), show_rownames = FALSE) 
    

    共有转录本数量

    library(UpSetR)
    
    judge_data <- function(x){
      ifelse(x == 0, 0, 1)
    }
    
    data_mat <- fpkm[index, -ncol(fpkm)]
    upset_data <- data.frame(apply(data_mat, 2, judge_data))
    col_names <- colnames(upset_data)
    
    
    upset(upset_data, sets = col_names, 
          order.by = c("freq"),
          main.bar.color = "#009688")
    

    样本间相关性

    library(corrplot)
    data_mat <- fpkm[index, -ncol(fpkm)]
    mat <- cor(data_mat)
    
    corrplot(corr = mat, is.corr = FALSE, type = "upper", 
             diag = FALSE)
    

    MDS Plot

    data_mat <- fpkm[index, -ncol(fpkm)]
    dist_mat <- dist(t(data_mat))
    mds_data <- cmdscale(dist_mat)
    
    # scale the plot data
    mds_data <- as.data.frame(scale(mds_data))
    
    colnames(mds_data) <- c("MDS 1", "MDS 2")
    mds_data$name <- rownames(mds_data)
    
    ggplot(data = mds_data) + 
      geom_point(aes(x = `MDS 1`, y = `MDS 2`), size = 3) +
      geom_text(aes(x = `MDS 1` + 0.2, y =`MDS 2`, 
                    label = name), size = 2) + 
      theme_bw()
    

    PCA Analysis

    data_mat <- fpkm[index, -ncol(fpkm)]
    pca_fit <- prcomp(t(data_mat))
    
    components <- summary(pca_fit )$importance[2,c(1,2)]
    plot_data <- as.data.frame(pca_fit$x)
    plot_data$name <- row.names(plot_data)
    
    ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2), 
                                          size = 2) +
      ggrepel::geom_text_repel(aes(x = PC1, y = PC2, label = name)) +
      xlab(label = paste0("PC 1 (", 
                          round(components[1], 4)*100, "%)")) + 
      ylab(label = paste0("PC 2 (", 
                          round(components[2], 4)*100, "%)")) +
      theme_bw()
    

    差异表达分析

    用于差异表达的矩阵是不经过数据转换的,一般是原始矩阵或者是筛选过低表达基因的原始矩阵。

    ballgown_filtted <- subset(ballgown_obj, "judge_sum", genomesubset=TRUE)
    
    results_transcripts <- stattest(ballgown_filtted,
                                    feature = "transcript",
                                    covariate = "population",
                                    getFC = TRUE, 
                                    meas = "FPKM")
    # combine gene ids
    results_transcripts <- data.frame(
      geneNames = geneNames(ballgown_filtted),
      geneIDs = geneIDs(ballgown_filtted), 
      results_transcripts)
      
    attach(results_transcripts)
    results_transcripts$threshold <- ifelse(pval < 0.05 & abs(log2(fc)) > 5,ifelse(log2(fc) > 5, 'up', 'down'), 'no')
    detach()
    
    ggplot(results_transcripts, aes(log2(fc), -log10(pval))) +
      geom_point(aes(color = threshold)) +
      scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
      theme_bw()+
      xlab(label = expression(log[2] (fold-change))) + 
      ylab(label = expression(-log[10] (p-value)))
    

    Reference

    Ballgown:https://www.bioconductor.org/packages/release/bioc/vignettes/ballgown/inst/doc/ballgown.html

    相关文章

      网友评论

          本文标题:3. Ballgown 流程

          本文链接:https://www.haomeiwen.com/subject/eaxtkltx.html