RNA-seq分析流程

作者: chaos_z | 来源:发表于2021-06-23 09:47 被阅读0次

    1、质量控制:

    fastqc -o ./ -t 1 /public/home/zyhu/hfd/${i}_R1_001.fastq.gz /public/home/zyhu/hfd/${i}_R2_001.fastq.gz
    
    合并质控结果
    multiqc ./
    
    Trimmomatic修剪去掉前端低质量碱基
    java -jar \$EBROOTTRIMMOMATIC/trimmomatic-0.32.jar PE -phred33 ${i}_R1_001.fastq.gz ${i}_R2_001.fastq.gz \
    ${i}_R1_001_paired.fastq.gz ${i}_R1_001_unpaired.fastq.gz ${i}_R2_001_paired.fastq.gz ${i}_R2_001_unpaired.fastq.gz \
    LEADING:3 TRAILING:3 HEADCROP:10 SLIDINGWINDOW:4:15 MINLEN:36
    

    2、比对到参考基因组(hisat2)

    hisat2 -p 4 --dta -x /public/home/zyhu/Mouse/reference/GRCm39 -1 /public/home/zyhu/Mouse_RNA-seq/02.trim/${i}_R1_001_paired.fastq.gz -2 /public/home/zyhu/Mouse_RNA-seq/02.trim/${i}_R2_001_paired.fastq.gz -S ${i}.sam
    

    3、featureCounts计数(meta层面)

    featureCounts -T 4 -a /public/home/zyhu/Mouse/annotation/Mus_musculus.GRCm39.103.gtf -o /public/home/zyhu/Mouse_RNA-seq/06.featureCounts/meta-featureCounts/featureCounts.txt -p -B -C -t exon -g gene_id *.bam
    
    提取基因名和reads count
    cut -f 1,7-41 featureCounts.txt|grep -v '#' > A_B_C_M.matrix.txt
    

    4、差异分析

    rm(list=ls())
    options(stringsAsFactors = F)
    library('rio')
    library('ggplot2')
    library('DESeq2')
    library('pheatmap')
    library('apeglm')
    CountMatrix <- read.table("C:/Users/Cool-N/OneDrive/桌面/A_B_C_M.matrix.txt",header = T,row.names= 1)
    colnames(CountMatrix) <- c("A_10_S37","A_1_S30","A_2_S31","A_3_S32","A_4_S33","A_6_S34","A_7_S35","A_9_S36","B_1_S38","B_2_S39","B_3_S40","B_4_S41","B_5_S42","B_6_S43","B_7_S44","B_9_S45","C_2_S46","C_3_S47","C_4_S48","C_5_S49","C_6_S50","C_7_S51","C_8_S52","C_9_S53","M_2_S54","M_3_S55","M_4_S56","M_5_S57","M_6_S58","M_7_S59","M_8_S60","M_9_S61")
    countdata = CountMatrix[,c(1:8,17:24)]
    #保留非全0的基因
    countdata <- countdata[rowSums(countdata) > 0,]
    condition = factor(c(rep("A", 8), rep("C", 8)), levels = c("A","C"))
    coldata = data.frame(row.names = colnames(countdata), condition)
    dds = DESeqDataSetFromMatrix(countdata, colData = coldata, design = ~ condition)
    
    #PCA分析--其他包
    # library(ggord)
    # library(FactoMineR)
    # dat <- as.data.frame(t(countdata))
    # dat.pca <- PCA(dat, graph = FALSE)
    # ggord(dat.pca,group_list,coord_fix = FALSE,
    #      arrow = 0,vec_ext = 0,txt = NULL)
    # ggsave("PCA.pdf")
    
    
    #PCA--使用DESeq2的plotPCA包;样本数<30:rlog; >30:vst(归一化)
    rld<-rlog(dds,blind=FALSE)
    assay(rld) <- limma::removeBatchEffect(assay(rld), rld$condition)#去除批次效应,可选项
    pdf("PCA_plot.pdf")
    plotPCA(rld,intgroup = "condition")
    dev.off()
    
    
    #差异表达分析
    dds = DESeq(dds)
    #sizeFactors(dds)
    res <- results(dds)
    res <- res[order(res$padj),]
    #table(res$padj<0.01)
    #将DEG转换为数据框格式,并去掉含NA的行
    DEG <- as.data.frame(res)
    DEG <- na.omit(DEG)
    
    #################火山图##############
    library(ggrepel)
    #确定差异表达倍数,设置动态阈值,也可以设置静态阈值,例如log2FoldChange=1或2
    logFC_t <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)))
    #取前两位小数
    logFC_t <- round(logFC_t, 2)
    #确定上下调基因
    DEG$change = as.factor(ifelse(DEG$padj < 0.05 & abs(DEG$log2FoldChange) > logFC_t,
                                  ifelse(DEG$log2FoldChange > logFC_t ,'UP','DOWN'),'STABLE'))
    #table(DEG$change)
    DEG$symbol <- rownames(DEG)
    #设置阈值显示超过阈值的基因名
    DEG$label <- ifelse(DEG$padj < 0.00001 & abs(DEG$log2FoldChange) >= 3,DEG$symbol,"")
    #画火山图
    pdf("volcano_plot.pdf")
    ggplot(DEG, aes(x=log2FoldChange, y=-log10(padj),color=change)) + 
      geom_point(alpha=0.4, size=2) + 
      theme_classic() + 
      xlab("log2 fold change") +
      ylab("-log10 padj") +
      theme(plot.title = element_text(size=15,hjust = 0.5)) + 
      scale_colour_manual(values = c('blue','black','red')) +
      geom_hline(yintercept = -log10(0.05), lty = 2) +
      geom_vline(xintercept = c(-logFC_t, logFC_t), lty = 2) +
      geom_label_repel(data = DEG, aes(label = label),
                       size = 3,box.padding = unit(0.5, "lines"),
                       point.padding = unit(0.8, "lines"),
                       segment.color = "black",
                       show.legend = FALSE, max.overlaps = 10000)
    dev.off()
    
    #############    pheatmap热图    ############
    dat <- countdata
    FC <- DEG$log2FoldChange
    names(FC) <- rownames(DEG)
    #排序差异倍数,提取前100和后100的基因名
    DEG_200 <- c(names(head(sort(FC),100)),names(tail(sort(FC),100)))
    #将提取的基因归一化
    dat <- t(scale(t(dat[DEG_200,])))
    #添加注释条
    group <- data.frame(group=condition)
    rownames(group)=colnames(dat)
    pdf("pheatmap_plot.pdf")
    pheatmap(dat, cluster_cols = T,
             show_colnames =F,show_rownames = F,
             annotation_col=group,
             color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
    dev.off()
    
    #大于2的赋值为2
    dat[dat > 2] <- 2
    #低于2的赋值为-2
    dat[dat < -2] <- -2
    pdf("pheatmap_alter_plot.pdf")
    pheatmap(dat, cluster_cols = T,
             show_colnames =F,show_rownames = F,
             annotation_col=group,
             color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
    dev.off()
    
    ###########    KEGG富集分析     ##########
    library('dplyr')
    library(org.Mm.eg.db)#选取物种数据库
    library(clusterProfiler)
    #### 第一步,确定上下调基因,已经完成
    #### 第二步,从org.Mm.eg.db提取ENSG的ID和GI号对应关系
    # 如果ENSG的ID具有小数点,用下面两句代码舍掉小数点
    # library(stringr)
    # rownames(DEG) <- str_sub(rownames(DEG), start = 1, end = 15)
    
    #DEG添加一列ENSEMBL
    DEG$ENSEMBL <- rownames(DEG)
    # bitr in clusterProfiler,将ENSEMBL的ID转换为GI号保存为df
    df <- bitr(rownames(DEG),
               fromType = "ENSEMBL",
               toType = "ENTREZID",
               OrgDb = org.Mm.eg.db)
    
    #### 第三步,将GI号合并到DEG数据框内
    DEG <- inner_join(DEG, df, by="ENSEMBL")
    #### 第四步,提取上调和下调的GI集
    gene_up <- DEG[DEG$change == "UP", "ENTREZID"] 
    gene_down <- DEG[DEG$change == "DOWN", "ENTREZID"]
    gene_diff <- c(gene_up, gene_down)
    gene_all <- as.character(DEG[,"ENTREZID"])
    
    geneList <- DEG$log2FoldChange
    names(geneList) <- DEG$ENTREZID
    geneList <- sort(geneList, decreasing = T)
    geneList <- geneList[geneList != 0]
    
    ## KEGG pathway analysis
    kk.up <- enrichKEGG(gene          =  gene_up,
                        organism      =  "mmu",
                        universe      =  gene_all,
                        pvalueCutoff  =  0.8,
                        qvalueCutoff  =  0.8)
    kk.down <- enrichKEGG(gene          =  gene_down,
                          organism      =  "mmu",
                          universe      =  gene_all,
                          pvalueCutoff  =  0.8,
                          qvalueCutoff  =  0.8)
    
    kegg_down_dt <- as.data.frame(kk.down)
    kegg_up_dt <- as.data.frame(kk.up)
    down_kegg <- kegg_down_dt[kegg_down_dt$pvalue < 0.05,]
    down_kegg$group = -1
    up_kegg <- kegg_up_dt[kegg_up_dt$pvalue < 0.05,]
    up_kegg$group = 1
    
    dat <- rbind(up_kegg, down_kegg)
    dat$pvalue <- -log10(dat$pvalue)
    dat$pvalue <- dat$pvalue * dat$group
    
    dat <- dat[order(dat$pvalue, decreasing = F),]
    pdf("KEGG.pdf")
    ggplot(dat, aes(x = reorder(Description, order(pvalue, decreasing=F)), y = pvalue, fill = group)) + 
      geom_bar(stat = "identity") + 
      scale_fill_gradient(low = "blue", high = "red", guide = FALSE) + 
      scale_x_discrete(name = "Pathway names") +
      scale_y_continuous(name = "log10P-value") +
      coord_flip() + theme_bw() + theme(plot.title = element_text(hjust = 0.5)) +
      ggtitle("Pathway Enrichment")
    dev.off()
    
    ############   GO富集分析   #########
    ego_up <- enrichGO(gene          = gene_up,
                       OrgDb         = org.Mm.eg.db,
                       ont           = "ALL" ,
                       universe      = gene_all,
                       pAdjustMethod = "BH",
                       pvalueCutoff  = 0.99,
                       qvalueCutoff  = 0.99,
                       readable      = TRUE)
    head(ego_up[,1:6])
    ego_up_result <- ego_up@result[order(ego_up@result$pvalue),]
    ego_up_sig <- rbind(head(subset(ego_up_result, ONTOLOGY == "BP"), 10),
                        head(subset(ego_up_result, ONTOLOGY == "CC"), 10),
                        head(subset(ego_up_result, ONTOLOGY == "MF"), 10))
    ego_up_sig$pvalue <- -log10(ego_up_sig$pvalue)
    ego_up_sig$Description <- factor(ego_up_sig$Description, levels = as.character(rev(ego_up_sig$Description)))
    pdf("GO_up.pdf")
    ggplot(ego_up_sig, aes(x = Description, y = pvalue, fill = ONTOLOGY)) +
      geom_col() +
      coord_flip() +
      theme_classic() +
      facet_grid(ONTOLOGY~., scales = "free") +
      xlab("Up regulated")
    dev.off()
    
    ego_down <- enrichGO(gene          = gene_down,
                         OrgDb         = org.Mm.eg.db,
                         ont           = "ALL" ,
                         universe      = gene_all,
                         pAdjustMethod = "BH",
                         pvalueCutoff  = 0.99,
                         qvalueCutoff  = 0.99,
                         readable      = TRUE)
    head(ego_down[,1:6])
    ego_down_result <- ego_down@result[order(ego_down@result$pvalue),]
    ego_down_sig <- rbind(head(subset(ego_down_result, ONTOLOGY == "BP"), 10),
                          head(subset(ego_down_result, ONTOLOGY == "CC"), 10),
                          head(subset(ego_down_result, ONTOLOGY == "MF"), 10))
    ego_down_sig$pvalue <- -log10(ego_down_sig$pvalue)
    ego_down_sig$Description <- factor(ego_down_sig$Description, levels = as.character(rev(ego_down_sig$Description)))
    pdf("GO_down.pdf")
    ggplot(ego_down_sig, aes(x = Description, y = pvalue, fill = ONTOLOGY)) +
      geom_col() +
      coord_flip() +
      theme_classic() +
      facet_grid(ONTOLOGY~., scales = "free") +
      xlab("Down regulated")
    dev.off()
    

    相关文章

      网友评论

        本文标题:RNA-seq分析流程

        本文链接:https://www.haomeiwen.com/subject/xsmxeltx.html