RNA-seq后续count处理

作者: pudding815 | 来源:发表于2023-04-10 22:30 被阅读0次
    ---------------------------------加载R包-------------------------
    library(DESeq2)
    library(pheatmap)
    library(corrplot)
    library(EnhancedVolcano)
    library(ggplot2)
    library(ggsci)
    library(Vennerable)
    library(rio)
    library(matrixStats)
    library(BiocGenerics)
    library(biomaRt)
    library(limma)
    library(dplyr)
    library(tidyverse)
    library(tibble)
    
    ----------------------count ------------------------------------------------
    count <- read.table('C:/High-throughput sequencing/LH0-4-12h/6sample_count.txt',header = T,sep = '\t')
    count <- count[,-c(2:5)]
    meta <- count[,1:2]
    rownames(count) <- count$Geneid
    count <- count[,-c(1:2)]
    colnames(count) <- c("LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")
    
    
    ---------------------gene information---------------------------------
    gene.info <- read.table('mouse.ref102.gene.type.gtf',header = F,sep = '\t')
    gene.info<-gene.info[,c(5:7)]
    colnames(gene.info) <- c('geneid','symbol','biotype')
    colnames(meta) <- c('geneid','length')
    meta <- meta %>% left_join(y = gene.info,by = 'geneid')
    
    -------------------------get TPM--------------------------------
    kb <- meta$Length / 1000
    fpkm <- count / kb
    tpm <- t(t(fpkm)/colSums(fpkm) * 1000000)
    tpm <- round(tpm,2) %>% as.data.frame()
    
    ----------------clean data-----------------------------
    pick.genes <- unique(c(rownames(tpm[matrixStats::rowMedians(as.matrix(tpm[,1:6]), cols = 1:2) > 0.5, ]),
                           rownames(tpm[matrixStats::rowMedians(as.matrix(tpm[,1:6]), cols = 3:4) > 0.5, ]),
                           rownames(tpm[matrixStats::rowMedians(as.matrix(tpm[,1:6]), cols = 5:6) > 0.5, ])
                           ))
    
    count.clean <- count[pick.genes,]
    fpkm.clean <- fpkm[pick.genes,]
    tpm.clean <- tpm[pick.genes,]
    count.clean <- rownames_to_column(count.clean,var = "row_names")
    colnames(count.clean) <- c("geneid","LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")
    fpkm.clean <- rownames_to_column(fpkm.clean,var = "row_names")
    colnames(fpkm.clean) <- c("geneid","LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")
    tpm.clean <- rownames_to_column(tpm.clean,var = "row_names")
    colnames(tpm.clean) <- c("geneid","LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")
    
    count.clean <- count.clean %>% left_join(y = meta,by = 'geneid')
    fpkm.clean <- fpkm.clean %>% left_join(y = meta,by = 'geneid')
    tpm.clean <- tpm.clean %>% left_join(y = meta,by = 'geneid')
    
    count.clean.pcg <- count.clean[count.clean$biotype == "protein_coding",]
    fpkm.clean.pcg  <- fpkm.clean[fpkm.clean$biotype == "protein_coding",]
    tpm.clean.pcg   <- tpm.clean[tpm.clean$biotype == "protein_coding",]
    
    有重复基因,根据表达量删除低的
    which(count.clean.pcg$symbol=="St6galnac2")
    
    count.clean.pcg <- count.clean.pcg[-c(2719,6414,10370,14222),]
    fpkm.clean.pcg  <- fpkm.clean.pcg[-c(2719,6414,10370,14222),]
    tpm.clean.pcg   <-tpm.clean.pcg[-c(2719,6414,10370,14222),]
    
    rownames(count.clean.pcg) <- count.clean.pcg$symbol
    rownames(fpkm.clean.pcg)  <- fpkm.clean.pcg$symbol
    rownames(tpm.clean.pcg)   <- tpm.clean.pcg$symbol
    
    count.clean.pcg <- count.clean.pcg[,2:7]
    fpkm.clean.pcg  <- fpkm.clean.pcg[,2:7]
    tpm.clean.pcg   <- tpm.clean.pcg[,2:7]
    write.csv(count.clean.pcg,'count_clean_pcg.csv')
    write.csv(fpkm.clean.pcg,'fpkm_clean_pcg.csv')
    write.csv(tpm.clean.pcg,'tpm_clean_pcg.csv')
    

    差异分析

    ------------------------- DEG analysis-----------------------
    padj_cutoff <- 0.05
    group_list = c(rep('h0',2),rep("h4",2))
    
    colData = data.frame(row.names = colnames(count.clean.pcg[,1:4]),group_list = group_list)
    dds <- DESeqDataSetFromMatrix(countData = count.clean.pcg[,1:4],colData = colData,design = ~group_list)
    rld <- rlog(dds, blind=TRUE)
    dds <- DESeq(dds)
    plotDispEsts(dds)
    RES <- results(dds, contrast = c('group_list','h4','h0'),name = 'h4_vs_h0',alpha = 0.05) 前处后对
    res_tbl <- RES %>%data.frame() %>%rownames_to_column(var="geneid") %>%as_tibble() %>%arrange(padj)
    res_tbl<-res_tbl %>% left_join(y = gene.info,by = 'geneid')
    write.csv(res_tbl,"h4_vs_h0_DEseq2_res.csv")
    
    ---------------log2FC pvaule------
    sig_res <- dplyr::filter(res_tbl, padj < padj_cutoff) %>%
      dplyr::arrange(padj)
    write.csv(sig_res,"h4_vs_h0_sig_res0.05.csv")
    pvalue<-sig_res$pvalue<0.05
    up<-sig_res$log2FoldChange>2
    down<-sig_res$log2FoldChange<(-2)
    sig_res$change<-ifelse(pvalue&up,"up",ifelse(pvalue&down,"down","none"))
    write.csv(sig_res,"h4_vs_h0_DEG_p0.05_FC2.csv")
    

    功能分析

    -----------------------enrichment-method2: clusterprofiler---------------------------
    library(clusterProfiler)
    library(org.Mm.eg.db)
    ms.up <- factor(na.omit(sig_res[sig_res$change == 'up',]$geneid))
    ms.up <- bitr(ms.up,fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Mm.eg.db)
    ms.down <- factor(na.omit(sig_res[sig_res$change == 'up',]$geneid ))
    ms.down <- bitr(ms.down,fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Mm.eg.db)
    bg.genes <- bitr(sig_res$geneid ,fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Mm.eg.db )
    
    
    ms.up.go = enrichGO(  gene = as.character(ms.up$ENTREZID),
                          universe = as.character(bg.genes$ENTREZID),
                          OrgDb = org.Mm.eg.db,
                          ont = 'BP',
                          pAdjustMethod = 'BH',
                          pvalueCutoff = 0.05,
                          qvalueCutoff = 0.1,
                          readable = TRUE)
    dotplot(ms.up.go,showCategory = 20)
    up.go_result_BP <- as.data.frame(ms.up.go)
    write.table(up.go_result_BP, "up.go_result_BP.xls", quote = F, sep = "\t", col.names = T, row.names = F)
    

    相关文章

      网友评论

        本文标题:RNA-seq后续count处理

        本文链接:https://www.haomeiwen.com/subject/anlwddtx.html