美文网首页差异分析scRNA-seq复现
复现1:AD的GWAS位点关联基因挖掘

复现1:AD的GWAS位点关联基因挖掘

作者: 小贝学生信 | 来源:发表于2021-05-10 17:03 被阅读0次
    • Title:Mapping Alzheimer's Disease Variants to Their Target Genes Using Computational Analysis of Chromatin Configuration
    • PMID:31984958
    • Data/Code availability:Yes
    • IF:1.163
    • 其它:paper来自JOVE的视频期刊,第一次了解,感觉挺有意思的
    • 分析思路:如下,思路较为清晰明了,简单易行。首先根据其它文章里的AD GWAS数据找到关联基因(外显子、启动子、hiC的enhancer),其中通过hiC的enhancer,寻找关联基因是本文的亮点所在。然后就是根据关联基因进行富集分析、不同AD分组差异表达分析、细胞类型差异表达分析。


      paper思路
    • 代码实操(R version 4.0.3 )

    0、安装包、下载数据

    (1) 相关R包

    BiocManager::install("GenomicRanges")
    BiocManager::install("biomaRt")
    BiocManager::install("WGCNA")
    install.packages("reshape")
    install.packages("ggplot2")
    install.packages("corrplot")
    install.packages("gProfileR")
    install.packages("tidyverse")
    install.packages("ggpubr")
    

    (2)所有数据均可下载

    1、制作GRanges对象(SNP、exon、promoter、HiC-enhancer)

    • GenomicRanges包的GRanges()函数创建
    library(GenomicRanges) 
    #SNP
    credSNP = read.delim("Supplementary_Table_8_Jansen.txt", header=T,encoding = "UTF-8") 
    #2357   19
    credSNP = credSNP[credSNP$Credible.Causal=="Yes",]
    #800    19
    credranges = GRanges(credSNP$Chr, IRanges(credSNP$bp, credSNP$bp), rsid=credSNP$SNP, P=credSNP$P) 
    
    #promoter and exonic region
    exon = read.table("Gencode19_exon.bed")
    exonranges = GRanges(exon[,1],IRanges(exon[,2],exon[,3]),gene=exon[,4])
    promoter = read.table("Gencode19_promoter.bed")
    promoterranges = GRanges(promoter[,1], IRanges(promoter[,2], promoter[,3]), gene=promoter[,4])
    
    #HiC region
    hic = read.table("Promoter-anchored_chromatin_loops.bed", skip=1)
    colnames(hic) = c("chr", "TSS_start", "TSS_end", "Enhancer_start", "Enhancer_end")
    hicranges = GRanges(hic$chr, IRanges(hic$TSS_start, hic$TSS_end), enhancer=hic$Enhancer_start)
    

    2、寻找SNP与其它三种feature的Overlap

    • GenomicRanges包的findOverlaps()函数找overlap
    • findOverlaps(A, B)。其中A对应结果的queryHits,B对应结果的subjectHits
    #(1)Overlap credible SNPs with exonic regions
    olap = findOverlaps(credranges, exonranges)
    credexon = credranges[queryHits(olap)]
    #add gene info
    mcols(credexon) = cbind(mcols(credexon), mcols(exonranges[subjectHits(olap)]))
    #GRanges object with 42 ranges and 3 metadata columns:
    #       seqnames    ranges strand |        rsid         P            gene
    #          <Rle> <IRanges>  <Rle> | <character> <numeric>     <character>
    #   [1]        1 161155392      * |   rs4575098  2.05e-10 ENSG00000158859
    #   [2]        1 161156033      * |  rs11585858  5.58e-10 ENSG00000158859
    #   [3]        1 207795320      * |   rs2296160  1.66e-17 ENSG00000203710
    
    
    #(2)Overlap credible SNPs with promoter regions
    olap = findOverlaps(credranges, promoterranges)
    credpromoter = credranges[queryHits(olap)]
    # add gene info
    mcols(credpromoter) = cbind(mcols(credpromoter), 
    mcols(promoterranges[subjectHits(olap)]))
    #GRanges object with 236 ranges and 3 metadata columns:
    #        seqnames    ranges strand |        rsid         P            gene
    #           <Rle> <IRanges>  <Rle> | <character> <numeric>     <character>
    #    [1]        2 234068476      * |  rs35349669  4.85e-09 ENSG00000168918
    #    [2]        2 234068476      * |  rs35349669  4.85e-09 ENSG00000168918
    #    [3]        2 234068705      * |  rs28669088  8.84e-09 ENSG00000168918
    
    #(3)Overlap credible SNPs with hic-enhancer
    ## 先找hic interaction里与promoter存在overlap的loop
    olap = findOverlaps(hicranges, promoterranges) 
    hicpromoter = hicranges[queryHits(olap)]
    mcols(hicpromoter) = cbind(mcols(hicpromoter), mcols(promoterranges[subjectHits(olap)]))
    hicenhancer = GRanges(seqnames(hicpromoter), IRanges(hicpromoter$enhancer, hicpromoter$enhancer+10000), gene=hicpromoter$gene)
    ## 再基于上一步结果寻找与含promoter的hic的另一端"enhancer"存在重叠的variant
    olap = findOverlaps(credranges, hicenhancer)
    credhic = credranges[queryHits(olap)]
    #add gene info
    mcols(credhic) = cbind(mcols(credhic), mcols(hicenhancer[subjectHits(olap)]))
    
    • Integrate the 3 types GWAS targeted gene
    ADgenes = Reduce(union, list(credhic$gene, credexon$gene, credpromoter$gene))
    ### to convert Ensembl Gene ID to HGNC symbol load("geneAnno.rda")
    load("geneAnno2.rda")
    ADhgnc = geneAnno1[match(ADgenes, geneAnno1$ensembl_gene_id), "hgnc_symbol"]
    ADhgnc = ADhgnc[ADhgnc!=""]
    save(ADgenes, ADhgnc, file="ADgenes.rda")
    write.table(ADhgnc, file="ADgenes.txt", row.names=F, col.names=F, quote=F, sep="\t")
    
    112 genes
    library(reshape2)
    library(ggplot2)
    library(GenomicRanges)
    library(biomaRt) 
    library(WGCNA)
    

    3、112个AD risk基因在Bulk RNA-seq的分组比较

    #(1)整理表达矩阵
    datExpr = read.csv("expression_matrix.csv", header = FALSE) 
    datExpr = datExpr[,-1]  #17604  492
    datMeta = read.csv("columns_metadata.csv") 
    datProbes = read.csv("rows_metadata.csv")
    datExpr = datExpr[datProbes$ensembl_gene_id!="",]   #17085  492
    datProbes = datProbes[datProbes$ensembl_gene_id!="",]
    
    #利用WGCGA包的collapseRows函数,针对重复基因的测序数据,取代表性的一个
    datExpr.cr = collapseRows(datExpr, rowGroup = datProbes$ensembl_gene_id, rowID= rownames(datExpr))
    class(datExpr.cr)
    datExpr = datExpr.cr$datETcollapsed
    gename = data.frame(datExpr.cr$group2row)
    rownames(datExpr) = gename$group
    #16279  492
    
    # (2)分组信息
    #Specify developmental stages
    datMeta$Unit = "Postnatal"
    idx = grep("pcw", datMeta$age) #postconceptional weeks
    datMeta$Unit[idx] = "Prenatal" #产前
    
    idx = grep("yrs", datMeta$age)
    datMeta$Unit[idx] = "Postnatal" #产后
    datMeta$Unit = factor(datMeta$Unit, levels=c("Prenatal", "Postnatal"))
    
    # (3)选择大脑皮层区域的测序样本
    datMeta$Region = "SubCTX"
    r = c("A1C", "STC", "ITC", "TCx", "OFC", "DFC", "VFC", "MFC", "M1C", "S1C", "IPC", "M1C-S1C", "PCx", "V1C", "Ocx")
    datMeta$Region[datMeta$structure_acronym %in% r] = "CTX"
    datExpr = datExpr[,which(datMeta$Region=="CTX")]
    #16279  345
    datMeta = datMeta[which(datMeta$Region=="CTX"),]
    #345  10
    
    # (4)观察112个risk gene在两组的平均表达差异
    load("ADgenes.rda")
    #计算两组的均值(postnatal cortex与prenatal cortex)
    exprdat = apply(datExpr[match(ADgenes, rownames(datExpr)),],2,mean,na.rm=T)
    dat = data.frame(Region=datMeta$Region, Unit=datMeta$Unit, Expr=exprdat)
    
    ggplot(dat,aes(x=Unit, y=Expr, fill=Unit, alpha=Unit)) + 
        geom_boxplot(outlier.size = NA) + 
        ggtitle("Brain Expression") + ylab("Normalized expression") + 
        xlab("") + scale_alpha_manual(values=c(0.2, 1)) + 
        theme_classic() + theme(legend.position="na")
    

    4、观察112个risk gene在不同细胞类型的表达差异

    #scRNA-seq 表达矩阵整理
    cellexp = read.table("DER-20_Single_cell_expression_processed_TPM_backup.tsv",header=T,fill=T) 
    cellexp[1121,1] = cellexp[1120,1] 
    cellexp = cellexp[-1120,] 
    rownames(cellexp) = cellexp[,1] 
    cellexp = cellexp[,-1] 
    datExpr = scale(cellexp,center=T, scale=F) 
    datExpr = datExpr[,789:ncol(datExpr)] #15086    3461
    
    #抽取112个gene的表达信息
    targetname = "AD"
    targetgene = ADhgnc
    exprdat = apply(datExpr[match(targetgene, rownames(datExpr)),],2,mean,na.rm=T)
    dat = data.frame(Group=targetname, cell=names(exprdat), Expr=exprdat)
    
    #整理细胞类型
    dat = dat[-grep("Ex|In",dat$celltype),] #不要神经元细胞
    dat$celltype = gsub("Dev","Fetal",dat$celltype) #Dev替换为Fetal
    dat$celltype = factor(dat$celltype, levels=c("Neurons","Astrocytes","Microglia","Endothelial", "Oligodendrocytes","OPC","Fetal"))
    
    ggplot(dat,aes(x=celltype, y=Expr, fill=celltype)) +
        ylab("Normalized expression") + xlab("") + 
        geom_violin() + 
        theme(axis.text.x=element_text(angle = 90, hjust=1)) + 
        theme(legend.position="none") +
        ggtitle(paste0("Cellular expression profiles of AD risk genes"))
    #AD risk genes are highly expressed in microglia
    
    • 如下图结果展示,有多种多样的基因富集思路可供选择。


    4、112个AD risk基因的富集分析(homer软件)

    • 4.1 安装、下载、分析
    #用conda安装
    conda install -c bioconda homer
    #下载背景data
    perl /home/shensuo/miniconda3/envs/tmp/share/homer/.//configureHomer.pl -install
    perl /home/shensuo/miniconda3/envs/tmp/share/homer/.//configureHomer.pl -install human-p 
    perl /home/shensuo/miniconda3/envs/tmp/share/homer/.//configureHomer.pl -install human-o
    #富集分析
    findGO.pl ~/work/ADgenes.txt human ./go/ -cpu 5
    
    富集分析结果
    • 4.2 可视化
    library(ggpubr)
    plot_barplot = function(dbname,name,color){
        input = read.delim(paste0(dbname,".txt"),header=T) 
        input = input[,c(-1,-10,-11)] 
        input = unique(input)
        input$FDR = p.adjust(exp(input$logP)) 
        input_sig = input[input$FDR < 0.1,] 
        input_sig$FDR = -log10(input_sig$FDR) 
        input_sig = input_sig[order(input_sig$FDR),]
        p = ggbarplot(input_sig, x = "Term", y = "FDR", fill = color, 
            color = "white", sort.val = "asc", 
            ylab = expression(-log[10](italic(FDR))), xlab = paste0(name," Terms"), 
            rotate = TRUE, label = paste0(input_sig$Target.Genes.in.Term,"/",input_sig$Genes.in.Term), 
            font.label = list(color = "white", size = 9), lab.vjust = 0.5, lab.hjust = 1)
        p = p+geom_hline(yintercept = -log10(0.05), linetype = 2, color = "lightgray")
        return(p)
    }
    #以GO结果为例
    plot_barplot("biological_process","GO Biological Process","#00AFBB") 
    
    GO enrichment

    以上就是本篇paper全部的分析流程,的确很简单,对于我来说的收获有

    • 1、GWAS的SNP与HiC的联合分析思路
    • 2、Homer软件进行基因富集分析的强大之处(进一步学习)
    • 3、WGCNA包里的collapseRows()函数处理重复基因测序的方法
    • 4、GenomicRanges包找重叠区域的方法
      .......

    相关文章

      网友评论

        本文标题:复现1:AD的GWAS位点关联基因挖掘

        本文链接:https://www.haomeiwen.com/subject/wfnbdltx.html