1.目的
将基因组的区间(比如sequencing reads, peaks)对应到对应的基因上面。
2.方法
library(rtracklayer)
library(GenomicFeatures)
########################################################################
# import the BED data
twist.bed = import.bed("~/Soft/bedfile/hg19-research-0317geneplus.merge.bed")
twist.bed
head(twist.bed)
#GRanges object with 6 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 2487957-2488311 *
# [2] chr1 2489027-2489407 *
########################################################################
#Getting Gene Features
#load genomic information
library(BSgenome.Hsapiens.1000genomes.hs37d5)
genome <- BSgenome.Hsapiens.UCSC.hg19
#head(seqlengths(genome))
#Read GTF annotation information.
gtf_file = "CNVs/gencode.v27lift37.annotation.gtf"
txdb <- makeTxDbFromGFF(file=gtf_file, chrominfo=seqinfo(genome), organism="Homo sapiens")
#Also can load annotation information from knowngenes.
#library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
#Get gene ranges.
genes = genes(txdb)
# GRanges object with 60424 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# ENSG00000000003.14_2 chrX 99882106-99894988 -
# ENSG00000000005.5_2 chrX 99839799-99854882 +
# ENSG00000000419.12_2 chr20 49551404-49575092 -
########################################################################
#find the overlap between beds and genes
hits = findOverlaps(twist.bed, genes)
merge.bed = twist.bed[queryHits(hits), ]
mcols(merge.bed) = mcols(genes[subjectHits(hits) ,])
merge.bed
# GRanges object with 6631 ranges and 1 metadata column:
# seqnames ranges strand | gene_id
# <Rle> <IRanges> <Rle> | <character>
# [1] chr1 2487957-2488311 * | ENSG00000157873
# [2] chr1 2487957-2488311 * | ENSG00000238164
# [3] chr1 2489027-2489407 * | ENSG00000157873
3.参考文献
- Assigning peaks to genes: https://research.stowers.org/cws/CompGenomics/Tutorial/peak_assignment.html
网友评论