美文网首页基因组坐标区间操作
【R - bioconducter练习】

【R - bioconducter练习】

作者: caokai001 | 来源:发表于2019-03-25 21:21 被阅读33次

    Before start, Make sure following packages are already installed:
    BSgenome, GenomicFeatures, GenomicRanges, BSgenome.Hsapiens.UCSC.hg19

    library(BSgenome)
    available.genomes()
    library(BSgenome.Hsapiens.UCSC.hg19)

    #####################################################

    1. explore sequence composition of fly genome

    #####################################################

    ### get sequence for chromosome 1
    Seq= Hsapiens [["chr1"]]
    Seq # shows some summaries
    
    ## get GC content
    bases=alphabetFrequency(Seq,baseOnly=TRUE)
    bases[1:4]
    ntotBases=sum(bases[1:4])
    baseFreq=bases[1:4]/ntotBases
    GCcontent=baseFreq["C"]+baseFreq["G"]
    
    ## look at CG dinucleotide content
    cg=matchPattern("CG", Seq)
    ncg=length(cg)
    
    ## compute the observed to expected ratio
    ncg/(baseFreq["C"]*baseFreq["G"]*ntotBases) ## this shows CG rarely stay together.
    
    ## compare to the observed to expected ratio of TG
    tg=matchPattern("TG", Seq)
    ntg=length(tg)
    ntg/(baseFreq["T"]*baseFreq["G"]*ntotBases) ## this shows TG presented more than expected.
    
    ######### look at GC content and CG dinucleotide distribution in 1000 bp windows in whole genome.
    ## Note I'm only doing it in chr1 now.
    ss=seq(1, length(Seq), by=1000)
    ss=ss[-length(ss)] ## remove the last one
    Seq.set=DNAStringSet(Seq, start=ss, end=ss+999)
    ff=alphabetFrequency(Seq.set, baseOnly=TRUE)
    pCG=(ff[,"C"]+ff[,"G"])/rowSums(ff)
    hist(pCG[pCG>0],100)
    
    ## CG occurance
    Seq.set=DNAStringSet(Seq, start=ss, end=ss+999)
    nCG=vcountPattern("CG", Seq.set)
    obsExp=nCG*1000/(ff[,"C"]*ff[,"G"])
    mean(obsExp,na.rm=TRUE)
    hist(obsExp,100) ## see a long tail, those are CpG islands
    

    #####################################################################

    2. Explore GC content at gene TSS (transcriptional starting site),

    #####################################################################

    ## and their overlaps with CpG island.
    ## first obtain genes from UCSC using GenomicFeatures functions
    library(GenomicFeatures)
    txdb=makeTxDbFromUCSC(genom="hg19",tablename="refGene") ## this is slow, take a few minutes.
    genes.hg19 = genes(txdb)
    
    ## get transcriptional start site.
    ## Note that txEnd is the start site for genes on "-" strand
    tss=start(genes.hg19)
    idx=which(strand(genes.hg19)=="-")
    tss[idx]=end(genes.hg19)[idx]
    
    ## create GRanges object for the TSS, remove random and hap chromosomes
    allchrs = as.character(seqnames(genes.hg19))
    idx=c(grep("random", allchrs),grep("hap", allchrs))
    ## create ranges with TSS +/- 500 bp
    TSS=GRanges(seqnames=Rle(allchrs[-idx]), ranges=IRanges(tss[-idx]-500,tss[-idx]+500))
    
    ## get GC contents of TSS for genes on chromosome 1
    idx.chr1=seqnames(TSS)=="chr1"
    Seq.set=DNAStringSet(Seq, start=start(TSS[idx.chr1]), end=end(TSS[idx.chr1]))
    ff=alphabetFrequency(Seq.set, baseOnly=TRUE)
    pCG.TSS=(ff[,"C"]+ff[,"G"])/rowSums(ff)
    hist(pCG.TSS,50)
    
    ## compare with genome wide distribution of GC content
    d1=density(pCG[pCG>0])
    d2=density(pCG.TSS)
    plot(d1, lwd=2)
    lines(d2, col="red",lwd=2)
    legend("topright", legend=c("Genome", "TSS"), lwd=2, col=c("black", "red"))
    
    image.png
    ## CG dinucleotide around TSS, compute obs/Exp.
    nCG=vcountPattern("CG", Seq.set)
    obsExp.TSS=nCG*1000/(ff[,"C"]*ff[,"G"])
    hist(obsExp.TSS, 50) ## bimodal distribution.
    
    ## compare with the genome wide one:
    d1=density(obsExp[!is.na(obsExp)])
    d2=density(obsExp.TSS)
    plot(d1, lwd=2, xlim=c(0,1.5), main="Observed to expected CG ratio")
    lines(d2, col="red",lwd=2)
    legend("topright", legend=c("Genome", "TSS"), lwd=2, col=c("black", "red"))
    
    image.png

    OPTIONAL(选做题)

    #####################################################################

    3. Overlaps of CpG island and TSS

    #####################################################################

    Read in CpG island file, create GRanges object and

    compute the percentage of TSS covered by CGI.

    This part is to be finished by students.

    相关文章

      网友评论

        本文标题:【R - bioconducter练习】

        本文链接:https://www.haomeiwen.com/subject/ywlivqtx.html