美文网首页R生物信息学R语言源码基因组坐标区间操作
fasta、fastq、vcf、bam、bed、gtf--多种生

fasta、fastq、vcf、bam、bed、gtf--多种生

作者: 小洁忘了怎么分身 | 来源:发表于2020-02-20 21:04 被阅读0次

    多种生信格式的R语言读取

    花花写于2020-02-20 其实我是现学现卖的,核心就是搜索, r read fasta​、r read fastq​ 巴拉巴拉。​

    导语

    今天介绍的内容是fasta、fastq、vcf、bam、bed、gtf六种数据读入R语言的方式。众所周知(假装是的),这些格式都是上游分析用到的,linux处理比较常见。但是linux学习难度大,很多人望而却步,(理由我就不继续编了,其实就是我要备课而已)。获取示例数据请在“生信星球”公众号后台回复“547”​

    1.fasta

    使用R包seqinr。

    #BiocManager::install('seqinr')
    library(seqinr)
    library(stringr)
    fasta <- read.fasta(file = "longreads.fa",
                        as.string = TRUE,
                        forceDNAtolower = FALSE)
    class(fasta)
    #> [1] "list"
    attributes(fasta)
    #> $names
    #> [1] "r1" "r2" "r3" "r4" "r5"
    r1 = fastafile[[1]]
    #> Error in eval(expr, envir, enclos): object 'fastafile' not found
    class(r1)
    #> Error in eval(expr, envir, enclos): object 'r1' not found
    str_length(r1)
    #> Error in stri_length(string): object 'r1' not found
    str_sub(r1,1,10)
    #> Error in stri_sub(string, from = start, to = end): object 'r1' not found
    

    2.fastq

    使用R包ShortRead,可以读取、查看、提取id、提取质量值,还可以导出。

    rm(list = ls())
    #BiocManager::install("ShortRead")
    suppressMessages(library(ShortRead))
    rfq <- readFastq("s_1_sequence.txt")
    rfq
    #> class: ShortReadQ
    #> length: 256 reads; width: 36 cycles
    sread(rfq)
    #>   A DNAStringSet instance of length 256
    #>       width seq
    #>   [1]    36 GGACTTTGTAGGATA...CTTTCCTTCTCCTGT
    #>   [2]    36 GATTTCTTACCTATT...TGAACAGCATCGGAC
    #>   [3]    36 GCGGTGGTCTATAGT...AATATCAATTTGGGT
    #>   [4]    36 GTTACCATGATGTTA...CATTTGGAGGTAAAA
    #>   [5]    36 GTATGTTTCTCCTGC...CCTTCTTGAAGGCTT
    #>   ...   ... ...
    #> [252]    36 GTTTAGATATGAGTC...TGTTCATGGTAGAGT
    #> [253]    36 GTTTTACAGACACCT...ACATCGTCAACGTTA
    #> [254]    36 GATGAACTAAGTCAA...CACTAACCTTGCGAG
    #> [255]    36 GTTTGGTTCGCTTTG...CTTCGGTTCCGACTA
    #> [256]    36 GCAATCTGCCGACCA...ATTCAATCATGACTT
    id(rfq)
    #>   A BStringSet instance of length 256
    #>       width seq
    #>   [1]    24 HWI-EAS88_1_1_1_1001_499
    #>   [2]    23 HWI-EAS88_1_1_1_898_392
    #>   [3]    23 HWI-EAS88_1_1_1_922_465
    #>   [4]    23 HWI-EAS88_1_1_1_895_493
    #>   [5]    23 HWI-EAS88_1_1_1_953_493
    #>   ...   ... ...
    #> [252]    23 HWI-EAS88_1_1_1_693_898
    #> [253]    23 HWI-EAS88_1_1_1_183_559
    #> [254]    23 HWI-EAS88_1_1_1_314_891
    #> [255]    23 HWI-EAS88_1_1_1_884_867
    #> [256]    23 HWI-EAS88_1_1_1_878_444
    quality(rfq)
    #> class: SFastqQuality
    #> quality:
    #>   A BStringSet instance of length 256
    #>       width seq
    #>   [1]    36 ]]]]]]]]]]]]Y]Y...]]]]]]VCHVMPLAS
    #>   [2]    36 ]]]]]]]]]]]]Y]]...]YPV]T][PZPICCK
    #>   [3]    36 ]]]]Y]]]]]V]T]]...]]]V]TMJEUXEFLA
    #>   [4]    36 ]]]]]]]]]]]]]]]...]T]]]]RJRZTQLOA
    #>   [5]    36 ]]]]]]]]]]]]]]]...]]]]]]]MJUJVLSS
    #>   ...   ... ...
    #> [252]    36 ]]]]]]]]]]]Y]]]...]]Y]VR]MJQNSAOC
    #> [253]    36 ]]]]]]]]]]]]]]]...]]Y]]]VTVVRVMSM
    #> [254]    36 ]]]]Y]Y]]]]]]]O...]]]YYVVTSZUOOHH
    #> [255]    36 ]]]]]]]]]]]]]]]...]]]]V]T[OVXEJSJ
    #> [256]    36 ]]]]]]]]]]]Y]]T...TRYVMEVVRSRHHNH
    
    file <- tempfile()
    writeFastq(rfq, file)
    readLines(file, 8)
    #> [1] "@HWI-EAS88_1_1_1_1001_499"           
    #> [2] "GGACTTTGTAGGATACCCTCGCTTTCCTTCTCCTGT"
    #> [3] "+"                                   
    #> [4] "]]]]]]]]]]]]Y]Y]]]]]]]]]]]]VCHVMPLAS"
    #> [5] "@HWI-EAS88_1_1_1_898_392"            
    #> [6] "GATTTCTTACCTATTAGTGGTTGAACAGCATCGGAC"
    #> [7] "+"                                   
    #> [8] "]]]]]]]]]]]]Y]]]]]]]]]YPV]T][PZPICCK"
    

    3.vcf

    记录变异,分meta、fix(主体)和gt(format和基因型)三部分

    #install.packages("vcfR")
    library(vcfR)
    vcf<-read.vcfR("pinf_sc50.vcf.gz")
    #> Scanning file to determine attributes.
    #> File attributes:
    #>   meta lines: 29
    #>   header_line: 30
    #>   variant count: 22031
    #>   column count: 27
    #> 
    Meta line 29 read in.
    #> All meta lines processed.
    #> gt matrix initialized.
    #> Character matrix gt created.
    #>   Character matrix gt rows: 22031
    #>   Character matrix gt cols: 27
    #>   skip: 0
    #>   nrows: 22031
    #>   row_num: 0
    #> 
    Processed variant 1000
    Processed variant 2000
    Processed variant 3000
    Processed variant 4000
    Processed variant 5000
    Processed variant 6000
    Processed variant 7000
    Processed variant 8000
    Processed variant 9000
    Processed variant 10000
    Processed variant 11000
    Processed variant 12000
    Processed variant 13000
    Processed variant 14000
    Processed variant 15000
    Processed variant 16000
    Processed variant 17000
    Processed variant 18000
    Processed variant 19000
    Processed variant 20000
    Processed variant 21000
    Processed variant 22000
    Processed variant: 22031
    #> All variants processed
    class(vcf@meta)
    #> [1] "character"
    class(vcf@fix)
    #> [1] "matrix"
    class(vcf@gt)
    #> [1] "matrix"
    meta = vcf@meta
    str_length(vcf@meta)
    #>  [1]   20   53   47  115  126   69   60  139
    #>  [9] 4402  126  115   94  124  141  108   84
    #> [17]  117  124  191  199  200   68   84  129
    #> [25]   84  130  117   45   95
    fix = vcf@fix
    fix[1:4,1:4]
    #>      CHROM              POS   ID REF 
    #> [1,] "Supercontig_1.50" "41"  NA "AT"
    #> [2,] "Supercontig_1.50" "136" NA "A" 
    #> [3,] "Supercontig_1.50" "254" NA "T" 
    #> [4,] "Supercontig_1.50" "275" NA "A"
    gt = vcf@gt
    colnames(gt)
    #>  [1] "FORMAT"        "BL2009P4_us23"
    #>  [3] "DDR7602"       "IN2009T1_us22"
    #>  [5] "LBUS5"         "NL07434"      
    #>  [7] "P10127"        "P10650"       
    #>  [9] "P11633"        "P12204"       
    #> [11] "P13527"        "P1362"        
    #> [13] "P13626"        "P17777us22"   
    #> [15] "P6096"         "P7722"        
    #> [17] "RS2009P1_us8"  "blue13"       
    #> [19] "t30-4"
    gt[1:4,1:4]
    #>      FORMAT          
    #> [1,] "GT:AD:DP:GQ:PL"
    #> [2,] "GT:AD:DP:GQ:PL"
    #> [3,] "GT:AD:DP:GQ:PL"
    #> [4,] "GT:AD:DP:GQ:PL"
    #>      BL2009P4_us23             
    #> [1,] "1|1:0,7:7:21:283,21,0"   
    #> [2,] "0|0:12,0:12:36:0,36,427" 
    #> [3,] "0|0:27,0:27:81:0,81,1117"
    #> [4,] "0|0:29,0:29:87:0,87,1243"
    #>      DDR7602                   
    #> [1,] "1|1:0,6:6:18:243,18,0"   
    #> [2,] "0|0:20,0:20:60:0,60,819" 
    #> [3,] "0|0:26,0:26:78:0,78,1077"
    #> [4,] "0|0:27,0:27:81:0,81,1158"
    #>      IN2009T1_us22             
    #> [1,] "1|1:0,8:8:24:324,24,0"   
    #> [2,] "0|0:16,0:16:48:0,48,650" 
    #> [3,] "0|0:23,0:23:69:0,69,946" 
    #> [4,] "0|0:32,0:32:96:0,96,1299"
    

    meta被做成了character,另外两个变成了矩阵,很简单了。

    4.bam

    使用R包Rsamtools,只用它来读取有点屈才了!

    library(Rsamtools)
    bamFile="RNA-seq.bam"
    #读取上一步比对生成的bam文件,成为R语言中的高级对象
    bam <- scanBam(bamFile)
    names(bam[[1]])
    #>  [1] "qname"  "flag"   "rname"  "strand"
    #>  [5] "pos"    "qwidth" "mapq"   "cigar" 
    #>  [9] "mrnm"   "mpos"   "isize"  "seq"   
    #> [13] "qual"
    bam=as.data.frame(do.call(cbind,lapply(bam[[1]],as.character)))
    bam[1:4,1:4]
    #>                qname flag rname strand
    #> 1  SRR957677.2906440   16 chr17      -
    #> 2 SRR957677.15075864  272 chr17      -
    #> 3  SRR957677.6649455    0 chr17      +
    #> 4  SRR957677.4552083   16 chr17      -
    

    5.bed

    使用R包genomation,读取结果是一个GRanges对象

    #BiocManager::install("genomation")
    library(genomation)
    bed = readBed("chr21.refseq.hg19.bed",track.line=FALSE,remove.unusual=FALSE)
    #> Parsed with column specification:
    #> cols(
    #>   X1 = col_character(),
    #>   X2 = col_double(),
    #>   X3 = col_double(),
    #>   X4 = col_character(),
    #>   X5 = col_double(),
    #>   X6 = col_character(),
    #>   X7 = col_double(),
    #>   X8 = col_double(),
    #>   X9 = col_double(),
    #>   X10 = col_double(),
    #>   X11 = col_number(),
    #>   X12 = col_character()
    #> )
    str(bed,max.level = 2)
    #> Formal class 'GRanges' [package "GenomicRanges"] with 7 slots
    #>   ..@ seqnames       :Formal class 'Rle' [package "S4Vectors"] with 4 slots
    #>   ..@ ranges         :Formal class 'IRanges' [package "IRanges"] with 6 slots
    #>   ..@ strand         :Formal class 'Rle' [package "S4Vectors"] with 4 slots
    #>   ..@ seqinfo        :Formal class 'Seqinfo' [package "GenomeInfoDb"] with 4 slots
    #>   ..@ elementMetadata:Formal class 'DataFrame' [package "S4Vectors"] with 6 slots
    #>   ..@ elementType    : chr "ANY"
    #>   ..@ metadata       : list()
    

    6.gtf

    本来我找到了一个叫refGenome的包,但是安装总是出错,我就另找办法了(不是说这个问题一定无法解决,只是不想继续浪费时间了,最快解决办法是换个电脑。)

    既然它是方方正正的表格,那可以用读取表格的命令来处理,比如data.table::fread

    gtf = data.table::fread("ANXA1.gencode.v7.gtf",data.table = F)
    colnames(gtf)
    #> [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8"
    #> [9] "V9"
    

    (没有列名,我看了一下示例文件里面也没有,哈哈。不过既然格式统一,那搜一下列名加上去也不是难事,但是,我不。)

    相关文章

      网友评论

        本文标题:fasta、fastq、vcf、bam、bed、gtf--多种生

        本文链接:https://www.haomeiwen.com/subject/bqyfqhtx.html