多种生信格式的R语言读取
花花写于2020-02-20 其实我是现学现卖的,核心就是搜索, r read fasta、r read fastq 巴拉巴拉。
导语
今天介绍的内容是fasta、fastq、vcf、bam、bed、gtf六种数据读入R语言的方式。众所周知(假装是的),这些格式都是上游分析用到的,linux处理比较常见。但是linux学习难度大,很多人望而却步,(理由我就不继续编了,其实就是我要备课而已)。获取示例数据请在“生信星球”公众号后台回复“547”
1.fasta
使用R包seqinr。
#BiocManager::install('seqinr')
library(seqinr)
library(stringr)
fasta <- read.fasta(file = "longreads.fa",
as.string = TRUE,
forceDNAtolower = FALSE)
class(fasta)
#> [1] "list"
attributes(fasta)
#> $names
#> [1] "r1" "r2" "r3" "r4" "r5"
r1 = fastafile[[1]]
#> Error in eval(expr, envir, enclos): object 'fastafile' not found
class(r1)
#> Error in eval(expr, envir, enclos): object 'r1' not found
str_length(r1)
#> Error in stri_length(string): object 'r1' not found
str_sub(r1,1,10)
#> Error in stri_sub(string, from = start, to = end): object 'r1' not found
2.fastq
使用R包ShortRead,可以读取、查看、提取id、提取质量值,还可以导出。
rm(list = ls())
#BiocManager::install("ShortRead")
suppressMessages(library(ShortRead))
rfq <- readFastq("s_1_sequence.txt")
rfq
#> class: ShortReadQ
#> length: 256 reads; width: 36 cycles
sread(rfq)
#> A DNAStringSet instance of length 256
#> width seq
#> [1] 36 GGACTTTGTAGGATA...CTTTCCTTCTCCTGT
#> [2] 36 GATTTCTTACCTATT...TGAACAGCATCGGAC
#> [3] 36 GCGGTGGTCTATAGT...AATATCAATTTGGGT
#> [4] 36 GTTACCATGATGTTA...CATTTGGAGGTAAAA
#> [5] 36 GTATGTTTCTCCTGC...CCTTCTTGAAGGCTT
#> ... ... ...
#> [252] 36 GTTTAGATATGAGTC...TGTTCATGGTAGAGT
#> [253] 36 GTTTTACAGACACCT...ACATCGTCAACGTTA
#> [254] 36 GATGAACTAAGTCAA...CACTAACCTTGCGAG
#> [255] 36 GTTTGGTTCGCTTTG...CTTCGGTTCCGACTA
#> [256] 36 GCAATCTGCCGACCA...ATTCAATCATGACTT
id(rfq)
#> A BStringSet instance of length 256
#> width seq
#> [1] 24 HWI-EAS88_1_1_1_1001_499
#> [2] 23 HWI-EAS88_1_1_1_898_392
#> [3] 23 HWI-EAS88_1_1_1_922_465
#> [4] 23 HWI-EAS88_1_1_1_895_493
#> [5] 23 HWI-EAS88_1_1_1_953_493
#> ... ... ...
#> [252] 23 HWI-EAS88_1_1_1_693_898
#> [253] 23 HWI-EAS88_1_1_1_183_559
#> [254] 23 HWI-EAS88_1_1_1_314_891
#> [255] 23 HWI-EAS88_1_1_1_884_867
#> [256] 23 HWI-EAS88_1_1_1_878_444
quality(rfq)
#> class: SFastqQuality
#> quality:
#> A BStringSet instance of length 256
#> width seq
#> [1] 36 ]]]]]]]]]]]]Y]Y...]]]]]]VCHVMPLAS
#> [2] 36 ]]]]]]]]]]]]Y]]...]YPV]T][PZPICCK
#> [3] 36 ]]]]Y]]]]]V]T]]...]]]V]TMJEUXEFLA
#> [4] 36 ]]]]]]]]]]]]]]]...]T]]]]RJRZTQLOA
#> [5] 36 ]]]]]]]]]]]]]]]...]]]]]]]MJUJVLSS
#> ... ... ...
#> [252] 36 ]]]]]]]]]]]Y]]]...]]Y]VR]MJQNSAOC
#> [253] 36 ]]]]]]]]]]]]]]]...]]Y]]]VTVVRVMSM
#> [254] 36 ]]]]Y]Y]]]]]]]O...]]]YYVVTSZUOOHH
#> [255] 36 ]]]]]]]]]]]]]]]...]]]]V]T[OVXEJSJ
#> [256] 36 ]]]]]]]]]]]Y]]T...TRYVMEVVRSRHHNH
file <- tempfile()
writeFastq(rfq, file)
readLines(file, 8)
#> [1] "@HWI-EAS88_1_1_1_1001_499"
#> [2] "GGACTTTGTAGGATACCCTCGCTTTCCTTCTCCTGT"
#> [3] "+"
#> [4] "]]]]]]]]]]]]Y]Y]]]]]]]]]]]]VCHVMPLAS"
#> [5] "@HWI-EAS88_1_1_1_898_392"
#> [6] "GATTTCTTACCTATTAGTGGTTGAACAGCATCGGAC"
#> [7] "+"
#> [8] "]]]]]]]]]]]]Y]]]]]]]]]YPV]T][PZPICCK"
3.vcf
记录变异,分meta、fix(主体)和gt(format和基因型)三部分
#install.packages("vcfR")
library(vcfR)
vcf<-read.vcfR("pinf_sc50.vcf.gz")
#> Scanning file to determine attributes.
#> File attributes:
#> meta lines: 29
#> header_line: 30
#> variant count: 22031
#> column count: 27
#>
Meta line 29 read in.
#> All meta lines processed.
#> gt matrix initialized.
#> Character matrix gt created.
#> Character matrix gt rows: 22031
#> Character matrix gt cols: 27
#> skip: 0
#> nrows: 22031
#> row_num: 0
#>
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant: 22031
#> All variants processed
class(vcf@meta)
#> [1] "character"
class(vcf@fix)
#> [1] "matrix"
class(vcf@gt)
#> [1] "matrix"
meta = vcf@meta
str_length(vcf@meta)
#> [1] 20 53 47 115 126 69 60 139
#> [9] 4402 126 115 94 124 141 108 84
#> [17] 117 124 191 199 200 68 84 129
#> [25] 84 130 117 45 95
fix = vcf@fix
fix[1:4,1:4]
#> CHROM POS ID REF
#> [1,] "Supercontig_1.50" "41" NA "AT"
#> [2,] "Supercontig_1.50" "136" NA "A"
#> [3,] "Supercontig_1.50" "254" NA "T"
#> [4,] "Supercontig_1.50" "275" NA "A"
gt = vcf@gt
colnames(gt)
#> [1] "FORMAT" "BL2009P4_us23"
#> [3] "DDR7602" "IN2009T1_us22"
#> [5] "LBUS5" "NL07434"
#> [7] "P10127" "P10650"
#> [9] "P11633" "P12204"
#> [11] "P13527" "P1362"
#> [13] "P13626" "P17777us22"
#> [15] "P6096" "P7722"
#> [17] "RS2009P1_us8" "blue13"
#> [19] "t30-4"
gt[1:4,1:4]
#> FORMAT
#> [1,] "GT:AD:DP:GQ:PL"
#> [2,] "GT:AD:DP:GQ:PL"
#> [3,] "GT:AD:DP:GQ:PL"
#> [4,] "GT:AD:DP:GQ:PL"
#> BL2009P4_us23
#> [1,] "1|1:0,7:7:21:283,21,0"
#> [2,] "0|0:12,0:12:36:0,36,427"
#> [3,] "0|0:27,0:27:81:0,81,1117"
#> [4,] "0|0:29,0:29:87:0,87,1243"
#> DDR7602
#> [1,] "1|1:0,6:6:18:243,18,0"
#> [2,] "0|0:20,0:20:60:0,60,819"
#> [3,] "0|0:26,0:26:78:0,78,1077"
#> [4,] "0|0:27,0:27:81:0,81,1158"
#> IN2009T1_us22
#> [1,] "1|1:0,8:8:24:324,24,0"
#> [2,] "0|0:16,0:16:48:0,48,650"
#> [3,] "0|0:23,0:23:69:0,69,946"
#> [4,] "0|0:32,0:32:96:0,96,1299"
meta被做成了character,另外两个变成了矩阵,很简单了。
4.bam
使用R包Rsamtools,只用它来读取有点屈才了!
library(Rsamtools)
bamFile="RNA-seq.bam"
#读取上一步比对生成的bam文件,成为R语言中的高级对象
bam <- scanBam(bamFile)
names(bam[[1]])
#> [1] "qname" "flag" "rname" "strand"
#> [5] "pos" "qwidth" "mapq" "cigar"
#> [9] "mrnm" "mpos" "isize" "seq"
#> [13] "qual"
bam=as.data.frame(do.call(cbind,lapply(bam[[1]],as.character)))
bam[1:4,1:4]
#> qname flag rname strand
#> 1 SRR957677.2906440 16 chr17 -
#> 2 SRR957677.15075864 272 chr17 -
#> 3 SRR957677.6649455 0 chr17 +
#> 4 SRR957677.4552083 16 chr17 -
5.bed
使用R包genomation,读取结果是一个GRanges对象
#BiocManager::install("genomation")
library(genomation)
bed = readBed("chr21.refseq.hg19.bed",track.line=FALSE,remove.unusual=FALSE)
#> Parsed with column specification:
#> cols(
#> X1 = col_character(),
#> X2 = col_double(),
#> X3 = col_double(),
#> X4 = col_character(),
#> X5 = col_double(),
#> X6 = col_character(),
#> X7 = col_double(),
#> X8 = col_double(),
#> X9 = col_double(),
#> X10 = col_double(),
#> X11 = col_number(),
#> X12 = col_character()
#> )
str(bed,max.level = 2)
#> Formal class 'GRanges' [package "GenomicRanges"] with 7 slots
#> ..@ seqnames :Formal class 'Rle' [package "S4Vectors"] with 4 slots
#> ..@ ranges :Formal class 'IRanges' [package "IRanges"] with 6 slots
#> ..@ strand :Formal class 'Rle' [package "S4Vectors"] with 4 slots
#> ..@ seqinfo :Formal class 'Seqinfo' [package "GenomeInfoDb"] with 4 slots
#> ..@ elementMetadata:Formal class 'DataFrame' [package "S4Vectors"] with 6 slots
#> ..@ elementType : chr "ANY"
#> ..@ metadata : list()
6.gtf
本来我找到了一个叫refGenome的包,但是安装总是出错,我就另找办法了(不是说这个问题一定无法解决,只是不想继续浪费时间了,最快解决办法是换个电脑。)
既然它是方方正正的表格,那可以用读取表格的命令来处理,比如data.table::fread
gtf = data.table::fread("ANXA1.gencode.v7.gtf",data.table = F)
colnames(gtf)
#> [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8"
#> [9] "V9"
(没有列名,我看了一下示例文件里面也没有,哈哈。不过既然格式统一,那搜一下列名加上去也不是难事,但是,我不。)
网友评论