官网文档:HTSeq: Analysing high-throughput sequencing data with Python
HTSeq用作测序数据和分析软件之间的“粘合剂”,主要有下述功能:
通过碱基质量了解数据
统计基因组覆盖度
GFF file文件读取注释信息
将RNA-Seq实验中的比对结果分配给外显子和基因。
安装:pip install HTSeq
服用方法
- 读取fastq,获取read信息
'''
import HTSeq
import itertools
fastq_file = HTSeq.FastqReader( "test.fastq.gz")
for read in itertools.islice( fastq_file, 5 ):
... print(read)
...
TCGCTAGCAAGACTTTTTCTTTTTCTAGGCACAGTAGGTATTAGATTAAATATGGTAATCACTCACTTCACTTCTGGAAGCAACAGCCCGGTGCAGTGGAGTCAGCCACATGTTGTCCTTGGCATTTACACGAGCTCCTGGAATCAAACA
CCCAGAGGCGGCAAGCTTCCTGTTCCATGGAGCCGGTACTCAGTCAAGGTCAGTATCTTCTGATCCTCCTTCTGCGGCTGCTGGGAGGTGGCAGGCCGGACAGCAGAAGGACAGGAGTGGCTCCATGGGGGCAGGCTTTCTGTGGCTGTG
GTTTGTGCCAGATGACGATTTGAATTAATAAATTCATTTGGTATAAACCGCGATTATTTTTGCATCAACTTACTACAGTCTATGGTCTTTTGTGTCTGGCATCGTTCACTTAGCGTGTTTTCATGGTTCATCCAGATTGTAACATGTTTC
TTCAGCCATTTCCCACTTCGCTTTAAGAGCATGCCCTGTTTAATGGGGATGGCTCTGCCGCTCCCGATGGTGTCAGCATGATTCTCCGGGGCTTTCCTCTCTTTGTCTGGGTCACTCCCTTTCTCAGATGTAAACAGGTTGGACCAGCGC
GTGATTTTCTCCCTTGAGGAGTTGAAGGAGATTGAGAAGGACTGTGCCGTCTACGTTGGGCGCATGGAGAGGGTGGCCCGCCACAGCTCTGTCAGCAAGGAGGAGAAGGTGCGTGGCACCACAGGTCTCGGGGGCTTTCTTCGCCCCTGC
read 变量仍然保留第十个read的信息
read
<SequenceWithQualities object 'A00312:90:HKVF5DSXX:3:1101:3441:1000 1:N:0:GCCTATCA'>
read.name
'A00312:90:HKVF5DSXX:3:1101:3441:1000 1:N:0:GCCTATCA'
read.seq
b'GTGATTTTCTCCCTTGAGGAGTTGAAGGAGATTGAGAAGGACTGTGCCGTCTACGTTGGGCGCATGGAGAGGGTGGCCCGCCACAGCTCTGTCAGCAAGGAGGAGAAGGTGCGTGGCACCACAGGTCTCGGGGGCTTTCTTCGCCCCTGC'
read.qual
array([37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 25, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 25, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 25, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 25, 37, 37, 37, 37, 37, 37],
dtype=uint8)
'''
遍历fastq文件,统计所有read的质量信息
'''
创建长度序列读长的0向量
import numpy
len(read)
150
qualsum = numpy.zeros(len(read),numpy.int)
qualsum
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
遍历fastq对象,所有位置碱基质量相加
nreads = 0
for read in fastq_file:
... qualsum += read.qual
... nreads += 1qualsum / float(nreads)
array([36.56013134, 36.63208692, 36.6780693 , 36.69489843, 36.71856081,
36.72337391, 36.70846614, 36.71600597, 36.71639324, 36.73297184,
... ...
36.20018642, 36.19426629, 36.19388805, 36.20587639, 36.17329932,
36.15208402, 36.13248803, 36.13966627, 36.12272692, 36.12639415])
画图展示
from matplotlib import pyplot
image.png
pyplot.plot( qualsum / nreads )
pyplot.show()
'''
fastq更多质控,在命令行使用 htseq-qa 实现。htseq-qa还可以用sam文件生成比对到和未比对到参考基因组的质控信息(看sam分析模块)。htseq-qa参数的意义看这Quality Assessment with
'''
'''
image.png
抽取特定区域reads重新写入新的bam文件
'''
bam_reader = HTSeq.BAM_Reader( "deduped.bam" )
for a in itertools.islice( bam_reader, 5 ): # printing first 5 reads
... print(a)
...
<SAM_Alignment object: Paired-end read 'A00250:179:HN33WDSXX:3:1455:28682:21167' aligned to chr1:[10000,10143)/+>
<SAM_Alignment object: Paired-end read 'A00250:179:HN33WDSXX:3:2127:25491:32988' aligned to chr1:[10000,10147)/+>
<SAM_Alignment object: Paired-end read 'A00250:179:HN33WDSXX:3:2164:16577:11412' aligned to chr1:[10008,10110)/+>
<SAM_Alignment object: Paired-end read 'A00250:179:HN33WDSXX:3:2221:3152:24768' aligned to chr1:[10016,10110)/+>
<SAM_Alignment object: Paired-end read 'A00250:179:HN33WDSXX:3:1659:5077:20838' aligned to chr1:[10018,10146)/->
此处报错没测通
bam_writer = HTSeq.BAM_Writer.from_BAM_Reader( "region.bam", bam_reader ) #set-up BAM_Writer with same header as reader
for a in bam_reader.fetch( region = "1:249000000-249200000" ): #fetching reads in a region
... print("Writing Alignment", a, "to file", bam_writer.filename)
... bam_writer.write( a )
Writing Alignment <SAM_Alignment object: Read 'SRR001432.104735 USI-EAS21_0008_3445:8:3:934:653 length=25' aligned to 1:[249085369,249085394)/-> to file region.bam
Writing Alignment <SAM_Alignment object: Read 'SRR001432.280764 USI-EAS21_0008_3445:8:7:479:581 length=25' aligned to 1:[249105864,249105889)/-> to file region.bam
...
Writing Alignment <SAM_Alignment object: Read 'SRR001432.248967 USI-EAS21_0008_3445:8:6:862:756 length=25' aligned to 1:[249167916,249167941)/-> to file region.bam
bam_writer.close()
查看sam中的a对象
a
<SAM_Alignment object: Paired-end read 'A00250:179:HN33WDSXX:3:1659:5077:20838' aligned to chr1:[10018,10146)/->
a.read
<SequenceWithQualities object 'A00250:179:HN33WDSXX:3:1659:5077:20838'>
a.read.name
'A00250:179:HN33WDSXX:3:1659:5077:20838'
a.read.qualstr
b'FF,F,FFFFFFFFFFFFFFFFFFFFFF,:FFFFFF:FFFFFFFF:F:FFF:FFFFF,FF:F:,F:,FF:FF,FF,:::::,,F::F,F:,,F,,,,,F:,,,,F,,:,,F:F,F,F,,,,,,,,,:FF:,,,,F,,F,,F,,,,F::,:F'
a.read.qual
array([37, 37, 11, 37, 11, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 11, 25, 37, 37, 37, 37, 37,
... ...
37, 11, 11, 37, 11, 11, 11, 11, 37, 25, 25, 11, 25, 37],
dtype=uint8)
a.iv
<GenomicInterval object 'chr1', [10018,10146), strand '-'>
a.iv.chrom
'chr1'
a.iv.start
10018
a.iv.end
10146
a.iv.strand
'-'
'''
处理CNV时用到的GenomicArrays 和 GenomicInterval
1) genomicInterval用来存储染色体及位置信息等
'''
iv = HTSeq.GenomicInterval( "chr3", 123203, 127245, "+" )
print(iv)
chr3:[123203,127245)/+
iv
<GenomicInterval object 'chr3', [123203,127245), strand '+'>
iv.length
to file region.bam
...
Writing Alignment <SAM_Alignment object: Read 'SRR001432.248967 USI-EAS21_0008_3445:8:6:862:756 length=25' aligned to 1:[249167916,249167941)/-> to file region.bam
>>> bam_writer.close()
# 查看sam中的a对象
>>> a
<SAM_Alignment object: Paired-end read 'A00250:179:HN33WDSXX:3:1659:5077:20838' aligned to chr1:[10018,10146)/->
>>> a.read
<SequenceWithQualities object 'A00250:179:HN33WDSXX:3:1659:5077:20838'>
>>> a.read.name
'A00250:179:HN33WDSXX:3:1659:5077:20838'
>>> a.read.qualstr
b'FF,F,FFFFFFFFFFFFFFFFFFFFFF,:FFFFFF:FFFFFFFF:F:FFF:FFFFF,FF:F:,F:,FF:FF,FF,:::::,,F::F,F:,,F,,,,,F:,,,,F,,:,,F:F,F,F,,,,,,,,,:FF:,,,,F,,F,,F,,,,F::,:F'
>>> a.read.qual
array([37, 37, 11, 37, 11, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 11, 25, 37, 37, 37, 37, 37,
... ...
37, 11, 11, 37, 11, 11, 11, 11, 37, 25, 25, 11, 25, 37],
dtype=uint8)
>>> a.iv
<GenomicInterval object 'chr1', [10018,10146), strand '-'>
>>> a.iv.chrom
'chr1'
>>> a.iv.start
10018
>>> a.iv.end
10146
>>> a.iv.strand
'-'
'''
处理CNV时用到的GenomicArrays 和 GenomicInterval
1) genomicInterval用来存储染色体及位置信息等
'''
>>> iv = HTSeq.GenomicInterval( "chr3", 123203, 127245, "+" )
>>> print(iv)
chr3:[123203,127245)/+
>>> iv
<GenomicInterval object 'chr3', [123203,127245), strand '+'>
>>> iv.length
4042
看两个iv 之间的包含于被包含关系
iv2 = HTSeq.GenomicInterval( "chr3", 123100, 125045, "+" )
iv2.overlaps(iv)
True
iv2.contains(iv)
False
iv2.is_contained_in(iv)
False
'''
2) GenomicArray可以根据每条染色体信息,并用iv处理染色体特定区域信息
'''
构建ga
chromlens = { 'chr1': 3000, 'chr2': 2000, 'chr3': 1000 }
chromlens
{'chr1': 3000, 'chr2': 2000, 'chr3': 1000}
ga = HTSeq.GenomicArray( chromlens, stranded=False, typecode="i" )
ga
<HTSeq._HTSeq.GenomicArray object at 0x7fe058b384c0>
构建一个iv
iv = HTSeq.GenomicInterval( "chr1", 100, 120, "." )
设iv区域copy number 为5
ga[iv] = 5
构建另一个iv
iv = HTSeq.GenomicInterval( "chr1", 110, 135, "." )
此iv区域 copy number 增加3
ga[iv] += 3
iv = HTSeq.GenomicInterval( "chr1", 90, 140, "." )
查看特定位置拷贝数
list(ga[iv])
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0]
'''
储存每个碱基的copy number信息浪费资源,GenomicArray只储存copy number 一致的连续区域信息:
'''
for iv2,value in ga[iv].steps():
... print(iv2,value)
...
chr1:[90,100)/. 0
chr1:[100,110)/. 5
chr1:[110,120)/. 8
chr1:[120,135)/. 3
chr1:[135,140)/. 0
'''
计算覆盖度实例
'''
创建空的ga
cvg = HTSeq.GenomicArray( "auto", stranded=True, typecode="i" )
alignment_file = HTSeq.SAM_Reader("deduped.sam")
for alngt in alignment_file:
... if alngt.aligned:
... cvg[alngt.iv] +=1
...
pyplot.plot(list( cvg[ HTSeq.GenomicInterval( "chr1", 12000, 13000, "+" ) ] ) )
[<matplotlib.lines.Line2D object at 0x7f65d5666ef0>]
pyplot.show()
'''
![](https://img.haomeiwen.com/i18214976/9f4f29e866f900f4.png?
imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
GenomicArrayOfSets用来存储注释信息
'''
gas = HTSeq.GenomicArrayOfSets( "auto", stranded=False )
gas[ HTSeq.GenomicInterval( "chr1", 100, 250 ) ] += "A"
gas[ HTSeq.GenomicInterval( "chr1", 360, 640 ) ] += "A"
gas[ HTSeq.GenomicInterval( "chr1", 510, 950 ) ] += "B"
read_iv = HTSeq.GenomicInterval( "chr1", 450, 800 )
for iv, val in gas[ read_iv ].steps():
... print(iv, sorted(val))
list(ga[iv])
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0]
'''
储存每个碱基的copy number信息浪费资源,GenomicArray只储存copy number 一致的连续区域信息:
'''
>>> for iv2,value in ga[iv].steps():
... print(iv2,value)
...
chr1:[90,100)/. 0
chr1:[100,110)/. 5
chr1:[110,120)/. 8
chr1:[120,135)/. 3
chr1:[135,140)/. 0
'''
计算覆盖度实例
'''
# 创建空的ga
>>> cvg = HTSeq.GenomicArray( "auto", stranded=True, typecode="i" )
>>> alignment_file = HTSeq.SAM_Reader("deduped.sam")
>>> for alngt in alignment_file:
... if alngt.aligned:
... cvg[alngt.iv] +=1
...
>>> pyplot.plot(list( cvg[ HTSeq.GenomicInterval( "chr1", 12000, 13000, "+" ) ] ) )
[<matplotlib.lines.Line2D object at 0x7f65d5666ef0>]
>>> pyplot.show()
'''
![](https://img.haomeiwen.com/i18214976/9f4f29e866f900f4.png?
imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
GenomicArrayOfSets用来存储注释信息
'''
>>> gas = HTSeq.GenomicArrayOfSets( "auto", stranded=False )
>>> gas[ HTSeq.GenomicInterval( "chr1", 100, 250 ) ] += "A"
>>> gas[ HTSeq.GenomicInterval( "chr1", 360, 640 ) ] += "A"
>>> gas[ HTSeq.GenomicInterval( "chr1", 510, 950 ) ] += "B"
>>> read_iv = HTSeq.GenomicInterval( "chr1", 450, 800 )
>>> for iv, val in gas[ read_iv ].steps():
... print(iv, sorted(val))
...
chr1:[450,510)/. ['A']
chr1:[510,640)/. ['A', 'B']
chr1:[640,800)/. ['B']
'''
网友评论