测试数据下载
wget -c ftp://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.chr.gtf.gz
先验知识
代码解读
实现了哪些功能?
1)读取GTF文件
2)根据feature,将其分为3种数据类型进行保存,即gene、transcript、exon。
每一种数据类型,分别构建一种对象,每种对象所具有的属性定义如下,
- Object gene包含的属性有:所属的染色体,起始位置,终止位置,正负链信息,gene id(对应GTF文件中的
gene_id
) - Object transcript包含的属性有:所属的染色体,起始位置,终止位置,transcript id(对应GTF文件中的
transcript_id
),parent id(对应GTF文件中的gene_id
) - Object exon包含的属性有:所属的染色体,起始位置,终止位置,parent id(对应GTF文件中的
transcript_id
)
3)数据处理
- 统计每一条染色体上的gene总数
- 计算每一个gene的长度
- 统计每个gene的转录本数
- 记录每个exon的坐标
代码设计思路
将GTF文件的每一行看作一个对象来处理,构建不同类型的对象对信息进行存储,同时由于不同的对象之间存在信息交集,这就是类继承的应用之处。
Python代码
在正则表达式上,我进行了一定修改。
'''
Date: 2022.8.14
Description: Kind of a collection of tools to parse and analyze GTF
Main Function:
- count gene number of each chromosome
- count transcritp number of each gene
- calculate the total number of exon and intron of each transcript
- plot the result
'''
import sys
import re
args = sys.argv # Set argv to grep the ipnut. argv is a list, e.g. [, , ]
class Genome_info():
def __init__(self):
self.chr = ""
self.start = 0
self.end = 0
class Gene(Genome_info):
'''
Note: Object Gene is inherited from Genome_info
Gene has two attributes,
1) orientation
2) id (gene id)
'''
def __init__(self):
Genome_info.__init__(self)
self.orientation = ""
self.id = ""
class Transcript(Genome_info):
'''
Note: Object Transcript is inherited from Genome_info
Transcript has two attributes,
1) id (transcript id)
2) parent (gene id)
'''
def __init__(self):
Genome_info.__init__(self)
self.id = ""
self.parent = ""
class Exon(Genome_info):
'''
Note: Object Exon is inherited from Genome_info
Transcript has one attributes,
2) parent (?)
'''
def __init__(self):
Genome_info.__init__(self)
self.parent = ""
'''
Define a main function to handle gtf file
'''
def main(args):
list_chr = []
list_gene = {}
list_transcript = {}
list_exon = []
# l_n = 0
with open(args[1]) as fp_gtf:
for line in fp_gtf:
if line.startswith("#"):
continue
# print ("in %s" % l_n)
# l_n += 1
lines = line.strip("\n").split("\t")
chr = lines[0]
type = lines[2]
start = int(lines[3])
end = int(lines[4])
orientation = lines[6]
attr = lines[8]
if not re.search(r'protein_coding', attr): # 只针对protein_coding进行统计分析
continue
if not chr in list_chr :
list_chr.append(chr)
if type == "gene":
gene = Gene()
id = re.search(r'gene_id "([^;]+)";?', attr).group(1) # Original version, could be adapted to r'gene_id "(\S+)";'
# id = re.search(r'gene_id "(\S+)";', attr).group(1) # My adaptation
gene.chr = chr
gene.start = start
gene.end = end
gene.id = id
gene.orientation = orientation
list_gene[id] = gene # list_gene is a dict to depost gene id and its correspoding Gene Object
# print(id)
elif type == "transcript":
transcript = Transcript()
id = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
# id = re.search(r'transcript_id "(\S+)";', attr).group(1) # My adaptation
parent = re.search(r'gene_id "([^;]+)";?', attr).group(1)
# id = re.search(r'gene_id "(\S+)";', attr).group(1) # My adaptation
if not parent in list_gene:
continue
transcript.chr = chr
transcript.start = start
transcript.end = end
transcript.id = id
transcript.parent = parent
list_transcript[id] = transcript
elif type == "exon":
exon = Exon()
parent = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
# parent = re.search(r'transcript_id "(\S+)";', attr).group(1) # My adaptation
if not parent in list_transcript:
continue
exon.chr = chr
exon.start = start
exon.end = end
exon.parent = parent
list_exon.append(exon)
chr_gene(list_gene)
gene_len(list_gene)
gene_transcript(list_transcript)
transcript_exon(list_exon)
exon_pos(list_exon)
def chr_gene(list_gene):
print("-------------------------------- Count gene number of each chromosome --------------------------------")
count_gene = {}
for info in list_gene.values():
chr = info.chr
if chr in count_gene:
count_gene[info.chr] += 1
else:
count_gene[info.chr] = 1
with open("chr_gene.txt", 'w') as fp_out:
for chr, num in count_gene.items():
print("\t".join([chr, str(num)]) + "\n")
fp_out.write("\t".join([chr, str(num)]) + "\n")
def gene_len(list_gene):
print("-------------------------------- Calculate the length of each gene --------------------------------")
with open("gene_len.txt", 'w') as fp_out:
for gene_id, info in list_gene.items():
len = info.end - info.start + 1
fp_out.write("\t".join([gene_id, str(len)]) + "\n")
# print("\t".join([gene_id, str(len)]) + "\n")
def gene_transcript(list_transcript):
print("-------------------------------- Count transcript number of each gene --------------------------------")
count_transcript = {}
for info in list_transcript.values():
gene_id = info.parent
if gene_id in count_transcript:
count_transcript[gene_id] += 1
else:
count_transcript[gene_id] = 1
with open("gene_transcript.txt", 'w') as fp_out:
for gene_id, num in count_transcript.items():
# print("\t".join([gene_id, str(num)]) + "\n")
fp_out.write("\t".join([gene_id, str(num)]) + "\n")
def transcript_exon(list_exon):
print("-------------------------------- Count exon number of each transcript --------------------------------")
count_exon = {}
for exon in list_exon:
transcript_id = exon.parent
if transcript_id in count_exon:
count_exon[transcript_id] += 1
else:
count_exon[transcript_id] = 1
with open("transcript_exon.txt", 'w') as fp_out:
for transcript_id, num in count_exon.items():
# print("\t".join([transcript_id, str(num)]) + "\n")
fp_out.write("\t".join([transcript_id, str(num)]) + "\n")
def exon_pos(list_exon):
print("-------------------------------- Save the position info of each exon --------------------------------")
count_exon = {}
for exon in list_exon:
transcript_id = exon.parent
if transcript_id in count_exon:
count_exon[transcript_id] += ",%s-%s" % (str(exon.start), str(exon.end))
else:
count_exon[transcript_id] = "%s-%s" % (str(exon.start), str(exon.end))
with open("exon_pos.txt", 'w') as fp_out:
for transcript_id, pos in count_exon.items():
print("\t".join([transcript_id, pos]) + "\n")
fp_out.write("\t".join([transcript_id, pos]) + "\n")
def gene_exon_pos(list_gene, list_transcript, list_exon):
pass
if __name__ == "__main__":
main(args)
代码运行
python save.py input.gtf
小记
暑假在编写脚本的过程中,感觉自己Python其实进步了不少,但是还没能把数据结构、面向对象编程很好的整合起来。
因为本科阶段确实没有什么自己编写脚本去分析数据的需求,就算自己创造需求了,也没有实际应用到工作当中,因此刻意练习是很重要的。
有意思的故事是,去年有幸听到了坦哥面试学生问的问题,“解释一下类”,
要是当时的我,碰到了这个问题,那我是真答不上来。
参考资料
[1] http://www.biotrainee.com/thread-626-1-1.html【作者:dongye】
网友评论