题目来自生信技能树
统计人类外显子长度
坐标的文件可如下下载
ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.current.txt
import os
import re
from collections import OrderedDict
from operator import itemgetter
os.chdir("D:\python")
ha1 = {}
exonLength = 0
with open("CCDS.current.txt","rt") as f:
for line in f:
if line.startswith("#"):
continue
line = line.rstrip()
lst = line.split('\t')
if lst[-2] == '-':
continue
lst[-2] = re.sub('\[|\]','',lst[-2])
exons = lst[-2].split(", ")
for exon in exons:
start = int(exon.split('-')[0])
end = int(exon.split('-')[1])
coordinate = lst[0] + ':' + exon
if coordinate not in ha1:
ha1[coordinate] = 1
exonLength += end - start
print(exonLength)
419272
ha1 = {}
exonLength = 0
with open("CCDS.current.txt","rt") as f:
for line in f:
if line.startswith("#"):
continue
line = line.rstrip()
lst = line.split('\t')
if lst[-2] == '-':
continue
lst[-2] = re.sub('\[|\]','',lst[-2])
exons = lst[-2].split(", ")
for exon in exons:
start = int(exon.split('-')[0])
end = int(exon.split('-')[1])
for i in range(start,end):
coordinate = lst[0] + ':' + str(i)
if coordinate not in ha1:
exonLength += 1
print(exonLength)
451017
网友评论