这两天在学习面向对象,跟着书本写了一个小例子。这种编程思想以前没用过,在慢慢熟悉其中的套路。事实上尽管学习生信有一年半了,仍然觉得自己编程能力较弱,原因是之前处理数据除了需要用到哈希我会用Perl写脚本以外,其它的场景我基本都用命令行可以解决,所以编程能力也还在哈希那个层次停留。这也是为什么最近在练习编程的原因。
下面这个小脚本的作用是:输入一段序列,可以查询它的长度、类型、各碱基的个数,查询是否包含某一段特定的序列,DNA序列还能查询GC含量以及求反向互补序列。
包含两个类的模块(用到了继承)
sequence.py
class Sequence():
def __init__(self, seq):
#属性1:序列本身
self.seq = seq
#属性2:序列类型
set_seq = set(seq)
set_dna = set("ATGCN")
set_rna = set("AUGCN")
set_pro = set("ACDEFGHIKLMNPQRSTVWY")
if set_seq <= set_dna:
self.seqtype = "DNA"
elif set_seq <= set_rna:
self.seqtype = "RNA"
elif set_seq <= set_pro:
self.seqtype = "protein"
else:
self.seqtype = "Unknown"
#属性3:序列长度
self.seqlen = len(seq)
#DNA, RNA, protein各种元素的含量
def seq_element(self):
element_counts = {}
for i in self.seq:
if i in element_counts.keys():
element_counts[i] += 1
else:
element_counts[i] = 1
num = 1
for key, value in element_counts.items():
this_key_percent = value / self.seqlen
print(str(num)+"\t" + key + "\t" + str(value) + "\t" + str(this_key_percent))
num +=1
#查询是否含有特定的子序列
#如果有,则返回具体的坐标,从1开始
def sub_seq(self, query_seq):
query_seq_len = len(query_seq)
times = self.seqlen+1-query_seq_len
hit_num = 0
for i in range(times):
part_of_self = self.seq[i:i+query_seq_len]
if part_of_self == query_seq:
hit_num += 1
if hit_num == 1:
print("the query seq exists in the ref seq, and the coordinates: \n")
print("Num.\tLeft\tRight")
print(str(hit_num) + "\t" + str(i+1) + "\t" + str(i+query_seq_len))
class DnaSequence(Sequence):
def __init__(self, seq):
super().__init__(seq)
#求DNA序列的反向互补序列
def rev_com(self):
if self.seqtype == "DNA":
seq_length = len(self.seq)
rc_self = ""
for i in range(1,seq_length+1):
tmp_base = self.seq[-i]
if tmp_base == "A":
rc_self += "T"
elif tmp_base == "T":
rc_self += "A"
elif tmp_base == "G":
rc_self += "C"
elif tmp_base == "C":
rc_self += "G"
else:
rc_self += "N"
return rc_self
else:
print("Please make sure your sequence is a DNA sequence!")
#求DNA序列的GC含量
def seq_gc(self):
if self.seqtype == "DNA":
seq_length = len(self.seq)
gc_counts = 0
for i in range(seq_length):
tmp_base = self.seq[i]
if tmp_base == "G" or tmp_base == "C":
gc_counts += 1
gc_percent = gc_counts / seq_length
return gc_percent
else:
print("Please make sure your sequence is a DNA sequence!")
测试脚本:test_sequence.py
from sequence import DnaSequence
#seq = input("Please input your sequence: ")
seq = "AGCTCGTACGTCGTACGTACGTACG"
my_seq = DnaSequence(seq)
print(my_seq.seq)
print(my_seq.seqlen)
print(my_seq.seqtype)
print(my_seq.seq_gc())
print(my_seq.rev_com())
my_seq.seq_element()
my_seq.sub_seq("CGT")
结果
print(my_seq.seq)
AGCTCGTACGTCGTACGTACGTACG
print(my_seq.seqlen)
25
print(my_seq.seqtype)
DNA
print(my_seq.seq_gc())
0.56
print(my_seq.rev_com())
CGTACGTACGTACGACGTACGAGCT
my_seq.seq_element()
1 A 5 0.2
2 G 7 0.28
3 C 7 0.28
4 T 6 0.24
my_seq.sub_seq("CGT")
the query seq exists in the ref seq, and the coordinates:
Num. Left Right
1 5 7
2 9 11
3 12 14
4 16 18
5 20 22
网友评论