细菌基因组草图组装完成后,需要过滤长度较短的contigs,脚本如下:
filter.py
#!/usr/bin/env python3
import sys
def usage():
print("\nThis script is written to filter short contigs in FASTA file.")
print("Useage:")
print("./filter.py <fasta> [length_threshold] > out.fa\n")
def printHelp():
if len(sys.argv) <= 1 or ("-h" in sys.argv) or ("--help" in sys.argv):
usage()
def checkInput(fasta):
for line in fasta:
if not line.startswith(">"):
print("Input file format error: input file must be in FASTA format.")
printHelp()
sys.exit(1)
else:
break
if __name__ == "__main__":
printHelp()
fh1 = open(sys.argv[1], "rt")
checkInput(fh1)
faDict = {}
fh1.seek(0)
for line in fh1:
if line.startswith(">"):
ID = line
faDict[ID] = []
else:
line = line.strip("\n")
faDict[ID].append(line)
len_threshold = sys.argv[2]
for key in faDict:
if len(faDict[key]) >= len_threshold:
print(key, end="")
print(fasta)
fh1.close()
使用方法
python filter.py /path/to/my/assembly.fasta 500 > /path/to/filtered/output.fasta
网友评论