使用bioconductor会很简单。
我们只需要:
mart_11 <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id", "external_gene_name"), mart = mart_11)
t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id, ext_gene = external_gene_name)
write.table(t2g,file = "homo_transcrip_to_genes.txt",sep="\t",col.names = F,row.names = F,quote=F)
之后就可以在salmon,kallisto等软件中使用啦。
但是,得到的文件可能并不是和我们在ensemble上下载的gtf相对应的,所以不能使用。这是我们使用python来处理。
def create_transcript_list(input, use_name = True, use_version = True):
r = {}
for line in input:
if len(line) == 0 or line[0] == '#':
continue
l = line.strip().split('\t')
if l[2] == 'transcript':
info = l[8]
d = {}
for x in info.split('; '):
x = x.strip()
p = x.find(' ')
if p == -1:
continue
k = x[:p]
p = x.find('"',p)
p2 = x.find('"',p+1)
v = x[p+1:p2]
d[k] = v
if 'transcript_id' not in d or 'gene_id' not in d:
continue
tid = d['transcript_id']
gid = d['gene_id']
if use_version:
if 'transcript_version' not in d or 'gene_version' not in d:
continue
tid += '.' + d['transcript_version']
gid += '.' + d['gene_version']
gname = None
if use_name:
if 'gene_name' not in d:
continue
gname = d['gene_name']
if tid in r:
continue
r[tid] = (gid, gname)
return r
def print_output(output, r, use_name = True):
for tid in r:
if use_name:
output.write("%s\t%s\t%s\n"%(tid, r[tid][0], r[tid][1]))
else:
output.write("%s\t%s\n"%(tid, r[tid][0]))
with open('./homo_ercc.gtf') as file:
r = create_transcript_list(file, use_name = True, use_version = True)
with open('human_transcript_to_gene.tsv', "w+") as output:
print_output(output, r, use_name = True)
print('Created human_transcript_to_gene.tsv file')
网友评论