本文提供人的Ensemble上 GRCH38 基因组和genecode上最新的 v42版本注释,gene_id 分别和gene_symbol,transcription_id的对应关系
#参考 https://www.jianshu.com/p/9e62f9148932
#提取gtf注释文件中gene_id等与gene_name的对应关系g2s_vm42_gencode.txt;
#transcript_id to gene_name的t2s_vm42_gencode.txt
cd ref/hum_GRCH38/idTransition
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.chr_patch_hapl_scaff.annotation.gtf.gz
gunzip gencode.v42.chr_patch_hapl_scaff.annotation.gtf.gz
vim gtf_geneid2symbol_gencode.sh
#!/bin/bash
#提取gtf注释文件中gene_id等与gene_name的对应关系,便于下游id转换
gtf="gencode.v42.chr_patch_hapl_scaff.annotation.gtf"
### gene_id to gene_name
grep 'gene_id' $gtf | awk -F 'gene_id \"' '{print $2}' |awk -F '\"' '{print $1}' >gene_id_tmp
grep 'gene_id' $gtf | awk -F 'gene_name \"' '{print $2}' |awk -F '\"' '{print $1}' >gene_name_tmp
paste gene_id_tmp gene_name_tmp >last_tmp
uniq last_tmp >g2s_vm42_gencode.txt
rm *_tmp
### transcript_id to gene_name
grep 'transcript_id' $gtf | awk -F 'transcript_id \"' '{print $2}' |awk -F '\"' '{print $1}' >gene_id_tmp
grep 'transcript_id' $gtf | awk -F 'gene_name \"' '{print $2}' |awk -F '\"' '{print $1}' >gene_name_tmp
paste gene_id_tmp gene_name_tmp >last_tmp
uniq last_tmp >t2s_vm42_gencode.txt
rm *_tmp
bash gtf_geneid2symbol_gencode.sh
网友评论