美文网首页一起生信啦啦啦
从ensembl提取intron序列 更新中

从ensembl提取intron序列 更新中

作者: 苏牧传媒 | 来源:发表于2018-10-23 15:51 被阅读4次

    从gff3提取intron的坐标信息:

    # 提取mRNA和exon信息:

    awk '{if ($3 == "mRNA" ) print $0}' Homo_sapiens.GRCh38.94.gff3 > mRNA.gff3

    awk '{if ($3 == "exon" ) print $0}' Homo_sapiens.GRCh38.94.gff3 > exon.gff3

    # 分别提取正负链的信息到bed:

    awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="+") print "chr"$1,$4,$5,$14,$16,$7,$11}' mRNA.gff3 | awk -F '\t' -v OFS='\t' 'gsub(/-[0-9]+/,"",$5)' > mRNA.+.bed

    awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="-") print "chr"$1,$4,$5,$14,$16,$7,$11}' mRNA.gff3 | awk -F '\t' -v OFS='\t' 'gsub(/-[0-9]+/,"",$5)' > mRNA.-.bed

    awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="+") print "chr"$1,$4,$5,$11,"rank="$23,$7}' exon.gff3 > exon.+.bed

    awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="-") print "chr"$1,$4,$5,$11,"rank="$23,$7}' exon.gff3 > exon.-.bed

    # 取mRNA的exon补集:

    bedtools subtract -a mRNA.+.bed -b exon.+.bed > intron.+.bed

    bedtools subtract -a mRNA.-.bed -b exon.-.bed > intron.-.bed

    # 取uniq的内含子坐标:

    cat intron.+.bed intron.-.bed | sort -k1,1 -k2,2n | awk '{if($2!=start || $3!=end)print; start=$2;end=$3}'> intron.bed

    # 构建成dict文件:

    awk -v OFS='\t' '{if($6=="+") print $7,"intron",$1":"$2".."$3,$1,$2,$3,$6,".","intron:xxx",$4,$4","$7,$3-$2; else print $7,"intron",$1":complement("$2".."$3")",$1,$3,$2,$6,".","intron:xxx",$4,$4","$7,$3-$2}' intron.bed | awk -v OFS='\t' 'gsub(/chr/,"",$3)' > intron.dict

    相关文章

      网友评论

        本文标题:从ensembl提取intron序列 更新中

        本文链接:https://www.haomeiwen.com/subject/sqkazftx.html