GFF或GTF格式转bed

作者: JeremyL | 来源:发表于2022-09-07 14:02 被阅读0次

    # 1. gff2bed和gtf2bed

    首先gff2bed和gtf2bed都是BEDOPS的程序;所以使用之前需要安装# BEDOPS

    ## Linux平台安装BEDOPS

    $ git clone https://github.com/bedops/bedops.git
    $ cd bedops
    $ make
    $ make install
    

    复制可执行文件到环境路径;

    $ cp bin/* /usr/local/bin
    

    ## 使用

    GFF文件和GTF文件均来自于gencode文件。
    GFF文件:gencode.v19.annotation.gff3
    GTF文件:gencode.v19.annotation.gtf

    ### GFF格式

    gff2bed <gencode.v19.annotation.gff3 > test.bed
    convert2bed -i gff -o bed <gencode.v19.annotation.gff3 > test.bed
    
    • 文件内容查看
      gencode.v19.annotation.gff3
    ##gff-version 3
    #description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74)
    #provider: GENCODE
    #contact: gencode@sanger.ac.uk
    #format: gff3
    #date: 2014-09-18
    ##sequence-region chr1 1 249250621
    chr1    HAVANA  gene    11869   14412   .       +       .       ID=ENSG00000223972.4;gene_id=ENSG00000223972.4;transcript_id=ENSG0000
    0223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_status=KNOWN;transcript_name
    =DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
    chr1    HAVANA  transcript      11869   14409   .       +       .       ID=ENST00000456328.2;Parent=ENSG00000223972.4;gene_id=ENSG000
    00223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcr
    ipt;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751
    .1;tag=basic
    chr1    HAVANA  exon    11869   12227   .       +       .       ID=exon:ENST00000456328.2:1;Parent=ENST00000456328.2;gene_id=ENSG0000
    0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
    pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana_gene=OTTHUMG00000000961
    .2;havana_transcript=OTTHUMT00000362751.1;tag=basic
    chr1    HAVANA  exon    12613   12721   .       +       .       ID=exon:ENST00000456328.2:2;Parent=ENST00000456328.2;gene_id=ENSG0000
    0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
    pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=2;exon_id=ENSE00003582793.1;level=2;havana_gene=OTTHUMG00000000961
    .2;havana_transcript=OTTHUMT00000362751.1;tag=basic
    

    test.bed

    chr1    11868   12227   ENSG00000223972.4       .       +       HAVANA  exon    .       ID=exon:ENST00000456328.2:1;Parent=ENST000004
    56328.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
    _type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana
    _gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
    chr1    11868   14409   ENSG00000223972.4       .       +       HAVANA  transcript      .       ID=ENST00000456328.2;Parent=ENSG00000
    223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
    t_type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_trans
    cript=OTTHUMT00000362751.1;tag=basic
    chr1    11868   14412   ENSG00000223972.4       .       +       HAVANA  gene    .       ID=ENSG00000223972.4;gene_id=ENSG00000223972.
    4;transcript_id=ENSG00000223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_stat
    us=KNOWN;transcript_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
    chr1    11871   12227   ENSG00000223972.4       .       +       ENSEMBL exon    .       ID=exon:ENST00000515242.2:1;Parent=ENST000005
    15242.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
    _type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;exon_number=1;exon_id=ENSE00002234632.1;
    level=3;havana_gene=OTTHUMG00000000961.2
    chr1    11871   14412   ENSG00000223972.4       .       +       ENSEMBL transcript      .       ID=ENST00000515242.2;Parent=ENSG00000
    223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
    t_type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;level=3;havana_gene=OTTHUMG00000000961.
    2
    

    ### GTF 格式

    gtf2bed <gencode.v19.annotation.gtf > test.bed
    convert2bed -i gtf -o bed <gencode.v19.annotation.gtf > test.bed
    

    # 2. 自己写的shell命令

    ##GTF

    cat gencode.v19.annotation.gtf | awk -F '[\t *;]' '/^chr/{if($3=="transcript"){print $1,$4,$5,$10,$13,$22,$7,$3}}' OFS="\t" >test.bed
    
    cat gencode.v19.annotation.gtf |sed 's/;//' | awk -F '[\t *]' '/^chr/{if($3=="transcript"){print $1,$4,$5,$10,$12,$21,$7,$3}}' OFS="\t" >test.bed
    

    ## GFF

    cat gencode.v19.annotation.gff3 | awk -F '[\t;]' '/^chr/{if($3=="exon"){print $1,$4,$5,$9,$11,$12,$15,$7,$3}}' OFS="\t" | sed -e 's/ID=//' -e 's/gene_id=//' -e 's/transcript_id=//' -e 's/gene_name=//' >test.bed
    

    相关文章

      网友评论

        本文标题:GFF或GTF格式转bed

        本文链接:https://www.haomeiwen.com/subject/zwjgnrtx.html