提取 genecode的gtf注释信息

作者: 落寞的橙子 | 来源:发表于2019-04-11 23:14 被阅读0次

    读入数据

    gtf <- rtracklayer::import('gencode.v22.annotation.gtf')#自行下载gtf注释文件
    gtf_df=as.data.frame(gtf) #转化为矩阵 这一步就可以随意操作了
    head(gtf_df)
      seqnames start   end width strand source       type score phase           gene_id
    1     chr1 11869 14409  2541      + HAVANA       gene    NA    NA ENSG00000223972.5
    2     chr1 11869 14409  2541      + HAVANA transcript    NA    NA ENSG00000223972.5
    3     chr1 11869 12227   359      + HAVANA       exon    NA    NA ENSG00000223972.5
    4     chr1 12613 12721   109      + HAVANA       exon    NA    NA ENSG00000223972.5
    5     chr1 13221 14409  1189      + HAVANA       exon    NA    NA ENSG00000223972.5
    6     chr1 12010 13670  1661      + HAVANA transcript    NA    NA ENSG00000223972.5
                               gene_type gene_status gene_name level          havana_gene     transcript_id
    1 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2              <NA>
    2 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
    3 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
    4 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
    5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
    6 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000450305.2
                         transcript_type transcript_status transcript_name   tag transcript_support_level
    1                               <NA>              <NA>            <NA>  <NA>                     <NA>
    2               processed_transcript             KNOWN     DDX11L1-002 basic                        1
    3               processed_transcript             KNOWN     DDX11L1-002 basic                        1
    4               processed_transcript             KNOWN     DDX11L1-002 basic                        1
    5               processed_transcript             KNOWN     DDX11L1-002 basic                        1
    6 transcribed_unprocessed_pseudogene             KNOWN     DDX11L1-001 basic                       NA
         havana_transcript exon_number           exon_id         ont protein_id ccdsid
    1                 <NA>        <NA>              <NA>        <NA>       <NA>   <NA>
    2 OTTHUMT00000362751.1        <NA>              <NA>        <NA>       <NA>   <NA>
    3 OTTHUMT00000362751.1           1 ENSE00002234944.1        <NA>       <NA>   <NA>
    4 OTTHUMT00000362751.1           2 ENSE00003582793.1        <NA>       <NA>   <NA>
    5 OTTHUMT00000362751.1           3 ENSE00002312635.1        <NA>       <NA>   <NA>
    6 OTTHUMT00000002844.2        <NA>              <NA> PGO:0000019       <NA>   <NA>
    

    提取gene信息

    gene<-gtf_df[gtf_df$type=="gene",]
    head(gene)
       seqnames start   end width strand  source type score phase           gene_id
    1      chr1 11869 14409  2541      +  HAVANA gene    NA    NA ENSG00000223972.5
    13     chr1 14404 29570 15167      -  HAVANA gene    NA    NA ENSG00000227232.5
    26     chr1 17369 17436    68      - ENSEMBL gene    NA    NA ENSG00000278267.1
    29     chr1 29554 31109  1556      +  HAVANA gene    NA    NA ENSG00000243485.3
    37     chr1 30366 30503   138      + ENSEMBL gene    NA    NA ENSG00000274890.1
    40     chr1 34554 36081  1528      -  HAVANA gene    NA    NA ENSG00000237613.2
                                gene_type gene_status    gene_name level          havana_gene transcript_id
    1  transcribed_unprocessed_pseudogene       KNOWN      DDX11L1     2 OTTHUMG00000000961.2          <NA>
    13             unprocessed_pseudogene       KNOWN       WASH7P     2 OTTHUMG00000000958.1          <NA>
    26                              miRNA       KNOWN    MIR6859-3     3                 <NA>          <NA>
    29                            lincRNA       NOVEL RP11-34P13.3     2 OTTHUMG00000000959.2          <NA>
    37                              miRNA       KNOWN    MIR1302-9     3                 <NA>          <NA>
    40                            lincRNA       KNOWN      FAM138A     2 OTTHUMG00000000960.1          <NA>
       transcript_type transcript_status transcript_name        tag transcript_support_level havana_transcript
    1             <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
    13            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
    26            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
    29            <NA>              <NA>            <NA> ncRNA_host                     <NA>              <NA>
    37            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
    40            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
       exon_number exon_id  ont protein_id ccdsid
    1         <NA>    <NA> <NA>       <NA>   <NA>
    13        <NA>    <NA> <NA>       <NA>   <NA>
    26        <NA>    <NA> <NA>       <NA>   <NA>
    29        <NA>    <NA> <NA>       <NA>   <NA>
    37        <NA>    <NA> <NA>       <NA>   <NA>
    40        <NA>    <NA> <NA>       <NA>   <NA>
    

    获取想要的信息

    colnames(gene)
     [1] "seqnames"                 "start"                    "end"                     
     [4] "width"                    "strand"                   "source"                  
     [7] "type"                     "score"                    "phase"                   
    [10] "gene_id"                  "gene_type"                "gene_status"             
    [13] "gene_name"                "level"                    "havana_gene"             
    [16] "transcript_id"            "transcript_type"          "transcript_status"       
    [19] "transcript_name"          "tag"                      "transcript_support_level"
    [22] "havana_transcript"        "exon_number"              "exon_id"                 
    [25] "ont"                      "protein_id"               "ccdsid"   
    pick_info<-c("seqnames","start","end","width","strand","gene_id","gene_name")#提取自己想要的列
    ann<-gene[,pick_info]
    row.names(ann)<-as.character(ann$gene_id)
    head(ann)
                      seqnames start   end width strand           gene_id    gene_name
    ENSG00000223972.5     chr1 11869 14409  2541      + ENSG00000223972.5      DDX11L1
    ENSG00000227232.5     chr1 14404 29570 15167      - ENSG00000227232.5       WASH7P
    ENSG00000278267.1     chr1 17369 17436    68      - ENSG00000278267.1    MIR6859-3
    ENSG00000243485.3     chr1 29554 31109  1556      + ENSG00000243485.3 RP11-34P13.3
    ENSG00000274890.1     chr1 30366 30503   138      + ENSG00000274890.1    MIR1302-9
    ENSG00000237613.2     chr1 34554 36081  1528      - ENSG00000237613.2      FAM138A
    write.csv(ann,"gencode_v22_annotation_gene.csv") #输出结果
    

    写在最后的话

    很多大神用perl和python来提取,对于文本提取这两个语言有很大的优势,不过需要花时间取理解。平时常用R,同时实验也比较多,所以就用R来做,还没入坑的小朋友可以学python,会比较好很多。

    相关文章

      网友评论

        本文标题:提取 genecode的gtf注释信息

        本文链接:https://www.haomeiwen.com/subject/xhefwqtx.html