读入数据
gtf <- rtracklayer::import('gencode.v22.annotation.gtf')#自行下载gtf注释文件
gtf_df=as.data.frame(gtf) #转化为矩阵 这一步就可以随意操作了
head(gtf_df)
seqnames start end width strand source type score phase gene_id
1 chr1 11869 14409 2541 + HAVANA gene NA NA ENSG00000223972.5
2 chr1 11869 14409 2541 + HAVANA transcript NA NA ENSG00000223972.5
3 chr1 11869 12227 359 + HAVANA exon NA NA ENSG00000223972.5
4 chr1 12613 12721 109 + HAVANA exon NA NA ENSG00000223972.5
5 chr1 13221 14409 1189 + HAVANA exon NA NA ENSG00000223972.5
6 chr1 12010 13670 1661 + HAVANA transcript NA NA ENSG00000223972.5
gene_type gene_status gene_name level havana_gene transcript_id
1 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 <NA>
2 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 ENST00000456328.2
3 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 ENST00000456328.2
4 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 ENST00000456328.2
5 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 ENST00000456328.2
6 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 ENST00000450305.2
transcript_type transcript_status transcript_name tag transcript_support_level
1 <NA> <NA> <NA> <NA> <NA>
2 processed_transcript KNOWN DDX11L1-002 basic 1
3 processed_transcript KNOWN DDX11L1-002 basic 1
4 processed_transcript KNOWN DDX11L1-002 basic 1
5 processed_transcript KNOWN DDX11L1-002 basic 1
6 transcribed_unprocessed_pseudogene KNOWN DDX11L1-001 basic NA
havana_transcript exon_number exon_id ont protein_id ccdsid
1 <NA> <NA> <NA> <NA> <NA> <NA>
2 OTTHUMT00000362751.1 <NA> <NA> <NA> <NA> <NA>
3 OTTHUMT00000362751.1 1 ENSE00002234944.1 <NA> <NA> <NA>
4 OTTHUMT00000362751.1 2 ENSE00003582793.1 <NA> <NA> <NA>
5 OTTHUMT00000362751.1 3 ENSE00002312635.1 <NA> <NA> <NA>
6 OTTHUMT00000002844.2 <NA> <NA> PGO:0000019 <NA> <NA>
提取gene信息
gene<-gtf_df[gtf_df$type=="gene",]
head(gene)
seqnames start end width strand source type score phase gene_id
1 chr1 11869 14409 2541 + HAVANA gene NA NA ENSG00000223972.5
13 chr1 14404 29570 15167 - HAVANA gene NA NA ENSG00000227232.5
26 chr1 17369 17436 68 - ENSEMBL gene NA NA ENSG00000278267.1
29 chr1 29554 31109 1556 + HAVANA gene NA NA ENSG00000243485.3
37 chr1 30366 30503 138 + ENSEMBL gene NA NA ENSG00000274890.1
40 chr1 34554 36081 1528 - HAVANA gene NA NA ENSG00000237613.2
gene_type gene_status gene_name level havana_gene transcript_id
1 transcribed_unprocessed_pseudogene KNOWN DDX11L1 2 OTTHUMG00000000961.2 <NA>
13 unprocessed_pseudogene KNOWN WASH7P 2 OTTHUMG00000000958.1 <NA>
26 miRNA KNOWN MIR6859-3 3 <NA> <NA>
29 lincRNA NOVEL RP11-34P13.3 2 OTTHUMG00000000959.2 <NA>
37 miRNA KNOWN MIR1302-9 3 <NA> <NA>
40 lincRNA KNOWN FAM138A 2 OTTHUMG00000000960.1 <NA>
transcript_type transcript_status transcript_name tag transcript_support_level havana_transcript
1 <NA> <NA> <NA> <NA> <NA> <NA>
13 <NA> <NA> <NA> <NA> <NA> <NA>
26 <NA> <NA> <NA> <NA> <NA> <NA>
29 <NA> <NA> <NA> ncRNA_host <NA> <NA>
37 <NA> <NA> <NA> <NA> <NA> <NA>
40 <NA> <NA> <NA> <NA> <NA> <NA>
exon_number exon_id ont protein_id ccdsid
1 <NA> <NA> <NA> <NA> <NA>
13 <NA> <NA> <NA> <NA> <NA>
26 <NA> <NA> <NA> <NA> <NA>
29 <NA> <NA> <NA> <NA> <NA>
37 <NA> <NA> <NA> <NA> <NA>
40 <NA> <NA> <NA> <NA> <NA>
获取想要的信息
colnames(gene)
[1] "seqnames" "start" "end"
[4] "width" "strand" "source"
[7] "type" "score" "phase"
[10] "gene_id" "gene_type" "gene_status"
[13] "gene_name" "level" "havana_gene"
[16] "transcript_id" "transcript_type" "transcript_status"
[19] "transcript_name" "tag" "transcript_support_level"
[22] "havana_transcript" "exon_number" "exon_id"
[25] "ont" "protein_id" "ccdsid"
pick_info<-c("seqnames","start","end","width","strand","gene_id","gene_name")#提取自己想要的列
ann<-gene[,pick_info]
row.names(ann)<-as.character(ann$gene_id)
head(ann)
seqnames start end width strand gene_id gene_name
ENSG00000223972.5 chr1 11869 14409 2541 + ENSG00000223972.5 DDX11L1
ENSG00000227232.5 chr1 14404 29570 15167 - ENSG00000227232.5 WASH7P
ENSG00000278267.1 chr1 17369 17436 68 - ENSG00000278267.1 MIR6859-3
ENSG00000243485.3 chr1 29554 31109 1556 + ENSG00000243485.3 RP11-34P13.3
ENSG00000274890.1 chr1 30366 30503 138 + ENSG00000274890.1 MIR1302-9
ENSG00000237613.2 chr1 34554 36081 1528 - ENSG00000237613.2 FAM138A
write.csv(ann,"gencode_v22_annotation_gene.csv") #输出结果
写在最后的话
很多大神用perl和python来提取,对于文本提取这两个语言有很大的优势,不过需要花时间取理解。平时常用R,同时实验也比较多,所以就用R来做,还没入坑的小朋友可以学python,会比较好很多。
网友评论