less -SN /data/server/reference/gtf/gencode/gencode.vM12.annotation.gtf|awk -F '\t' '{print $3}'|sort -u|uniq
#总共这几类
CDS
exon
gene
Selenocysteine
start_codon
stop_codon
transcript
UTR
featureCounts设置了很多的参数,我的测序数据是RNA-seq,包含了mRNA=protein_coding RNA,LncRNA这些,所以,我想把它们区分开
nohup /public/vip/biosoft/subread-2.0.2-Linux-x86_64/bin/featureCounts -T 20 -p -t exon -g gene_id -a $gtf --extraAttributes gene_name,gene_type -o all.id.20210531.counts *.bam > re20210531.id.log 2>&1
特别感谢小郭郭,提示我加了个参数 --extraAttributes gene_name,gene_type
运行结束后,产生了下列的文件
ls -hl
#-rw-rw-r-- 1 vip vip 19K 5月 31 23:00 re20210531.id.log
#-rw-rw-r-- 1 vip vip 2.6K 5月 31 23:00 all.id.20210531.counts.summary
#-rw-rw-r-- 1 vip vip 24M 5月 31 23:00 all.id.20210531.counts
all.id.20210531.counts
统计下总共有多少类型的RNA
less -SN all.id.20210531.counts|awk -F '\t' '{print $8}'|sort -u|uniq
3prime_overlapping_ncRNA
antisense
bidirectional_promoter_lncRNA
gene_type
IG_C_gene
IG_C_pseudogene
IG_D_gene
IG_D_pseudogene
IG_J_gene
IG_LV_gene
IG_pseudogene
IG_V_gene
IG_V_pseudogene
lincRNA
macro_lncRNA
miRNA
misc_RNA
Mt_rRNA
Mt_tRNA
polymorphic_pseudogene
processed_pseudogene
processed_transcript
protein_coding
pseudogene
ribozyme
rRNA
scaRNA
scRNA
sense_intronic
sense_overlapping
snoRNA
snRNA
sRNA
TEC
transcribed_processed_pseudogene
transcribed_unitary_pseudogene
transcribed_unprocessed_pseudogene
TR_C_gene
TR_D_gene
TR_J_gene
TR_J_pseudogene
TR_V_gene
TR_V_pseudogene
unitary_pseudogene
unprocessed_pseudogene
统计下protein_coding和lincRNA
less -SN all.id.20210531.counts|wc
49587 1586793 25120021
less -SN all.id.20210531.counts|awk '/protein_coding/{print $0}' |wc
21973 703136 20579223
less -SN all.id.20210531.counts|awk '/lincRNA/{print $0}' |wc
4255 136160 909306
less -SN all.id.20210531.counts|awk '/miRNA/{print $0}' |wc
2202 70464 253808
对输出结果进行去除,使用Linux提取特定的列
sed -i '#Aligned.sortedByCoord.out.bam##g' all.id.20210531.counts
less -SN counts_featureeCounts0531.txt |cut -f 1,7-|less -SN
输出结果
将输出的表达矩阵按照不同类型提取出来,分别做下游分析
提取出编码蛋白的featureCounts
less -SN counts_featureeCounts0531.txt |cut -f 1,7-|awk '/protein_coding/{print $0}' |less -SN
less -SN counts_featureeCounts0531.txt |cut -f 1,7-|awk '/protein_coding/{print $0}'>protein_coding_Counts0531.txt
提取出mRNA-蛋白表达的矩阵出来
提取出lincRNA的featureCounts
less -SN counts_featureeCounts0531.txt |cut -f 1,7-|awk '/lincRNA/{print $0}'>lincRNA_Counts0531.txt
lincRNA
#删除第一行
sed -i '1d' filename
#范围删除,删除1-3行
sed -i '1,3d' filename
#删除第n行
sed -i 'nd' filename
#删除最后一行
sed -i '$d' filename
发现生成的文件没有了第一行,所以把第一行给添加进去
head -1 counts0531.txt
#复制内容
回帖到文件的第一行
sed -i "1iGeneid\tgene_name\tgene_type\t1-PC-LCJ9004\t2-CP-LCJ9001\t4-CP-LCJ9003\tA-CO-LCG0611\tA-CP-LCG0596\tA-M-LCG0591\tA-OL-LCG0606\tA-PC-LCG0601\t B-CO-LCG0612\tB-M-LCG0592\tB-OL-LCG0607\tB-PC-LCG0602\tC-CO-LCG0613\tC-M-LCG0593\tC-OL-LCG0608\tCP-3-LDA14877\tD-CO-LCG0614\tD-M-LCG0594\tD-OL-LCG0609\tD-PC-LCG0604\tE-CO-LCG0615\tE-M-LCG0595\tE-OL-LCG0610\tE-PC-LCG0605" protein_coding_Counts0531.txt
首行回来了
统计各个种类的RNA频率
less -SN counts0531.txt|cut -f 3|sort|uniq -c
2 3prime_overlapping_ncRNA
2419 antisense
61 bidirectional_promoter_lncRNA
1 gene_type
13 IG_C_gene
1 IG_C_pseudogene
19 IG_D_gene
3 IG_D_pseudogene
14 IG_J_gene
4 IG_LV_gene
2 IG_pseudogene
218 IG_V_gene
155 IG_V_pseudogene
4255 lincRNA
1 macro_lncRNA
2202 miRNA
564 misc_RNA
2 Mt_rRNA
22 Mt_tRNA
77 polymorphic_pseudogene
7289 processed_pseudogene
753 processed_transcript
21973 protein_coding
99 pseudogene
22 ribozyme
354 rRNA
51 scaRNA
1 scRNA
270 sense_intronic
25 sense_overlapping
1508 snoRNA
1383 snRNA
2 sRNA
2695 TEC
197 transcribed_processed_pseudogene
8 transcribed_unitary_pseudogene
191 transcribed_unprocessed_pseudogene
8 TR_C_gene
4 TR_D_gene
70 TR_J_gene
10 TR_J_pseudogene
144 TR_V_gene
34 TR_V_pseudogene
26 unitary_pseudogenqe
2434 unprocessed_pseudogene
网友评论