全长转录组已经学完并分析完了,前面码代码和理顺思路也不容易,分享一部分自己的流程给感兴趣的朋友。
- 每个转录本有多少个CDS区域
grep "CDS" sample0.nr.transdecoder.gff3 |cut -f1 | sort |uniq -c >sample0.cds.id.number
- 提取所有有CDS的序列
grep "CDS" sample0.nr.transdecoder.gff3 |cut -f1 |sort|uniq -c |tr " " "\t"|cut -f8|>sample0.cds.id #前提是CDS数目不超过10
- 找既能被nr数据库注释到又有CDS的序列
cd /mnt/c/Users/TE/Desktop/全长转录组/绍梅师姐/sample0/uncompare/数据库注释/nr-animal
cat nr.zhushi ../../CDS/nr/sample0.cds.id |sort|uniq -c |tr " " "\t"| cut -f7,8| awk '{if($1==2) print $2}' >nr-CDS.id
- 找出共有序列注释到nr数据库的具体信息
把nr-CDS.id 上传至服务器/public/jychu/sandai-goose/sample0-0/ccs/fl/flnc/clustered/uncompare/zhushi/nr
cd /public/jychu/sandai-goose/sample0-0/ccs/fl/flnc/clustered/uncompare/zhushi/nr
vi nr-CDS.id.sh
cat nr-CDS.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample0.nr.final.zs >>nr-CDS.protein
done
chmod a+x nr-CDS.id.sh
nohup ./nr-CDS.id.sh
sort -k2,2 nr-CDS.protein |cut -f2|uniq |tr ":" "\t"|cut -f1 >sample0-nr-cds.id #提取样本0既有CDS又能被nr数据库注释到的数据库id 下载到桌面用韦恩图
sample1
cd /mnt/c/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/uncompare/CDS/nr
grep "CDS" sample1.nr.transdecoder.gff3 |cut -f1 | sort |uniq -c >sample1.cds.id.number #每个转录本有多少个CDS区域
grep "CDS" sample1.nr.transdecoder.gff3 |cut -f1 |sort|uniq -c |tr " " "\t"|cut -f8|>sample1.cds.id #提取所有有CDS的序列
- 找既能被nr数据库注释到又有CDS的序列
cd /mnt/c/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/uncompare/数据库注释/nr
cat nr.zhushi ../../CDS/nr/sample1.cds.id |sort|uniq -c |tr " " "\t"| cut -f7,8| awk '{if($1==2) print $2}' >nr-CDS.id
- 找出共有序列注释到nr数据库的具体信息
把nr-CDS.id 上传至服务器/public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhushi/nr
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhushi/nr
vi nr-CDS.id.sh
cat nr-CDS.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample1.nr.final.zs >>nr-CDS.protein
done
chmod a+x nr-CDS.id.sh
./nr-CDS.id.sh
sort -k2,2 nr-CDS.protein |cut -f2|uniq |tr ":" "\t"|cut -f1 >sample1-nr-cds.id #提取样本0既有CDS又能被nr数据库注释到的数据库id 下载到桌面用韦恩图
sample2
cd /mnt/c/Users/TE/Desktop/全长转录组/绍梅师姐/sample2/uncompare/CDS/nr
grep "CDS" sample2.nr.transdecoder.gff3 |cut -f1 | sort |uniq -c >sample2.cds.id.number #每个转录本有多少个CDS区域
grep "CDS" sample2.nr.transdecoder.gff3 |cut -f1 |sort|uniq -c |tr " " "\t"|cut -f8|>sample2.cds.id #提取所有有CDS的序列
- 找既能被nr数据库注释到又有CDS的序列
cd /mnt/c/Users/TE/Desktop/全长转录组/绍梅师姐/sample2/uncompare/数据库注释/nr
cat nr.zhushi ../../CDS/nr/sample2.cds.id |sort|uniq -c |tr " " "\t"| cut -f7,8| awk '{if($1==2) print $2}' >nr-CDS.id
- 找出共有序列注释到nr数据库的具体信息
把nr-CDS.id 上传至服务器/public/jychu/sandai-goose/sample2/ccs/fl/flnc/clustered/uncompare/zhushi/nr
cd /public/jychu/sandai-goose/sample2/ccs/fl/flnc/clustered/uncompare/zhushi/nr
vi nr-CDS.id.sh
cat nr-CDS.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample2.nr.final.zs >>nr-CDS.protein
done
chmod a+x nr-CDS.id.sh
./nr-CDS.id.sh
sort -k2,2 nr-CDS.protein |cut -f2|uniq |tr ":" "\t"|cut -f1 >sample2-nr-cds.id #提取样本0既有CDS又能被nr数据库注释到的数据库id 下载到桌面用韦恩图
看三个样本特有和共有的数据库id,用韦恩图
分类为:
1.三个样本共有
2.样本0和样本1共有
3.样本0和样本2共有
4.样本1和样本2共有
5.样本0特有
6.样本1特有
7.样本2特有
- 新建文件夹
cd
mkdir sandai-final
cp /public/jychu/sandai-goose/sample0-0/ccs/fl/flnc/clustered/uncompare/zhushi/nr/nr-CDS.protein ~/sandai-final/sample0.protein
cp /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhushi/nr/nr-CDS.protein ~/sandai-final/sample1.protein
cp /public/jychu/sandai-goose/sample2/ccs/fl/flnc/clustered/uncompare/zhushi/nr/nr-CDS.protein ~/sandai-final/sample2.protein
vi sample012.protein.id #三个样本共有的id
vi sample01.protein.id #样本0和样本1共有id
vi sample02.protein.id #样本0和样本2共有id
vi sample12.protein.id #样本1和样本2共有id
vi sample0.protein.id #样本0特有id
vi sample1.protein.id #样本1特有id
vi sample2.protein.id #样本2特有id
vi sample012.protein.id.sample0.sh #提取样本0中共有的id
cat sample012.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample0.protein >>sample012.protein.id.sample0.protein
done
chmod a+x sample012.protein.id.sample0.sh
./sample012.protein.id.sample0.sh
vi sample012.protein.id.sample1.sh #提取样本1中共有的id
cat sample012.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample1.protein >>sample012.protein.id.sample1.protein
done
chmod a+x sample012.protein.id.sample1.sh
./sample012.protein.id.sample1.sh
vi sample012.protein.id.sample2.sh #提取样本2中共有的id
cat sample012.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample2.protein >>sample012.protein.id.sample2.protein
done
chmod a+x sample012.protein.id.sample2.sh
./sample012.protein.id.sample2.sh
vi sample01.protein.id.sample0.sh #提取样本0中01样本共有的id
cat sample01.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample0.protein >>sample01.protein.id.sample0.protein
done
chmod a+x sample01.protein.id.sample0.sh
./sample01.protein.id.sample0.sh
vi sample01.protein.id.sample1.sh #提取样本1中01样本共有的id
cat sample01.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample1.protein >>sample01.protein.id.sample1.protein
done
chmod a+x sample01.protein.id.sample1.sh
./sample01.protein.id.sample1.sh
vi sample02.protein.id.sample0.sh #提取样本0中02样本共有的id
cat sample02.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample0.protein >>sample02.protein.id.sample0.protein
done
chmod a+x sample02.protein.id.sample0.sh
./sample02.protein.id.sample0.sh
vi sample02.protein.id.sample2.sh #提取样本2中02样本共有的id
cat sample02.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample2.protein >>sample02.protein.id.sample2.protein
done
chmod a+x sample02.protein.id.sample2.sh
./sample02.protein.id.sample2.sh
vi sample12.protein.id.sample1.sh #提取样本1中12样本共有的id
cat sample12.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample1.protein >>sample12.protein.id.sample1.protein
done
chmod a+x sample12.protein.id.sample1.sh
./sample12.protein.id.sample1.sh
vi sample12.protein.id.sample2.sh #提取样本2中12样本共有的id
cat sample12.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample2.protein >>sample12.protein.id.sample2.protein
done
chmod a+x sample12.protein.id.sample2.sh
./sample12.protein.id.sample2.sh
vi sample0.protein.id.sh #提取样本0中特有的id
cat sample0.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample0.protein >>sample0.protein.id.protein
done
chmod a+x sample0.protein.id.sh
./sample0.protein.id.sh
vi sample1.protein.id.sh #提取样本1中特有的id
cat sample1.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample1.protein >>sample1.protein.id.protein
done
chmod a+x sample1.protein.id.sh
./sample1.protein.id.sh
vi sample2.protein.id.sh #提取样本2中特有的id
cat sample2.protein.id | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample2.protein >>sample2.protein.id.protein
done
chmod a+x sample2.protein.id.sh
./sample2.protein.id.sh
把该文件夹下载至桌面
- 提取共有ID每个样本对应的转录本数目
cd /mnt/c/Users/TE/Desktop/全长转录组/sandai-final
cut -f2 sample012.protein.id.sample0.protein|uniq|tr "/" "_ "|tr ":" "\t"|cut -f2,3|tr "\t" ":">sample012.protein #获取ID对应的蛋白信息
cut -f2 sample012.protein.id.sample0.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample012.protein.sample0.number #获取ID对应的样本0转录本数目
cut -f2 sample012.protein.id.sample1.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample012.protein.sample1.number #获取ID对应的样本1转录本数目
cut -f2 sample012.protein.id.sample2.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample012.protein.sample2.number #获取ID对应的样本2转录本数目
- 提取01共有ID每个样本对应的转录本数目
cd /mnt/c/Users/TE/Desktop/全长转录组/sandai-final
cut -f2 sample01.protein.id.sample0.protein|uniq|tr "/" "_ "|tr ":" "\t"|cut -f2,3|tr "\t" ":">sample01.protein #获取ID对应的蛋白信息
cut -f2 sample01.protein.id.sample0.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample01.protein.sample0.number #获取ID对应的样本0转录本数目
cut -f2 sample01.protein.id.sample1.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample01.protein.sample1.number #获取ID对应的样本1转录本数目
- 提取02共有ID每个样本对应的转录本数目
cd /mnt/c/Users/TE/Desktop/全长转录组/sandai-final
cut -f2 sample02.protein.id.sample0.protein|uniq|tr "/" "_ "|tr ":" "\t"|cut -f2,3|tr "\t" ":">sample02.protein #获取ID对应的蛋白信息
cut -f2 sample02.protein.id.sample0.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample02.protein.sample0.number #获取ID对应的样本0转录本数目
cut -f2 sample02.protein.id.sample2.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample02.protein.sample2.number #获取ID对应的样本2转录本数目
- 提取12共有ID每个样本对应的转录本数目
cd /mnt/c/Users/TE/Desktop/全长转录组/sandai-final
cut -f2 sample12.protein.id.sample1.protein|uniq|tr "/" "_ "|tr ":" "\t"|cut -f2,3|tr "\t" ":">sample12.protein #获取ID对应的蛋白信息
cut -f2 sample12.protein.id.sample1.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample12.protein.sample1.number #获取ID对应的样本1转录本数目
cut -f2 sample12.protein.id.sample2.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample12.protein.sample2.number #获取ID对应的样本2转录本数目
- 提取012特有的ID每个样本对应的转录本数目
cd /mnt/c/Users/TE/Desktop/全长转录组/sandai-final
cut -f2 sample0.protein.id.protein|uniq|tr "/" "_ "|tr ":" "\t"|cut -f2,3|tr "\t" ":">sample00.protein #获取ID对应的蛋白信息
cut -f2 sample0.protein.id.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample00.protein.number #获取ID对应的样本0转录本数目
cut -f2 sample1.protein.id.protein|uniq|tr "/" "_ "|tr ":" "\t"|cut -f2,3|tr "\t" ":">sample11.protein #获取ID对应的蛋白信息
cut -f2 sample1.protein.id.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample11.protein.number #获取ID对应的样本1转录本数目
cut -f2 sample2.protein.id.protein|uniq|tr "/" "_ "|tr ":" "\t"|cut -f2,3|tr "\t" ":">sample22.protein #获取ID对应的蛋白信息
cut -f2 sample2.protein.id.protein|uniq -c|sed 's/^[ \t]*//g'|tr " " "\t" |cut -f1>sample22.protein.number #获取ID对应的样本2转录本数目
网友评论