WGD
export PATH=/home/lx_sky6/software/miniconda3/envs/wgdi2/bin/:$PATH
cp /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_v2.1.gene.gff3 Vvin.gff3
cp /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_v2.1.cds.fa ./Vvin.cds.fasta
cp /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_v2.1.protein.fa ./Vvin.protein.fasta
cp /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_Genoscope.12X.fa .
修改gff,以及protein和cds的名字
sed -i 's/.v2.1//g' Vvin.gff3
chr1 phytozomev13 gene 14302888 14303488 . - . ID=VIT_201s0010g00010.v2.1;Name=VIT_201s0010g00010
修改为
chr1 phytozomev13 gene 14302888 14303488 . - . ID=VIT_201s0010g00010;Name=VIT_201s0010g00010
获得长度文件
python /home/lx_sky6/software/miniconda3/envs/jcvi/generate_conf.py -p Vvin Vvinifera_457_Genoscope.12X.fa Vvin.gff3
##生成Vvin.gff和Vvin.len文件 ,记得一定要删除Vvin.gff文件
chr1 23037639 1629
chr10 18140952 1037
chr11 19818926 1239
chr12 22702307 1542
chr13 24396255 1559
chr14 30274277 1948
chr15 20304914 1217
chr16 22053297 1356
chr17 17126926 1157
chr18 29360087 2116
chr19 24021853 1443
chr2 18779844 1169
chr3 19341862 1319
chr4 23867706 1580
chr5 25021643 1705
chr6 21508407 1484
chr7 21026613 1569
chr8 22385789 1688
chr9 23006712 1407
chrUn 43154196 2850
python /home/lx_sky6/yt/script/WGDI/test/01.getgff.py Vvin.gff3 old.gff
##生成old.gff文件
chr1 VIT_201s0010g00010.1 14302888 14303488 -
chr1 VIT_201s0010g00020.1 14354642 14356756 -
chr1 VIT_201s0010g00020.2 14355486 14356756 -
python /home/lx_sky6/yt/script/WGDI/test/02.gff_lens.py old.gff Vvin.gff Vvin.gff
##获得重新命名的Vvin.gff,前一个Vvin.gff为加在里面的前缀
chr1 Vvin.gffchr1g00001 12837 26777 + 1 VIT_201s0011g00010.1
chr1 Vvin.gffchr1g00002 33171 35791 + 2 VIT_201s0011g00030.1
##给蛋白和cds重新命名
python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Vvin.gff Vvin.protein.fasta Vvin.rename.protein.fasta
python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Vvin.gff Vvin.cds.fasta Vvin.rename.cds.fasta
针对自己用evm注释的物种文件
EVM生成的gff和cds等都要改名字
cp ../9-evm/5-RUN1/EVM.all.gff ./Cbre.gff3
cp ../9-evm/5-RUN1/Cbre_protein.fasta ./Cbre.protein.fasta
cp ../9-evm/5-RUN1/Cbre_cds.fasta ./Cbre.cds.fasta
sed -i 's/evm.TU/evm.model/g' Cbre.gff3
##是gff3和cds,pep的序列id一致
gff3:scaffold001 EVM mRNA 151969 152899 . - . ID=evm.model.scaffold001.1;Parent=evm.model.scaffold001.1;Name=EVM%20prediction%20scaffold001.1
cds:evm.model.scaffold001.1 gene=evm.TU.scaffold001.1
python /home/lx_sky6/software/miniconda3/envs/jcvi/generate_conf.py -p Cbre ../genome.Carex_breviculmis.fa Cbre.gff3
##一定删除生成的Cbre.gff文件
python /home/lx_sky6/yt/script/WGDI/test/01.getgff.py Cbre.gff3 old.gff
python /home/lx_sky6/yt/script/WGDI/test/02.gff_lens.py old.gff Cbre.gff Cbre.gff
python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Cbre.gff Cbre.cds.fasta Cbre.rename.cds.fasta
python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Cbre.gff Cbre.protein.fasta Cbre.rename.protein.fasta
其他物种以此处理
此刻我们就获得了需要分析的物种的改了名的蛋白和cds文件
比对
diamond makedb --in Cbre/Cbre.rename.protein.fasta --db Cbre
diamond blastp --db Cbre -q Cbre/Cbre.rename.protein.fasta --outfmt 6 --out Cbre.blastp.txt --threads 30
diamond makedb --in Vvin/Vvin.rename.protein.fasta --db Vvin
diamond blastp --db Vvin -q Vvin/Vvin.rename.protein.fasta --outfmt 6 --out Vvin.blastp.txt --threads 30
diamond makedb --in Acom.rename.protein.fasta --db Acom
diamond blastp --db Acom -q Acom.rename.protein.fasta --outfmt 6 --out Acom.blastp.txt --threads 30
生成配置文件并修改conf内容
wgdi -d help >> Cbre.conf
点图
wgdi -d Cbre.conf
##生成Cbre.step1.pdf
[dotplot]
blast = Cbre.blastp.txt
gff1 = Cbre.gff
gff2 = Cbre.gff
lens1 = Cbre.len
lens2 = Cbre.len
genome1_name = Carex_breviculmis
genome2_name = Carex_breviculmis
multiple = 2
score = 100
evalue = 1e-5
repeat_number = 20
position = order
blast_reverse = false
ancestor_left = none
ancestor_top = none
markersize = 0.5
figsize = 10,10
savefig = Cbre.step1.pdf
运行共线性
wgdi -icl Cbre.conf
##生成Cbre.icl.collinearity.txt
[collinearity]
gff1 = Cbre.gff
gff2 = Cbre.gff
lens1 = Cbre.len
lens2 = Cbre.len
blast = Cbre.blastp.txt
blast_reverse = false
multiple = 1
process = 8
evalue = 1e-5
score = 100
grading = 50,40,25
mg = 25,25
pvalue = 0.2
repeat_number = 10
positon = order
savefile = Cbre.icl.collinearity.txt
计算ks值
wgdi -ks Cbre.conf
##生成Cbre.ks
[ks]
cds_file = Cbre.rename.cds.fasta
pep_file = Cbre.rename.protein.fasta
align_software = muscle
pairs_file = Cbre.icl.collinearity.txt
ks_file = Cbre.ks
结果整合 (依赖Cbre.icl.collinearity.txt和Cbre.ks)
wgdi -bi Cbre.conf
##生成Cbre.block.csv
[blockinfo]
blast = Cbre.blastp.txt
gff1 = Cbre.gff
gff2 = Cbre.gff
lens1 = Cbre.len
lens2 = Cbre.len
collinearity = Cbre.icl.collinearity.txt
score = 100
evalue = 1e-5
repeat_number = 20
position = order
ks = Cbre.ks
ks_col = ks_YN00
savefile = Cbre.block.csv
提取
wgdi -c Cbre.conf
##生成Cbre.block.new.csv
[correspondence]
blockinfo = Cbre.block.csv
lens1 = Cbre.len
lens2 = Cbre.len
tandem = false
tandem_length = 200
pvalue = 0.2
lock_length = 5
multiple = 2
homo = -1,1
savefile = Cbre.block.new.csv
绘制dotplot 展示共线性块上的 Ks
wgdi -bk Cbre.conf
##生成Cbre.blocks.new.png
[blockks]
lens1 = Cbre.len
lens2 = Cbre.len
genome1_name = Carex_breviculmis
genome2_name = Carex_breviculmis
blockinfo = Cbre.block.new.csv
pvalue = 1
tandem = true
tandem_length = 200
markersize = 1
area = -1,3
block_length = 5
figsize = 8,8
savefig = Cbre.blocks.new.png
ks分布图
wgdi -kp Cbre.conf
##生成Cbre.kp.medain.ks
[kspeaks]
blockinfo = Cbre.block.new.csv
pvalue = 0.2
tandem = false
block_length = 5
ks_area = 0,10
multiple = 2
homo = 0.3,1
fontsize = 9
area = 0,3
figsize = 10,6.18
savefig = Cbre.kp.png
savefile = Cbre.kp.medain.ks
拟合参数将会用于后续的 ksfigure 模块。
wgdi -pf pf.conf
##生成Cbre.ks_peak_peaksfit.png以及拟合参数
R-square: 0.09702451916365609
The gaussian fitting curve parameters are :
3.534434892929756 | 0.33939949288718707 | 0.4269611969337495
[peaksfit]
blockinfo = Cbre.block.new.csv
mode = median
bins_number = 200
ks_area = 0,10
fontsize = 9
area = 0,3
figsize = 10,6.18
shadow = true
savefig = Cbre.ks_peak_peaksfit.png
单个物种的到此结束
下面是多个物种
Cbre.block.new.csv ##拟合参数 3.534434892929756 | 0.33939949288718707 | 0.4269611969337495
vvin.block.new.csv ##拟合参数 1.718535848451611 | 1.4664829594546644 | 0.2716067658348252
Acom.block.new.csv ##拟合参数 0.8408592432366855 | 1.630457537878788 | 0.544759663045775
我们新建一个 all_ks.csv文件, 该文件的第一行为标题行,第二行以后为数据行。一共有4+3n列,其中第一列是样本信息,第2-3列对应线条的属性,后面都是拟合参数
注意,在终端里面编辑时,需要记住linestyle后面的逗号数量是 3n个, 其中n是最大peak数,例如拟南芥有2个peak,那么就得有6个逗号。
假如,我们之前用wgdi拟合过其他物种的peak,也得到了一些拟合参数,那么添加到这个文件中,例如
保存到all_ks.csv
,color,linewidth,linestyle,,,,,,
Cbre_Cbre,red,1,--,3.534434892929756,0.33939949288718707,0.4269611969337495
Vvin_Vvin,orange,1,-,1.718535848451611,1.4664829594546644,0.2716067658348252
Acom_Acom,1,-,0.8408592432366855,1.630457537878788,0.544759663045775
拟合多物种ks图
wgdi -kf Cbre.conf
##生成ks分布图(正态)
[ksfigure]
ksfit = all_ks.csv
labelfontsize = 15
legendfontsize = 15
xlabel = none
ylabel = none
title = none
area = 0,4
figsize = 10,6.18
savefig = all_ks.png
到此结束
![](https://img.haomeiwen.com/i14744215/611b65d53b9135f4.png)
网友评论