WGDI

作者: 多啦A梦的时光机_648d | 来源:发表于2023-08-18 17:08 被阅读0次

WGD

export PATH=/home/lx_sky6/software/miniconda3/envs/wgdi2/bin/:$PATH
cp /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_v2.1.gene.gff3 Vvin.gff3
cp /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_v2.1.cds.fa ./Vvin.cds.fasta
cp /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_v2.1.protein.fa ./Vvin.protein.fasta
cp  /home/lx_sky6/yt/database/Vitis_vinifera_v2.1/Phytozome/Vvinifera_457_Genoscope.12X.fa .

修改gff,以及protein和cds的名字

sed -i 's/.v2.1//g' Vvin.gff3

chr1    phytozomev13    gene    14302888        14303488        .       -       .       ID=VIT_201s0010g00010.v2.1;Name=VIT_201s0010g00010
修改为
chr1    phytozomev13    gene    14302888        14303488        .       -       .       ID=VIT_201s0010g00010;Name=VIT_201s0010g00010

获得长度文件

python /home/lx_sky6/software/miniconda3/envs/jcvi/generate_conf.py -p Vvin Vvinifera_457_Genoscope.12X.fa  Vvin.gff3
##生成Vvin.gff和Vvin.len文件 ,记得一定要删除Vvin.gff文件

chr1    23037639        1629
chr10   18140952        1037
chr11   19818926        1239
chr12   22702307        1542
chr13   24396255        1559
chr14   30274277        1948
chr15   20304914        1217
chr16   22053297        1356
chr17   17126926        1157
chr18   29360087        2116
chr19   24021853        1443
chr2    18779844        1169
chr3    19341862        1319
chr4    23867706        1580
chr5    25021643        1705
chr6    21508407        1484
chr7    21026613        1569
chr8    22385789        1688
chr9    23006712        1407
chrUn   43154196        2850
python /home/lx_sky6/yt/script/WGDI/test/01.getgff.py Vvin.gff3 old.gff
##生成old.gff文件

chr1    VIT_201s0010g00010.1    14302888        14303488        -
chr1    VIT_201s0010g00020.1    14354642        14356756        -
chr1    VIT_201s0010g00020.2    14355486        14356756        -
python /home/lx_sky6/yt/script/WGDI/test/02.gff_lens.py old.gff  Vvin.gff Vvin.gff
##获得重新命名的Vvin.gff,前一个Vvin.gff为加在里面的前缀

chr1    Vvin.gffchr1g00001      12837   26777   +       1       VIT_201s0011g00010.1
chr1    Vvin.gffchr1g00002      33171   35791   +       2       VIT_201s0011g00030.1
##给蛋白和cds重新命名
python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Vvin.gff Vvin.protein.fasta Vvin.rename.protein.fasta
python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Vvin.gff Vvin.cds.fasta Vvin.rename.cds.fasta

针对自己用evm注释的物种文件

EVM生成的gff和cds等都要改名字

cp ../9-evm/5-RUN1/EVM.all.gff ./Cbre.gff3
cp ../9-evm/5-RUN1/Cbre_protein.fasta ./Cbre.protein.fasta
cp ../9-evm/5-RUN1/Cbre_cds.fasta ./Cbre.cds.fasta

sed -i 's/evm.TU/evm.model/g' Cbre.gff3
##是gff3和cds,pep的序列id一致
gff3:scaffold001    EVM mRNA    151969  152899  .   -   .   ID=evm.model.scaffold001.1;Parent=evm.model.scaffold001.1;Name=EVM%20prediction%20scaffold001.1
cds:evm.model.scaffold001.1 gene=evm.TU.scaffold001.1


python /home/lx_sky6/software/miniconda3/envs/jcvi/generate_conf.py -p Cbre ../genome.Carex_breviculmis.fa Cbre.gff3
##一定删除生成的Cbre.gff文件

python /home/lx_sky6/yt/script/WGDI/test/01.getgff.py Cbre.gff3 old.gff
python /home/lx_sky6/yt/script/WGDI/test/02.gff_lens.py old.gff  Cbre.gff Cbre.gff

python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Cbre.gff Cbre.cds.fasta Cbre.rename.cds.fasta
python /home/lx_sky6/yt/script/WGDI/test/03.seq_newname.py Cbre.gff Cbre.protein.fasta Cbre.rename.protein.fasta

其他物种以此处理

此刻我们就获得了需要分析的物种的改了名的蛋白和cds文件

比对

diamond makedb --in Cbre/Cbre.rename.protein.fasta --db Cbre
diamond blastp --db Cbre -q Cbre/Cbre.rename.protein.fasta --outfmt 6 --out Cbre.blastp.txt --threads 30

diamond makedb --in Vvin/Vvin.rename.protein.fasta --db Vvin
diamond blastp --db Vvin -q Vvin/Vvin.rename.protein.fasta --outfmt 6 --out Vvin.blastp.txt --threads 30

diamond makedb --in Acom.rename.protein.fasta --db Acom
diamond blastp --db Acom -q Acom.rename.protein.fasta --outfmt 6 --out Acom.blastp.txt --threads 30

生成配置文件并修改conf内容

wgdi -d help >> Cbre.conf

点图

wgdi -d Cbre.conf
##生成Cbre.step1.pdf
[dotplot]
blast = Cbre.blastp.txt
gff1 = Cbre.gff
gff2 =  Cbre.gff
lens1 = Cbre.len
lens2 = Cbre.len
genome1_name =  Carex_breviculmis
genome2_name =  Carex_breviculmis
multiple  = 2
score = 100
evalue = 1e-5
repeat_number = 20
position = order
blast_reverse = false
ancestor_left = none
ancestor_top = none
markersize = 0.5
figsize = 10,10
savefig = Cbre.step1.pdf

运行共线性

wgdi -icl Cbre.conf
##生成Cbre.icl.collinearity.txt
[collinearity]
gff1 = Cbre.gff
gff2 = Cbre.gff
lens1 = Cbre.len
lens2 = Cbre.len
blast = Cbre.blastp.txt
blast_reverse = false
multiple  = 1
process = 8
evalue = 1e-5
score = 100
grading = 50,40,25
mg = 25,25
pvalue = 0.2
repeat_number = 10
positon = order
savefile = Cbre.icl.collinearity.txt

计算ks值

wgdi -ks Cbre.conf
##生成Cbre.ks
[ks]
cds_file = Cbre.rename.cds.fasta
pep_file = Cbre.rename.protein.fasta
align_software = muscle
pairs_file = Cbre.icl.collinearity.txt
ks_file = Cbre.ks

结果整合 (依赖Cbre.icl.collinearity.txt和Cbre.ks)

wgdi -bi Cbre.conf
##生成Cbre.block.csv
[blockinfo]
blast = Cbre.blastp.txt
gff1 = Cbre.gff
gff2 = Cbre.gff
lens1 = Cbre.len
lens2 = Cbre.len
collinearity = Cbre.icl.collinearity.txt
score = 100
evalue = 1e-5
repeat_number = 20
position = order
ks = Cbre.ks
ks_col = ks_YN00
savefile = Cbre.block.csv

提取

wgdi -c Cbre.conf
##生成Cbre.block.new.csv
[correspondence]
blockinfo = Cbre.block.csv
lens1 = Cbre.len
lens2 = Cbre.len
tandem = false
tandem_length = 200
pvalue = 0.2
lock_length = 5
multiple  = 2
homo = -1,1
savefile = Cbre.block.new.csv

绘制dotplot 展示共线性块上的 Ks

wgdi -bk Cbre.conf
##生成Cbre.blocks.new.png
[blockks]
lens1 = Cbre.len
lens2 = Cbre.len
genome1_name = Carex_breviculmis
genome2_name = Carex_breviculmis
blockinfo = Cbre.block.new.csv
pvalue = 1
tandem = true
tandem_length = 200
markersize = 1
area = -1,3
block_length = 5
figsize = 8,8
savefig = Cbre.blocks.new.png

ks分布图

wgdi -kp Cbre.conf
##生成Cbre.kp.medain.ks
[kspeaks]
blockinfo = Cbre.block.new.csv
pvalue = 0.2
tandem = false
block_length = 5
ks_area = 0,10
multiple  = 2
homo = 0.3,1
fontsize = 9
area = 0,3
figsize = 10,6.18
savefig = Cbre.kp.png
savefile = Cbre.kp.medain.ks

拟合参数将会用于后续的 ksfigure 模块。

wgdi -pf pf.conf
##生成Cbre.ks_peak_peaksfit.png以及拟合参数
R-square: 0.09702451916365609
The gaussian fitting curve parameters are :
3.534434892929756  |  0.33939949288718707  |  0.4269611969337495

[peaksfit]
blockinfo = Cbre.block.new.csv
mode = median
bins_number = 200
ks_area = 0,10
fontsize = 9
area = 0,3
figsize = 10,6.18
shadow = true
savefig = Cbre.ks_peak_peaksfit.png

单个物种的到此结束

下面是多个物种

Cbre.block.new.csv ##拟合参数 3.534434892929756  |  0.33939949288718707  |  0.4269611969337495
vvin.block.new.csv ##拟合参数 1.718535848451611  |  1.4664829594546644  |  0.2716067658348252
Acom.block.new.csv ##拟合参数 0.8408592432366855  |  1.630457537878788  |  0.544759663045775

我们新建一个 all_ks.csv文件, 该文件的第一行为标题行,第二行以后为数据行。一共有4+3n列,其中第一列是样本信息,第2-3列对应线条的属性,后面都是拟合参数

注意,在终端里面编辑时,需要记住linestyle后面的逗号数量是 3n个, 其中n是最大peak数,例如拟南芥有2个peak,那么就得有6个逗号。

假如,我们之前用wgdi拟合过其他物种的peak,也得到了一些拟合参数,那么添加到这个文件中,例如

保存到all_ks.csv

,color,linewidth,linestyle,,,,,,
Cbre_Cbre,red,1,--,3.534434892929756,0.33939949288718707,0.4269611969337495
Vvin_Vvin,orange,1,-,1.718535848451611,1.4664829594546644,0.2716067658348252
Acom_Acom,1,-,0.8408592432366855,1.630457537878788,0.544759663045775

拟合多物种ks图

wgdi -kf Cbre.conf
##生成ks分布图(正态)
[ksfigure]
ksfit = all_ks.csv
labelfontsize = 15
legendfontsize = 15
xlabel = none
ylabel = none
title = none
area = 0,4
figsize = 10,6.18
savefig =  all_ks.png

到此结束

image.png

相关文章

网友评论

      本文标题:WGDI

      本文链接:https://www.haomeiwen.com/subject/qijdmdtx.html