0.运行Orthofinder
略
1.建树
进入到Single_Copy_Orthologue_Sequences文件夹中
ls *fa|while read file;do mafft --auto $file > $file.aln;done
ls *aln|while read file;do seqkit sort $file|seqkit seq -w 0 > $file.format;done
rm *.aln
find *format | xargs paste -d " " > ../all.single.copy.aln.fa && rm *format
这里就不提取4D位点了, 有的文章会提取有的文章则不提取
all.single.copy.aln.fa便是用来建树的文件
trimal -in all.single.copy.aln.fa -out all.single.copy.aln.trimal.fa
iqtree -s all.single.copy.aln.trimal.fa --runs 100
这个方法只能说命令行简单, 但是运行时间堪忧
因为我是slurm集群
for i in {00001..15175}
do
echo "mafft --auto OG00${i}.fa | seqkit sort - | seqkit seq -w 0 > OG00${i}.format" >> 1.sh
done
可以这样写,分1000行提交,运行速度更快
find *format | xargs paste -d " " > ../all.single.copy.aln.fa && rm *format
这里就不提取4D位点了, 有的文章会提取有的文章则不提取
all.single.copy.aln.fa便是用来建树的文件
trimal -in all.single.copy.aln.fa -out all.single.copy.aln.trimal.fa
可以用的核心是32,所以-T设置为32
iqtree -s all.single.copy.aln.trimal.fa --runs 100 -T 32
https://github.com/yongzhiyang2012/Two_iron_wood_genome_analysis/tree/master/evolution/02.phylogenetic
在网上找到了相关的脚本也是构建物种树的, 包括后续的估计物种分歧时间和基因家族的收缩与扩张(https://www.jianshu.com/p/deb27036d3c5)都是基于这个网站的脚本做的
如果你使用了这些脚本, 可以引用下面这篇文章
Yang Y, Ma T, Wang Z, Lu Z, Li Y, Fu C, Chen X, Zhao M, Olson MS, Liu J: Genomic effects of population collapse in a critically endangered ironwood tree Ostrya rehderiana. Nature Communications 9, 5449 (2018).
下面介绍上述网站提及脚本的用法:
- 运行Orthofinder
略
1.输入文件的准备
上一步运行orthofinder, 在输出的文件夹Orthogroups中的 Orthogroups.txt 与 Orthogroups_SingleCopyOrthologues.txt是我们要用到的.
Orthogroups.txt中, 在基因名前面加上物种名, 比如原来的基因名是 evm.model.Chr1.1, 需要改成sppA|evm.model.Chr1.1, 即 "物种名" + “|”
此外, 在运行这些脚本的目录下创建一个文件夹data, 里面需要将orthofinder用到的物种的cds/pep文件做相应处理, 在基因名前加上物种名, 最后将cds/pep文件命名为 sppA.cds.clean.fa/sppA.pep.clean.fa
Orthogroups.txt修改后
sppA.pep.clean.fa
data文件夹里面的内容
python3 1.AllClusters.txt.single.copygene.txt.py Orthogroups_SingleCopyOrthologues.txt Orthogroups.txt > scg.txt
perl 1.splitSeq.pl data scg.txt
perl 2.align.pl > 2.align.pl.sh
sh 2.align.pl.sh
perl 3.connect.pl
perl 4.4Dsites.pl 3.connect.pl.connect.cds.fa
perl fasta2phylip.pl 4.4Dsites.pl.connect4Dsites.fa > A-c.phy
iqtree -s A-c.phy --runs 100
#输出的A-c.phy是后续做基因家族收缩与扩张要用到的文件
#A-c.phy.treefile即是物种树
3.该网站提供的脚本内容如下:
https://github.com/yongzhiyang2012/Two_iron_wood_genome_analysis/tree/master/evolution/02.phylogenetic
如果你使用了这些脚本, 可以引用下面这篇文章
Yang Y, Ma T, Wang Z, Lu Z, Li Y, Fu C, Chen X, Zhao M, Olson MS, Liu J: Genomic effects of population collapse in a critically endangered ironwood tree Ostrya rehderiana. Nature Communications 9, 5449 (2018).
vim 1.splitSeq.pl data scg.txt
use strict;
use warnings;
use Bio::SeqIO;
## created by Yongzhi Yang. 2017/3/20 ##
my ($dir,$singlecoypegenelist)=@ARGV;
die "Usage:\nperl $0 dir single_coype_gene_list
dir: containing cds and pep seq with the name of XXX.cds.clean.fa and XXX.pep.clean.fa. XXX represent the species ID.
single_coype_gene_list: clusterid\tGene_num\tSpecies_num\tOrthologous_genes_split_comma
" if (! $singlecoypegenelist);
my %list;
my %seq;
my @seq=<$dir/*clean.fa>;
for my $seqfile (@seq){
$seqfile=~/\/(\w+)\.(\w+)\.clean.fa$/;
my $type=$2;
my $name=$1;
my $fa=Bio::SeqIO->new(-file=>"$seqfile",-format=>'fasta');
while (my $seq=$fa->next_seq) {
my $id=$seq->id;
my $seq=$seq->seq;
my $key;
if ($id=~/^$name\|/){
$key=$id;
}else{
$key="$name|$id";
}
$seq{$type}{$key}=$seq;
}
}
open (F,"$singlecoypegenelist")||die"$!";
while (<F>) {
chomp;
next if /^#/;
my @a=split(/\t/,$_);
my @b=split(/,/,$a[3]);
for (my $i=0;$i<@b;$i++){
$list{$a[0]}{$b[$i]}++;
}
}
close F;
`mkdir align` if (! -e "align");
for my $k (sort keys %list){
my $dir="align/$k";
`mkdir $dir` if (! -e "$dir");
for my $type ("cds","pep"){
open (O,">$dir/$type");
for my $k2 (sort keys %{$list{$k}}){
die "$k2\n" if ! exists $seq{$type}{$k2};
$k2=~/^(\S+)\|/ or die "$k2\n";
my $outid=$1;
print O ">$outid\n$seq{$type}{$k2}\n";
}
close O;
}
}
vim 2.align.pl
use strict;
use warnings;
## created by Yongzhi Yang. 2017/3/20 ##
my $muscle="~/muscle";
#可以更换为其他序列比对软件
my $pal2nal="~/pal2nal.pl";
my @in=<align/*/pep>;
for my $in (@in){
my $in2=$in;
$in2=~s/pep/cds/;
print "$muscle -in $in -out $in.best.fas ; $pal2nal $in.best.fas $in2 -output fasta > $in2.best.fas\n";
}
#如果更换了序列比对软件, 更改上述命令行即可
2.align.pl中提及的pal2nal.pl如下获取:
wget http://www.bork.embl.de/pal2nal/distribution/pal2nal.v14.tar.gz
tar -zxvf pal2nal.v14.tar.gz
vim 3.connect.pl
use strict;
use warnings;
use Bio::SeqIO;
## created by Yongzhi Yang. 2017/3/20 ##
my @in=<align/*/cds.best.fas>;
my %seq;
my %name;
my $alllen=0;
my $number;
for my $in (sort @in){
my $fa=Bio::SeqIO->new(-file=>"$in",-format=>"fasta");
my $len;
while (my $seq=$fa->next_seq) {
my $id=$seq->id;
my $seq=$seq->seq;
$len=length($seq);
###check
if ($len % 3 != 0){
die "wrong: $in\t$id\n";
}
my @seq=split(//,$seq);
for (my $i=0;$i<@seq;$i += 3){
my $word=$seq[$i].$seq[$i+1].$seq[$i+2];
die "wrong: $in\t$id\n" if ($word=~/-/ && $word=~/\w/);
}
###check done
my @id=split(/\|/,$id);
$seq{$id[0]} .= $seq;
$name{$id[0]} .= $id;
}
$alllen += $len;
$number++;
}
open (O,">$0.list");
print O "Length:\t$alllen\tNumber:\t$number\n";
for my $k1 (sort keys %name){
my $v=$name{$k1};
my @v=split(/$k1\|/,$v);
shift @v;
print O "$k1\t",scalar(@v),"\t",join(",",@v),"\n";
}
close O;
open (O,">$0.connect.cds.fa");
for my $k1 (sort keys %seq){
print O ">$k1\n$seq{$k1}\n";
}
close O;
vim 4.4Dsites.pl
use strict;
use warnings;
use Bio::SeqIO;
## created by Yongzhi Yang. 2017/3/20 ##
my %fourDegenerateSite=(
'TC'=>'Ser',
'CT'=>'Leu',
'CC'=>'Pho',
'CG'=>'Arg',
'AC'=>'Thr',
'GT'=>'Val',
'GC'=>'Phe',
'GG'=>'Gln',
);
my %seq;
my %count;
#my $in=shift or die "perl $0 \$inFasta output\n";
#my $out=shift or die "perl $0 \$inFasta output\n";
my $in="3.connect.pl.connect.cds.fa";
my $out="$0.connect4Dsites.fa";
my $fa=Bio::SeqIO->new(-file=>"$in",-format=>"fasta");
while (my $seq=$fa->next_seq) {
my $id=$seq->id;
my $seq=$seq->seq;
my @seq=split(//,$seq);
for (my $i=0;$i<@seq;$i=$i+3){
my $key=$seq[$i].$seq[$i+1];
my $value=$seq[$i+2];
if (exists $fourDegenerateSite{$key}){
$seq{$id}{$i}=$value;
print "$id\t$i\t$value\t",$seq[$i-2],$seq[$i-1],$seq[$i],$seq[$i+1],$seq[$i+2],"\n" if ($value eq '-');
$count{$i}++;
}
}
}
open (O,">$out");
open (O1,">$out.list");
my @k1=sort keys %seq;
for my $k1 (@k1){
print O ">$k1\n";
print O1 "$k1\n";
for my $k2 (sort{$a<=>$b} keys %{$seq{$k1}}){
my $count=$count{$k2};
if ($count==scalar(@k1)){
print O "$seq{$k1}{$k2}";
print O1 $k2+2," ";
#print "$k1\t$k2\t$seq{$k1}{$k2}\n" if $seq{$k1}{$k2} eq '-';
}
}
print O "\n";
print O1 "\n";
}
close O;
close O1;
cat fasta2phylip.pl
#!/usr/bin/perl
# Converts an aligned fasta (aa or dna) seq file to phylip format
# Copyright 2013, Naoki Takebayashi <ntakebayashi@alaska.edu>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Version: 20130612
my $usage = "Usage: $0 [-h] [-v] [-c numChar] [infile]\n" .
" -h: help\n" .
" -c: long seq names are shortened to 10 char, default: 7 chars from the\n".
" beggining is combined with the last 3 chars. You can change the\n".
" behavior by this option. E.g., -c 10 will shorten the long name\n" .
" by taking the first 10 characters of the name.\n".
" -v: verbose (print name conversion in STDERR)\n" .
" infile should be an aligned fasta, " .
"STDIN is used if no infile is given\n";
use IO::File;
use Getopt::Std;
getopts('hvc:') || die "$usage\n";
die "$usage\n" if (defined ($opt_h));
my $totNumChar = 10; # number of characters allowed for name in phylip
my $numFrontChar = 7; # When the name is too long, this amount of characters
# are used from the beginning of the name, and the rest
# are from the end of the name.
if (defined ($opt_c)) {
if ($opt_c <= $totNumChar && $opt_c >= 0) {
$numFrontChar = $opt_c;
} else {
die "Error: give an integer between 0 and $totNumChar (ends inclusive)".
" for -c.\n";
}
}
my $tmpFH = IO::File->new_tmpfile || die "Unable to make new temp file: $!";
my $firstLine = 1;
my $maxLen = 0;
my $numTaxa = 0;
my $name;
while(<>){
chomp;
s/^\s+//; s/\s$//;
next if (/^$/);
if (s/^>\s*//) {
if ($firstLine == 0) {
if ($seqLen != $maxLen && $maxLen != 0) {
warn "WARN: The $numTaxa-th species ($name) have ",
"different seq length\n";
warn "Previous Seq Len: $maxLen, This Seq Len: $seqLen\n";
}
print $tmpFH "\n"; # end of the previous sequence
} else {
$firstLine = 0;
}
$maxLen = $seqLen if ($seqLen > $maxLen); $seqLen = 0;
$numTaxa ++;
$name = $_;
if (CharLen($_) <=10) {
printf $tmpFH "%-10s", $_;
} else {
$shortName = TrimName($_);
print STDERR "$_ => $shortName\n" if (defined ($opt_v));
printf $tmpFH "%10s", $shortName;
}
} else {
$seqLen += CharLen ($_);
print $tmpFH $_;
}
}
print $tmpFH "\n";
### print out to the STDOUT
print "$numTaxa $maxLen\n";
seek ($tmpFH, 0, 0) || die "seek: $!";
my $line;
while (defined ($line = $tmpFH->getline())) {
chomp ($line);
print "$line";
$missingBases = $maxLen - (CharLen($line) - $totNumChar);
while ($missingBases > 0) {
print "-";
$missingBases--;
}
print "\n";
}
sub CharLen { # returns number of characters in a string
my $string = shift;
return scalar(split (//, $string));
}
sub TrimName { # trim a long name
my $name = shift;
my @charArray = split (//, $name);
if ($totNumChar == $numFrontChar) {
return join ('', splice (@charArray, 0, $numFrontChar));
} else {
return join ('', splice (@charArray, 0, $numFrontChar),
splice (@charArray, - ($totNumChar - $numFrontChar)));
}
}
网友评论