string收录的所有物种数据下载:https://cn.string-db.org/cgi/download?sessionId=bUHT3Irc05kI 下载[species.v*.txt]
![](https://img.haomeiwen.com/i6552864/9ec852d13d7f88da.png)
ncbi taxid 数据下载:https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump_archive/
![](https://img.haomeiwen.com/i6552864/085361b2e0001f51.png)
下载最新的文件,解压出taxidlineage.dmp文件
将两个文件解压,记下绝对路径。
下面是shell 脚本:
# $1 要查询的近缘物种 taxid
# $2 string version 默认为11.5,具体需要到string上搜任意物种查看“X[.protein.links.v*.txt.gz](https://stringdb-static.org/download/protein.links.v11.5/9606.protein.links.v11.5.txt.gz)” *的数字为版本
# 当显示 'whether download PPI related files ?(y/n/s)' y=download PPIsequences|aliases|links.detailed,输入n=not download,s=skip this taxid,search next id
path1=/cluster/home/fygong/data/protein/species.v11.0.txt
path2=/cluster/home/fygong/data/protein/taxidlineage.dmp
set -eu
function dppi()
{
echo 'Check URL:'https://stringdb-static.org/download/protein.sequences.v$2/$1'.protein.sequences.v'$2.fa.gz
echo 'Start downloding......'
wget --no-check-certificate --spider https://stringdb-static.org/download/protein.sequences.v$2/$1'.protein.sequences.v'$2.fa.gz > check.out 2>&1
line=`grep '存在远程文件' check.out | wc -l`
line2=`grep 'Remote file exists' check.out | wc -l`
if [[ $line -eq 2 ]] || [[ $line2 -eq 2 ]];then
wget --no-check-certificate https://stringdb-static.org/download/protein.sequences.v$2/$1'.protein.sequences.v'$2.fa.gz > /dev/null 2>&1 &
wget --no-check-certificate https://stringdb-static.org/download/protein.aliases.v$2/$1'.protein.aliases.v'$2.txt.gz > /dev/null 2>&1 &
wget --no-check-certificate https://stringdb-static.org/download/protein.links.detailed.v$2/$1'.protein.links.detailed.v'$2.txt.gz > /dev/null 2>&1 &
wait
if [[ -e $1'.protein.links.detailed.v'$2.txt.gz ]] && [[ -e $1'.protein.aliases.v'$2.txt.gz ]] && [[ -e $1'.protein.sequences.v'$2.fa.gz ]];then
echo 'Start unzip ......'
gunzip *gz
echo 'Download & unzip successed !'
else
echo 'remote file exist,but not successed download,please check !'
fi
else
echo 'string link file not exist,plese check !(visit:https://cn.string-db.org/cgi/download?sessionId=bUHT3Irc05kI & search taxid & check version)'
fi
rm check.out &
}
if [[ $# -eq 1 ]];then
ver='11.5'
elif [[ $# -eq 2 ]];then
ver=$2
fi
awk -v spe=$1 'BEGIN{FS="\t"}{if($1 == spe){print "物种存在,taixid="$1}}' $path1
re1=`awk -v spe=$1 'BEGIN{FS="\t"}{if($1 == spe){print "物种存在,taixid="$1}}' $path1 `
if [[ $re1 != '' ]];then
read -p "whether download PPI related files ?(y/n/s)" args1
if [[ $args1 == *y* ]];then
dppi $1 $ver
elif [[ $args1 == *s* ]];then
re1=''
fi
fi
if [[ $re1 == '' ]];then
echo '搜索近缘物种.....'
awk -v spe=$1 'BEGIN{FS="\t"}{if($1 == spe){split($3,a," ");for(i=1;i<length(a)+1;i++){print a[i]}print spe}}' $path2 > link_id
all_num=`wc -l link_id | awk '{print $1}'`
for line in $(seq 1 $all_num)
do
let line=all_num-line+1
echo 'line='$line
tarid=`awk -v num=$line 'NR==num{print $1}' link_id`
echo 'tarid='$tarid
sleep 1s
if [[ -e 'species-01.txt' ]];then
grep $tarid $path2 -w | awk 'BEGIN{FS="\t"}{print $1}' > species-02.txt
awk 'NR==FNR{a[$1]}NR>FNR{if(!($1 in a)){print $1}}' species-01.txt species-02.txt > species_tari
mv species_tari species-01.txt
else
grep $tarid $path2 -w | awk 'BEGIN{FS="\t"}{print $1}' > species-01.txt
fi
for i in `cat species-01.txt`
do
re1=`awk -v spe=$i 'BEGIN{FS="\t"}{if($1 == spe){print "近缘物种存在,taixid="$1}}' $path1 `
if [[ $re1 != "" ]];then
echo $re1
read -p "whether download PPI related files ?(y/n/s)" args1
if [[ $args1 == *y* ]];then
rm link_id species-0*
dppi $i $ver
exit 1
elif [[ $args1 == *s* ]];then
echo $i' passed !'
elif [[ $args1 == *n* ]];then
rm link_id species-0*
exit 1
fi
else
echo $i 'Passed ...'
fi
done
done
fi
网友评论