通过NCBItaxid获取string有收录的近缘物种信息并下载

作者: 吃吃吃_就知道吃 | 来源:发表于2022-01-07 16:12 被阅读0次

通过NCBItaxid获取string有收录的近缘物种信息并下载
Call SNP用MUMmer
Java反射基础
通过URL获取页面信息
GtRNAdb:tRNA数据库简介
Request获取请求信息-常用API
获取IP地址信息
String的Json数据怎样取值
cytoscape绘制互作网络图
获取信息的方式

string收录的所有物种数据下载：https://cn.string-db.org/cgi/download?sessionId=bUHT3Irc05kI 下载[species.v*.txt]

string.png
ncbi taxid 数据下载：https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump_archive/

image

下载最新的文件，解压出taxidlineage.dmp文件
将两个文件解压，记下绝对路径。
下面是shell 脚本：

# $1 要查询的近缘物种 taxid

# $2 string version 默认为11.5，具体需要到string上搜任意物种查看“X[.protein.links.v*.txt.gz](https://stringdb-static.org/download/protein.links.v11.5/9606.protein.links.v11.5.txt.gz)”  *的数字为版本

# 当显示  'whether download PPI related files ?(y/n/s)' y=download PPIsequences|aliases|links.detailed,输入n=not download,s=skip this taxid,search next id

path1=/cluster/home/fygong/data/protein/species.v11.0.txt
path2=/cluster/home/fygong/data/protein/taxidlineage.dmp

set -eu
function dppi()
{
  echo 'Check URL:'https://stringdb-static.org/download/protein.sequences.v$2/$1'.protein.sequences.v'$2.fa.gz
  echo 'Start downloding......'
  wget --no-check-certificate --spider https://stringdb-static.org/download/protein.sequences.v$2/$1'.protein.sequences.v'$2.fa.gz > check.out 2>&1
  line=`grep '存在远程文件' check.out | wc -l`
  line2=`grep 'Remote file exists' check.out | wc -l`
  if [[ $line -eq 2 ]]  || [[  $line2 -eq 2 ]];then
     wget --no-check-certificate  https://stringdb-static.org/download/protein.sequences.v$2/$1'.protein.sequences.v'$2.fa.gz > /dev/null 2>&1 &
      wget --no-check-certificate  https://stringdb-static.org/download/protein.aliases.v$2/$1'.protein.aliases.v'$2.txt.gz > /dev/null 2>&1 &
      wget --no-check-certificate https://stringdb-static.org/download/protein.links.detailed.v$2/$1'.protein.links.detailed.v'$2.txt.gz > /dev/null 2>&1 &
     wait
     if [[ -e $1'.protein.links.detailed.v'$2.txt.gz ]] && [[ -e $1'.protein.aliases.v'$2.txt.gz ]] && [[ -e $1'.protein.sequences.v'$2.fa.gz ]];then
        echo 'Start unzip ......'
        gunzip *gz
        echo 'Download & unzip successed !'
     else
        echo 'remote file exist,but not successed download,please check !'
     fi
  else
     echo 'string link file not exist,plese check !(visit:https://cn.string-db.org/cgi/download?sessionId=bUHT3Irc05kI & search taxid & check version)'
  fi
  rm check.out &

}
if [[ $# -eq 1 ]];then
    ver='11.5'
elif [[ $# -eq 2 ]];then
    ver=$2
fi
awk -v spe=$1 'BEGIN{FS="\t"}{if($1 == spe){print "物种存在,taixid="$1}}' $path1
re1=`awk -v spe=$1 'BEGIN{FS="\t"}{if($1 == spe){print "物种存在,taixid="$1}}'  $path1 `

if [[ $re1 != '' ]];then
  read -p "whether download PPI related files ?(y/n/s)" args1
  if [[ $args1 == *y* ]];then
     dppi $1 $ver
  elif [[ $args1 == *s* ]];then
     re1=''
  fi
fi


if [[ $re1 == '' ]];then
   echo '搜索近缘物种.....'
   awk -v spe=$1 'BEGIN{FS="\t"}{if($1 == spe){split($3,a," ");for(i=1;i<length(a)+1;i++){print a[i]}print spe}}'  $path2 > link_id
   all_num=`wc -l link_id | awk '{print $1}'`
   for line in $(seq  1 $all_num)
   do
     let line=all_num-line+1
     echo 'line='$line
     tarid=`awk -v num=$line 'NR==num{print $1}' link_id`
     echo 'tarid='$tarid
     sleep 1s
     if [[ -e 'species-01.txt' ]];then
        grep $tarid   $path2 -w | awk 'BEGIN{FS="\t"}{print $1}' > species-02.txt
        awk 'NR==FNR{a[$1]}NR>FNR{if(!($1 in a)){print $1}}' species-01.txt species-02.txt > species_tari
        mv species_tari species-01.txt
     else
       grep $tarid   $path2 -w | awk 'BEGIN{FS="\t"}{print $1}' > species-01.txt
     fi
     for i in `cat species-01.txt`
     do
       re1=`awk -v spe=$i 'BEGIN{FS="\t"}{if($1 == spe){print "近缘物种存在,taixid="$1}}'  $path1 `
       if [[ $re1 != "" ]];then
          echo $re1
          read -p "whether download PPI related files ?(y/n/s)" args1
          if [[ $args1 == *y* ]];then
             rm link_id species-0*
             dppi $i $ver
             exit 1
          elif [[ $args1 == *s* ]];then
             echo $i' passed !'
          elif [[ $args1 == *n* ]];then
             rm link_id species-0*
             exit 1
          fi
        else
          echo $i 'Passed ...'
        fi
    done
 done
fi