美文网首页
代码积累

代码积累

作者: MLD_TRNA | 来源:发表于2021-04-18 14:07 被阅读0次

    计算机知识

    #赋予管理员权限 
    sudo chown -R $USER  XXX.fasta 
    #进入ROOT模式
    su
    #添加环境变量
    

    $ vim ~/.bashrc

    在打开文件的最后一行加入路径。
    export PATH=SPATH:/path/to/blast-2.2.26/bin
    export PATH=SPATH: /usr/local/bin/mcl
    export PATH-SPATH:/path/to/orthomcl/bin
    

    序列处理

    #提取大于多少小于多少的长度的序列  
    reformat.sh in=temp 1.fa out=2.fa minlength=300 
    reformat.sh in=temp 1.fa out=2.fa maxlength=600
    #提取xxx.fa文件的最大\小的序列输入到新文件里面
    cat xxx.fa | seqkit seq -M 600 >  xxxnew.fa
    cat xxx.fa | seqkit seq -m 300 >  xxxnew.fa
    #seqkit 从基因组根据ID提取序列
    seqkit grep -f list.fa test.fa > new.fa
    #提取基因id
    seqkit fx2tab Echi-Tcas_P450.fasta -i -n >2.fa
    seqkit grep -f gene test.fa |seqkit seq -i >gene.fa
    #提取序列id
    cat blastp.out |awk '$3>75' |cut -f1 |sort -u > blastp_result_id.list
    #合并id列表
    comm -12 blastp_result_id.list hmm_out_id.list 
     common.list
    cat 1.fa 2.fa 3.fa >final.fa
    #统计显示根据ID筛选出来的条目
    cat maker.proteins.uniprot.fasta | seqkit grep -r -p "Epicauta_chinensis_00006884-RA" | seqkit stat
    

    构建系统发育树

    #使用mafft比对 
    mafft input.fasta > output.fasta
    #使用trimal裁剪序列
    trimal -in t2.mafft.fa -out t2.trim.fa -automated1
    #使用IQtree建树(MFP为模型)
    iqtree -s Tene_db1.fasta -m MFP -bb 1000 -alrt 1000 -nt AUTO
    iqtree -s input.fas -mset LG -bb 1000 -nt AUTO
    

    本地比对

    #blastp运行
    makeblastdb -in ref.nbs.plant.fa -dbtype prot
    

    基因家族代码

    ##extract gene families from genome and proteins
    cd 15-gene_families
    #download reference protein sequences from NCBI, such GST famiy
    #blastp-like search against proteinsDB
    ##mmseqs easy-search Bmori_GST.fasta genome.fa blastp.out tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 0.001 --format-mode 2
    
    #extrect query and target DB
    mmseqs createdb genome.fa genomeDB
    mmseqs createdb proteins.fa proteinsDB
    
    #erect index file for the targetDB
    mmseqs createindex genomeDB tmp
    mmseqs createindex proteinsDB tmp
    
    cd GST
    mmseqs createdb ../Bmori_GST.fasta queryDB
    cd GST/AA
    mmseqs search ../queryDB ../../proteinsDB result_AADB tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 0.001       #(-e   10     1e-5) , --mi
    n-seq-id 0.5 
    mmseqs convertalis ../queryDB ../../proteinsDB result_AADB blastp.out --format-mode 2
    cat blastp.out | awk '{print $2}' | sort | uniq > blastp.AA.list
    cat ../../proteins.fa | seqkit grep -f blastp.AA.list | seqkit seq -w 0 > blastp.AA.fa
    cd-hit -i blastp.AA.fa -o blastp.AA.cdhit95.fa -c 0.95 -n 5 -M 12000 -d 0 -T 8  #0.99 for CSP and very conserved genes
    
    #check GSTs by HMMER3 search (cutoff E-value = 0.001) using the Pfam database to confirm conserved domains (online or local interproscan)
    HMMER-Pfam results were further checked by blastp in the nonredundant GenBank protein database.
    interproscan.sh -dp -b blastp.AA.interproscan -f TSV -iprlookup -t p -cpu 15 -i blastp.AA.fa -appl Pfam
    #HMMER-Pfam results were further checked by blastp in the nonredundant GenBank protein database.
    cat blastp.AA.interproscan.tsv | grep "Olfactory receptor" | awk '{print $1}' | sed "s/>//g" | sort | uniq > hmmer-pfam.list
    cat blastp.AA.fa | seqkit grep -f hmmer-pfam.list | seqkit seq -u -w 0 > hmmer-pfam.fa
    cd-hit -i hmmer-pfam.fa -o hmmer-pfam.cdhit99.fa -c 0.99 -n 5 -M 12000 -T 16
    #cat hmmer-pfam.results | grep "GST_" | awk '{print $1}' | sed "s/>//g" | sort | uniq > hmmer-pfam.list
    #cat blastp.AA.fa | seqkit grep -f hmmer-pfam.list | seqkit seq -u -w 0 > hmmer-pfam.fa
    #cd-hit -i hmmer-pfam.fa -o hmmer-pfam.cdhit99.fa -c 0.99 -n 5 -M 12000 -T 16
    #check sequences too short or too long, 40 remaining
    
    #tblatn-like search against genomeDB using mmseq2
    cd GST/genome
    mmseqs search ../queryDB ../../genomeDB result_genomeDB tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 10
    mmseqs convertalis ../queryDB ../../genomeDB result_genomeDB tblastn.out --format-output 'query,target,qstart,qend,qlen,tstart,tend,tlen,evalue,
    alnlen,pident,qaln,taln'
    #mmseqs easy-search template.fasta genome.picauta_chinensis.fa tblastn_aln.cce tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 0.001 --forma
    t-output 'query,target,evalue,pident,nident,qstart,qend,qlen,tstart,tend,tlen,alnlen,mismatch,bits,qcov,tcov,qframe,tframe,qaln,taln'
    
    #sort by target, qstart and length using excel or csvtk
    cat tblastn.out | csvtk sort -H -t -k2 -k6:n -k9:n | nl -b a -n rz | sed "s/^/>/g" > tblastn.temp1
    #reduce the multi-hits to the same gene region
    cat tblastn.temp1 | awk '{print $1,$14}' | sed "s/-//g" | sed "s/ /\n/g" > tblastn.temp2.fa
    cd-hit -i tblastn.temp2.fa -o tblastn.temp2.cdhit99.fa -c 0.99 -n 5 -M 12000 -T 8
    cat tblastn.temp2.cdhit99.fa | grep ">" | sed "s/>//g" | sort > tblastn.temp3.list
    for list in $(cat tblastn.temp3.list); do cat tblastn.temp1 | grep ">$list" >> tblastn.temp4;done #delete invalid matches
    #filter the identical regions matching GSTs prodicted from annotated proteins 
    mmseqs easy-search tblastn.temp2.cdhit99.fa ../AA/blastp.AA.cdhit95.fa tblastn.blastp.out tmp -s 7.5 --alignment-mode 3 -e 0.001 --format-output
     'query,target,qstart,qend,qlen,tstart,tend,tlen,evalue,alnlen,pident,qaln,taln' --min-seq-id 0.9   #0.99 for CSP
    cat tblastn.blastp.out | awk '{print $1}' | sort | uniq > matched.list
    for list in $(cat matched.list); do sed -i "s/>"$list"/>"$list"***/g" tblastn.temp4; done
    
    #manually merge exons
    cat ../genome.fa | seqkit grep -r -p "Sexi_chr31" | seqkit subseq -r 7359364:7359604 | seqkit seq -w 0 > test.fasta
    #accurately determine Reading frames and intron/exon boundaries
    #align candidates to annotated proteins or use tools GeneWise, exonerate or GenomeThreader
    gth -xmlout -o gth.xml -genomic test.fasta -protein ../Bmori_GST.fasta -cdna transcripts.fa
    cat gth.xml | grep "<predicted_protein_sequence>" | sed "s/<predicted_protein_sequence>//g" | sed "s/<\/predicted_protein_sequence>//g" >> test.
    aa.fasta
    #compare newly preidicted genes to AA
    mmseqs easy-search unmatched.fa ../AA/blastp.AA.cdhit95.fa unmatched.blastp.out tmp -s 7.5 --alignment-mode 3 -e 0.001 --format-output 'query,ta
    rget,qstart,qend,qlen,tstart,tend,tlen,evalue,alnlen,pident,qaln,taln'
    cd ..
    cat genome/unmatched.fa AA/blastp.AA.cdhit95.fa > GSTs.fa   #44 totally
    #cd-hit -i P450s.fa -o P450s.cdhit95.fa -c 0.95 -n 5 -M 12000 -T 8
    
    #check their positions on chromosomes
    cd tree
    #cat ../CCEs.fa | sed "s/ protein /_protein_/g" | sed "s/_protein_Name/ /g" > CCEs.fa
    linsi --thread 8 ../ABCs.fa > ABC.mafft.fas
    trimal -in ABC.mafft.fas -out ABC.trim.fas -automated1
    iqtree -s input.fas -mset LG -bb 1000 -nt AUTO
    
    mmseqs easy-search ../ABCs.fa ../../genome.fa blastp.out tmp -s 7.5 --alignment-mode 3 -e 0.001 --format-output 'query,target,qstart,qend,qlen,t
    start,tend,tlen,evalue,alnlen,pident,qaln,taln' --min-seq-id 0.9
    

    相关文章

      网友评论

          本文标题:代码积累

          本文链接:https://www.haomeiwen.com/subject/rtrlkltx.html