计算机知识
#赋予管理员权限
sudo chown -R $USER XXX.fasta
#进入ROOT模式
su
#添加环境变量
$ vim ~/.bashrc
在打开文件的最后一行加入路径。
export PATH=SPATH:/path/to/blast-2.2.26/bin
export PATH=SPATH: /usr/local/bin/mcl
export PATH-SPATH:/path/to/orthomcl/bin
序列处理
#提取大于多少小于多少的长度的序列
reformat.sh in=temp 1.fa out=2.fa minlength=300
reformat.sh in=temp 1.fa out=2.fa maxlength=600
#提取xxx.fa文件的最大\小的序列输入到新文件里面
cat xxx.fa | seqkit seq -M 600 > xxxnew.fa
cat xxx.fa | seqkit seq -m 300 > xxxnew.fa
#seqkit 从基因组根据ID提取序列
seqkit grep -f list.fa test.fa > new.fa
#提取基因id
seqkit fx2tab Echi-Tcas_P450.fasta -i -n >2.fa
seqkit grep -f gene test.fa |seqkit seq -i >gene.fa
#提取序列id
cat blastp.out |awk '$3>75' |cut -f1 |sort -u > blastp_result_id.list
#合并id列表
comm -12 blastp_result_id.list hmm_out_id.list
common.list
cat 1.fa 2.fa 3.fa >final.fa
#统计显示根据ID筛选出来的条目
cat maker.proteins.uniprot.fasta | seqkit grep -r -p "Epicauta_chinensis_00006884-RA" | seqkit stat
构建系统发育树
#使用mafft比对
mafft input.fasta > output.fasta
#使用trimal裁剪序列
trimal -in t2.mafft.fa -out t2.trim.fa -automated1
#使用IQtree建树(MFP为模型)
iqtree -s Tene_db1.fasta -m MFP -bb 1000 -alrt 1000 -nt AUTO
iqtree -s input.fas -mset LG -bb 1000 -nt AUTO
本地比对
#blastp运行
makeblastdb -in ref.nbs.plant.fa -dbtype prot
基因家族代码
##extract gene families from genome and proteins
cd 15-gene_families
#download reference protein sequences from NCBI, such GST famiy
#blastp-like search against proteinsDB
##mmseqs easy-search Bmori_GST.fasta genome.fa blastp.out tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 0.001 --format-mode 2
#extrect query and target DB
mmseqs createdb genome.fa genomeDB
mmseqs createdb proteins.fa proteinsDB
#erect index file for the targetDB
mmseqs createindex genomeDB tmp
mmseqs createindex proteinsDB tmp
cd GST
mmseqs createdb ../Bmori_GST.fasta queryDB
cd GST/AA
mmseqs search ../queryDB ../../proteinsDB result_AADB tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 0.001 #(-e 10 1e-5) , --mi
n-seq-id 0.5
mmseqs convertalis ../queryDB ../../proteinsDB result_AADB blastp.out --format-mode 2
cat blastp.out | awk '{print $2}' | sort | uniq > blastp.AA.list
cat ../../proteins.fa | seqkit grep -f blastp.AA.list | seqkit seq -w 0 > blastp.AA.fa
cd-hit -i blastp.AA.fa -o blastp.AA.cdhit95.fa -c 0.95 -n 5 -M 12000 -d 0 -T 8 #0.99 for CSP and very conserved genes
#check GSTs by HMMER3 search (cutoff E-value = 0.001) using the Pfam database to confirm conserved domains (online or local interproscan)
HMMER-Pfam results were further checked by blastp in the nonredundant GenBank protein database.
interproscan.sh -dp -b blastp.AA.interproscan -f TSV -iprlookup -t p -cpu 15 -i blastp.AA.fa -appl Pfam
#HMMER-Pfam results were further checked by blastp in the nonredundant GenBank protein database.
cat blastp.AA.interproscan.tsv | grep "Olfactory receptor" | awk '{print $1}' | sed "s/>//g" | sort | uniq > hmmer-pfam.list
cat blastp.AA.fa | seqkit grep -f hmmer-pfam.list | seqkit seq -u -w 0 > hmmer-pfam.fa
cd-hit -i hmmer-pfam.fa -o hmmer-pfam.cdhit99.fa -c 0.99 -n 5 -M 12000 -T 16
#cat hmmer-pfam.results | grep "GST_" | awk '{print $1}' | sed "s/>//g" | sort | uniq > hmmer-pfam.list
#cat blastp.AA.fa | seqkit grep -f hmmer-pfam.list | seqkit seq -u -w 0 > hmmer-pfam.fa
#cd-hit -i hmmer-pfam.fa -o hmmer-pfam.cdhit99.fa -c 0.99 -n 5 -M 12000 -T 16
#check sequences too short or too long, 40 remaining
#tblatn-like search against genomeDB using mmseq2
cd GST/genome
mmseqs search ../queryDB ../../genomeDB result_genomeDB tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 10
mmseqs convertalis ../queryDB ../../genomeDB result_genomeDB tblastn.out --format-output 'query,target,qstart,qend,qlen,tstart,tend,tlen,evalue,
alnlen,pident,qaln,taln'
#mmseqs easy-search template.fasta genome.picauta_chinensis.fa tblastn_aln.cce tmp -s 7.5 --alignment-mode 3 --num-iterations 4 -e 0.001 --forma
t-output 'query,target,evalue,pident,nident,qstart,qend,qlen,tstart,tend,tlen,alnlen,mismatch,bits,qcov,tcov,qframe,tframe,qaln,taln'
#sort by target, qstart and length using excel or csvtk
cat tblastn.out | csvtk sort -H -t -k2 -k6:n -k9:n | nl -b a -n rz | sed "s/^/>/g" > tblastn.temp1
#reduce the multi-hits to the same gene region
cat tblastn.temp1 | awk '{print $1,$14}' | sed "s/-//g" | sed "s/ /\n/g" > tblastn.temp2.fa
cd-hit -i tblastn.temp2.fa -o tblastn.temp2.cdhit99.fa -c 0.99 -n 5 -M 12000 -T 8
cat tblastn.temp2.cdhit99.fa | grep ">" | sed "s/>//g" | sort > tblastn.temp3.list
for list in $(cat tblastn.temp3.list); do cat tblastn.temp1 | grep ">$list" >> tblastn.temp4;done #delete invalid matches
#filter the identical regions matching GSTs prodicted from annotated proteins
mmseqs easy-search tblastn.temp2.cdhit99.fa ../AA/blastp.AA.cdhit95.fa tblastn.blastp.out tmp -s 7.5 --alignment-mode 3 -e 0.001 --format-output
'query,target,qstart,qend,qlen,tstart,tend,tlen,evalue,alnlen,pident,qaln,taln' --min-seq-id 0.9 #0.99 for CSP
cat tblastn.blastp.out | awk '{print $1}' | sort | uniq > matched.list
for list in $(cat matched.list); do sed -i "s/>"$list"/>"$list"***/g" tblastn.temp4; done
#manually merge exons
cat ../genome.fa | seqkit grep -r -p "Sexi_chr31" | seqkit subseq -r 7359364:7359604 | seqkit seq -w 0 > test.fasta
#accurately determine Reading frames and intron/exon boundaries
#align candidates to annotated proteins or use tools GeneWise, exonerate or GenomeThreader
gth -xmlout -o gth.xml -genomic test.fasta -protein ../Bmori_GST.fasta -cdna transcripts.fa
cat gth.xml | grep "<predicted_protein_sequence>" | sed "s/<predicted_protein_sequence>//g" | sed "s/<\/predicted_protein_sequence>//g" >> test.
aa.fasta
#compare newly preidicted genes to AA
mmseqs easy-search unmatched.fa ../AA/blastp.AA.cdhit95.fa unmatched.blastp.out tmp -s 7.5 --alignment-mode 3 -e 0.001 --format-output 'query,ta
rget,qstart,qend,qlen,tstart,tend,tlen,evalue,alnlen,pident,qaln,taln'
cd ..
cat genome/unmatched.fa AA/blastp.AA.cdhit95.fa > GSTs.fa #44 totally
#cd-hit -i P450s.fa -o P450s.cdhit95.fa -c 0.95 -n 5 -M 12000 -T 8
#check their positions on chromosomes
cd tree
#cat ../CCEs.fa | sed "s/ protein /_protein_/g" | sed "s/_protein_Name/ /g" > CCEs.fa
linsi --thread 8 ../ABCs.fa > ABC.mafft.fas
trimal -in ABC.mafft.fas -out ABC.trim.fas -automated1
iqtree -s input.fas -mset LG -bb 1000 -nt AUTO
mmseqs easy-search ../ABCs.fa ../../genome.fa blastp.out tmp -s 7.5 --alignment-mode 3 -e 0.001 --format-output 'query,target,qstart,qend,qlen,t
start,tend,tlen,evalue,alnlen,pident,qaln,taln' --min-seq-id 0.9
网友评论