美文网首页转录组分析
全长转录组分析

全长转录组分析

作者: 宗肃書 | 来源:发表于2022-01-10 11:09 被阅读0次

    sample1

    • sample1基本测序信息统计
    1.统计reads数
    samtools view m64082_210415_203420.subreads.1--1.bam| wc -l
    16394742                 #sample1 reads number
    2.统计bases数
    samtools view m64082_210415_203420.subreads.1--1.bam| cut -f10 |wc -c
    51688834095
    51688834095-16394742=51672439353  #sample1 总bases数
    3.统计reads长度分布
    samtools view m64082_210415_203420.subreads.1--1.bam | cut -f10 |  awk '{print length($1);}'  | sort -n -r > sample1.subreads.length
    4.计算N50
    samtools fasta m64082_210415_203420.subreads.1--1.bam >m64082_210415_203420.subreads.1--1.fasta
    perl ~/soft/N50_N90.pl m64082_210415_203420.subreads.1--1.fasta >sample1.subreads.N50 &
    N50: 3465
    
    • 画频数分布直方图
    setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/subreads")
    a<-read.table(file = "sample1.subreads.length")
    a$V1<-as.integer(a$V1) 
    library(ggplot2)
    ggplot(data=a,aes(x=V1)) + 
    geom_histogram(binwidth=200,fill="#6e6dcd", color="#e9ecef", alpha=0.9)+
    theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 20000))+
    scale_y_continuous(name="Reads",limits=c(0, 1500000))
    ggsave("sample1.subreads.length.tiff",height = 4,width = 8)
    
    • 获取一致性序列CCS
    nohup  /public/jychu/soft/smrtlink/smrtcmds/bin/ccs --min-passes 2 --min-length 50 --max-length 15000 --min-rq 0.9  --num-threads 20  m64082_210415_203420.subreads.1--1.bam  sample0.ccs.bam  &
    
    • 统计CCS信息
    1.统计reads数
    samtools view sample1.ccs.bam | wc -l
    628188    #reads number
    2.统计bases数
    samtools view sample1.ccs.bam | cut -f10 |wc -c
    2205667668
    2205667668-628188=2205039480     #all bases
    3.统计序列长度、FP、质量值
    samtools view sample1.ccs.bam | cut -f10 |  awk '{print length($1);}' | sort -n -r > sample1.ccs.length
    samtools view sample1.ccs.bam |cut -f16 | tr ":" "\t" | cut -f3 > sample1.Fullpasses
    samtools view sample1.ccs.bam |cut -f17 | tr ":" "\t" | cut -f3 > sample1.Quality
    cat sample1.Quality| awk '{sum+=$1} END {print "Avg =", sum/NR}'
    Avg = 0.995584
    cat sample1.Fullpasses| awk '{sum+=$1} END {print "Avg =", sum/NR}'
    Avg = 22.9421
    
    • 画频数分布直方图
    setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/ccs")
    a<-read.table(file = "sample1.ccs.length")
    a$V1<-as.integer(a$V1) 
    library(ggplot2)
    ggplot(data=a,aes(x=V1)) + 
    geom_histogram(binwidth=200,fill="#6498C8", color="#e9ecef", alpha=0.9)+
    theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 15000))+
    scale_y_continuous(name="Reads",limits=c(0, 50000))
    ggsave("sample1.ccs.length.tiff",height = 4,width = 8)
    rm(list=ls())
    setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/ccs")
    a<-read.table(file = "sample1.Quality")
    library(ggplot2)
    ggplot(data=a,aes(x=V1)) + 
    geom_histogram(binwidth=0.0008,fill="#87C966", color="#e9ecef", alpha=0.9)+
    theme_bw()+ scale_x_continuous(name="Quality",limits=c(0.9,1))+
    scale_y_continuous(name="Reads",limits=c(0,30000))
    ggsave("sample1.ccs.quality.tiff",height = 4,width = 8)
    rm(list=ls())
    setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/ccs")
    a<-read.table(file = "sample1.Fullpasses")
    a$V1<-as.integer(a$V1) 
    library(ggplot2)
    ggplot(data=a,aes(x=V1)) + 
    geom_histogram(binwidth=2,fill="#F4AB58", color="#e9ecef", alpha=0.9)+
    theme_bw()+ scale_x_continuous(name="Number of Passes",limits=c(0, 200))+
    scale_y_continuous(name="Reads",limits=c(0,60000))
    ggsave("sample1.ccs.Fullpasses.tiff",height = 4,width = 8)
    
    • CCS分类
    1.统计含有5p引物序列数
    vi 5p.fasta
    >primer_5p
    GCAATGAAGTCGCAGGGTTGGG
     /public/jychu/soft/smrtlink/smrtcmds/bin/lima  --peek-guess -j 20 sample1.ccs.bam 5p.fasta sample1.5p.bam
    samtools view sample1.5p.bam| wc -l
    563560       #Number of five-five prime reads
    rm -f sample1.5p.*
    2.统计含有3p引物序列数
    vi 3p.fasta
    >primer_3p
    AAGCAGTGGTATCAACGCAGAGTAC
    /public/jychu/soft/smrtlink/smrtcmds/bin/lima  --peek-guess -j 20 sample1.ccs.bam 3p.fasta sample1.3p.bam
    samtools view sample1.3p.bam| wc -l
    615510          #Number of three-three prime reads
    rm -f sample1.3p.*
    3.统计同时含有3p和5p的序列数
    vi IsoSeqPrimers.fasta
    >primer_5p
    GCAATGAAGTCGCAGGGTTGGG
    >primer_3p
    AAGCAGTGGTATCAACGCAGAGTAC
    /public/jychu/soft/smrtlink/smrtcmds/bin/lima --isoseq --dump-clips --peek-guess -j 20 sample1.ccs.bam IsoSeqPrimers.fasta sample1.fl.bam &
    samtools view sample1.fl.primer_5p--primer_3p.bam |  wc -l
    540612                     #Number of five-three prime reads
    mv  sample1.fl.* fl/
    
    • isoseq3 refine(去除ployA和嵌合序列)
    1.去除ployA和嵌合序列
    /public/jychu/soft/smrtlink/smrtcmds/bin/isoseq3 refine --require-polya --min-polya-length 20 -j 4 sample1.fl.primer_5p--primer_3p.bam IsoSeqPrimers.fasta sample1.flnc.bam
    mkdir flnc
    mv sample1.flnc.* flnc/
    2.统计全长非嵌合序列数
    samtools view sample1.flnc.bam |wc -l
    539221          #Number of full-length non-chimeric reads
    3.统计序列长度
    samtools view sample1.flnc.bam | cut -f10 |  awk '{print length($1);}' | sort -n -r > sample1.flnc.length
    cat sample1.flnc.length| awk '{sum+=$1} END {print "Avg =", sum/NR}'
    Avg = 3413.56       #Mean length of flnc
    4.统计reads长度密度分布
    把sample1.flnc.length下载到文件夹C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/flnc  
    setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/flnc")
    a<-read.table(file = "sample1.flnc.length")
    a$V1<-as.integer(a$V1) 
    library(ggplot2)
    ggplot(data=a,aes(x=V1)) + 
    geom_density(fill="#fdd5d3", color="#e06666", alpha=0.9,size=0.4,linetype="solid")+       #size指外部线条大小
    theme_bw()+ scale_x_continuous(name="Sequence length(bp)",breaks =seq(0,12000,2000),limits=c(0, 12000))+
    theme(plot.title = element_text(size = 15,face = "bold", vjust = 0.5, hjust = 0.5),panel.background = element_rect(fill = "transparent",colour = NA), panel.grid.minor = element_blank(), panel.grid.major = element_blank(),plot.background = element_rect(fill = "transparent",colour = NA))
    ggsave("sample1.flnc.length.tiff",height = 4,width = 6)
    
    • FLNC聚类
    1.序列聚类
    mkdir clustered
    /public/jychu/soft/smrtlink/smrtcmds/bin/isoseq3 cluster sample1.flnc.bamclustered/sample1.clustered.bam -j 20 --verbose --use-qvs
    cd clustered
    2.统计一致性转录本数目
    samtools view sample1.clustered.bam | wc -l
    46314                #Number of consensus isoforms(一致转录本)
    samtools view sample1.clustered.hq.bam | wc -l
    45663               #Number of high-quality isoforms
    samtools view sample1.clustered.lq.bam | wc -l
    651                   #Number of polished low-quality isoforms
    3.统计一致性转录本序列长度
    samtools view sample1.clustered.bam | cut -f10 |  awk '{print length($1);}' | sort -n -r > sample1.clustered.length
    4.统计一致性转录本reads长度密度分布   
    下载sample1.clustered.length到目录C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/clustered
    统计一致性转录本reads长度密度分布   
    setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/clustered")
    a<-read.table(file = "sample1.clustered.length")
    a$V1<-as.integer(a$V1) 
    library(ggplot2)
    ggplot(data=a,aes(x=V1)) + 
    geom_density(fill="#fdd5d3", color="#e06666", alpha=0.9,size=0.4,linetype="solid")+       #size指外部线条大小
    theme_bw()+ scale_x_continuous(name="Sequence length(bp)",breaks =seq(0,12000,2000),limits=c(0, 12000))+
    theme(plot.title = element_text(size = 25,face = "bold", vjust = 0.5, hjust = 0.5),panel.background = element_rect(fill = "transparent",colour = NA), panel.grid.minor = element_blank(), panel.grid.major = element_blank(),plot.background = element_rect(fill = "transparent",colour = NA))
    ggsave("sample1.clustered.length.tiff",height = 4,width = 6)
    
    • 转录本去冗余(CD-hit使用 )
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered
    mkdir uncompare
    cp sample1.clustered.hq.fasta.gz uncompare/
    cd uncompare
    gzip -d sample1.clustered.hq.fasta.gz
    /public/jychu/soft/cd-hit-v4.8.1-2019-0228/cd-hit  -i sample1.clustered.hq.fasta -o sample1.fasta -c 0.99 -T 30 -d 40   -M 16000 -U 10 -p 1 -G 1 -s 0.99
    45013  finished      45013  clusters非冗余的高质量转录本
    
    • CDS预测(TransDecoder 软件)
    #预测转录本中长的开放阅读框, 默认是100个氨基酸,可以用-m修改
    cd  /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
    mkdir CDS
    cp sample1.fasta CDS/
    cd CDS
    ~/soft/TransDecoder-v5.5.0/TransDecoder.LongOrfs -t sample1.fasta
    #使用DIAMOND对上一步输出的longest_orfs.pep在蛋白数据库中进行搜索,寻找同源证据支持
    1. 使用uniprot蛋白数据库-BLASTP比对
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/CDS/sample1.fasta.transdecoder_dir
    ~/soft/diamond/diamond blastp -d ~/database/uniprot_sprot/uniprot_sprot -q  longest_orfs.pep --evalue 1e-5 --sensitive -p 40 --max-target-seqs 1 > blastp.outfmt6
    1.1 预测可能的编码区
    ~/soft/TransDecoder-v5.5.0/TransDecoder.Predict  -t sample1.fasta --retain_blastp_hits sample0.fasta.transdecoder_dir/blastp.outfmt6 
    cat sample1.fasta.transdecoder.bed | cut -f1 | sort | uniq |wc -l
    37632                                 #有CDS结构的序列数
    cat sample0.fasta.transdecoder.bed | wc -l
    49907                                  #检测出CDS区
    修改文件名
    mv sample1.fasta.transdecoder.bed sample1.uniport.transdecoder.bed
    mv sample1.fasta.transdecoder.cds sample1.uniport.transdecoder.cds
    mv sample1.fasta.transdecoder.gff3 sample1.uniport.transdecoder.gff3
    mv sample1.fasta.transdecoder.pep sample1.uniport.transdecoder.pep
    把50个碱基一行的序列改为该序列的所有碱基为一行
    seqkit  seq  sample1.uniport.transdecoder.cds  -w  0  >  sample1.uniport.cds.fa
    下载 sample1.uniport.cds.fa sample1.uniport.transdecoder.gff3 到桌面目录
    ==============================================================
    1.2 使用自己提取出来的nr动物蛋白数据库
    1.2.1 blastp比对
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/CDS/sample1.fasta.transdecoder_dir
    ~/soft/diamond/diamond blastp -p 40 -d ~/database/nr/nr.animal -q  longest_orfs.pep --evalue 1e-5  > nr.animal.blastp.outfmt6
    sort -k1,1 -k12,12nr nr.animal.blastp.outfmt6 |awk  '!a[$1]++' >nr.animal.blastp.outfmt6.txt
    把 nr ID转换成genesymbol  网站(https://biodbnet-abcc.ncifcrf.gov/db/db2db.php)
    cat nr.animal.blastp.outfmt6.txt |cut -f2 |sort|uniq >refid.txt
    下载到桌面在excel中打开,复制第一列到ID转换网站(用GeneBank Protein Accession转换)
    保存为genebank.txt  上传至服务器
    在nr-animal.fa中查找refid.txt对应的序列
    vi refid.sh
    cat refid.txt | while read id;
    do
             arr=(${id})
             geneid=${arr[0]}
    grep  -w  "$geneid" ~/database/nr/nr.animal.fa >> sample1.nr.protein
    done
    chmod a+x refid.sh
    nohup ./refid.sh &                                            #8个小时
    cat sample1.nr.protein| sort |uniq |tr ">" "\t"|cut -f2 >sample1_nr.protein                      #提取第一个注释到的物种信息
    cat sample1_nr.protein|tr " " "\t"|awk '{$1=null;print $0}'|tr "\t" " "|sed  's/^[\t ]*//g'>nr.detil     #把id去掉,并去掉首行的空格
    sed -i "1d" genebank.txt        #去掉首行
    cat genebank.txt|cut -f1,2 >col2   #取前两列
    paste col2  nr.detil | tr "\t" ":" >sample1.nr.zhushi    #把文本转换成AAB96843:VCL:vinculin [Mus musculus] 的格式
    rm -f  col2 nr.detil  nr.animal.blastp.outfmt6  #去掉没用的文件
    cat nr.animal.blastp.outfmt6.txt | cut -f1,2 |tr "\t" " "|tr "." "\t"|cut -f1,2|tr "\t" "."|tr " " "\t"| sort -k2,2 >nr.animal.blastp.outfmt6.txt.2col   #提取nr比对结果文件的前两列,并把.去掉,然后按照第二列的字符排序
    cut -f2 nr.animal.blastp.outfmt6.txt.2col >col2      #提取按照字符排序的序列ID
    从sample1.nr.zhushi中遍历col2文件的每一行字符
    vi serach.sh
    cat col2| while read id;
    do
             arr=(${id})
             geneid=${arr[0]}
    grep  -w  "$geneid" sample1.nr.zhushi >> nr.animal.blastp.outfmt6.txt.2col.col2
    done
    chmod a+x serach.sh
    ./serach.sh
    cat nr.animal.blastp.outfmt6.txt.2col|cut -f1 >nr.animal.blastp.outfmt6.txt.2col.col1      #取第一列转录本ID
    paste nr.animal.blastp.outfmt6.txt.2col.col1 nr.animal.blastp.outfmt6.txt.2col.col2 >  sample1.nr.final.zs     #合并转录本ID和其对应的nr数据库的注释信息
    1.2.3 预测可能的编码区
    cd ..
    ~/soft/TransDecoder-v5.5.0/TransDecoder.Predict  -t sample1.fasta --retain_blastp_hits sample1.fasta.transdecoder_dir/sample1.nr.final.zs 
    cat sample1.fasta.transdecoder.bed | cut -f1 | sort | uniq |wc -l
    37684                                  #有CDS结构的序列数
    cat sample1.fasta.transdecoder.bed | wc -l
    50206                                  #检测出50206个CDS区
    修改文件名
    mv sample1.fasta.transdecoder.bed sample1.nr.transdecoder.bed
    mv sample1.fasta.transdecoder.cds sample1.nr.transdecoder.cds
    mv sample1.fasta.transdecoder.gff3 sample1.nr.transdecoder.gff3
    mv sample1.fasta.transdecoder.pep sample1.nr.transdecoder.pep
    把50个碱基一行的序列改为该序列的所有碱基为一行
    seqkit  seq  sample1.nr.transdecoder.cds  -w  0  >  sample1.nr.cds.fa
    下载 sample1.nr.cds.fa sample1.nr.transdecoder.gff3 到桌面目录
    
    • SSR-简单重复序列预测(MISA)
    运行前将misa.ini与fasta文件放在一起,输入的序列存在fasta文件里面,然后运行下面的命令
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
    mkdir SSR
    cp sample1.fasta SSR/
    cd SSR
    cp ~/soft/misa/misa.ini ./
    perl ~/soft/misa/misa.pl sample1.fasta
    less -S sample1.fasta.statistics
    Total number of identified SSRs:                 45532
    Number of SSRs present in compound formation:    4520   #复合卫星数目
    wc -l sample1.fasta.misa
    41013 sample1.fasta.misa                #该文件包含所有的SSR序列的信息,但是会把符合复合微卫星序列的序列信息放在一行,所以文件的行数比总微卫星数目少
    41013=45532-4520+1                #+1是因为第一行是行首标题
    下载sample1.fasta.misa  sample1.fasta.statistics到桌面
    计算SSR的密度分布(count/MB)
    首先统计sample1的序列总长度
    grep -v ">" sample1.fasta|wc -c
    153248411
    grep -v ">" sample1.fasta|wc -l
    45013
    总长度为153248411-45013=153203398
    总Mb为153203398/1000000=153.2
    ---------------------------------------------------------------------------------------------------------
    画密度分布图,新建文件ssrdensity.txt
    type    denity
    C    29.50391645
    P1    244.3146214
    P2    21.15535248
    P3    27.46083551
    P4    2.950391645
    P5    1.11618799
    P6    0.208877285
    setwd("C:/Users/TE/Desktop/绍梅师姐/sample1/uncompare/SSR")
    data<-read.table(file="ssrdensity.txt",header=T,sep="\t")
    library(ggplot2)
    data$type=factor(data$type,levels=data$type)
    CPCOLS <- c("#002fa7","#00fa9a","#008080","#ba55d3","#f0e68c","#f08080","#90ee90")
    p<-ggplot(data=data,aes(x=denity,y=type,fill=type))+geom_bar(stat="identity",width=0.8)+coord_flip()+ scale_fill_manual(values = CPCOLS) + theme_bw() + theme(axis.text=element_text(face = "bold", color="gray50")) +labs(y="Period number",x="SSR count/bases(Mb)",title="SSR Density",size=20)+theme(panel.grid.major=element_blank(),panel.grid.minor= element_blank(),legend.position = "none")+theme(axis.text.x = element_text(size=13,face="bold", hjust = 0.5, vjust = 0.5))+theme(plot.title = element_text(hjust = 0.5,size=18,face="bold"))+ theme(axis.title.x = element_text(size = 15, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.title.y = element_text(size = 15, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.text.y = element_text(size=13,face="bold", hjust = 0.5, vjust = 0.5))
    ggsave("SSRDensity.tiff", p, width = 14, height=10)
    
    • 转录本注释
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
    mkdir zhushi
    cd zhushi
    mkdir nr
    mkdir uniport
    mkdir kog
    #用diamond BLASTX进行注释
    1. nr注释
    cd nr
    nohup  ~/soft/diamond/diamond blastx --db ~/database/nr/nr.animal -q  ../../sample1.fasta --sensitive -p 40 --evalue 1e-5 -o  sample1_nr_matches_fmt6.txt &
    cat sample1_nr_matches_fmt6.txt| cut -f1 | sort |uniq|wc -l
    37149   #比对上的转录本数目
    sort -k1,1 -k12,12nr sample1_nr_matches_fmt6.txt |awk  '!a[$1]++' >sample1.nr.fmt6   #提取每个转录本第12列的最大值的行
    rm -f sample1_nr_matches_fmt6.txt     #删除无用的文件
    把 nr ID转换成genesymbol  网站(https://biodbnet-abcc.ncifcrf.gov/db/db2db.php)
    cat sample1.nr.fmt6 |cut -f2 |sort|uniq >refid.txt
    下载到桌面在excel中打开,复制第一列到ID转换网站(分别用GeneBank Protein Accession\RefSeq Protein Accession转换)
    保存为genebank.txt  上传至服务器
    在nr-animal.fa中查找refid.txt对应的序列
    vi refid.sh
    cat refid.txt | while read id;
    do
             arr=(${id})
             geneid=${arr[0]}
    grep  -w  "$geneid" ~/database/nr/nr.animal.fa >> sample1.nr.protein
    done
    chmod a+x refid.sh
    nohup ./refid.sh &                                                                                 #6小时
    cat sample1.nr.protein| sort |uniq |tr ">" "\t"|cut -f2 >sample1_nr.protein                      #提取第一个注释到的物种信息
    cat sample1_nr.protein|tr " " "\t"|awk '{$1=null;print $0}'|tr "\t" " "|sed  's/^[\t ]*//g'>nr.detil     #把id去掉,并去掉首行的空格
    sed -i "1d" genebank.txt        #去掉首行
    cat genebank.txt|cut -f1,2 >col2   #取前两列
    paste col2  nr.detil | tr "\t" ":" >sample1.nr.zhushi    #把文本转换成AAB96843:VCL:vinculin [Mus musculus] 的格式
    rm -f  col2 nr.detil   #去掉没用的文件
    cat sample1.nr.fmt6 | cut -f1,2 |tr "\t" " "|tr "." "\t"|cut -f1,2|tr "\t" "."|tr " " "\t"| sort -k2,2 >nr.animal.blastp.outfmt6.txt.2col   #提取nr比对结果文件的前两列,并把.去掉,然后按照第二列的字符排序
    cut -f2 nr.animal.blastp.outfmt6.txt.2col |tr "." "\t" |cut -f1>col2      #提取按照字符排序的序列ID
    从sample1.nr.zhushi中遍历col2文件的每一行字符
    vi serach.sh
    cat col2| while read id;
    do
             arr=(${id})
             geneid=${arr[0]}
    grep  -w  "$geneid" sample1.nr.zhushi >> nr.animal.blastp.outfmt6.txt.2col.col2
    done
    chmod a+x serach.sh
    ./serach.sh
    cat nr.animal.blastp.outfmt6.txt.2col|cut -f1 >nr.animal.blastp.outfmt6.txt.2col.col1      #取第一列转录本ID
    paste nr.animal.blastp.outfmt6.txt.2col.col1 nr.animal.blastp.outfmt6.txt.2col.col2 >  sample1.nr.final.zs     #合并转录本ID和其对应的nr数据库的注释信息,并下载到桌面
    cat sample1.nr.final.zs|tr "[" "\t"|sed "s/]//g" |cut -f3|sort|uniq -c                  #查看比对到的物种的分类,复制到excel表里按数值降序
    2.Swiss-Prot
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhuzhi/uniport
    ~/soft/diamond/diamond blastx -d ~/database/uniprot_sprot/uniprot_sprot  -q  ../../sample1.fasta --sensitive -p 40 --evalue 1e-5 -o  sample1_uniprot_matches_fmt6.txt
    sort -k1,1 -k12,12nr sample1_uniprot_matches_fmt6.txt |awk  '!a[$1]++' >sample1.uniport.fmt6
    rm -f  sample1_uniprot_matches_fmt6.txt
    wc -l sample1.uniport.fmt6
    35577     #比对上的转录本数目
    3.KEGG   在线比对参考https://www.jianshu.com/p/48716fa7321b
    grep "K" query.ko.txt|cut -f1 >kegg.zhushi                   18953个注释到的转录本
    4. KOG数据库
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhuzhi/kog
    ~/soft/diamond/diamond blastx -d ~/database/KOG/kyva  -q  ../../sample1.fasta --sensitive  --evalue 1e-5 -o  sample1_kog_matches_fmt6.txt
    35203 queries aligned.    #比对上的转录本数目
    sort -k1,1 -k12,12nr sample1_kog_matches_fmt6.txt |awk  '!a[$1]++' >sample1_kog.fmt6       #下载到桌面
    rm -f  sample1_kog_matches_fmt6.txt
    cat sample1_kog.fmt6|cut -f1,2 >kog.id
    从kog.txt中遍历kog.id文件的第二列的每一行字符
    vi serachkog.sh
    cat kog.id| while read id;
    do
             arr=(${id})
             geneid=${arr[1]}
    grep  -w  "$geneid" /public/jychu/database/KOG/kog.txt >> kog.id.group
    done
    chmod a+x serachkog.sh
    ./serachkog.sh
    wc -l kog.id.group
    28977     #一共有28977个kogID可以注释到功能层级
    cat kog.id.group|tr "]" "\t"|cut -f1>sample1.kog.group    #提取功能描述的编号
    cat sample1.kog.group|tr "\n" ","|sed "s/,//g" >row1    #把所有字符放在一行
    wc -c row1      32764-1=32763个字符
    把每一个字符分成一行
    for((i=1;i<=32763;i++));
    do
    cut -b $i row1 >>sample1.kog.group.1row;
    done
    cat sample1.kog.group.1row|sort|uniq -c    #统计每个字符(Group 分类)出现的次数
      1577 A
        711 B
        577 C
       1079 D
        559 E
        326 F
        706 G
        162 H
        863 I
       1111 J
       2079 K
        694 L
        175 M
        114 N
       2685 O
        493 P
        163 Q
       4991 R
       2172 S
       5311 T
       2431 U
        281 V
       1005 W
         27 X
        424 Y
       2047 Z
    --------------------------------------------------
    绘图
    setwd("C:/Users/TE/Desktop/绍梅师姐/sample1/uncompare/数据库注释/kog")
    library(ggplot2)
    data<- read.table("kog.picture.txt",header=T, sep="\t")
    CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5","#db7093")
    data$Term=factor(data$Term,levels=data$Term) 
    p <- ggplot(data=data, aes(y=Term, x=number, fill=group)) +
    geom_bar(stat="identity", width=0.8) + coord_flip() +
    scale_fill_manual(values = CPCOLS) + theme_bw() +
    theme(axis.text=element_text(face = "bold", color="gray50"))+labs(title = "KOG Function Classification of Consensus Sequence",y = "Function Class",x = "Frequency")+theme(panel.grid.major =element_blank(), panel.grid.minor = element_blank())+theme(legend.position = "right")+theme(plot.title = element_text(hjust = 0.5,size=17,face="bold"))+theme(axis.title.x = element_text(size = 14, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.title.y = element_text(size = 14, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+
    guides(fill = guide_legend(title = NULL))+theme(axis.text.x = element_text(angle = 288,size=12,face="bold", hjust = 0.02, vjust = 0.02))
    ggsave("KOGpicture.tiff", p, width = 12, height=13)
    ========================================================
    进入桌面:
    cd nr
    cut -f1 sample1.nr.final.zs >nr.zhushi
    cut -f1 ../kog/sample1_kog.fmt6 >../kog/kog.zhushi
    cut -f1 ../uniport/sample1.uniport.fmt6 > ../uniport/uniport.zhushi
    cat nr.zhushi ../uniport/uniport.zhushi ../kog/kog.zhushi ../kegg/kegg.zhushi|sort|uniq|wc -l
    37215        #至少有一个数据库注释成功的转录本数目
    
    • lncRNA预测
    1.CAPT预测
    cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
    mkdir lncRNA
    cd lncRNA
    cpat.py -x /public/jychu/soft/CPAT/goose/goose_Hexamer.tsv -d /public/jychu/soft/CPAT/goose/goose.logit.RData -g ../sample1.fasta -o sample1.goose.CAPT.out
    2. 统计nr数据库和uniport数据库没有注释到的转录本id
    grep ">" ../sample1.fasta|sed "s/>//g"|tr " " "\t"|cut -f1 >transcript.all    #提取所有的转录本id名称
    cut -f1 ../zhushi/uniport/sample1.uniport.fmt6 >uniport.trans    #提取uniport注释到的转录本id
    cat transcript.all uniport.trans|sort|uniq -u > uniport.untrans   #提取uniport未比对到的转录本id     9436个
    rm -f uniport.trans           #删除无用文件
    cut -f1 ../zhushi/nr/sample1.nr.fmt6 >nr.trans    #提取nr注释到的转录本id
    cat nr.trans transcript.all |sort|uniq -u > nr.untrans   #提取nr未比对到的转录本id    7864个
    rm -f nr.trans
    cd lncRNA
    awk  '$2>=200' sample1.goose.CAPT.out.ORF_prob.tsv |cut -f1|tr "_" "\t"|cut -f1|sort|uniq > capt.alltrans    #提取ORF长度大于200的所有转录本id
    awk '$10>=0.1 && $2>=200' sample1.goose.CAPT.out.ORF_prob.tsv|cut -f1|tr "_" "\t"|cut -f1|sort|uniq > goose.unlncRNA     #提取编码潜力大于0.1的转录本id
    cat goose.unlncRNA capt.alltrans|sort|uniq -u > capt.untrans      #提取编码潜力小于0.1,ORF长度大于200的转录本id    11281个
    rm -f  capt.alltrans  goose.unlncRNA
    下载 uniport.untrans  nr.untrans  capt.untrans 到桌面
    3.用CPC2软件预测编码潜能
    conda activate python2  #cpc2脚本只能在python2环境下使用
    python /public/jychu/soft/CPC2-beta/bin/CPC2.py -i ../sample1.fasta -o sample1.cpc2      #运行命令
    sample1.cpc2.txt             #输出文件
    grep "noncoding" sample1.cpc2.txt |cut -f1 >sample1.cpc2.lnc     #把预测为没有编码能力的转录本ID提取出来,下载到桌面
    wc -l sample1.cpc2.lnc
    12831 sample1.cpc2.lnc     #一共预测到12831个没有编码能力的转录本
    --------------------------------------------------------------------------------------------
    在https://bioinfogp.cnb.csic.es/tools/venny/index.html 网站绘制韦恩图
    把交集的序列保存为sample1.lnc.all,在服务器中新建sample1.lnc.all文件,并把序列粘贴上去(如果直接把文件上传上去,shell脚本识别不了该文件),下载到桌面上
    conda deactivate
    ---------------------------------------------------------------------------------------------
    4.构建鹅的lncRNA数据库索引,用blast进行比对
    cd /public/jychu/reference/index/hisat/goose
    makeblastdb -in lncRNA.name.fa -parse_seqids  -dbtype nucl -out lncrna.db                  #建立索引
    提取出所有的lncRNA序列
    vi lnc.all.sh
    cat sample1.lnc.all| while read id;
    do
             arr=(${id})
             geneid=${arr[0]}
    grep  -w -A1 "$geneid" ../sample1.fasta >> sample1.lnc.all.fa      #下载到桌面上
    done
    chmod a+x lnc.all.sh
    ./lnc.all.sh &
    blastn -query sample1.lnc.all.fa -out sample1.goose.blast -db /public/jychu/reference/index/hisat/goose/lncrna.db -outfmt 6 -evalue 1e-5 -num_threads 40   #lncran序列与鹅的已知lncrna数据库进行比对
    sort -k1,1 -k12,12nr sample1.goose.blast |awk  '!a[$1]++' >sample1.goose.lnc    #提取每个转录本第12列的最大值的行,下载到桌面上
    rm -f sample1.goose.blast    #删除无用的文件
    cut -f1 sample1.goose.lnc|sort|uniq|wc -l
    230   #一共有230个转录本比对到鹅的lncRNA上
    cut -f2 sample1.goose.lnc|sort|uniq|wc -l
    124  #一共比对上124个lncRNA
    seqkit  seq  sample1.goose.CAPT.out.ORF_seqs.fa  -w  0  >  sample1.goose.CAPT.ORF.fa   #把序列转换成一行
    sed -i "s/_/ /g" sample1.goose.CAPT.ORF.fa   #把行首标签替换字符,以便下面脚本中序列名称的匹配
    vi lnc.all.orf.sh                           #提取lncRNA预测的ORF序列
    cat sample1.lnc.all| while read id;
    do
             arr=(${id})
             geneid=${arr[0]}
    grep -i -w -A1 "$geneid" sample1.goose.CAPT.ORF.fa >> sample1.lncrna.ORF.fa         #下载到桌面上
    done
    chmod a+x lnc.all.orf.sh
    ./lnc.all.orf.sh
    grep ">" sample1.lnc.all.fa|tr "=" "\t"|cut -f3 >sample1.lnc.all.length  #获取序列长度信息,并下载到桌面上
    画序列长度分布图
    setwd("C:/Users/TE/Desktop/绍梅师姐/sample1/uncompare/lncRNA")
    a<-read.table(file = "sample1.lnc.all.length")
    a$V1<-as.integer(a$V1) 
    library(ggplot2)
    ggplot(data=a,aes(x=V1)) + 
    geom_histogram(binwidth=200,fill="#6498C8", color="#e9ecef", alpha=0.9)+
    theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 15000))+
    scale_y_continuous(name="count")
    ggsave("sample1.lnc.all.length.tiff",height = 4,width = 8)
    

    相关文章

      网友评论

        本文标题:全长转录组分析

        本文链接:https://www.haomeiwen.com/subject/eaersltx.html