day1、linux基础命令
man #查看帮助文档
ls #查看目录文件
cd #进某个目录
pwd #查看当前路径
ln #链接ln -s [目标文件] [指向的文件]
./ ../ -/ #路径
cp #复制文件
rm #删除文件
mkdir #穿件文件
touch #创建文件
wget #下载文件
curl #抓取文件
tar gunzip zip #解压缩
cat head tail less more vim nano emacs #查看与编辑文件
grep alias echo #
$PATH ./bashrc
wc sort cut uniq
输入输出 > < |
交集、并集
day2、基因组大小、杂合度等评估
1、数据和软件
数据:
ZHSY2/data/NGS/NHD2206_L1_1.fq
ZHSY2/data/NGS/NHD2206_L1_2.fq
软件:
jellyfish
kermgeneie
2、软件 jellyfish、kermgeneie用kmer法评估基因组大小
jellyfish count -m 19 -s 2G -t 16 - o mer_counts NHD2206_L1_1.fq NHD2206_L1_2.fq
jellyfish histo -o mer_counts.histo -t 30 mer_counts
R
> dataframe <- read.table("mer_counts.histo")
> plot(dataframe[1:200,],type="l")
> sum(as.numeric(dataframe[2:5135,1]*dataframe[2:5135,2]))/10
[1] 100110524
此基因组大小约为100M
kermgeneie
ls -1 NHD2206_L1_1.fq NHD2206_L1_2.fq > reads.list
kmergenie reads.list --diploid -k 121 -l 19 -s 6 -o 19mer_count
3、GenomeScope评估基因组大小和杂合度
Rscript genomescope.R 19mer_count.histo 19 150 out
day3、三代基因组组装
1 、数据和软件
数据:
data/TGS/Guy11_m160523.fastq.gz
data/TGS/Guy11_m160527.fastq.gz
软件:nextDenovo NextPolish
2、使用nextdenovo组装
ls Guy11_m160523.fastq.gz Guy11_m160527.fastq.gz > input.fofn
cp NextDenovo/doc/run.cfg ./
nextDenovo run.cfg
得到genome.fasta等数据
3、使用nextpolish组装
ls Guy11_m160523.fastq.gz Guy11_m160527.fastq.gz > sgs.fofn
genome=genome.fasta #nextdenovo组装的基因组作为输入
nextPolish run2.cfg
得到genome.nextpolish.fasta
day4、基因组评价
1、软件和数据
数据:
genome.fasta
genome.nextpolish.fasta
Schizosaccharomyces_pombe.ASM294v2.pep.all.fa
软件:busco
2、评估
busco -f -i genome.fasta -o genome -l ./funge_odb10 -m geno
busco -f -i genome.nextpolish.fasta -o genome_nextpolish -l ./funge_odb10 -m geno
busco -f -i Schizosaccharomyces_pombe.ASM294v2.pep.all.fa -o Schizosaccharomyces -l ./funge_odb10 -m prot
cp geno*/short* busco_summary/
cp Schi*/short* busco_summary/
generate_plot.py -wd busco_summary/
day5、基因组注释
1、数据和软件
数据:genome.nextpolish.fasta
软件:Augustus
2、注释
Augustus --protein=on --codingseq=on --species=magnaporthe_grisea genome.nextpolish.fasta > gene.gff3
得到文件 gene.gff3
day6、gitee、markdown与git的使用
1、gitee
2、markdown
# 一级标题
## 二级标题
##### 五级标题
- 列表第一项
- 列表第二项
1. 有序列表第一项
2. 有序列表第二项
[标题](链接地址)
![图片描述](图片链接地址)
*斜体*
**粗体**
> 引用段落
```
代码块
```
3、git
1、本地操作
#0、创建一个文件夹,作为工作目录
mkdir wd && cd wd
#1、初始化
git init wd
#2、添加、删除一个文件或修改一个文件
touch 1.hello
#3、将修改提交到缓存目录
git add 1.hello
#4、将修改提交到本地仓库
git commit add -m "修改说明"
2、git本地配置
git config --global user.name ["用户名"]
git config --global user.email ["用户邮箱"]
生成本地shh-key
ssh-keygen -t rsa -C "youremail@example.com"
服务器上添加shh key
3、远程操作
#1、下载远程仓库
git clone [远程仓库地址]
#将本地修改提交到远程仓库
git push [远程仓库/远程仓库别名]
#从远程仓库获取更新
git fetch [远程仓库/远程仓库别名]
git merge [远程仓库/远程仓库别名]
git pull [远程仓库/远程仓库别名]#fetch+merge,自动处理冲突,不建议这么做
day7、利用miR-PREFeR预测miRNA
1、fastqc
fastqc -o fastqc_out/ rawdata/SRR5448177.fastq.gz
2、去除接头
1、查找接头,并把匹配率
这里做了两个文件的
gzip -d -c SRR12420850.fastq.gz | find_3p_adapter.pl -m ugacagaagagagugagcac
#Sequence Frequency
#AGATCGGAAGAGCACACGTCTGAACTCCAGT 419 77.306 %
#TAGATCGGAAGAGCACACGTCTGAACTCCAG 31 5.720 %
#TTTAGATCGGAAGAGCACACGTCTGAACTCC 23 4.244 %
#AAGATCGGAAGAGCACACGTCTGAACTCCAG 18 3.321 %
gzip -d -c SRR12420851.fastq.gz | find_3p_adapter.pl -m ugacagaagagagugagcac
#Sequence Frequency
#AGATCGGAAGAGCACACGTCTGAACTCCAGT 306 74.272 %
#TAGATCGGAAGAGCACACGTCTGAACTCCAG 28 6.796 %
#CAGATCGGAAGAGCACACGTCTGAACTCCAG 17 4.126 %
#AAGATCGGAAGAGCACACGTCTGAACTCCAG 15 3.641 %
2、去除接头
cutadapt -a "AGATCGGAAG" -m 18 -M 35 --discard-untrimmed -o SRR12420850_trimmed.fastq.gz SRR12420850.fastq.gz
#=== Summary ===
#
#Total reads processed: 20,656,394
#Reads with adapters: 20,614,994 (99.8%)
#Reads that were too short: 3,602,574 (17.4%)
#Reads that were too long: 151,642 (0.7%)
#Reads written (passing filters): 16,902,178 (81.8%)
cutadapt -a "AGATCGGAAG" -m 18 -M 35 --discard-untrimmed -o SRR12420851_trimmed.fastq.gz SRR12420851.fastq.gz
#=== Summary ===
#
#Total reads processed: 21,441,218
#Reads with adapters: 21,401,577 (99.8%)
#Reads that were too short: 4,827,806 (22.5%)
#Reads that were too long: 97,414 (0.5%)
#Reads written (passing filters): 16,515,998 (77.0%)
3、检查长度分布
zcat SRR12420850_trimmed.fastq.gz |awk '{if(NR%4==2) print length($1)}' |sort|uniq -c
zcat SRR12420851_trimmed.fastq.gz |awk '{if(NR%4==2) print length($1)}' |sort|uniq -c
4、
1、建库
# bowtie-build data/rice_genome/IRGSP-1.0_genome.fa data/rice_genome/IRGSP-1.0_genome.fa
student@bioinfo-PowerEdge-R910:~/ZHSY2/lcp22/day7$ ls -x1 data/rice_genome/IRGSP-1.0_genome.*
data/rice_genome/IRGSP-1.0_genome.1.ebwt
data/rice_genome/IRGSP-1.0_genome.2.ebwt
data/rice_genome/IRGSP-1.0_genome.3.ebwt
data/rice_genome/IRGSP-1.0_genome.4.ebwt
data/rice_genome/IRGSP-1.0_genome.fa
data/rice_genome/IRGSP-1.0_genome.fa.fai
data/rice_genome/IRGSP-1.0_genome.fasta.gz
data/rice_genome/IRGSP-1.0_genome.rev.1.ebwt
data/rice_genome/IRGSP-1.0_genome.rev.2.ebwt
2、比对到库上
ShortStack --bowtie_cores 10 --readfile SRR12420850_trimmed.fastq.gz SRR12420851_trimmed.fastq.gz --genomefile data/rice_genome/IRGSP-1.0_genome.fa
grep -e "N15" -e "Y" Results.txt | cut -f2,9 |sed -e 's/^/>/' -e 's/\t/\n/' >miRNA.fa
day8、
1、这步不做
#java -Xmx100g –Xms50g -jar ~/script/srna-workbenchV4.4.Alpha.D/Workbench.jar -tool mircat2 -config efault_mircat2.json -f
#gzip –d SRR3955481_trimmed.fastq.gz
#fastq2fasta.pl –a SRR3955481_trimmed.fastq
#perl ~/script/mRNA_rm_white.pl FAN_r1.1.cds.fa FAN_r1.1.cds.v2.fa
#CleaveLand4.pl –t –e SRR3955481_trimmed.fa -u sRNA.fa -n transcriptome.fa –p 0.05 –o ./tplot
#samtools view -Sb SRR671953_trimmed.fa.processed.sam |samtools sort - SRR671953_sort
#samtools view -h SRR671953_sort.bam "Chr4" >SRR671953_chr4.sam
#python miR_PREFeR.py -L -k pipeline config.txt
#psRNAtarget(http://plantgrn.noble.org/psRNATarget/)
2、使用miR_PREFeR进行预测
##加载samtools 0.1.9, bwotie, python2.7
##注意版本号
##激活conda环境
conda activate miR_PREFeR
## change fastq to fasta ,将fastq序列转化为fasta,即fastq的1,3两行
gunzip -c SRR12420850_trimmed.fastq.gz | awk '{if(NR%4==1) {printf(">%s\n",substr($0,2));} else if(NR%4==2) print;}' > SRR12420850_trimmed.fa
# 重复read合并,并重命名reads
echo "rice50" > sample.names.txt
#只将fasta中的序列提出来,不包括>开头部分
cat SRR12420850_trimmed.fa|grep -v ">" > SRR12420850_trimmed_seq.fa
python scripts/process-reads-fasta.py sample.names.txt SRR12420850_trimmed_seq.fa
# 用bowite 定位
#python bowtie-align-reads.py -f -i [基因组文件] -p 1
python ./miR-PREFeR/scripts/bowtie-align-reads.py -f -i SRR12420850_trimmed.fa.processed
data/rice_genome/IRGSP-1.0_genome.fa -p 1 SRR12420850_trimmed.fa.processed
# 跑miR_PREFeR全部流程
#将config.example中的相关参数进行修改,包括输入输出数据,线程数等参数
python miR_PREFeR.py -L -k pipeline ./config.example
#如果程序跑太久,就只跑一条染色体
#将sam转化为bam,然后对bam文件进行排序,然后对排序后的文件进行index
samtools view -Sb SRR671953_trimmed.fa.processed.sam |samtools sort > SRR12420850_sort
samtools index SRR12420850_sort.bam
samtools view -h SRR12420850_sort.bam "chr04" > SRR12420850_chr4.sam
#对config.example进行参数修改,然后保存为config.txt,进行预测
python miR_PREFeR.py -L -k pipeline config.txt
3、在psRNATarget
http://plantgrn.noble.org/psRNATarget/
上传软件软件预测的结果到miR-PREFeR-result2/ZHSY2-test_miRNA.precursor.fa这个文件预测miRNA
day9、绘制网络图
下载数据,并用前1,2两列做出网络图,这里使用R软件进行绘图
library(threejs)
library(tidyr)
setwd("/home/uu/Desktop/")
links <- read.table("psRNATargetJob-1598685721663462.txt",stringsAsFactors = F)
gma_miRNA <- sort(links[,1])%>%unique()
rice_miRNA <- sort(links[,2])%>%unique()
notes <-c(gma_miRNA,rice_miRNA)
notes_color <- c(rep("red",length(gma_miRNA)),rep("blue",length(rice_miRNA)))
net <- graph_from_data_frame(d = links,directed = F,vertices = notes)
net %>%
set_vertex_attr("color", value =notes_color) %>%
set_vertex_attr("label", value=notes) %>%
set_vertex_attr("size",value = 1) %>%
#set_vertex_attr("frame.color",value="red")%>%
set_vertex_attr("label.family",value="serif") %>%#点标签字体
set_vertex_attr("label.dist",value=1) %>%#标签距离顶点的距离
set_vertex_attr("label.degree",value=0) %>%#顶点标签的位置
#set_vertex_attr("label.color",value="blue") %>% #边标签颜色
set_vertex_attr("label.cex",value="10") %>%#标签字体大小
set_vertex_attr("label.font",value="3") %>%
set_vertex_attr("label.x",value=NA) %>%
set_vertex_attr("label.y",value=NA) %>%
set_vertex_attr("main",value=FALSE) %>%
set_vertex_attr("xlab",value=FALSE) %>%
set_vertex_attr("ylab",value=FALSE) %>%
#set_vertex_attr("size2",value = NA) %>%
#set_vertex_attr("size2",value = NA) %>% #第二个性状点的大小
#set_edge_attr("color",value="green") %>%#设置边的颜色
#layout_nicely(dim=3) %>%
#layout_with_kk(dim=3) %>%
graphjs(bg="black")
点击查看3D图
day10
1、做基因组圆形图
##https://github.com/jokergoo/circlize/issues/50
#setwd("D:/programs/github_biobai/SARS-CoV-2_Bioinformatics/manuscript/plot/circlize/rice_circlize")
##添加水稻基因组信息
txdb <- makeTxDbFromGFF("./transcripts_exon.gtf",circ_seqs=character())
annoData <- genes(txdb)
annoData1 <- as.data.frame(annoData)
pdf(file="version1.pdf")
#circos.par(start.degree = 90)
#外圈基因组
circos.genomicInitialize(annoData1)
#已经注释基因
GENE <- read.csv("./transcripts_exon.bed", header = FALSE, sep = "\t")
GENE <- GENE[,c(1:3)]
colnames(GENE)=c("seqnames","start","end")
circos.genomicTrack(GENE ,ylim = c(0, 1),
panel.fun = function(region, value,...){
circos.genomicRect(region, value, col = "green3",border = NA, ...)
},track.height = 0.1, bg.col= NA)
miRNA <- read.csv("./miRNA.bed", header = FALSE, sep = "\t")
miRNA <- miRNA[,c(1:3)]
colnames(miRNA)=c("seqnames","start","end")
circos.genomicTrack(miRNA,ylim = c(0, 1),
panel.fun = function(region, value,...){
circos.genomicRect(region, value, col = "lightsalmon", border = NA, ...)
},track.height = 0.1, bg.col = NA ) #"#AE8933B3"
circos.clear()
dev.off()
image.png
网友评论