笔记

作者: 纵春水东流 | 来源:发表于2020-08-26 15:02 被阅读0次

    day1、linux基础命令


    man #查看帮助文档
    ls #查看目录文件
    cd #进某个目录
    pwd #查看当前路径
    ln #链接ln -s [目标文件] [指向的文件]
    ./ ../ -/ #路径 
    cp #复制文件
    rm #删除文件
    mkdir #穿件文件
    touch #创建文件
    wget #下载文件
    curl #抓取文件
    tar gunzip zip  #解压缩
    cat head tail less more vim nano emacs #查看与编辑文件
    grep alias echo  #
    $PATH ./bashrc
    wc sort cut uniq 
    输入输出 > < | 
    交集、并集
    

    day2、基因组大小、杂合度等评估


    1、数据和软件

    数据:
      ZHSY2/data/NGS/NHD2206_L1_1.fq
      ZHSY2/data/NGS/NHD2206_L1_2.fq
    软件:
      jellyfish
      kermgeneie
    

    2、软件 jellyfish、kermgeneie用kmer法评估基因组大小

    jellyfish count -m 19 -s 2G -t 16 - o mer_counts NHD2206_L1_1.fq NHD2206_L1_2.fq
    jellyfish histo -o mer_counts.histo -t 30 mer_counts
    R
    > dataframe <- read.table("mer_counts.histo")
    > plot(dataframe[1:200,],type="l")
    > sum(as.numeric(dataframe[2:5135,1]*dataframe[2:5135,2]))/10
    [1] 100110524
    此基因组大小约为100M
    
    kermgeneie
    ls -1 NHD2206_L1_1.fq NHD2206_L1_2.fq > reads.list
    kmergenie reads.list --diploid -k 121 -l 19 -s 6 -o 19mer_count
    

    3、GenomeScope评估基因组大小和杂合度

    Rscript  genomescope.R 19mer_count.histo 19 150 out
    

    day3、三代基因组组装


    1 、数据和软件

    数据:
    data/TGS/Guy11_m160523.fastq.gz
    data/TGS/Guy11_m160527.fastq.gz
    软件:nextDenovo NextPolish
    

    2、使用nextdenovo组装

    ls Guy11_m160523.fastq.gz Guy11_m160527.fastq.gz > input.fofn
    cp NextDenovo/doc/run.cfg ./
    nextDenovo run.cfg
    

    得到genome.fasta等数据
    3、使用nextpolish组装

    ls Guy11_m160523.fastq.gz Guy11_m160527.fastq.gz > sgs.fofn
    genome=genome.fasta #nextdenovo组装的基因组作为输入
    nextPolish run2.cfg
    

    得到genome.nextpolish.fasta

    day4、基因组评价


    1、软件和数据

    数据:
    genome.fasta
    genome.nextpolish.fasta
    Schizosaccharomyces_pombe.ASM294v2.pep.all.fa
    软件:busco
    

    2、评估

    busco -f -i genome.fasta -o genome -l ./funge_odb10 -m geno
    busco -f -i genome.nextpolish.fasta -o genome_nextpolish -l ./funge_odb10 -m geno
    busco -f -i Schizosaccharomyces_pombe.ASM294v2.pep.all.fa -o Schizosaccharomyces -l ./funge_odb10 -m prot
    cp  geno*/short* busco_summary/
    cp  Schi*/short* busco_summary/
    generate_plot.py -wd busco_summary/
    

    day5、基因组注释


    1、数据和软件

    数据:genome.nextpolish.fasta
    软件:Augustus
    

    2、注释

    Augustus --protein=on --codingseq=on --species=magnaporthe_grisea genome.nextpolish.fasta > gene.gff3
    

    得到文件 gene.gff3

    day6、gitee、markdown与git的使用


    1、gitee

    2、markdown

        # 一级标题
        ## 二级标题
        ##### 五级标题
    
        - 列表第一项
        - 列表第二项
    
        1. 有序列表第一项
        2. 有序列表第二项
    
        [标题](链接地址)
        ![图片描述](图片链接地址)
    
        *斜体*
        **粗体**
        > 引用段落
        ```
        代码块
        ```
    

    3、git
    1、本地操作

    #0、创建一个文件夹,作为工作目录
    mkdir wd && cd wd
    #1、初始化
    git init wd
    #2、添加、删除一个文件或修改一个文件
    touch 1.hello
    #3、将修改提交到缓存目录
    git add 1.hello
    #4、将修改提交到本地仓库
    git commit add -m "修改说明"
    

    2、git本地配置

    git config --global user.name ["用户名"]
    git config --global user.email ["用户邮箱"]
    生成本地shh-key
    ssh-keygen -t rsa -C "youremail@example.com"
    服务器上添加shh key
    

    3、远程操作

    #1、下载远程仓库
    git clone [远程仓库地址]
    #将本地修改提交到远程仓库
    git push [远程仓库/远程仓库别名]
    #从远程仓库获取更新
    git fetch  [远程仓库/远程仓库别名]
    git merge  [远程仓库/远程仓库别名]
    
    git pull [远程仓库/远程仓库别名]#fetch+merge,自动处理冲突,不建议这么做
    

    day7、利用miR-PREFeR预测miRNA

    1、fastqc

    fastqc -o fastqc_out/ rawdata/SRR5448177.fastq.gz
    

    2、去除接头
    1、查找接头,并把匹配率
    这里做了两个文件的

    gzip -d -c SRR12420850.fastq.gz | find_3p_adapter.pl -m ugacagaagagagugagcac 
    #Sequence   Frequency
    #AGATCGGAAGAGCACACGTCTGAACTCCAGT    419 77.306 %
    #TAGATCGGAAGAGCACACGTCTGAACTCCAG    31  5.720 %
    #TTTAGATCGGAAGAGCACACGTCTGAACTCC    23  4.244 %
    #AAGATCGGAAGAGCACACGTCTGAACTCCAG    18  3.321 %
    
    gzip -d -c SRR12420851.fastq.gz | find_3p_adapter.pl -m ugacagaagagagugagcac
    #Sequence   Frequency
    #AGATCGGAAGAGCACACGTCTGAACTCCAGT    306 74.272 %
    #TAGATCGGAAGAGCACACGTCTGAACTCCAG    28  6.796 %
    #CAGATCGGAAGAGCACACGTCTGAACTCCAG    17  4.126 %
    #AAGATCGGAAGAGCACACGTCTGAACTCCAG    15  3.641 %
    
    

    2、去除接头

     cutadapt -a "AGATCGGAAG" -m 18 -M 35 --discard-untrimmed -o SRR12420850_trimmed.fastq.gz SRR12420850.fastq.gz
    #=== Summary ===
    #
    #Total reads processed:              20,656,394
    #Reads with adapters:                20,614,994 (99.8%)
    #Reads that were too short:           3,602,574 (17.4%)
    #Reads that were too long:              151,642 (0.7%)
    #Reads written (passing filters):    16,902,178 (81.8%)
    
     cutadapt -a "AGATCGGAAG" -m 18 -M 35 --discard-untrimmed -o SRR12420851_trimmed.fastq.gz SRR12420851.fastq.gz
    #=== Summary ===
    #
    #Total reads processed:              21,441,218
    #Reads with adapters:                21,401,577 (99.8%)
    #Reads that were too short:           4,827,806 (22.5%)
    #Reads that were too long:               97,414 (0.5%)
    #Reads written (passing filters):    16,515,998 (77.0%)
    
    

    3、检查长度分布

    zcat SRR12420850_trimmed.fastq.gz  |awk '{if(NR%4==2) print length($1)}' |sort|uniq -c 
    zcat SRR12420851_trimmed.fastq.gz  |awk '{if(NR%4==2) print length($1)}' |sort|uniq -c
    

    4、
    1、建库

    # bowtie-build    data/rice_genome/IRGSP-1.0_genome.fa  data/rice_genome/IRGSP-1.0_genome.fa
    student@bioinfo-PowerEdge-R910:~/ZHSY2/lcp22/day7$ ls -x1 data/rice_genome/IRGSP-1.0_genome.*
    data/rice_genome/IRGSP-1.0_genome.1.ebwt
    data/rice_genome/IRGSP-1.0_genome.2.ebwt
    data/rice_genome/IRGSP-1.0_genome.3.ebwt
    data/rice_genome/IRGSP-1.0_genome.4.ebwt
    data/rice_genome/IRGSP-1.0_genome.fa
    data/rice_genome/IRGSP-1.0_genome.fa.fai
    data/rice_genome/IRGSP-1.0_genome.fasta.gz
    data/rice_genome/IRGSP-1.0_genome.rev.1.ebwt
    data/rice_genome/IRGSP-1.0_genome.rev.2.ebwt
    

    2、比对到库上

    ShortStack --bowtie_cores 10 --readfile SRR12420850_trimmed.fastq.gz SRR12420851_trimmed.fastq.gz --genomefile data/rice_genome/IRGSP-1.0_genome.fa
    grep -e "N15" -e "Y" Results.txt | cut -f2,9 |sed -e 's/^/>/' -e 's/\t/\n/' >miRNA.fa
    

    day8、


    1、这步不做

    #java -Xmx100g –Xms50g -jar ~/script/srna-workbenchV4.4.Alpha.D/Workbench.jar -tool mircat2 -config efault_mircat2.json -f
    #gzip –d SRR3955481_trimmed.fastq.gz
    #fastq2fasta.pl –a SRR3955481_trimmed.fastq
    #perl ~/script/mRNA_rm_white.pl    FAN_r1.1.cds.fa        FAN_r1.1.cds.v2.fa
    #CleaveLand4.pl –t –e  SRR3955481_trimmed.fa  -u sRNA.fa   -n transcriptome.fa    –p 0.05 –o ./tplot
    #samtools view -Sb SRR671953_trimmed.fa.processed.sam |samtools sort - SRR671953_sort
    #samtools view -h SRR671953_sort.bam "Chr4" >SRR671953_chr4.sam
    #python miR_PREFeR.py  -L -k pipeline config.txt 
    #psRNAtarget(http://plantgrn.noble.org/psRNATarget/)
    

    2、使用miR_PREFeR进行预测

    ##加载samtools 0.1.9, bwotie, python2.7
    ##注意版本号
    ##激活conda环境
    conda activate miR_PREFeR
    ## change fastq to fasta ,将fastq序列转化为fasta,即fastq的1,3两行
    gunzip -c SRR12420850_trimmed.fastq.gz | awk '{if(NR%4==1) {printf(">%s\n",substr($0,2));} else if(NR%4==2) print;}' > SRR12420850_trimmed.fa
    
    # 重复read合并,并重命名reads
    echo "rice50" >  sample.names.txt
    #只将fasta中的序列提出来,不包括>开头部分
    cat SRR12420850_trimmed.fa|grep -v ">" > SRR12420850_trimmed_seq.fa
    python scripts/process-reads-fasta.py sample.names.txt   SRR12420850_trimmed_seq.fa
    
    # 用bowite 定位
    #python bowtie-align-reads.py -f -i [基因组文件] -p 1 
    python ./miR-PREFeR/scripts/bowtie-align-reads.py -f -i SRR12420850_trimmed.fa.processed
    data/rice_genome/IRGSP-1.0_genome.fa -p 1  SRR12420850_trimmed.fa.processed
    # 跑miR_PREFeR全部流程
    #将config.example中的相关参数进行修改,包括输入输出数据,线程数等参数
    python miR_PREFeR.py  -L -k pipeline ./config.example 
    
    #如果程序跑太久,就只跑一条染色体
    #将sam转化为bam,然后对bam文件进行排序,然后对排序后的文件进行index
    samtools view -Sb SRR671953_trimmed.fa.processed.sam |samtools sort > SRR12420850_sort
    samtools index  SRR12420850_sort.bam
    samtools view -h  SRR12420850_sort.bam "chr04" >  SRR12420850_chr4.sam
    
    #对config.example进行参数修改,然后保存为config.txt,进行预测
    python miR_PREFeR.py  -L -k pipeline config.txt 
    

    3、在psRNATarget
    http://plantgrn.noble.org/psRNATarget/
    上传软件软件预测的结果到miR-PREFeR-result2/ZHSY2-test_miRNA.precursor.fa这个文件预测miRNA

    image.png

    day9、绘制网络图


    下载数据,并用前1,2两列做出网络图,这里使用R软件进行绘图

    library(threejs)
    library(tidyr)
    setwd("/home/uu/Desktop/")
    links <- read.table("psRNATargetJob-1598685721663462.txt",stringsAsFactors = F)
    gma_miRNA <- sort(links[,1])%>%unique()
    rice_miRNA <- sort(links[,2])%>%unique()
    notes <-c(gma_miRNA,rice_miRNA)
    notes_color <- c(rep("red",length(gma_miRNA)),rep("blue",length(rice_miRNA)))
    net <- graph_from_data_frame(d = links,directed = F,vertices = notes)
    net %>%
      set_vertex_attr("color", value =notes_color) %>% 
      set_vertex_attr("label", value=notes) %>% 
      set_vertex_attr("size",value = 1) %>% 
      #set_vertex_attr("frame.color",value="red")%>% 
      set_vertex_attr("label.family",value="serif") %>%#点标签字体
      set_vertex_attr("label.dist",value=1) %>%#标签距离顶点的距离
      set_vertex_attr("label.degree",value=0) %>%#顶点标签的位置
      #set_vertex_attr("label.color",value="blue") %>% #边标签颜色
      set_vertex_attr("label.cex",value="10") %>%#标签字体大小
      set_vertex_attr("label.font",value="3") %>%
      set_vertex_attr("label.x",value=NA) %>%
      set_vertex_attr("label.y",value=NA) %>%
      set_vertex_attr("main",value=FALSE) %>%
      set_vertex_attr("xlab",value=FALSE) %>%
      set_vertex_attr("ylab",value=FALSE) %>% 
      #set_vertex_attr("size2",value = NA) %>%
      #set_vertex_attr("size2",value = NA) %>% #第二个性状点的大小
      #set_edge_attr("color",value="green") %>%#设置边的颜色
      #layout_nicely(dim=3) %>%
      #layout_with_kk(dim=3) %>%
      graphjs(bg="black")
    

    点击查看3D图

    day10


    1、做基因组圆形图

    ##https://github.com/jokergoo/circlize/issues/50
    
    #setwd("D:/programs/github_biobai/SARS-CoV-2_Bioinformatics/manuscript/plot/circlize/rice_circlize")
    ##添加水稻基因组信息
    txdb <- makeTxDbFromGFF("./transcripts_exon.gtf",circ_seqs=character())
    annoData <- genes(txdb)
    annoData1 <- as.data.frame(annoData)
    
    pdf(file="version1.pdf")
    #circos.par(start.degree = 90)
    #外圈基因组
    circos.genomicInitialize(annoData1)
    
    
    #已经注释基因
    GENE <- read.csv("./transcripts_exon.bed", header = FALSE, sep = "\t")
    GENE <- GENE[,c(1:3)]
    colnames(GENE)=c("seqnames","start","end")
    
    circos.genomicTrack(GENE ,ylim = c(0, 1),
                        panel.fun = function(region, value,...){
                          circos.genomicRect(region, value, col = "green3",border = NA, ...)
                        },track.height = 0.1, bg.col= NA)
    
    miRNA <- read.csv("./miRNA.bed", header = FALSE, sep = "\t")
    miRNA <- miRNA[,c(1:3)]
    colnames(miRNA)=c("seqnames","start","end")
    
    circos.genomicTrack(miRNA,ylim = c(0, 1),
                        panel.fun = function(region, value,...){
                          circos.genomicRect(region, value, col = "lightsalmon", border = NA, ...)
                        },track.height = 0.1, bg.col = NA ) #"#AE8933B3"
    
    circos.clear()
    dev.off()
    
    
    
    image.png

    相关文章

      网友评论

          本文标题:笔记

          本文链接:https://www.haomeiwen.com/subject/pgexsktx.html