hic分析之分步工作原理

作者: Ray钱 | 来源:发表于2019-08-21 12:38 被阅读3次

    参考之前的文章 hic分析之 hic-pro 介绍与安装

    分步工作原理:

    先看test文件

    /PATH/TO/HiC-Pro_2.11.1/test-op/test_data
    .
    |-- dixon_2M
    |   |-- SRR400264_00_R1.fastq.gz
    |   `-- SRR400264_00_R2.fastq.gz
    `-- dixon_2M_2
        |-- SRR400264_01_R1.fastq.gz
        `-- SRR400264_01_R2.fastq.gz
    

    再看看opt文件

    tree
    .
    |-- bowtie_results
    |   |-- bwt2
    |   |   |-- dixon_2M
    |   |   |   |-- SRR400264_00_R1_hg19.bwt2merged.bam
    |   |   |   |-- SRR400264_00_R1_hg19.mapstat
    |   |   |   |-- SRR400264_00_R2_hg19.bwt2merged.bam
    |   |   |   |-- SRR400264_00_R2_hg19.mapstat
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.bam
    |   |   |   `-- SRR400264_00_hg19.bwt2pairs.pairstat
    |   |   `-- dixon_2M_2
    |   |       |-- SRR400264_01_R1_hg19.bwt2merged.bam
    |   |       |-- SRR400264_01_R1_hg19.mapstat
    |   |       |-- SRR400264_01_R2_hg19.bwt2merged.bam
    |   |       |-- SRR400264_01_R2_hg19.mapstat
    |   |       |-- SRR400264_01_hg19.bwt2pairs.bam
    |   |       |-- SRR400264_01_hg19.bwt2pairs.pairstat
    |   |-- bwt2_global
    |   |   |-- dixon_2M
    |   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.bam
    |   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap.fastq
    |   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.bam
    |   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap.fastq
    |   |   |-- dixon_2M_2
    |   |       |-- SRR400264_01_R1_hg19.bwt2glob.bam
    |   |       |-- SRR400264_01_R1_hg19.bwt2glob.unmap.fastq
    |   |       |-- SRR400264_01_R2_hg19.bwt2glob.bam
    |   |       |-- SRR400264_01_R2_hg19.bwt2glob.unmap.fastq
    |   |-- bwt2_local
    |       |-- dixon_2M
    |       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
    |       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq
    |       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
    |       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq
    |       |-- dixon_2M_2
    |           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bwt2loc.bam
    |           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_trimmed.fastq
    |           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bwt2loc.bam
    |           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_trimmed.fastq
    |-- config_test_latest.txt
    |-- hic_results
    |   |-- data
    |   |   |-- dixon_2M
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.DEPairs
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.DumpPairs
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.FiltPairs
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.REPairs
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.RSstat
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.SCPairs
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.SinglePairs
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs.validPairs
    |   |   |   |-- SRR400264_00_hg19.bwt2pairs_interaction.bam
    |   |   |   `-- dixon_2M.allValidPairs
    |   |   `-- dixon_2M_2
    |   |       |-- SRR400264_01_hg19.bwt2pairs.DEPairs
    |   |       |-- SRR400264_01_hg19.bwt2pairs.DumpPairs
    |   |       |-- SRR400264_01_hg19.bwt2pairs.FiltPairs
    |   |       |-- SRR400264_01_hg19.bwt2pairs.REPairs
    |   |       |-- SRR400264_01_hg19.bwt2pairs.RSstat
    |   |       |-- SRR400264_01_hg19.bwt2pairs.SCPairs
    |   |       |-- SRR400264_01_hg19.bwt2pairs.SinglePairs
    |   |       |-- SRR400264_01_hg19.bwt2pairs.validPairs
    |   |       |-- SRR400264_01_hg19.bwt2pairs_interaction.bam
    |   |       \-- dixon_2M_2.allValidPairs
    |   |-- matrix
    |   |   |-- dixon_2M
    |   |   |   |-- iced
    |   |   |   |   |-- 500000
    |   |   |   |-- raw
    |   |   |       |-- 1000000
    |   |   |       |   |-- dixon_2M_1000000.matrix
    |   |   |       |   |-- dixon_2M_1000000_abs.bed
    |   |   |       |-- 500000
    |   |   |           |-- dixon_2M_500000.matrix
    |   |   |           |-- dixon_2M_500000_abs.bed
    |   |   |-- dixon_2M_2
    |   |       |-- raw
    |   |           |-- 1000000
    |   |           |   |-- dixon_2M_2_1000000.matrix
    |   |           |   |-- dixon_2M_2_1000000_abs.bed
    |   |           |-- 500000
    |   |               |-- dixon_2M_2_500000.matrix
    |   |               |-- dixon_2M_2_500000_abs.bed
    |   |-- pic
    |   |   |-- dixon_2M
    |   |   |   |-- plotHiCContactRanges_dixon_2M.pdf
    |   |   |   |-- plotHiCFragmentSize_dixon_2M.pdf
    |   |   |   |-- plotHiCFragment_dixon_2M.pdf
    |   |   |   |-- plotMappingPairing_dixon_2M.pdf
    |   |   |   |-- plotMapping_dixon_2M.pdf
    |   |   |-- dixon_2M_2
    |   |       |-- plotHiCContactRanges_dixon_2M_2.pdf
    |   |       |-- plotHiCFragmentSize_dixon_2M_2.pdf
    |   |       |-- plotHiCFragment_dixon_2M_2.pdf
    |   |       |-- plotMappingPairing_dixon_2M_2.pdf
    |   |       |-- plotMapping_dixon_2M_2.pdf
    |   |-- stats
    |       |-- dixon_2M
    |       |   |-- dixon_2M.mRSstat
    |       |   |-- dixon_2M.mpairstat
    |       |   |-- dixon_2M_R1.mmapstat
    |       |   |-- dixon_2M_R2.mmapstat
    |       |   |-- dixon_2M_allValidPairs.mergestat
    |       |-- dixon_2M_2
    |           |-- dixon_2M_2.mRSstat
    |           |-- dixon_2M_2.mpairstat
    |           |-- dixon_2M_2_R1.mmapstat
    |           |-- dixon_2M_2_R2.mmapstat
    |           |-- dixon_2M_2_allValidPairs.mergestat
    |-- logs
    |   |-- dixon_2M
    |   |   |-- SRR400264_00_R1_bowtie2.log
    |   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bowtie2.log
    |   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_readsTrimming.log
    |   |   |-- SRR400264_00_R2_bowtie2.log
    |   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bowtie2.log
    |   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_readsTrimming.log
    |   |   |-- build_raw_maps.log
    |   |   |-- ice_500000.log
    |   |   |-- make_Rplots.log
    |   |   |-- mapped_2hic_fragments.log
    |   |   |-- mapping_combine.log
    |   |   |-- mapping_stats.log
    |   |   |-- mapping_step1.log
    |   |   |-- mapping_step2.log
    |   |   |-- mergeSAM.log
    |   |   |-- merge_stats.log
    |   |   |-- merge_valid_interactions.log
    |   |   |-- plot_hic_contacts.Rout
    |   |   |-- plot_hic_fragment.Rout
    |   |   |-- plot_mapping_portion.Rout
    |   |   |-- plot_pairing_portion.Rout
    |   |-- dixon_2M_2
    |       |-- SRR400264_01_R1_bowtie2.log
    |       |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bowtie2.log
    |       |-- SRR400264_01_R1_hg19.bwt2glob.unmap_readsTrimming.log
    |       |-- SRR400264_01_R2_bowtie2.log
    |       |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bowtie2.log
    |       |-- SRR400264_01_R2_hg19.bwt2glob.unmap_readsTrimming.log
    |       |-- build_raw_maps.log
    |       |-- make_Rplots.log
    |       |-- mapped_2hic_fragments.log
    |       |-- mapping_combine.log
    |       |-- mapping_stats.log
    |       |-- mapping_step1.log
    |       |-- mapping_step2.log
    |       |-- mergeSAM.log
    |       |-- merge_stats.log
    |       |-- merge_valid_interactions.log
    |       |-- plot_hic_contacts.Rout
    |       |-- plot_hic_fragment.Rout
    |       |-- plot_mapping_portion.Rout
    |       |-- plot_pairing_portion.Rout
    |-- rawdata -> /research/labs/molebio/ordog/shared/HiC-Pro/HiC-Pro_2.11.1/test-op/test_data
    |-- tmp
    
    
    

    工作原理介绍:


    image.png

    1.Reads Mapping

    将fastq文件用bowtie2 mapping一次,看log:

    cat mapping_step1.log
    
    ###mapping fastq to reference genome , 将unmapping的read 另存为 unmap.fastq
    /PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap.fastq --rg-id BMG --rg SM:SRR400264_00_R2 -p 20 -x /PATH/TO/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U rawdata/dixon_2M/SRR400264_00_R2.fastq.gz 2>> logs/dixon_2M/SRR400264_00_R2_bowtie2.log| /PATH/TO/samtools/samtools-1.6/bin/samtools view -F 4 -bS - > bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.bam
    /PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap.fastq --rg-id BMG --rg SM:SRR400264_00_R1 -p 20 -x /PATH/TO/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U rawdata/dixon_2M/SRR400264_00_R1.fastq.gz 2>> logs/dixon_2M/SRR400264_00_R1_bowtie2.log| /PATH/TO/samtools/samtools-1.6/bin/samtools view -F 4 -bS - > bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.bam
    

    得到这样的文件:

    |   |-- bwt2_global
    |   |   |-- dixon_2M
    |   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.bam
    |   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap.fastq
    |   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.bam
    |   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap.fastq
    |   |   `-- dixon_2M_2
    |   |       |-- SRR400264_01_R1_hg19.bwt2glob.bam
    |   |       |-- SRR400264_01_R1_hg19.bwt2glob.unmap.fastq
    |   |       |-- SRR400264_01_R2_hg19.bwt2glob.bam
    |   |       |-- SRR400264_01_R2_hg19.bwt2glob.unmap.fastq
    
    

    这里有mapping的,和unmapping的。因为read是嵌合基因。mapping上的是背景基因组。unmapping的是需要下一步处理的。

    2.片段分配和过滤

    先看log

    cat mapping_step2.log
    ### 酶切
    /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/cutsite_trimming --fastq bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap.fastq --cutsite AAGCTAGCTT --out bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq > logs/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_readsTrimming.log 2>&1
    /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/cutsite_trimming --fastq bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap.fastq --cutsite AAGCTAGCTT --out bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq > logs/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_readsTrimming.log 2>&1
    ### 再用bowtie2 mapping一次
    /PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder --rg-id BML --rg SM:SRR400264_00_R1_hg19.bwt2glob.unmap -p 20 -x /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq 2>> logs/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bowtie2.log | /PATH/TO/samtools/samtools-1.6/bin/samtools view -bS - > bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
    /PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder --rg-id BML --rg SM:SRR400264_00_R2_hg19.bwt2glob.unmap -p 20 -x /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq 2>> logs/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bowtie2.log | /PATH/TO/samtools/samtools-1.6/bin/samtools view -bS - > bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
    

    这一步是将unmapping的结果,用酶切位点分割,然后将分割下来的结果,再mapping一次。得到以下结果

    |   |-- bwt2_local
    |       |-- dixon_2M
    |       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
    |       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq
    |       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
    |       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq
    |       |-- dixon_2M_2
    |           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bwt2loc.bam
    |           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_trimmed.fastq
    |           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bwt2loc.bam
    |           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_trimmed.fastq
    

    3、mapping_combine.log

    cat mapping_combine.log
    
    ### 将global的bam与local的bam用samtools merge
    /PATH/TO/samtools/samtools-1.6/bin/samtools merge -@ 20 -n -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.bam bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
    /PATH/TO/samtools/samtools-1.6/bin/samtools merge -@ 20 -n -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.bam bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
    
    ### sort
    /PATH/TO/samtools/samtools-1.6/bin/samtools sort -@ 20 -n -T tmp/SRR400264_00_R2_hg19 -o bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
    /PATH/TO/samtools/samtools-1.6/bin/samtools sort -@ 20 -n -T tmp/SRR400264_00_R1_hg19 -o bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
    
    [bam_sort_core] merging from 0 files and 20 in-memory blocks...
    [bam_sort_core] merging from 0 files and 20 in-memory blocks...
    
    ### 改名
    mv bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
    mv bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
    

    4、merge

    ### 将R1与R2 merge为一个文件。
    cat mergeSAM.log
    /PATH/TO/python/python-2.7.10/bin/python /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/mergeSAM.py -q 0 -t -v -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam -r bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam -o bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
    ## mergeBAM.py
    ## forward= bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
    ## reverse= bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
    ## output= bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
    ## min mapq= 0
    ## report_single= False
    ## report_multi= False
    ## verbose= True
    ## Merging forward and reverse tags ...
    

    5、利用HiCPro的mapped_2hic_fragments.py程序将比对结果转化为Hi-C片段信息

    cat mapped_2hic_fragments.log
    /PATH/TO/python /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/mapped_2hic_fragments.py -v -S -t 100 -m 100000 -s 100 -l 600 -a -f /PATH/TO/HiC-Pro_2.11.1/annotation/HindIII_resfrag_hg19.bed -r bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam -o hic_results/data/dixon_2M
    ## overlapMapped2HiCFragments.py
    ## mappedReadsFile= bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
    ## fragmentFile= /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/annotation/HindIII_resfrag_hg19.bed
    ## minInsertSize= 100
    ## maxInsertSize= 600
    ## minFragSize= 100
    ## maxFragSize= 100000
    ## allOuput= True
    ## SAM ouput= True
    ## verbose= True
    

    再对输出的valid pairs文件进行排序:

    LANG=en; sort -T tmp -k2,2V -k3,3n -k5,5V -k6,6n -o hic_results/data/sample/sample_sample_genome_ref.bwt2pairs.validPairs hic_results/data/sample/sample_sample_genome_ref.bwt2pairs.validPairs
    

    6 对所有的valid pairs进行合并

    LANG=en; sort -T tmp -S 50% -k2,2V -k3,3n -k5,5V -k6,6n -m hic_results/data/dixon_2M_2/SRR400264_01_hg19.bwt2pairs.validPairs | awk -F"\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=$2 || c2!=$5 || s1!=$3 || s2!=$6){print;c1=$2;c2=$5;s1=$3;s2=$6}' > hic_results/data/dixon_2M_2/dixon_2M_2.allValidPairs
    

    这样就成了我们最终的文件,dixon_2M_2.allValidPairs。

    这个文件还不能可视化,如果要可视化,一般在jucebox里面查看。我们需要用配套的hicpro2juicebox.sh来转化:

    ## e.g.1
    HICPRO_PATH/bin/utils/hicpro2juicebox.sh -i hicpro_res/hic_results/data/dixon_2M/dixon_2M_allValidPairs -g hg19 -j /usr/local/juicebox/juicebox_clt_1.4.jar
    

    很多人问,这个-j的 jar文件在哪里?
    事实上用这个就可以。

    /PATH/TO/juicer/juicer/UGER/scripts/juicer_tools.jar
    

    下次,我们再讨论juicer与juicebox的用法。

    相关文章

      网友评论

        本文标题:hic分析之分步工作原理

        本文链接:https://www.haomeiwen.com/subject/lvebsctx.html