0.做三维基因组的时候, 需要将TAD分为 A-TAD和Other TAD(B-TAD)
具体衡量指标是如果某个TAD的body,有超过70%的区域都是A区室, 则该TAD就是A-TAD
1.输入文件有两个
第一个是TAD的bed文件, 文件是三列
染色体名, 起始bp 和 终止bp
Chr1 1 2000000
Chr1 2000001 2200000
第二个是AB区室的文件,文件是四列
Chr1 1 2000000 A
Chr1 2000001 2200000 B
(稍微处理一下, 如果PC1 值为正, 就写成A)
bedtools intersect -a Chr1.TAD.bed -b Chr1.compartment.txt -wo | sort -Vk 1 > jiaoji.bed
grep 'A' jiaoji.bed > jiaoji.A.bed
awk 'BEGIN{ FS="\t";OFS="\t" }{print $1"-"$2"-"$3,$8+1}' jiaoji.A.bed | tr "\t" "," | awk 'BEGIN{ FS=",";OFS="," }{a[$1]+=$2}{b[$1]+=$3}{c[$1]+=$4}{d[$1]+=$5}{e[$1]+=$6}{f[$1]+=$7}{g[$1]+=$8}{h[$1]+=$9}{m[$1]+=$10}{j[$1]+=$11}{k[$1]+=$12}{l[$1]+=$13}END{for(i in a){print i,a[i],b[i],c[i],d[i],e[i],f[i],g[i],h[i],m[i],j[i],k[i],l[i]}}' | sort -Vk 1 | awk -F "," '{print $1"\t"$2}' | sed 's/-/\t/g' | awk '{print $1"\t"$2"\t"$3"\t"$4/($3-$2+1)}' | awk '{if($4>=0.7)print $1"\t"$2"\t"$3"\t"$4"\t""A"; else print $1"\t"$2"\t"$3"\t"$4"\t""B"}' > Chr1.TADtoCom.partA.tmp
grep 'B' jiaoji.bed > jiaoji.B.bed
awk 'BEGIN{ FS="\t";OFS="\t" }{print $1"-"$2"-"$3,$8+1}' jiaoji.B.bed | tr "\t" "," | awk 'BEGIN{ FS=",";OFS="," }{a[$1]+=$2}{b[$1]+=$3}{c[$1]+=$4}{d[$1]+=$5}{e[$1]+=$6}{f[$1]+=$7}{g[$1]+=$8}{h[$1]+=$9}{m[$1]+=$10}{j[$1]+=$11}{k[$1]+=$12}{l[$1]+=$13}END{for(i in a){print i,a[i],b[i],c[i],d[i],e[i],f[i],g[i],h[i],m[i],j[i],k[i],l[i]}}' | sort -Vk 1 | awk -F "," '{print $1"\t"$2}' | sed 's/-/\t/g' | awk '{print $1"\t"$2"\t"$3"\t"$4/($3-$2+1)}' | awk '{if($4<=0.3)print $1"\t"$2"\t"$3"\t"$4"\t""A"; else print $1"\t"$2"\t"$3"\t"$4"\t""B"}' > Chr1.TADtoCom.partB.tmp
cat Chr1.TADtoCom.partA.tmp Chr1.TADtoCom.partB.tmp | cut -f 1,2,3,5 | sort -Vk 1 | uniq > Chr1.TADtoCom.bed
#这就是最终的结果
rm *tmp*
分割线-----------------------------------------------------
- 还需要统计loop的两端落在了A区室还是B区室
loop的文件格式一般包含这6列
chr1 start1 end1 chr2 start2 end2 (value......)
把前三列提取为一个文件: loop.left.anchor.bed
四五六列提取为另一个文件: loop.right.anchor.bed
第三个是AB区室的文件,文件是四列
Chr1 1 2000000 A
Chr1 2000001 2200000 B
(稍微处理一下, 如果PC1 值为正, 就写成A)
bedtools intersect -a loop.left.anchor.bed -b Chr1.compartment.bed -wa -wb -f 0.9 | awk '{print $1"\t"$2"\t"$3"\t"$7}' > loop.left.anchor.compart.tmp1
bedtools intersect -a loop.right.anchor.bed -b Chr1.compartment.bed -wa -wb -f 0.9 | awk '{print $1"\t"$2"\t"$3"\t"$7}' > loop.right.anchor.compart.tmp1
paste -d "\t" loop.left.anchor.compart.tmp1 loop.right.anchor.compart.tmp1 | awk '{print $1"\t"$2"\t"$3"\t"$5"\t"$6"\t"$7"\t"$4"-"$8}' > Chr1.loop.anchor.compart.bed
2.
最后一列就写了这个loop的左右锚点分布在A/B区室
网友评论