先下载h38的参考基因组文件
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
#解压 gzip -d hg38.fa.gz 也一样
gunzip hg38.fa.gz
#gzip -9v need_gzip_file
一般情况下只要执行以下命令即可:
#构建各种索引文件
/data/BGICGA/tools/samtools dict hg38.fa > hg38.dict
/data/BGICGA/tools/samtools faidx hg38.fa
/data/BGICGA/tools/bwa index -a bwtsw hg38.fa
但是,我这里要提取各条染色体排序。
/data/BGICGA/tools/samtools faidx hg38.fa
awk '{print $1}' hg38.fa.fai > all_chrname
# cat samtools_faidx.sh
#提取每条chr的fa序列
#!/usr/bin/bash
cat all_chrname | while read line
do
samtools faidx hg38.fa $line > ./fafile/${line}.fa
done
echo "done."
sh samtools_faidx.sh
#!/usr/bin/env perl
#对chrY进行处理;
open (FA, "chrY.fa") or die "Error: $!\n";
open (MSK, ">", "chrY_mask.fa") or die "Error: $!\n";
open (ALLN, ">", "chrY_all_N.fa") or die "Error: $!\n";
local $/ = ">";
my @chrY = <FA>;
close FA;
shift @chrY; # shift the leading empty record.
my $seq = $chrY[0];
my $hd = $1 if ($seq =~ s/^(\S+)[^\n]*\n//);
$seq =~ s/\s+//g;
# according to the http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/
# mask the Pseudoautosomal region.
# Pseudoautosomal regions
# Name Chr Start Stop
# PAR#1 X 10001 2781479
# PAR#2 X 155701383 156030895
# PAR#1 Y 10001 2781479 2781479 - 10001 = 2771478
# PAR#2 Y 56887903 57217415 57217415 - 56887903 = 329512
substr ($seq, 10000, 2771479, ("N" x 2771479));
substr ($seq, 56887902, 329513, ("N" x 329513));
my $all_N = "N" x (length($seq));
$seq =~ s/([A-z]{60})/\1\n/g;
$all_N =~ s/(N{60})/\1\n/g;
print MSK ">$hd\n$seq\n";
close MSK;
print ALLN ">$hd\n$all_N\n";
close ALLN;
#对所有fa文件合并
cat chr[1-9].fa chr1[0-9].fa chr2[0-2].fa chrX.fa chrY_mask.fa chrM.fa chrUn_*.fa chr*_alt.fa chr*_random.fa > ../hg38_chM_male_mask.fa
cat chr[1-9].fa chr1[0-9].fa chr2[0-2].fa chrX.fa chrY_all_N.fa chrM.fa chrUn_*.fa chr*_alt.fa chr*_random.fa > ../hg38_chM_female.fa
#构建索引
#cat work.sh
#!/bin/sh
/data/BGICGA/tools/samtools dict hg38_chM_female.fa > hg38_chM_female.dict
/data/BGICGA/tools/samtools faidx hg38_chM_female.fa
/data/BGICGA/tools/bwa index -a bwtsw hg38_chM_female.fa
/data/BGICGA/tools/samtools dict hg38_chM_male_mask.fa > hg38_chM_male_mask.dict
/data/BGICGA/tools/samtools faidx hg38_chM_male_mask.fa
/data/BGICGA/tools/bwa index -a bwtsw hg38_chM_male_mask.fa
sh work.sh
好了,构建完毕。
索引
网友评论