美文网首页
htslib 的使用( C++ 处理BAM文件)

htslib 的使用( C++ 处理BAM文件)

作者: 茄子_0937 | 来源:发表于2021-11-09 18:05 被阅读0次

    背景

    在处理NGS 数据时,尤其是突变的检测。我们需要对bam 文件进行读写。 目前有相对方便的库 pysam 和 htsjdk ,这两个库提供的api 可以方便python 和java 读写,操作BAM。

    随着测序深度的提高,尤其是UMI技术的普及,上万层的测序数据下,我们需要提高分析软件的效率,C++是比较好的选择。

    C++ 中可以读写BAM 的库有 htslib 、bamtools、Seqan3 等。Seqan3的文档信息比较全,但是我使用文档中的测试案例发现,速度很慢(也许是我打开方式有问题?);bamtools 和 htslib 的效率挺好的,但是都缺乏详细的文档,用起来真的头大。

    本文,粗略的学习了htslib sam.h sam.c 部分的代码,以及参考了陈师傅的gencore 代码,总结了一些htslib的使用方法。(pileup 部分好难,略过~)

    C++ 代码

    • 获得Cigar

      /*
      @discussion In the CIGAR array, each element is a 32-bit integer. The
       lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
       length of a CIGAR.
      */
      
      string getCigar(const bam1_t *b) {
          uint32_t *data = (uint32_t *)bam_get_cigar(b);
          int cigarNum = b->core.n_cigar;
          stringstream ss;
          for(int i=0; i<cigarNum; i++) {
              uint32_t val = data[i];
              char op = bam_cigar_opchr(val);
              uint32_t len = bam_cigar_oplen(val);
              ss << len << op;
          }
          return ss.str();
      }
      
    • 获得序列和质量值

      //Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 8 for T and 15 for N.
      char fourbits2base(uint8_t val) {
          switch(val) {
              case 1:
                  return 'A';
              case 2:
                  return 'C';
              case 4:
                  return 'G';
              case 8:
                  return 'T';
              case 15:
                  return 'N';
              default:
                  cerr << "ERROR: Wrong base with value "<< (int)val << endl ;
                  return 'N';
          }
      }
      
      // get seq 序列的信息记录在8bit 的数据结构中,前4bit 是前面的碱基,后4bit 是后面的碱基
      string getSeq(const bam1_t *b) {
          uint8_t *data = bam_get_seq(b);
          int len = b->core.l_qseq;
          string s(len, '\0');
          for(int i=0; i<len; i++) {
              char base;
              if(i%2 == 1)
                  base = fourbits2base(data[i/2] & 0xF); 
              else
                  base = fourbits2base((data[i/2]>>4) & 0xF);
              s[i] = base;
          }
          return s;
      }
      
      // get seq quality
      string getQual(const bam1_t *b) {
          uint8_t *data = bam_get_qual(b);
          int len = b->core.l_qseq;
          string s(len, '\0');
          for(int i=0; i<len; i++) {
              s[i] = (char)(data[i] + 33); // 转换成打印的ascci
          }
          return s;
      }
      
    • 读取完整BAM

      int main(int argc,char *argv[])
      {
          if(argc < 2) {
              cerr << "need bam path";
              return -1;
          }
          samFile *bam_in = sam_open(argv[1],"r"); // open bam file
          bam_hdr_t *bam_header = sam_hdr_read(bam_in); // read header
          bam1_t *aln = NULL;
          aln = bam_init1(); //initialize an alignment
          
      //return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error
          while (sam_read1(bam_in,bam_header,aln) >= 0) 
          {
              int pos = aln->core.pos ;
              string chr = "*";
              if (aln->core.tid != -1) // 存在无法比对到基因组的reads
                  chr = bam_header->target_name[aln->core.tid]; // config name(chromosome)            
              string queryname = bam_get_qname(aln);
              string cigar = getCigar(aln);
              string seq = getSeq(aln);
              string qual = getQual(aln);
      
              cout << "QueryName: " << queryname << '\n' 
                  << "Positon: " << chr << '\t' <<  pos <<'\n'
                  << "Cigar: " << cigar << '\n'
                  << "Seq: " << seq << '\n'
                  << "Qual: " << qual << '\n';
              
          }
      
          bam_destroy1(aln); // 回收资源
          bam_hdr_destroy(bam_header);
          sam_close(bam_in);
          cout << "Finshed!\n";
          return 0;
      }
      
    • 读取选定区域的BAM

      int main(int argc,char *argv[])
      {
          if(argc < 3) {
              cerr << "need bam path; chrom: start pos-end pos";
              return -1;
          }
          samFile *bam_in = sam_open(argv[1],"r"); // open bam file
          hts_idx_t *bam_index = sam_index_load(bam_in,argv[1]); //load index 
          bam_hdr_t *bam_header = sam_hdr_read(bam_in); // read header
          bam1_t *aln = NULL;
          aln = bam_init1(); //initialize an alignment
      /*
      Regions are parsed by hts_parse_reg(), and take one of the following forms:
      region          | Outputs
      --------------- | -------------
      REF             | All reads with RNAME REF
      REF:            | All reads with RNAME REF
      REF:START       | Reads with RNAME REF overlapping START to end of REF
      REF:-END        | Reads with RNAME REF overlapping start of REF to END
      REF:START-END   | Reads with RNAME REF overlapping START to END
      .               | All reads from the start of the file
      *               | Unmapped reads at the end of the file (RNAME '*' in SAM)
      */
              string regin = argv[2]; 
             
          hts_itr_t *iter = sam_itr_querys(bam_index,bam_header,regin);
          if(!iter) cerr << "invalid regin\n";
         // while (sam_read1(bam_in,bam_header,aln) >= 0)
          while (sam_itr_next(bam_in, iter, aln) >= 0)
          {
              int pos = aln->core.pos ;
              string chr = "*";
              if (aln->core.tid != -1)
                  chr = bam_header->target_name[aln->core.tid]; // config name(chromosome)            
              string queryname = bam_get_qname(aln);
              string cigar = getCigar(aln);
              string seq = getSeq(aln);
              string qual = getQual(aln);
              string md = getAux(aln,"MD");
      
              //int64_t newPos = pos - 5;
              cout << "QueryName: " << queryname << '\n' 
                  << "Positon: " <<aln->core.tid << '\t' << chr << '\t' <<  pos <<'\n'
                  << "Cigar: " << cigar << '\n'
                  << md << '\n'
                  << "Seq: " << seq << '\n'
                  << "Qual: " << qual << '\n';
              
          }
          sam_itr_destroy(iter) ;// free iter
          bam_destroy1(aln);
          bam_hdr_destroy(bam_header);
          sam_close(bam_in);
          cout << "Finshed!\n";
          return 0;
      }
      
      
    • 辅助字段的读取

      string getAux(const bam1_t *b,const char tag[2])
      {
          kstring_t res = KS_INITIALIZE;    // 需要初始化
          if(bam_aux_get_str(b,tag,&res) == 1) //kstring的string buffer 没有\0终止
          {
              int len = ks_len(&res);
              char *ks_s = ks_str(&res);
              string s(len, '\0');
              for (int i = 0;i<len;i++ ){
                  s[i] = ks_s[i];
              }
              ks_free(&res); // 释放资源
              return s;
          }    
          else 
          {
              cerr << "no tag :" << tag << '\n';
              ks_free(&res);
              return "";
          }
          
      }
      
      int main()
      {
      。。。
          string md = getAux(aln,"MD");
      。。。
      }
      

    相关文章

      网友评论

          本文标题:htslib 的使用( C++ 处理BAM文件)

          本文链接:https://www.haomeiwen.com/subject/pmrizltx.html