美文网首页
序列比对(十三)——后验解码

序列比对(十三)——后验解码

作者: 生信了 | 来源:发表于2019-11-14 10:50 被阅读0次

    原创: hxj7

    本文介绍了如何利用后验概率进行解码,可称为后验解码。

    前文《序列比对(12)计算后验概率》介绍了如何计算某一位置可能状态的后验概率。那么可以据此找到某一位置最有可能的状态。即


    image

    从上面的公式可以看出,有两种方法求解某一位置的最可能状态。如果是依据公式(1),先计算出后验概率,然后找到其中最大后验概率对应的状态;如果是依据公式(3),无需计算后验概率,比较简单。

    本文所用代码采用了公式(1)的方法,虽然稍嫌麻烦,但是可以在原先的代码基础上稍加增改即可。运行效果如下:


    image

    具体代码如下:

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <math.h>
    //#define MIN_LOG_VALUE -15
    //#define SAFE_EXP(x) ((x) < MIN_LOG_VALUE ? 0 : exp(x))
    
    typedef char State;
    typedef char Result;
    State state[] = {'F', 'L'};   // 所有的可能状态
    Result result[] = {'1', '2', '3', '4', '5', '6'};   // 所有的可能符号
    double init[] = {0.9, 0.1};    // 初始状态的概率向量
    double emission[][6] = {   // 发射矩阵:行对应着状态,列对应着符号
      1.0/6, 1.0/6, 1.0/6, 1.0/6, 1.0/6, 1.0/6,
      0.1, 0.1, 0.1, 0.1, 0.1, 0.5
    };
    double trans[][2] = {   // 转移矩阵:行和列都是状态
      0.95, 0.05,
      0.1, 0.9
    };
    const int nstate = 2;
    const int nresult = 6;
    
    double** fscore;  // 前向算法的得分矩阵
    double** bscore;  // 后向算法的得分矩阵
    double* scale;   // 缩放因子向量
    double logScaleSum;
    
    int random(double* prob, const int n);
    void randSeq(State* st, Result* res, const int n);
    int getResultIndex(Result r);
    void printState(State* st, const int n);
    void printResult(Result* res, const int n);
    double forward(Result* res, const int n);
    double backward(Result* res, const int n);
    double** getPostProb(const int n);
    void postDecode(double** prob, const int n);
    
    int main(void) {
      int i;
      int n = 5;
      State* rst;   // 一串随机状态序列
      Result* rres;  // 一串随机符号序列
      double** postProb;
      if ((rst = (State*) malloc(sizeof(State) * n)) == NULL || \
          (rres = (Result*) malloc(sizeof(Result) * n)) == NULL || \
          (scale = (double*) malloc(sizeof(double) * n)) == NULL || \
          (fscore = (double**) malloc(sizeof(double*) * nstate)) == NULL || \
          (bscore = (double**) malloc(sizeof(double*) * nstate)) == NULL) {
        fputs("Error: out of space!\n", stderr);
        exit(1);
      }
      for (i = 0; i < nstate; i++) {
        if ((fscore[i] = (double*) malloc(sizeof(double) * n)) == NULL || \
            (bscore[i] = (double*) malloc(sizeof(double) * n)) == NULL) {
          fputs("Error: out of space!\n", stderr);
          exit(1);
        }
      }
      randSeq(rst, rres, n);
      printState(rst, n);
      printResult(rres, n);
      forward(rres, n);
      backward(rres, n);
      postProb = getPostProb(n);
      postDecode(postProb, n);
      free(rst);
      free(rres);
      free(scale);
      free(fscore);
      free(bscore);
      for (i = 0; i < nstate; i++)
        free(postProb[i]);
      free(postProb);
    }
    
    // 根据一个概率向量从0到n-1随机抽取一个数
    int random(double* prob, const int n) {
      int i;
      double p = rand() / 1.0 / (RAND_MAX + 1);
      for (i = 0; i < n - 1; i++) {
        if (p <= prob[i])
          break;
        p -= prob[i];
      }
      return i;
    }
    
    // 根据转移矩阵和发射矩阵生成一串随机状态和符号
    void randSeq(State* st, Result* res, const int n) {
      int i, ls, lr;
      srand((unsigned int) time(NULL));
      ls = random(init, nstate);
      lr = random(emission[ls], nresult);
      st[0] = state[ls];
      res[0] = result[lr];
      for (i = 1; i < n; i++) {
        ls = random(trans[ls], nstate);
        lr = random(emission[ls], nresult);
        st[i] = state[ls];
        res[i] = result[lr];
      }
    }
    
    int getResultIndex(Result r) {
      return r - result[0];
    }
    
    // 前向算法计算P(x)
    double forward(Result* res, const int n) {
      int i, l, k, idx;
      double logpx;
      // 缩放因子向量初始化
      for (i = 0; i < n; i++)
        scale[i] = 0;
      // 计算第0列分值
      idx = getResultIndex(res[0]);
      for (l = 0; l < nstate; l++) {
        fscore[l][0] = emission[l][idx] * init[l];
        scale[0] += fscore[l][0];
      }
      for (l = 0; l < nstate; l++)
        fscore[l][0] /= scale[0];
      // 计算从第1列开始的各列分值
      for (i = 1; i < n; i++) {
        idx = getResultIndex(res[i]);
        for (l = 0; l < nstate; l++) {
          fscore[l][i] = 0;
          for (k = 0; k < nstate; k++) {
            fscore[l][i] += fscore[k][i - 1] * trans[k][l];
          }
          fscore[l][i] *= emission[l][idx];
          scale[i] += fscore[l][i];
        }
        for (l = 0; l < nstate; l++)
          fscore[l][i] /= scale[i];
      }
      // P(x) = product(scale)
      // P(x)就是缩放因子向量所有元素的乘积
      logpx = 0;
      for (i = 0; i < n; i++)
        logpx += log(scale[i]);
      printf("forward: logP(x) = %f\n", logpx);
      logScaleSum = logpx;
    /*
      for (l = 0; l < nstate; l++) {
        for (i = 0; i < n; i++)
          printf("%f ", fscore[l][i]);
        printf("\n");
      }
    */
      return exp(logpx);
    }
    
    // 后向算法计算P(x)
    // backward算法中使用的缩放因子和forward中的一样
    double backward(Result* res, const int n) {
      int i, l, k, idx;
      double tx, logpx;
      // 计算最后一列分值
      for (l = 0; l < nstate; l++)
        bscore[l][n - 1] = 1 / scale[n - 1];
      // 计算从第n - 2列开始的各列分值
      for (i = n - 2; i >= 0; i--) {
        idx = getResultIndex(res[i + 1]);
        for (k = 0; k < nstate; k++) {
          bscore[k][i] = 0;
          for (l = 0; l < nstate; l++) {
            bscore[k][i] += bscore[l][i + 1] * trans[k][l] * emission[l][idx];
          }
        }
        for (l = 0; l < nstate; l++)
          bscore[l][i] /= scale[i];
      }
      // 计算P(x)
      tx = 0;
      idx = getResultIndex(res[0]);
      for (l = 0; l < nstate; l++)
        tx += init[l] * emission[l][idx] * bscore[l][0];
      logpx = log(tx) + logScaleSum;
      printf("backward: logP(x) = %f\n", logpx);
    /*
      for (l = 0; l < nstate; l++) {
        for (i = 0; i < n; i++)
          printf("%f ", bscore[l][i]);
        printf("\n");
      }
    */
      return exp(logpx);  
    }
    
    // 计算后验概率
    double** getPostProb(const int n) {
      int i, k;
      double** postProb;
      //double logdiff;
      if ((postProb = (double**) malloc(sizeof(double*) * nstate)) == NULL) {
        fputs("Error: out of space!\n", stderr);
        exit(1);  
      }
      for (k = 0; k < nstate; k++) {
        if ((postProb[k] = (double*) malloc(sizeof(double) * n)) == NULL) {
          fputs("Error: out of space!\n", stderr);
          exit(1);
        }
      }
      // 计算后验概率
      for (i = 0; i < n; i++) {
        for (k = 0; k < nstate; k++) {
          postProb[k][i] = scale[i] * fscore[k][i] * bscore[k][i];
        }
      }
      printf("\n");
      printf("Posterior Probabilities:\n");
      for (k = 0; k < nstate; k++) {
        for (i = 0; i < n; i++)
          printf("%f ", postProb[k][i]);
        printf("\n");
      }
      return postProb;
    }
    
    void postDecode(double** prob, const int n) {
      int i, k;
      double maxCol;
      int idx;
      State* st;
      if ((st = (State*) malloc(sizeof(State) * n)) == NULL) {
        fputs("Error: out of space!\n", stderr);
        exit(1);    
      }
      for (i = 0; i < n; i++) {
        idx = 0;
        maxCol = prob[0][i];
        for (k = 1; k < nstate; k++)
          if (prob[k][i] > maxCol) {
            maxCol = prob[k][i];
            idx = k;
          }
        st[i] = state[idx];
      }
      printf("\n");
      printf("Posterior Decode:\n");
      printState(st, n);
      free(st);
    }
    
    void printState(State* st, const int n) {
      int i;
      for (i = 0; i < n; i++)
        printf("%c", st[i]);
      printf("\n");
    }
    
    void printResult(Result* res, const int n) {
      int i;
      for (i = 0; i < n; i++)
        printf("%c", res[i]);
      printf("\n");  
    }
    

    (公众号:生信了)

    相关文章

      网友评论

          本文标题:序列比对(十三)——后验解码

          本文链接:https://www.haomeiwen.com/subject/dwigbctx.html