美文网首页
c语言实现AC-BM算法

c语言实现AC-BM算法

作者: 一路向后 | 来源:发表于2021-04-09 22:45 被阅读0次

1.源码实现

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>

#define KEYSIZE     256
#define PATTERN_LEN 1024

struct Node {
    int label;                      /*标识: -2.根节点, -1.中间节点, n.第n个字串尾节点*/
    int depth;                      /*节点深度*/
    unsigned char ch;               /*节点对应字符*/
    int gs;                         /*好后缀位移*/
    int tgs;                        /*临时好后缀位移*/
    int bc;                         /*坏节点位移*/
    unsigned char och;              /*其中一个子索引字符*/
    struct Node *next[KEYSIZE];     /*儿子节点个数*/
    int nc;                         /*子节点个数*/
    struct Node *parent;            /*父节点*/
};

struct ACTree {
    struct Node *root;              /*树根节点*/
    unsigned char **key;            /*关键字数组*/
    int *len;                       /*每个关键字长度*/
    short hs[KEYSIZE];              /*是否存在某字符*/
    int bc[KEYSIZE];                /*坏字符的shift*/
    int maxdep;                     /*最大字串深度*/
    int minsize;                    /*最短字串长度*/
    int count;                      /*包含字串个数*/
};

struct ACMatch {
    int i;                          /*关键字在关键字数组中的索引*/
    unsigned long offset;           /*在待匹配文本中的偏移值*/
};

void ACTreeClean(struct Node *root);

int ACTreeBuild(struct ACTree *ptree, unsigned char **patterns, int *patterns_len, int npattern)
{
    struct Node *root;
    struct Node *parent;
    unsigned char ch ;
    int max_pattern_len = 0;
    int min_pattern_len = PATTERN_LEN;
    int i;

    if(NULL == ptree || NULL == patterns || npattern < 0)
    {
        return -1;
    }
              
    root = (struct Node *)malloc(sizeof(struct Node));
    if(NULL == root)
    {
        return -1;
    }

    memset(root, 0, sizeof(struct Node));
    root->label = -2;       /*树根标志*/
    root->depth = 0;        /*树根深度*/
    root->ch = ' ';         /*根字符为空*/

    /*对输入的字串循环处理添加进ACTree*/
    for(i=0; i<npattern; i++)
    {
        int pat_len;
        int ch_i;

        pat_len = strlen(patterns[i]);

        if(pat_len == 0)
        {
            continue;
        }
        else
        {
            if(pat_len > PATTERN_LEN)
            {
                pat_len = PATTERN_LEN;
            }

            if(pat_len > max_pattern_len)
            {
                max_pattern_len = pat_len;
            }

            if(pat_len < min_pattern_len)
            {
                min_pattern_len = pat_len;
            }

            parent = root;

            for(ch_i=pat_len-1; ch_i>=0; ch_i--)
            {
                ch = patterns[i][ch_i];

                /*对应的字符索引为NULL*/
                if(parent->next[ch] == NULL)
                {
                    break;
                }

                parent = parent->next[ch];
            }

            if(ch_i < pat_len)
            {
                /*在父节点下添加新节点*/
                for(; ch_i>=0; ch_i--)
                {
                    struct Node *node = NULL;
                    ch = patterns[i][ch_i];

                    node = (struct Node *)malloc(sizeof(struct Node));
                    if(node == NULL)
                    {
                        if(ptree->root != NULL)
                        {
                            ACTreeClean(ptree->root);
                            free(ptree->root);
                            ptree->root = NULL;
                        }

                        return -1;
                    }

                    memset(node, 0x00, sizeof(struct Node));

                    node->depth = pat_len - ch_i;
                    node->ch = ch;
                    node->label = -1;

                    /*将新节点添加到父节点的对应字符索引指针*/
                    parent->next[ch] = node;
                    parent->nc++;
                    /*parent->ch = ch;*/
                    node->parent = parent;

                    parent = node;
                }
            }
        }

        parent->label = i;
    }

    ptree->key = patterns;
    ptree->len = patterns_len;
    ptree->count = npattern;
    ptree->root = root;
    ptree->maxdep = max_pattern_len;
    ptree->minsize = min_pattern_len;

    return 0;
}

int ACPrintTree(struct Node *root)
{
    int i;

    if(root == NULL)
    {
        return 0;
    }

    printf("---------------------------------------------------------\n");

    printf("ch:%2c   GSShift:%2d   label:%2d   depth:%2d   childs:", root->ch, root->gs, root->label, root->depth);

    for(i=0; i<KEYSIZE; i++)
    {
        if(root->next[i] != NULL)
        {
            printf("%c ", (root->next[i])->ch);
        }
    }

    printf("\n");

    /*递归打印本节点的所有后缀节点信息*/
    for(i=0; i<KEYSIZE; i++)
    {
        if(root->next[i] != NULL)
        {
            ACPrintTree(root->next[i]);
        }
    }

    return 0;
}

int ACTreeComputeBCShifts(struct ACTree *ptree)
{
    unsigned char ch;
    int i, j = 0;

    for(i=0; i<KEYSIZE; i++)
    {
        ptree->bc[i] = ptree->minsize;
    }

    for(i=0; i<ptree->minsize; i++)
    {
        for(j=0; j<ptree->count; j++)
        {
            ch = ptree->key[j][i];
            ptree->bc[ch] = ptree->minsize - 1 - i;
        }
    }

    return 0;
}

int ACTreeSetGSShift(struct ACTree *ptree, unsigned char *pat, int depth)
{
    struct Node *node;
    int *gs = (int *)malloc(depth*sizeof(int));
    int *gl = (int *)malloc(depth*sizeof(int));
    int n = depth;
    int m = n / 2;
    int u = 0;
    int i;
    int j;
    int k = 0;

    if(ptree == NULL || ptree->root == NULL)
    {
        return -1;
    }

    node = ptree->root;

    for(i=0; i<n; i++)
    {
        gs[i] = -1;
        gl[i] = 0;
    }

    for(i=0; i<m; i++)
    {
        for(j=0; j<n-1-i; j++)
        {
            u = 1;

            for(k=0; k<=i; k++)
            {
                if(pat[j+k] != pat[n-1-i+k])
                {
                    u = 0;
                    break;
                }
            }

            if(u)
            {
                gs[n-1-i] = j;
                gl[n-1-i] = i+1;
            }
        }
    }

    for(i=n-2; i>=0; i--)
    {
        if(gs[i+1] < i && gl[i] < gl[i+1])
        {
            gs[i] = gs[i+1];
            gl[i] = gl[i+1];
        }
    }

    for(i=0; i<n; i++)
    {
        if(gs[i] == -1)
        {
            gs[i] = n;
        }
        else
        {
            gs[i] = n - gs[i];
        }
    }

    for(i=n-1; i>=0; i--)
    {
        node = node->next[pat[i]];
        node->tgs = gs[i];

        if(node->gs != -1)
        {
            node->gs = node->tgs < node->gs ? node->tgs : node->gs;
        }
        else
        {
            node->gs = node->tgs;
        }
    }

    free(gs);
    free(gl);

    gs = NULL;
    gl = NULL;

    return 0;
}

int ACTreeComputeGSShifts(struct ACTree *ptree)
{
    int pat_i = 0, pat_j = 0;
          
    for(pat_i=0 ; pat_i<ptree->count; pat_i++)
    {
        unsigned char *ppat_i = ptree->key[pat_i];
        int patlen_i = strlen(ptree->key[pat_i]);

        ACTreeSetGSShift(ptree, ppat_i, patlen_i);
    }

    return 0;
}


int ACTreeNInitGSShifts(struct Node *root, int shift)
{
    int i;

    if(root->label != -2)
    {
        root->gs = shift;
    }
             
    for(i=0; i<KEYSIZE; i++)
    {
        if(NULL != root->next[i])
        {
            ACTreeNInitGSShifts(root->next[i], shift);
        }
    }

    return 0;
}

int ACTreeInitGSShifts(struct ACTree *ptree)
{
    ACTreeNInitGSShifts(ptree->root, ptree->minsize);

    return 0;
}

int ACTreeComputeShifts(struct ACTree *ptree)
{
    ACTreeComputeBCShifts(ptree);
    /*printf ("\n函数 ACtree_compute_shifts----------------------") ; // 测试使用
    printf ("\n----调用 ACtree_compute_BCshifts----------------\n") ; // 测试使用
    ACtree_print_tree(ptree)  ; // 测试使用*/

    ACTreeInitGSShifts(ptree);
    /*printf ("\n函数 ACtree_compute_shifts----------------------") ; // 测试使用 
    printf ("\n----调用 ACtree_init_GSshifts-------------------\n") ; // 测试使用
    ACtree_print_tree(ptree) ;  // 测试使用*/

    ACTreeComputeGSShifts(ptree);
    /*printf ("\n函数 ACtree_compute_shifts----------------------") ; // 测试使用
    printf ("\n----调用 ACtree_compute_GSshifts----------------\n") ; // 测试使用
    ACtree_print_tree(ptree) ;  // 测试使用*/

    return 0;
}

struct ACTree *ACTreeCreate(unsigned char **patterns, int *patterns_len, int npattern)
{
    struct ACTree *ptree = NULL;
     
    ptree = (struct ACTree *)malloc(sizeof(struct ACTree));
    if(NULL == ptree)
    {
        return NULL;
    }

    memset(ptree, 0, sizeof(struct ACTree));

    ACTreeBuild(ptree, patterns, patterns_len, npattern);

    ACTreeComputeShifts(ptree);

    return ptree;
}

void ACTreeClean(struct Node *root)
{
    int i;

    for(i=0; i<KEYSIZE; i++)
    {
        if(root->next[i] != NULL) 
        {
            ACTreeClean(root->next[i]);
            free(root->next[i]);
            root->next[i] = NULL;
        }
    }

    return;
}

void ACBMClean(struct ACTree *ptree)
{
    if(ptree == NULL)
    {
        return;
    }

    if(ptree->root != NULL) 
    {
        ACTreeClean(ptree->root);
        free(ptree->root);
        ptree->root = NULL;
    }

    free(ptree);

    return;
}

int ACBMSearch(struct ACTree *ptree, unsigned char *text, int text_len, struct ACMatch *items, int nmax)
{
    int nmatched = 0;
    int base_index = 0;
    int cur_index = 0;
    int pat_index = 0;
    int real_shift = 0;
    int gs_shift = 0;
    int bc_shift = 0;
    int u = 0;

    struct Node *node = NULL;
        
    if(text_len < ptree->minsize)
    {
        return -1;
    }

    base_index = ptree->minsize - 1;
    pat_index = 0;

    while(base_index < text_len)
    {
        cur_index = base_index - pat_index;

        node = ptree->root;

        /*printf("while begin, ch=%c, node=%p, cur_index=%d\n", text[cur_index], node->next[text[cur_index]], cur_index);*/

        while(cur_index >= 0 && node->next[text[cur_index]] != NULL) 
        {
            node = node->next[text[cur_index]];

            if(node->label >= 0)
            {
                /*匹配到一个关键字, 保存到items中*/
                items[nmatched].i = node->label;
                items[nmatched].offset = cur_index;

                printf("key=%s, offset=%d\n", ptree->key[node->label], cur_index);

                nmatched++;

                if(nmatched == nmax)
                {
                    return 0;
                }

                base_index++;
                pat_index = 0;

                continue;
            }

            /*printf("pat_index++\n");*/
            pat_index++;
            cur_index--;
        }

        if(node->nc > 0)
        {
            /*跳字符,由GSshift和BCshift决定跳字符的位数*/
            gs_shift = node->gs; /* - pat_index - 1;*/

            if(cur_index < text_len)
            {
                /*printf("text[cur_index]=%c, pat_index=%d, bc=%d\n", text[cur_index], pat_index, ptree->bc[text[cur_index]]);*/
                bc_shift = ptree->bc[text[cur_index]] - pat_index;
            }
            else
            {
                bc_shift = 1;
            }

            /*printf("cur_index=%d, base_index=%d, pat_index=%d, gs_shift=%d, bc_shift=%d\n", cur_index, base_index, pat_index, gs_shift, bc_shift);*/

            /*取大者跳转*/
            real_shift = gs_shift > bc_shift ? gs_shift : bc_shift;

            /*printf("real_shift=%d\n", real_shift);*/

            base_index += real_shift;
            pat_index = 0;

            u++;
        }
        else
        {
            base_index++;
            pat_index = 0;
        }

        /*printf("base=%d, len=%d\n", base_index, text_len);*/
    }

    return nmatched;
}

int main()
{
    FILE *fp;
    unsigned char *keywords[] = {"your", "five", "ukey"};
    struct ACTree *ptree = NULL;
    struct ACMatch items[256];
    char str[4096] = "abcacabbcabccabc";
    int a[3] = {4, 4, 4};
    int n = strlen(str);
    int u;
    int k = 0;

    ptree = ACTreeCreate(keywords, a, 3);

    ACTreeComputeShifts(ptree);

    fp = fopen("1.txt", "r");

    memset(str, 0x00, sizeof(str));

    while(fgets(str, 2048, fp) != NULL)
    {
        n = strlen(str);

        k++;

        /*if(k==61326)*/
        {
            ACBMSearch(ptree, str, n, items, 256);
        }

        memset(str, 0x00, sizeof(str));
    }

    fclose(fp);

    ACBMClean(ptree);

    return 0;
}

2.编译源码

$ gcc -o example examle.c -std=c89

4.运行及其结果

$ ./example
key=five, offset=216
key=ukey, offset=325
key=five, offset=303
key=ukey, offset=739
key=ukey, offset=826
key=five, offset=114
key=your, offset=839
key=five, offset=308
key=your, offset=378
key=your, offset=744

相关文章

网友评论

      本文标题:c语言实现AC-BM算法

      本文链接:https://www.haomeiwen.com/subject/xitgkltx.html