1.源码实现
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#define KEYSIZE 256
#define PATTERN_LEN 1024
struct Node {
int label; /*标识: -2.根节点, -1.中间节点, n.第n个字串尾节点*/
int depth; /*节点深度*/
unsigned char ch; /*节点对应字符*/
int gs; /*好后缀位移*/
int tgs; /*临时好后缀位移*/
int bc; /*坏节点位移*/
unsigned char och; /*其中一个子索引字符*/
struct Node *next[KEYSIZE]; /*儿子节点个数*/
int nc; /*子节点个数*/
struct Node *parent; /*父节点*/
};
struct ACTree {
struct Node *root; /*树根节点*/
unsigned char **key; /*关键字数组*/
int *len; /*每个关键字长度*/
short hs[KEYSIZE]; /*是否存在某字符*/
int bc[KEYSIZE]; /*坏字符的shift*/
int maxdep; /*最大字串深度*/
int minsize; /*最短字串长度*/
int count; /*包含字串个数*/
};
struct ACMatch {
int i; /*关键字在关键字数组中的索引*/
unsigned long offset; /*在待匹配文本中的偏移值*/
};
void ACTreeClean(struct Node *root);
int ACTreeBuild(struct ACTree *ptree, unsigned char **patterns, int *patterns_len, int npattern)
{
struct Node *root;
struct Node *parent;
unsigned char ch ;
int max_pattern_len = 0;
int min_pattern_len = PATTERN_LEN;
int i;
if(NULL == ptree || NULL == patterns || npattern < 0)
{
return -1;
}
root = (struct Node *)malloc(sizeof(struct Node));
if(NULL == root)
{
return -1;
}
memset(root, 0, sizeof(struct Node));
root->label = -2; /*树根标志*/
root->depth = 0; /*树根深度*/
root->ch = ' '; /*根字符为空*/
/*对输入的字串循环处理添加进ACTree*/
for(i=0; i<npattern; i++)
{
int pat_len;
int ch_i;
pat_len = strlen(patterns[i]);
if(pat_len == 0)
{
continue;
}
else
{
if(pat_len > PATTERN_LEN)
{
pat_len = PATTERN_LEN;
}
if(pat_len > max_pattern_len)
{
max_pattern_len = pat_len;
}
if(pat_len < min_pattern_len)
{
min_pattern_len = pat_len;
}
parent = root;
for(ch_i=pat_len-1; ch_i>=0; ch_i--)
{
ch = patterns[i][ch_i];
/*对应的字符索引为NULL*/
if(parent->next[ch] == NULL)
{
break;
}
parent = parent->next[ch];
}
if(ch_i < pat_len)
{
/*在父节点下添加新节点*/
for(; ch_i>=0; ch_i--)
{
struct Node *node = NULL;
ch = patterns[i][ch_i];
node = (struct Node *)malloc(sizeof(struct Node));
if(node == NULL)
{
if(ptree->root != NULL)
{
ACTreeClean(ptree->root);
free(ptree->root);
ptree->root = NULL;
}
return -1;
}
memset(node, 0x00, sizeof(struct Node));
node->depth = pat_len - ch_i;
node->ch = ch;
node->label = -1;
/*将新节点添加到父节点的对应字符索引指针*/
parent->next[ch] = node;
parent->nc++;
/*parent->ch = ch;*/
node->parent = parent;
parent = node;
}
}
}
parent->label = i;
}
ptree->key = patterns;
ptree->len = patterns_len;
ptree->count = npattern;
ptree->root = root;
ptree->maxdep = max_pattern_len;
ptree->minsize = min_pattern_len;
return 0;
}
int ACPrintTree(struct Node *root)
{
int i;
if(root == NULL)
{
return 0;
}
printf("---------------------------------------------------------\n");
printf("ch:%2c GSShift:%2d label:%2d depth:%2d childs:", root->ch, root->gs, root->label, root->depth);
for(i=0; i<KEYSIZE; i++)
{
if(root->next[i] != NULL)
{
printf("%c ", (root->next[i])->ch);
}
}
printf("\n");
/*递归打印本节点的所有后缀节点信息*/
for(i=0; i<KEYSIZE; i++)
{
if(root->next[i] != NULL)
{
ACPrintTree(root->next[i]);
}
}
return 0;
}
int ACTreeComputeBCShifts(struct ACTree *ptree)
{
unsigned char ch;
int i, j = 0;
for(i=0; i<KEYSIZE; i++)
{
ptree->bc[i] = ptree->minsize;
}
for(i=0; i<ptree->minsize; i++)
{
for(j=0; j<ptree->count; j++)
{
ch = ptree->key[j][i];
ptree->bc[ch] = ptree->minsize - 1 - i;
}
}
return 0;
}
int ACTreeSetGSShift(struct ACTree *ptree, unsigned char *pat, int depth)
{
struct Node *node;
int *gs = (int *)malloc(depth*sizeof(int));
int *gl = (int *)malloc(depth*sizeof(int));
int n = depth;
int m = n / 2;
int u = 0;
int i;
int j;
int k = 0;
if(ptree == NULL || ptree->root == NULL)
{
return -1;
}
node = ptree->root;
for(i=0; i<n; i++)
{
gs[i] = -1;
gl[i] = 0;
}
for(i=0; i<m; i++)
{
for(j=0; j<n-1-i; j++)
{
u = 1;
for(k=0; k<=i; k++)
{
if(pat[j+k] != pat[n-1-i+k])
{
u = 0;
break;
}
}
if(u)
{
gs[n-1-i] = j;
gl[n-1-i] = i+1;
}
}
}
for(i=n-2; i>=0; i--)
{
if(gs[i+1] < i && gl[i] < gl[i+1])
{
gs[i] = gs[i+1];
gl[i] = gl[i+1];
}
}
for(i=0; i<n; i++)
{
if(gs[i] == -1)
{
gs[i] = n;
}
else
{
gs[i] = n - gs[i];
}
}
for(i=n-1; i>=0; i--)
{
node = node->next[pat[i]];
node->tgs = gs[i];
if(node->gs != -1)
{
node->gs = node->tgs < node->gs ? node->tgs : node->gs;
}
else
{
node->gs = node->tgs;
}
}
free(gs);
free(gl);
gs = NULL;
gl = NULL;
return 0;
}
int ACTreeComputeGSShifts(struct ACTree *ptree)
{
int pat_i = 0, pat_j = 0;
for(pat_i=0 ; pat_i<ptree->count; pat_i++)
{
unsigned char *ppat_i = ptree->key[pat_i];
int patlen_i = strlen(ptree->key[pat_i]);
ACTreeSetGSShift(ptree, ppat_i, patlen_i);
}
return 0;
}
int ACTreeNInitGSShifts(struct Node *root, int shift)
{
int i;
if(root->label != -2)
{
root->gs = shift;
}
for(i=0; i<KEYSIZE; i++)
{
if(NULL != root->next[i])
{
ACTreeNInitGSShifts(root->next[i], shift);
}
}
return 0;
}
int ACTreeInitGSShifts(struct ACTree *ptree)
{
ACTreeNInitGSShifts(ptree->root, ptree->minsize);
return 0;
}
int ACTreeComputeShifts(struct ACTree *ptree)
{
ACTreeComputeBCShifts(ptree);
/*printf ("\n函数 ACtree_compute_shifts----------------------") ; // 测试使用
printf ("\n----调用 ACtree_compute_BCshifts----------------\n") ; // 测试使用
ACtree_print_tree(ptree) ; // 测试使用*/
ACTreeInitGSShifts(ptree);
/*printf ("\n函数 ACtree_compute_shifts----------------------") ; // 测试使用
printf ("\n----调用 ACtree_init_GSshifts-------------------\n") ; // 测试使用
ACtree_print_tree(ptree) ; // 测试使用*/
ACTreeComputeGSShifts(ptree);
/*printf ("\n函数 ACtree_compute_shifts----------------------") ; // 测试使用
printf ("\n----调用 ACtree_compute_GSshifts----------------\n") ; // 测试使用
ACtree_print_tree(ptree) ; // 测试使用*/
return 0;
}
struct ACTree *ACTreeCreate(unsigned char **patterns, int *patterns_len, int npattern)
{
struct ACTree *ptree = NULL;
ptree = (struct ACTree *)malloc(sizeof(struct ACTree));
if(NULL == ptree)
{
return NULL;
}
memset(ptree, 0, sizeof(struct ACTree));
ACTreeBuild(ptree, patterns, patterns_len, npattern);
ACTreeComputeShifts(ptree);
return ptree;
}
void ACTreeClean(struct Node *root)
{
int i;
for(i=0; i<KEYSIZE; i++)
{
if(root->next[i] != NULL)
{
ACTreeClean(root->next[i]);
free(root->next[i]);
root->next[i] = NULL;
}
}
return;
}
void ACBMClean(struct ACTree *ptree)
{
if(ptree == NULL)
{
return;
}
if(ptree->root != NULL)
{
ACTreeClean(ptree->root);
free(ptree->root);
ptree->root = NULL;
}
free(ptree);
return;
}
int ACBMSearch(struct ACTree *ptree, unsigned char *text, int text_len, struct ACMatch *items, int nmax)
{
int nmatched = 0;
int base_index = 0;
int cur_index = 0;
int pat_index = 0;
int real_shift = 0;
int gs_shift = 0;
int bc_shift = 0;
int u = 0;
struct Node *node = NULL;
if(text_len < ptree->minsize)
{
return -1;
}
base_index = ptree->minsize - 1;
pat_index = 0;
while(base_index < text_len)
{
cur_index = base_index - pat_index;
node = ptree->root;
/*printf("while begin, ch=%c, node=%p, cur_index=%d\n", text[cur_index], node->next[text[cur_index]], cur_index);*/
while(cur_index >= 0 && node->next[text[cur_index]] != NULL)
{
node = node->next[text[cur_index]];
if(node->label >= 0)
{
/*匹配到一个关键字, 保存到items中*/
items[nmatched].i = node->label;
items[nmatched].offset = cur_index;
printf("key=%s, offset=%d\n", ptree->key[node->label], cur_index);
nmatched++;
if(nmatched == nmax)
{
return 0;
}
base_index++;
pat_index = 0;
continue;
}
/*printf("pat_index++\n");*/
pat_index++;
cur_index--;
}
if(node->nc > 0)
{
/*跳字符,由GSshift和BCshift决定跳字符的位数*/
gs_shift = node->gs; /* - pat_index - 1;*/
if(cur_index < text_len)
{
/*printf("text[cur_index]=%c, pat_index=%d, bc=%d\n", text[cur_index], pat_index, ptree->bc[text[cur_index]]);*/
bc_shift = ptree->bc[text[cur_index]] - pat_index;
}
else
{
bc_shift = 1;
}
/*printf("cur_index=%d, base_index=%d, pat_index=%d, gs_shift=%d, bc_shift=%d\n", cur_index, base_index, pat_index, gs_shift, bc_shift);*/
/*取大者跳转*/
real_shift = gs_shift > bc_shift ? gs_shift : bc_shift;
/*printf("real_shift=%d\n", real_shift);*/
base_index += real_shift;
pat_index = 0;
u++;
}
else
{
base_index++;
pat_index = 0;
}
/*printf("base=%d, len=%d\n", base_index, text_len);*/
}
return nmatched;
}
int main()
{
FILE *fp;
unsigned char *keywords[] = {"your", "five", "ukey"};
struct ACTree *ptree = NULL;
struct ACMatch items[256];
char str[4096] = "abcacabbcabccabc";
int a[3] = {4, 4, 4};
int n = strlen(str);
int u;
int k = 0;
ptree = ACTreeCreate(keywords, a, 3);
ACTreeComputeShifts(ptree);
fp = fopen("1.txt", "r");
memset(str, 0x00, sizeof(str));
while(fgets(str, 2048, fp) != NULL)
{
n = strlen(str);
k++;
/*if(k==61326)*/
{
ACBMSearch(ptree, str, n, items, 256);
}
memset(str, 0x00, sizeof(str));
}
fclose(fp);
ACBMClean(ptree);
return 0;
}
2.编译源码
$ gcc -o example examle.c -std=c89
4.运行及其结果
$ ./example
key=five, offset=216
key=ukey, offset=325
key=five, offset=303
key=ukey, offset=739
key=ukey, offset=826
key=five, offset=114
key=your, offset=839
key=five, offset=308
key=your, offset=378
key=your, offset=744
网友评论