Problem Specs
![](https://img.haomeiwen.com/i2297735/86dde825d8cc863a.png)
The key idea is to use the bits of the primitive type int to mark
the matches of words----for example on matching words at words[0] and words[1] (the
first and the second words in the input words list) I set the value of the int
variable to 0x00000003, or more intuitively, 0000 0000 0000 0000 0000 0000 0000 0011.
Baring this idea in mind I first wrote this code:
int** computeWTB(char **words, int n) {
int **WTB = (int**)malloc(R * sizeof(*WTB));
int l = strlen(words[0]);
for (int i = 0; i < R; i++) {
WTB[i] = (int*)malloc(l * sizeof(int));
for (int j = 0; j < l; j++) { WTB[i][j] = 0; }
}
printa2(WTB, R, 3);
for (int i = 0; i < n; i++) {
for (int j = 0; j < l; j++) {
char c = words[i][j] - 'a';
WTB[c][j] |= (1 << i);
}
}
return WTB;
}
int* findSubstring(char * s, char ** words, int wordsSize, int* returnSize){
int n = wordsSize;
int **WTB = computeWTB(words, n);
int wl = strlen(words[0]);
int sl = strlen(s);
int wc = (1 << n) - 1;
int tl = wl * n;
int *ids = (int*)malloc((sl - tl) * sizeof(*ids));
int *ptr = ids;
int sum = 0;
int id = 0;
for (int i = 0; i < sl; i++) {
int ci = (i - id) % wl;
int cc = WTB[s[i] - 'a'][ci];
if (cc == 0) {//no word has an s[i] at ci
i = id++;
sum = 0;
wc = (1 << n) - 1;
} else {
wc &= cc;
if (wc == 0) {//combination of characters not found in word
i = id++;
sum = 0;
wc = (1 << n) - 1;
} else {
if (ci == wl - 1) {
if (wc == 0 || wc == (wc & sum)) {//word not in the list or repeated
i = id++;
sum = 0;
wc = (1 << n) - 1;
} else {//found
if (i - id + 1 == tl) {//all matched
*(ptr++) = id;
sum = 0;
wc = (1 << n) - 1;
i = id++;
} else {
int f = 1;
while (f <= wc) {
if (f & wc && !(f & sum)) {
sum |= f;
break;
}
f <<= 1;
}
}
}
wc = (1 << n) - 1;
}
}
}
}
if (returnSize != NULL) { *returnSize = ptr - ids; }
return ids;
}
Using an int incurred a limit on the number of words, i.e. wordSize, rendering this impl wrong;
however, the fact that it works on problems of smaller scale suggests that the idea's correct;
it just wants a type that can represent arbitrarily large integers.
Privation of built-in data structures(specifically, hash map in our case) that are taken for
granted in more advanced languages like cpp and java greately limits C's expressiveness and
impls in C usually come with very long code(see FOOTNOTE 1). It's even without class,
which is, after decades of prevalence of OOP paradigm, to a programming language what a
soul is to a human. Writing a ADT in C(a struct and a bunch of functions associated with it)
took quite a lot of getting used to.
[FOOTNOTE 1] Actually I've seen solutions in C that's remarkably
elegant&compact, with no more than 20 lines of code; however, those solutions heavily
rely on methematical theory, more specifically, the theory of numbers, to hash C strings into
integers, which is, quite embarrassedly, beyond the scope of my current concern; anyway, I'll
return to hash-based solution latter.
Listed below is the code that got ac-ed:
/**
* Abstract: The problem's difficult, the code's much more difficult to understand.
* With the same idea I first wrote the code(see the end of this article) based
* on the primitive type int provided by C and got runtime exceptions duo to the
* input words size being too large(larger than can be marked with an int's bits).
* Sticking to the idea I created the BInt struct and wrote the various functions
* to support arithmetic ops on this type.
*
* Traces might be found of R. Sedgewick's notion of using [c][i] into some array
* to express the idea of "encountering character c at position I".
*
* A vague idea has been spinning in my mind that a DFA might help avoid backing
* up the text ptr & therefore greately speeds up the algorithm but I can't see where
* it fits in. DFAs are computed using fixed pattern strings(?are thay?) while in
* our case the pattern consists in the words list whose order varies.
*
* Without further ado, let's move on into the algorithm itself:
* 1. First, a mechanism other than character-wise-cmp for word matching is devised,
* which goes as follows:
* Encode the word list into a table WTB(words table), whose entry WTB[c][i], being of
* type BInt wraps a trunk of memory that is to be interpreted this way:
* 1) The n'th bit being 1 means the n'th word in the word list has an c on its i'th
* position, the meaning of 0 should be clear and inferable from that of 1;
* 2) If AND of WTB[c1][0], WTB[c2][1], ..., WTB[cl][l] yields an result whose i'th
* bit is 1, this means in the word list, the i'th word is c1c2...cl;
* 3) The width of the an entry within WTB equals the size of the word list;
* 4) The size of the table is R by wl(R being the capacity of character set, 26 in
* our case, and wl being word length).
* We understand this definition by examples; for word list [ab, ac, bc], we have word
* size = 3, R = 3(the character set has three different characters: a, b and c),
* wl = 2, and the WTB is:
* 0 1
* a 011 000
* b 100 001
* c 000 110
* TO BE CONTINUED...
* To further speed up the algorithm, when compute the WTB, we can store in an array
* where further on to the end of the word list the next occurence of the same word is:
* words = [ab, bc, ab, cd, bc]=====> next = [2, 4, -1, -1, -1], that is, a[i] is the
* location of the next occurence of words[i]. And we use this array a to avoid looping
* through the sum bits to find the next polisition to match the found word. Still
* further improvement is possible to eliminate even the loop for the first location.
* But a hash map's needed. The idea is to move work done within the loop to the
* preprocessing of the word list.
*/
#define R 26
typedef unsigned char byte;
struct BInt {
byte *bytes;
size_t size;
};
struct BInt* CreateBInt(int n) {
struct BInt* bint = (struct BInt*)malloc(sizeof(*bint));
int r = n % 8;
bint->size = (n >> 3) + (r ? 1 : 0);
bint->bytes = (byte*)malloc(bint->size * sizeof(byte*));
for (int i = 0; i < bint->size; i++) { bint->bytes[i] = 0; }
return bint;
}
struct BInt* CreateFBInt(int n) {
struct BInt *bi = CreateBInt(n);
for (int i = 0; i < bi->size; i++) { bi->bytes[i] = 255; }
return bi;
}
void SetBIntn(struct BInt *bint, int n) {
int byten = n >> 3;
int offset = n % 8;
byte bt = 1 << offset;
byte *tbyte = bint->bytes + byten;
*tbyte |= bt;
}
void SetBInt(struct BInt *bint) { for (int i = 0; i < bint->size; i++) { bint->bytes[i] = 255; } }
void ClearBInt(struct BInt *bint) { for (int i = 0; i < bint->size; i++) { bint->bytes[i] = 0; } }
struct BInt* BIntByAndBInts(const struct BInt *bint1, const struct BInt *bint2) {
const struct BInt *sbi = bint1->size < bint2->size ? bint1 : bint2;
const struct BInt *lbi = sbi == bint1 ? bint2 : bint1;
struct BInt *bint = CreateBInt(lbi->size * 8);
for (int i = 0; i < lbi->size; i++) { bint->bytes[i] |= lbi->bytes[i]; }
for (int i = 0; i < sbi->size; i++) { bint->bytes[i] &= sbi->bytes[i]; }
return bint;
}
struct BInt* AndBInts(struct BInt *dest, const struct BInt *bint1, const struct BInt *bint2) {
const struct BInt *sbi = bint1->size < bint2->size ? bint1 : bint2;
const struct BInt *lbi = sbi == bint1 ? bint2 : bint1;
ClearBInt(dest);
for (int i = 0; i < lbi->size; i++) { dest->bytes[i] |= lbi->bytes[i]; }
for (int i = 0; i < sbi->size; i++) { dest->bytes[i] &= sbi->bytes[i]; }
return dest;
}
void BIntAnd(struct BInt *dest, const struct BInt *src) {
size_t size = dest->size < src->size ? dest->size : src->size;
for (int i = 0; i < size; i++) { dest->bytes[i] &= src->bytes[i]; }
}
struct BInt* BIntByOrBInts(const struct BInt *bint1, const struct BInt *bint2) {
const struct BInt *sbi = bint1->size < bint2->size ? bint1 : bint2;
const struct BInt *lbi = sbi == bint1 ? bint2 : bint1;
struct BInt *bint = CreateBInt(lbi->size << 3);
for (int i = 0; i < lbi->size; i++) { bint->bytes[i] |= lbi->bytes[i]; }
for (int i = 0; i < sbi->size; i++) { bint->bytes[i] |= sbi->bytes[i]; }
return bint;
}
void BIntOr(struct BInt *dest, const struct BInt *src) {
size_t size = dest->size < src->size ? dest->size : src->size;
for (int i = 0; i < size; i++) { dest->bytes[i] |= src->bytes[i]; }
}
void RShiftBIntOneBit(struct BInt *bint) {
for (int i = 0; i < bint->size; i++) {
int b = bint->bytes[i];
if (b != 0) {
if (b == 128) {
ClearBInt(bint);
SetBIntn(bint, (i + 1) << 3);
} else {
bint->bytes[i] <<= 1;
}
break;
}
}
}
struct BInt* BIntByRShiftOne(int bn, int n) {
struct BInt *bint = CreateBInt(n);
SetBIntn(bint, bn);
return bint;
}
int CompareBInts(const struct BInt *bint1, const struct BInt *bint2) {
for (int i = bint1->size; i >= 0; i--) {
int r = bint1->bytes[i] - bint2->bytes[i];
if (r != 0) { return r; }
}
return 0;
}
int IsBIntsZero(const struct BInt *bint) {
for (int i = 0; i < bint->size; i++) { if (bint->bytes[i] != 0) { return 0; } }
return 1;
}
int TestBIntAt(const struct BInt *bint, int n) {
int one = 0;
int byten = n >> 3;
int offset = n % 8;
byte bt = 1 << offset;
byte *tbyte = bint->bytes + byten;
return *tbyte & bt;
}
struct BInt*** computeWTB(char **words, int n) {
struct BInt ***WTB = (struct BInt ***)malloc(R * sizeof(*WTB));
int l = strlen(words[0]);
for (int i = 0; i < R; i++) {
WTB[i] = (struct BInt **)malloc(l * sizeof(struct BInt*));
for (int j = 0; j < l; j++) { WTB[i][j] = CreateBInt(n); }
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < l; j++) {
char c = words[i][j] - 'a';
struct BInt* bi = WTB[c][j];
BIntOr(bi, BIntByRShiftOne(i, n));
}
}
return WTB;
}
int MostSignificentBitOfBInt(struct BInt *bint) {
int bitn = (bint->size << 3) - 1;
int cs = 128;
for (int i = bint->size - 1; i >= 0; i++) {
if (bint->bytes[i] != 0) {
int bi = 7;
while (!(cs & bint->bytes[i])) {
cs >>= 1;
bi--;
}
return i * 8 + bi;
}
}
return -1;
}
int LeastSignificentBitOfBInt(struct BInt *bint) {
int bitn = 0;
int cs = 1;
for (int i = 0; i < bint->size; i++) {
if (bint->bytes[i] != 0) {
while(!(cs & bint->bytes[i])) {
cs <<= 1;
bitn++;
}
return (i << 3) + bitn;
}
}
return (bint->size - 1) << 3;
}
void PrintBInt(const struct BInt *bint) { for (size_t i = 0; i < bint->size; i++) { printf("BInt(%lu): %X\n", i, (bint->bytes)[i]); } }
void ClearLocations(int *a, int n) { for (int i = 0; i < n; i++) { a[i] = -1; } }
int* findSubstring(char * s, char ** words, int wordsSize, int* returnSize){
int n = wordsSize;
int sl = strlen(s);//sl, string length
int wl = strlen(words[0]);//wl, word length
int tl = wl * n;//tl, total length of words
int *ids = (int*)malloc((sl - tl + 1) * sizeof(*ids));
int *ptr = ids, id = 0;
struct BInt ***WTB = computeWTB(words, n);//WTB, word table
struct BInt* wc = CreateFBInt(n);//wc, word code
struct BInt* sum = CreateBInt(n);//sum of wc that's matched
struct BInt *and = CreateBInt(n);
int *loc = (int*)malloc(n * sizeof(*loc));
for (int i = 0; i < n; i++) { loc[i] = -1; }
for (int i = 0; i < sl; i++) {
int ci = (i - id) % wl;
struct BInt* cc = WTB[s[i] - 'a'][ci];//cc, character code
if (IsBIntsZero(cc)) {//no word has an s[i] at ci
i = id++;
ClearBInt(sum);
SetBInt(wc);
} else {
BIntAnd(wc, cc);
if (IsBIntsZero(wc)) {//combination of characters not found in word
i = id++;
ClearBInt(sum);
SetBInt(wc);
ClearLocations(loc, n);
} else {
if (ci + 1 == wl) {
if (!CompareBInts(wc, AndBInts(and, wc, sum))) {//too many presences
i = id++;
ClearBInt(sum);
SetBInt(wc);
ClearLocations(loc, n);
} else {//found
if (i + 1 == tl + id) {//all matched
*(ptr++) = id;
ClearBInt(sum);
SetBInt(wc);
ClearLocations(loc, n);
i = id++;
} else {
int lsb = LeastSignificentBitOfBInt(wc);
//bi, bit index into sum to be set
int bi = loc[lsb] == -1 ? 0 : loc[lsb];
while (bi < n) {
if (TestBIntAt(wc, bi) && !TestBIntAt(sum, bi)) {
SetBIntn(sum, bi);
loc[lsb] = bi;
break;
}
bi++;
}
}
}
SetBInt(wc);
}
}
}
}
if (returnSize != NULL) { *returnSize = ptr - ids; }
return ids;
}
网友评论