1/* kwset.c - search for any of a set of keywords. 2 Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19/* Written August 1989 by Mike Haertel. 20 The author may be reached (Email) at the address mike@ai.mit.edu, 21 or (US mail) as Mike Haertel c/o Free Software Foundation. */ 22 23/* The algorithm implemented by these routines bears a startling resemblence 24 to one discovered by Beate Commentz-Walter, although it is not identical. 25 See "A String Matching Algorithm Fast on the Average," Technical Report, 26 IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900 27 Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient 28 String Matching: An Aid to Bibliographic Search," CACM June 1975, 29 Vol. 18, No. 6, which describes the failure function used below. */ 30 31#ifdef HAVE_CONFIG_H 32# include <config.h> 33#endif 34#include <sys/types.h> 35#include "system.h" 36#include "kwset.h" 37#include "obstack.h" 38 39#ifdef GREP 40extern char *xmalloc(); 41# undef malloc 42# define malloc xmalloc 43#endif 44 45#define NCHAR (UCHAR_MAX + 1) 46#define obstack_chunk_alloc malloc 47#define obstack_chunk_free free 48 49#define U(c) ((unsigned char) (c)) 50 51/* Balanced tree of edges and labels leaving a given trie node. */ 52struct tree 53{ 54 struct tree *llink; /* Left link; MUST be first field. */ 55 struct tree *rlink; /* Right link (to larger labels). */ 56 struct trie *trie; /* Trie node pointed to by this edge. */ 57 unsigned char label; /* Label on this edge. */ 58 char balance; /* Difference in depths of subtrees. */ 59}; 60 61/* Node of a trie representing a set of reversed keywords. */ 62struct trie 63{ 64 unsigned int accepting; /* Word index of accepted word, or zero. */ 65 struct tree *links; /* Tree of edges leaving this node. */ 66 struct trie *parent; /* Parent of this node. */ 67 struct trie *next; /* List of all trie nodes in level order. */ 68 struct trie *fail; /* Aho-Corasick failure function. */ 69 int depth; /* Depth of this node from the root. */ 70 int shift; /* Shift function for search failures. */ 71 int maxshift; /* Max shift of self and descendents. */ 72}; 73 74/* Structure returned opaquely to the caller, containing everything. */ 75struct kwset 76{ 77 struct obstack obstack; /* Obstack for node allocation. */ 78 int words; /* Number of words in the trie. */ 79 struct trie *trie; /* The trie itself. */ 80 int mind; /* Minimum depth of an accepting node. */ 81 int maxd; /* Maximum depth of any node. */ 82 unsigned char delta[NCHAR]; /* Delta table for rapid search. */ 83 struct trie *next[NCHAR]; /* Table of children of the root. */ 84 char *target; /* Target string if there's only one. */ 85 int mind2; /* Used in Boyer-Moore search for one string. */ 86 char const *trans; /* Character translation table. */ 87}; 88 89/* Allocate and initialize a keyword set object, returning an opaque 90 pointer to it. Return NULL if memory is not available. */ 91kwset_t 92kwsalloc (char const *trans) 93{ 94 struct kwset *kwset; 95 96 kwset = (struct kwset *) malloc(sizeof (struct kwset)); 97 if (!kwset) 98 return NULL; 99 100 obstack_init(&kwset->obstack); 101 kwset->words = 0; 102 kwset->trie 103 = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie)); 104 if (!kwset->trie) 105 { 106 kwsfree((kwset_t) kwset); 107 return NULL; 108 } 109 kwset->trie->accepting = 0; 110 kwset->trie->links = NULL; 111 kwset->trie->parent = NULL; 112 kwset->trie->next = NULL; 113 kwset->trie->fail = NULL; 114 kwset->trie->depth = 0; 115 kwset->trie->shift = 0; 116 kwset->mind = INT_MAX; 117 kwset->maxd = -1; 118 kwset->target = NULL; 119 kwset->trans = trans; 120 121 return (kwset_t) kwset; 122} 123 124/* This upper bound is valid for CHAR_BIT >= 4 and 125 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */ 126#define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2) 127 128/* Add the given string to the contents of the keyword set. Return NULL 129 for success, an error message otherwise. */ 130const char * 131kwsincr (kwset_t kws, char const *text, size_t len) 132{ 133 struct kwset *kwset; 134 register struct trie *trie; 135 register unsigned char label; 136 register struct tree *link; 137 register int depth; 138 struct tree *links[DEPTH_SIZE]; 139 enum { L, R } dirs[DEPTH_SIZE]; 140 struct tree *t, *r, *l, *rl, *lr; 141 142 kwset = (struct kwset *) kws; 143 trie = kwset->trie; 144 text += len; 145 146 /* Descend the trie (built of reversed keywords) character-by-character, 147 installing new nodes when necessary. */ 148 while (len--) 149 { 150 label = kwset->trans ? kwset->trans[U(*--text)] : *--text; 151 152 /* Descend the tree of outgoing links for this trie node, 153 looking for the current character and keeping track 154 of the path followed. */ 155 link = trie->links; 156 links[0] = (struct tree *) &trie->links; 157 dirs[0] = L; 158 depth = 1; 159 160 while (link && label != link->label) 161 { 162 links[depth] = link; 163 if (label < link->label) 164 dirs[depth++] = L, link = link->llink; 165 else 166 dirs[depth++] = R, link = link->rlink; 167 } 168 169 /* The current character doesn't have an outgoing link at 170 this trie node, so build a new trie node and install 171 a link in the current trie node's tree. */ 172 if (!link) 173 { 174 link = (struct tree *) obstack_alloc(&kwset->obstack, 175 sizeof (struct tree)); 176 if (!link) 177 return _("memory exhausted"); 178 link->llink = NULL; 179 link->rlink = NULL; 180 link->trie = (struct trie *) obstack_alloc(&kwset->obstack, 181 sizeof (struct trie)); 182 if (!link->trie) 183 { 184 obstack_free(&kwset->obstack, link); 185 return _("memory exhausted"); 186 } 187 link->trie->accepting = 0; 188 link->trie->links = NULL; 189 link->trie->parent = trie; 190 link->trie->next = NULL; 191 link->trie->fail = NULL; 192 link->trie->depth = trie->depth + 1; 193 link->trie->shift = 0; 194 link->label = label; 195 link->balance = 0; 196 197 /* Install the new tree node in its parent. */ 198 if (dirs[--depth] == L) 199 links[depth]->llink = link; 200 else 201 links[depth]->rlink = link; 202 203 /* Back up the tree fixing the balance flags. */ 204 while (depth && !links[depth]->balance) 205 { 206 if (dirs[depth] == L) 207 --links[depth]->balance; 208 else 209 ++links[depth]->balance; 210 --depth; 211 } 212 213 /* Rebalance the tree by pointer rotations if necessary. */ 214 if (depth && ((dirs[depth] == L && --links[depth]->balance) 215 || (dirs[depth] == R && ++links[depth]->balance))) 216 { 217 switch (links[depth]->balance) 218 { 219 case (char) -2: 220 switch (dirs[depth + 1]) 221 { 222 case L: 223 r = links[depth], t = r->llink, rl = t->rlink; 224 t->rlink = r, r->llink = rl; 225 t->balance = r->balance = 0; 226 break; 227 case R: 228 r = links[depth], l = r->llink, t = l->rlink; 229 rl = t->rlink, lr = t->llink; 230 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 231 l->balance = t->balance != 1 ? 0 : -1; 232 r->balance = t->balance != (char) -1 ? 0 : 1; 233 t->balance = 0; 234 break; 235 default: 236 abort (); 237 } 238 break; 239 case 2: 240 switch (dirs[depth + 1]) 241 { 242 case R: 243 l = links[depth], t = l->rlink, lr = t->llink; 244 t->llink = l, l->rlink = lr; 245 t->balance = l->balance = 0; 246 break; 247 case L: 248 l = links[depth], r = l->rlink, t = r->llink; 249 lr = t->llink, rl = t->rlink; 250 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 251 l->balance = t->balance != 1 ? 0 : -1; 252 r->balance = t->balance != (char) -1 ? 0 : 1; 253 t->balance = 0; 254 break; 255 default: 256 abort (); 257 } 258 break; 259 default: 260 abort (); 261 } 262 263 if (dirs[depth - 1] == L) 264 links[depth - 1]->llink = t; 265 else 266 links[depth - 1]->rlink = t; 267 } 268 } 269 270 trie = link->trie; 271 } 272 273 /* Mark the node we finally reached as accepting, encoding the 274 index number of this word in the keyword set so far. */ 275 if (!trie->accepting) 276 trie->accepting = 1 + 2 * kwset->words; 277 ++kwset->words; 278 279 /* Keep track of the longest and shortest string of the keyword set. */ 280 if (trie->depth < kwset->mind) 281 kwset->mind = trie->depth; 282 if (trie->depth > kwset->maxd) 283 kwset->maxd = trie->depth; 284 285 return NULL; 286} 287 288/* Enqueue the trie nodes referenced from the given tree in the 289 given queue. */ 290static void 291enqueue (struct tree *tree, struct trie **last) 292{ 293 if (!tree) 294 return; 295 enqueue(tree->llink, last); 296 enqueue(tree->rlink, last); 297 (*last) = (*last)->next = tree->trie; 298} 299 300/* Compute the Aho-Corasick failure function for the trie nodes referenced 301 from the given tree, given the failure function for their parent as 302 well as a last resort failure node. */ 303static void 304treefails (register struct tree const *tree, struct trie const *fail, 305 struct trie *recourse) 306{ 307 register struct tree *link; 308 309 if (!tree) 310 return; 311 312 treefails(tree->llink, fail, recourse); 313 treefails(tree->rlink, fail, recourse); 314 315 /* Find, in the chain of fails going back to the root, the first 316 node that has a descendent on the current label. */ 317 while (fail) 318 { 319 link = fail->links; 320 while (link && tree->label != link->label) 321 if (tree->label < link->label) 322 link = link->llink; 323 else 324 link = link->rlink; 325 if (link) 326 { 327 tree->trie->fail = link->trie; 328 return; 329 } 330 fail = fail->fail; 331 } 332 333 tree->trie->fail = recourse; 334} 335 336/* Set delta entries for the links of the given tree such that 337 the preexisting delta value is larger than the current depth. */ 338static void 339treedelta (register struct tree const *tree, 340 register unsigned int depth, 341 unsigned char delta[]) 342{ 343 if (!tree) 344 return; 345 treedelta(tree->llink, depth, delta); 346 treedelta(tree->rlink, depth, delta); 347 if (depth < delta[tree->label]) 348 delta[tree->label] = depth; 349} 350 351/* Return true if A has every label in B. */ 352static int 353hasevery (register struct tree const *a, register struct tree const *b) 354{ 355 if (!b) 356 return 1; 357 if (!hasevery(a, b->llink)) 358 return 0; 359 if (!hasevery(a, b->rlink)) 360 return 0; 361 while (a && b->label != a->label) 362 if (b->label < a->label) 363 a = a->llink; 364 else 365 a = a->rlink; 366 return !!a; 367} 368 369/* Compute a vector, indexed by character code, of the trie nodes 370 referenced from the given tree. */ 371static void 372treenext (struct tree const *tree, struct trie *next[]) 373{ 374 if (!tree) 375 return; 376 treenext(tree->llink, next); 377 treenext(tree->rlink, next); 378 next[tree->label] = tree->trie; 379} 380 381/* Compute the shift for each trie node, as well as the delta 382 table and next cache for the given keyword set. */ 383const char * 384kwsprep (kwset_t kws) 385{ 386 register struct kwset *kwset; 387 register int i; 388 register struct trie *curr; 389 register char const *trans; 390 unsigned char delta[NCHAR]; 391 392 kwset = (struct kwset *) kws; 393 394 /* Initial values for the delta table; will be changed later. The 395 delta entry for a given character is the smallest depth of any 396 node at which an outgoing edge is labeled by that character. */ 397 memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR); 398 399 /* Check if we can use the simple boyer-moore algorithm, instead 400 of the hairy commentz-walter algorithm. */ 401 if (kwset->words == 1 && kwset->trans == NULL) 402 { 403 char c; 404 405 /* Looking for just one string. Extract it from the trie. */ 406 kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); 407 if (!kwset->target) 408 return _("memory exhausted"); 409 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) 410 { 411 kwset->target[i] = curr->links->label; 412 curr = curr->links->trie; 413 } 414 /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ 415 for (i = 0; i < kwset->mind; ++i) 416 delta[U(kwset->target[i])] = kwset->mind - (i + 1); 417 /* Find the minimal delta2 shift that we might make after 418 a backwards match has failed. */ 419 c = kwset->target[kwset->mind - 1]; 420 for (i = kwset->mind - 2; i >= 0; --i) 421 if (kwset->target[i] == c) 422 break; 423 kwset->mind2 = kwset->mind - (i + 1); 424 } 425 else 426 { 427 register struct trie *fail; 428 struct trie *last, *next[NCHAR]; 429 430 /* Traverse the nodes of the trie in level order, simultaneously 431 computing the delta table, failure function, and shift function. */ 432 for (curr = last = kwset->trie; curr; curr = curr->next) 433 { 434 /* Enqueue the immediate descendents in the level order queue. */ 435 enqueue(curr->links, &last); 436 437 curr->shift = kwset->mind; 438 curr->maxshift = kwset->mind; 439 440 /* Update the delta table for the descendents of this node. */ 441 treedelta(curr->links, curr->depth, delta); 442 443 /* Compute the failure function for the decendents of this node. */ 444 treefails(curr->links, curr->fail, kwset->trie); 445 446 /* Update the shifts at each node in the current node's chain 447 of fails back to the root. */ 448 for (fail = curr->fail; fail; fail = fail->fail) 449 { 450 /* If the current node has some outgoing edge that the fail 451 doesn't, then the shift at the fail should be no larger 452 than the difference of their depths. */ 453 if (!hasevery(fail->links, curr->links)) 454 if (curr->depth - fail->depth < fail->shift) 455 fail->shift = curr->depth - fail->depth; 456 457 /* If the current node is accepting then the shift at the 458 fail and its descendents should be no larger than the 459 difference of their depths. */ 460 if (curr->accepting && fail->maxshift > curr->depth - fail->depth) 461 fail->maxshift = curr->depth - fail->depth; 462 } 463 } 464 465 /* Traverse the trie in level order again, fixing up all nodes whose 466 shift exceeds their inherited maxshift. */ 467 for (curr = kwset->trie->next; curr; curr = curr->next) 468 { 469 if (curr->maxshift > curr->parent->maxshift) 470 curr->maxshift = curr->parent->maxshift; 471 if (curr->shift > curr->maxshift) 472 curr->shift = curr->maxshift; 473 } 474 475 /* Create a vector, indexed by character code, of the outgoing links 476 from the root node. */ 477 for (i = 0; i < NCHAR; ++i) 478 next[i] = NULL; 479 treenext(kwset->trie->links, next); 480 481 if ((trans = kwset->trans) != NULL) 482 for (i = 0; i < NCHAR; ++i) 483 kwset->next[i] = next[U(trans[i])]; 484 else 485 memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); 486 } 487 488 /* Fix things up for any translation table. */ 489 if ((trans = kwset->trans) != NULL) 490 for (i = 0; i < NCHAR; ++i) 491 kwset->delta[i] = delta[U(trans[i])]; 492 else 493 memcpy(kwset->delta, delta, NCHAR); 494 495 return NULL; 496} 497 498/* Fast boyer-moore search. */ 499static size_t 500bmexec (kwset_t kws, char const *text, size_t size) 501{ 502 struct kwset const *kwset; 503 register unsigned char const *d1; 504 register char const *ep, *sp, *tp; 505 register int d, gc, i, len, md2; 506 507 kwset = (struct kwset const *) kws; 508 len = kwset->mind; 509 510 if (len == 0) 511 return 0; 512 if (len > size) 513 return -1; 514 if (len == 1) 515 { 516 tp = memchr (text, kwset->target[0], size); 517 return tp ? tp - text : -1; 518 } 519 520 d1 = kwset->delta; 521 sp = kwset->target + len; 522 gc = U(sp[-2]); 523 md2 = kwset->mind2; 524 tp = text + len; 525 526 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 527 if (size > 12 * len) 528 /* 11 is not a bug, the initial offset happens only once. */ 529 for (ep = text + size - 11 * len;;) 530 { 531 while (tp <= ep) 532 { 533 d = d1[U(tp[-1])], tp += d; 534 d = d1[U(tp[-1])], tp += d; 535 if (d == 0) 536 goto found; 537 d = d1[U(tp[-1])], tp += d; 538 d = d1[U(tp[-1])], tp += d; 539 d = d1[U(tp[-1])], tp += d; 540 if (d == 0) 541 goto found; 542 d = d1[U(tp[-1])], tp += d; 543 d = d1[U(tp[-1])], tp += d; 544 d = d1[U(tp[-1])], tp += d; 545 if (d == 0) 546 goto found; 547 d = d1[U(tp[-1])], tp += d; 548 d = d1[U(tp[-1])], tp += d; 549 } 550 break; 551 found: 552 if (U(tp[-2]) == gc) 553 { 554 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 555 ; 556 if (i > len) 557 return tp - len - text; 558 } 559 tp += md2; 560 } 561 562 /* Now we have only a few characters left to search. We 563 carefully avoid ever producing an out-of-bounds pointer. */ 564 ep = text + size; 565 d = d1[U(tp[-1])]; 566 while (d <= ep - tp) 567 { 568 d = d1[U((tp += d)[-1])]; 569 if (d != 0) 570 continue; 571 if (U(tp[-2]) == gc) 572 { 573 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 574 ; 575 if (i > len) 576 return tp - len - text; 577 } 578 d = md2; 579 } 580 581 return -1; 582} 583 584/* Hairy multiple string search. */ 585static size_t 586cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) 587{ 588 struct kwset const *kwset; 589 struct trie * const *next; 590 struct trie const *trie; 591 struct trie const *accept; 592 char const *beg, *lim, *mch, *lmch; 593 register unsigned char c; 594 register unsigned char const *delta; 595 register int d; 596 register char const *end, *qlim; 597 register struct tree const *tree; 598 register char const *trans; 599 600#ifdef lint 601 accept = NULL; 602#endif 603 604 /* Initialize register copies and look for easy ways out. */ 605 kwset = (struct kwset *) kws; 606 if (len < kwset->mind) 607 return -1; 608 next = kwset->next; 609 delta = kwset->delta; 610 trans = kwset->trans; 611 lim = text + len; 612 end = text; 613 if ((d = kwset->mind) != 0) 614 mch = NULL; 615 else 616 { 617 mch = text, accept = kwset->trie; 618 goto match; 619 } 620 621 if (len >= 4 * kwset->mind) 622 qlim = lim - 4 * kwset->mind; 623 else 624 qlim = NULL; 625 626 while (lim - end >= d) 627 { 628 if (qlim && end <= qlim) 629 { 630 end += d - 1; 631 while ((d = delta[c = *end]) && end < qlim) 632 { 633 end += d; 634 end += delta[U(*end)]; 635 end += delta[U(*end)]; 636 } 637 ++end; 638 } 639 else 640 d = delta[c = (end += d)[-1]]; 641 if (d) 642 continue; 643 beg = end - 1; 644 trie = next[c]; 645 if (trie->accepting) 646 { 647 mch = beg; 648 accept = trie; 649 } 650 d = trie->shift; 651 while (beg > text) 652 { 653 c = trans ? trans[U(*--beg)] : *--beg; 654 tree = trie->links; 655 while (tree && c != tree->label) 656 if (c < tree->label) 657 tree = tree->llink; 658 else 659 tree = tree->rlink; 660 if (tree) 661 { 662 trie = tree->trie; 663 if (trie->accepting) 664 { 665 mch = beg; 666 accept = trie; 667 } 668 } 669 else 670 break; 671 d = trie->shift; 672 } 673 if (mch) 674 goto match; 675 } 676 return -1; 677 678 match: 679 /* Given a known match, find the longest possible match anchored 680 at or before its starting point. This is nearly a verbatim 681 copy of the preceding main search loops. */ 682 if (lim - mch > kwset->maxd) 683 lim = mch + kwset->maxd; 684 lmch = 0; 685 d = 1; 686 while (lim - end >= d) 687 { 688 if ((d = delta[c = (end += d)[-1]]) != 0) 689 continue; 690 beg = end - 1; 691 if (!(trie = next[c])) 692 { 693 d = 1; 694 continue; 695 } 696 if (trie->accepting && beg <= mch) 697 { 698 lmch = beg; 699 accept = trie; 700 } 701 d = trie->shift; 702 while (beg > text) 703 { 704 c = trans ? trans[U(*--beg)] : *--beg; 705 tree = trie->links; 706 while (tree && c != tree->label) 707 if (c < tree->label) 708 tree = tree->llink; 709 else 710 tree = tree->rlink; 711 if (tree) 712 { 713 trie = tree->trie; 714 if (trie->accepting && beg <= mch) 715 { 716 lmch = beg; 717 accept = trie; 718 } 719 } 720 else 721 break; 722 d = trie->shift; 723 } 724 if (lmch) 725 { 726 mch = lmch; 727 goto match; 728 } 729 if (!d) 730 d = 1; 731 } 732 733 if (kwsmatch) 734 { 735 kwsmatch->index = accept->accepting / 2; 736 kwsmatch->offset[0] = mch - text; 737 kwsmatch->size[0] = accept->depth; 738 } 739 return mch - text; 740} 741 742/* Search through the given text for a match of any member of the 743 given keyword set. Return a pointer to the first character of 744 the matching substring, or NULL if no match is found. If FOUNDLEN 745 is non-NULL store in the referenced location the length of the 746 matching substring. Similarly, if FOUNDIDX is non-NULL, store 747 in the referenced location the index number of the particular 748 keyword matched. */ 749size_t 750kwsexec (kwset_t kws, char const *text, size_t size, 751 struct kwsmatch *kwsmatch) 752{ 753 struct kwset const *kwset = (struct kwset *) kws; 754 if (kwset->words == 1 && kwset->trans == NULL) 755 { 756 size_t ret = bmexec (kws, text, size); 757 if (kwsmatch != NULL && ret != (size_t) -1) 758 { 759 kwsmatch->index = 0; 760 kwsmatch->offset[0] = ret; 761 kwsmatch->size[0] = kwset->mind; 762 } 763 return ret; 764 } 765 else 766 return cwexec(kws, text, size, kwsmatch); 767} 768 769/* Free the components of the given keyword set. */ 770void 771kwsfree (kwset_t kws) 772{ 773 struct kwset *kwset; 774 775 kwset = (struct kwset *) kws; 776 obstack_free(&kwset->obstack, NULL); 777 free(kws); 778}