1/* 2 * This file has been copied from commit e7ac713d^ in the GNU grep git 3 * repository. A few small changes have been made to adapt the code to 4 * Git. 5 */ 6 7/* kwset.c - search for any of a set of keywords. 8 Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc. 9 10 This program is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 This program is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with this program; if not, see <http://www.gnu.org/licenses/>. */ 22 23/* Written August 1989 by Mike Haertel. 24 The author may be reached (Email) at the address mike@ai.mit.edu, 25 or (US mail) as Mike Haertel c/o Free Software Foundation. */ 26 27/* The algorithm implemented by these routines bears a startling resemblance 28 to one discovered by Beate Commentz-Walter, although it is not identical. 29 See "A String Matching Algorithm Fast on the Average," Technical Report, 30 IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900 31 Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient 32 String Matching: An Aid to Bibliographic Search," CACM June 1975, 33 Vol. 18, No. 6, which describes the failure function used below. */ 34 35#include"cache.h" 36 37#include"kwset.h" 38#include"compat/obstack.h" 39 40#define NCHAR (UCHAR_MAX + 1) 41/* adapter for `xmalloc()`, which takes `size_t`, not `long` */ 42static void*obstack_chunk_alloc(long size) 43{ 44if(size <0) 45BUG("Cannot allocate a negative amount:%ld", size); 46returnxmalloc(size); 47} 48#define obstack_chunk_free free 49 50#define U(c) ((unsigned char) (c)) 51 52/* Balanced tree of edges and labels leaving a given trie node. */ 53struct tree 54{ 55struct tree *llink;/* Left link; MUST be first field. */ 56struct tree *rlink;/* Right link (to larger labels). */ 57struct trie *trie;/* Trie node pointed to by this edge. */ 58unsigned char label;/* Label on this edge. */ 59char balance;/* Difference in depths of subtrees. */ 60}; 61 62/* Node of a trie representing a set of reversed keywords. */ 63struct trie 64{ 65unsigned int accepting;/* Word index of accepted word, or zero. */ 66struct tree *links;/* Tree of edges leaving this node. */ 67struct trie *parent;/* Parent of this node. */ 68struct trie *next;/* List of all trie nodes in level order. */ 69struct trie *fail;/* Aho-Corasick failure function. */ 70int depth;/* Depth of this node from the root. */ 71int shift;/* Shift function for search failures. */ 72int maxshift;/* Max shift of self and descendants. */ 73}; 74 75/* Structure returned opaquely to the caller, containing everything. */ 76struct kwset 77{ 78struct obstack obstack;/* Obstack for node allocation. */ 79int words;/* Number of words in the trie. */ 80struct trie *trie;/* The trie itself. */ 81int mind;/* Minimum depth of an accepting node. */ 82int maxd;/* Maximum depth of any node. */ 83unsigned char delta[NCHAR];/* Delta table for rapid search. */ 84struct trie *next[NCHAR];/* Table of children of the root. */ 85char*target;/* Target string if there's only one. */ 86int mind2;/* Used in Boyer-Moore search for one string. */ 87unsigned char const*trans;/* Character translation table. */ 88}; 89 90/* Allocate and initialize a keyword set object, returning an opaque 91 pointer to it. Return NULL if memory is not available. */ 92kwset_t 93kwsalloc(unsigned char const*trans) 94{ 95struct kwset *kwset; 96 97 kwset = (struct kwset *)xmalloc(sizeof(struct kwset)); 98 99obstack_init(&kwset->obstack); 100 kwset->words =0; 101 kwset->trie 102= (struct trie *)obstack_alloc(&kwset->obstack,sizeof(struct trie)); 103if(!kwset->trie) 104{ 105kwsfree((kwset_t) kwset); 106return NULL; 107} 108 kwset->trie->accepting =0; 109 kwset->trie->links = NULL; 110 kwset->trie->parent = NULL; 111 kwset->trie->next = NULL; 112 kwset->trie->fail = NULL; 113 kwset->trie->depth =0; 114 kwset->trie->shift =0; 115 kwset->mind = INT_MAX; 116 kwset->maxd = -1; 117 kwset->target = NULL; 118 kwset->trans = trans; 119 120return(kwset_t) kwset; 121} 122 123/* This upper bound is valid for CHAR_BIT >= 4 and 124 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */ 125#define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2) 126 127/* Add the given string to the contents of the keyword set. Return NULL 128 for success, an error message otherwise. */ 129const char* 130kwsincr(kwset_t kws,char const*text,size_t len) 131{ 132struct kwset *kwset; 133registerstruct trie *trie; 134registerunsigned char label; 135registerstruct tree *link; 136registerint depth; 137struct tree *links[DEPTH_SIZE]; 138enum{ L, R } dirs[DEPTH_SIZE]; 139struct tree *t, *r, *l, *rl, *lr; 140 141 kwset = (struct kwset *) kws; 142 trie = kwset->trie; 143 text += len; 144 145/* Descend the trie (built of reversed keywords) character-by-character, 146 installing new nodes when necessary. */ 147while(len--) 148{ 149 label = kwset->trans ? kwset->trans[U(*--text)] : *--text; 150 151/* Descend the tree of outgoing links for this trie node, 152 looking for the current character and keeping track 153 of the path followed. */ 154 link = trie->links; 155 links[0] = (struct tree *) &trie->links; 156 dirs[0] = L; 157 depth =1; 158 159while(link && label != link->label) 160{ 161 links[depth] = link; 162if(label < link->label) 163 dirs[depth++] = L, link = link->llink; 164else 165 dirs[depth++] = R, link = link->rlink; 166} 167 168/* The current character doesn't have an outgoing link at 169 this trie node, so build a new trie node and install 170 a link in the current trie node's tree. */ 171if(!link) 172{ 173 link = (struct tree *)obstack_alloc(&kwset->obstack, 174sizeof(struct tree)); 175if(!link) 176return"memory exhausted"; 177 link->llink = NULL; 178 link->rlink = NULL; 179 link->trie = (struct trie *)obstack_alloc(&kwset->obstack, 180sizeof(struct trie)); 181if(!link->trie) 182{ 183obstack_free(&kwset->obstack, link); 184return"memory exhausted"; 185} 186 link->trie->accepting =0; 187 link->trie->links = NULL; 188 link->trie->parent = trie; 189 link->trie->next = NULL; 190 link->trie->fail = NULL; 191 link->trie->depth = trie->depth +1; 192 link->trie->shift =0; 193 link->label = label; 194 link->balance =0; 195 196/* Install the new tree node in its parent. */ 197if(dirs[--depth] == L) 198 links[depth]->llink = link; 199else 200 links[depth]->rlink = link; 201 202/* Back up the tree fixing the balance flags. */ 203while(depth && !links[depth]->balance) 204{ 205if(dirs[depth] == L) 206--links[depth]->balance; 207else 208++links[depth]->balance; 209--depth; 210} 211 212/* Rebalance the tree by pointer rotations if necessary. */ 213if(depth && ((dirs[depth] == L && --links[depth]->balance) 214|| (dirs[depth] == R && ++links[depth]->balance))) 215{ 216switch(links[depth]->balance) 217{ 218case(char) -2: 219switch(dirs[depth +1]) 220{ 221case L: 222 r = links[depth], t = r->llink, rl = t->rlink; 223 t->rlink = r, r->llink = rl; 224 t->balance = r->balance =0; 225break; 226case R: 227 r = links[depth], l = r->llink, t = l->rlink; 228 rl = t->rlink, lr = t->llink; 229 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 230 l->balance = t->balance !=1?0: -1; 231 r->balance = t->balance != (char) -1?0:1; 232 t->balance =0; 233break; 234default: 235abort(); 236} 237break; 238case2: 239switch(dirs[depth +1]) 240{ 241case R: 242 l = links[depth], t = l->rlink, lr = t->llink; 243 t->llink = l, l->rlink = lr; 244 t->balance = l->balance =0; 245break; 246case L: 247 l = links[depth], r = l->rlink, t = r->llink; 248 lr = t->llink, rl = t->rlink; 249 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 250 l->balance = t->balance !=1?0: -1; 251 r->balance = t->balance != (char) -1?0:1; 252 t->balance =0; 253break; 254default: 255abort(); 256} 257break; 258default: 259abort(); 260} 261 262if(dirs[depth -1] == L) 263 links[depth -1]->llink = t; 264else 265 links[depth -1]->rlink = t; 266} 267} 268 269 trie = link->trie; 270} 271 272/* Mark the node we finally reached as accepting, encoding the 273 index number of this word in the keyword set so far. */ 274if(!trie->accepting) 275 trie->accepting =1+2* kwset->words; 276++kwset->words; 277 278/* Keep track of the longest and shortest string of the keyword set. */ 279if(trie->depth < kwset->mind) 280 kwset->mind = trie->depth; 281if(trie->depth > kwset->maxd) 282 kwset->maxd = trie->depth; 283 284return NULL; 285} 286 287/* Enqueue the trie nodes referenced from the given tree in the 288 given queue. */ 289static void 290enqueue(struct tree *tree,struct trie **last) 291{ 292if(!tree) 293return; 294enqueue(tree->llink, last); 295enqueue(tree->rlink, last); 296(*last) = (*last)->next = tree->trie; 297} 298 299/* Compute the Aho-Corasick failure function for the trie nodes referenced 300 from the given tree, given the failure function for their parent as 301 well as a last resort failure node. */ 302static void 303treefails(registerstruct tree const*tree,struct trie const*fail, 304struct trie *recourse) 305{ 306registerstruct tree *link; 307 308if(!tree) 309return; 310 311treefails(tree->llink, fail, recourse); 312treefails(tree->rlink, fail, recourse); 313 314/* Find, in the chain of fails going back to the root, the first 315 node that has a descendant on the current label. */ 316while(fail) 317{ 318 link = fail->links; 319while(link && tree->label != link->label) 320if(tree->label < link->label) 321 link = link->llink; 322else 323 link = link->rlink; 324if(link) 325{ 326 tree->trie->fail = link->trie; 327return; 328} 329 fail = fail->fail; 330} 331 332 tree->trie->fail = recourse; 333} 334 335/* Set delta entries for the links of the given tree such that 336 the preexisting delta value is larger than the current depth. */ 337static void 338treedelta(registerstruct tree const*tree, 339registerunsigned int depth, 340unsigned char delta[]) 341{ 342if(!tree) 343return; 344treedelta(tree->llink, depth, delta); 345treedelta(tree->rlink, depth, delta); 346if(depth < delta[tree->label]) 347 delta[tree->label] = depth; 348} 349 350/* Return true if A has every label in B. */ 351static int 352hasevery(registerstruct tree const*a,registerstruct tree const*b) 353{ 354if(!b) 355return1; 356if(!hasevery(a, b->llink)) 357return0; 358if(!hasevery(a, b->rlink)) 359return0; 360while(a && b->label != a->label) 361if(b->label < a->label) 362 a = a->llink; 363else 364 a = a->rlink; 365return!!a; 366} 367 368/* Compute a vector, indexed by character code, of the trie nodes 369 referenced from the given tree. */ 370static void 371treenext(struct tree const*tree,struct trie *next[]) 372{ 373if(!tree) 374return; 375treenext(tree->llink, next); 376treenext(tree->rlink, next); 377 next[tree->label] = tree->trie; 378} 379 380/* Compute the shift for each trie node, as well as the delta 381 table and next cache for the given keyword set. */ 382const char* 383kwsprep(kwset_t kws) 384{ 385registerstruct kwset *kwset; 386registerint i; 387registerstruct trie *curr; 388registerunsigned char const*trans; 389unsigned char delta[NCHAR]; 390 391 kwset = (struct kwset *) kws; 392 393/* Initial values for the delta table; will be changed later. The 394 delta entry for a given character is the smallest depth of any 395 node at which an outgoing edge is labeled by that character. */ 396memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR); 397 398/* Check if we can use the simple boyer-moore algorithm, instead 399 of the hairy commentz-walter algorithm. */ 400if(kwset->words ==1&& kwset->trans == NULL) 401{ 402char c; 403 404/* Looking for just one string. Extract it from the trie. */ 405 kwset->target =obstack_alloc(&kwset->obstack, kwset->mind); 406if(!kwset->target) 407return"memory exhausted"; 408for(i = kwset->mind -1, curr = kwset->trie; i >=0; --i) 409{ 410 kwset->target[i] = curr->links->label; 411 curr = curr->links->trie; 412} 413/* Build the Boyer Moore delta. Boy that's easy compared to CW. */ 414for(i =0; i < kwset->mind; ++i) 415 delta[U(kwset->target[i])] = kwset->mind - (i +1); 416/* Find the minimal delta2 shift that we might make after 417 a backwards match has failed. */ 418 c = kwset->target[kwset->mind -1]; 419for(i = kwset->mind -2; i >=0; --i) 420if(kwset->target[i] == c) 421break; 422 kwset->mind2 = kwset->mind - (i +1); 423} 424else 425{ 426registerstruct trie *fail; 427struct trie *last, *next[NCHAR]; 428 429/* Traverse the nodes of the trie in level order, simultaneously 430 computing the delta table, failure function, and shift function. */ 431for(curr = last = kwset->trie; curr; curr = curr->next) 432{ 433/* Enqueue the immediate descendants in the level order queue. */ 434enqueue(curr->links, &last); 435 436 curr->shift = kwset->mind; 437 curr->maxshift = kwset->mind; 438 439/* Update the delta table for the descendants of this node. */ 440treedelta(curr->links, curr->depth, delta); 441 442/* Compute the failure function for the descendants of this node. */ 443treefails(curr->links, curr->fail, kwset->trie); 444 445/* Update the shifts at each node in the current node's chain 446 of fails back to the root. */ 447for(fail = curr->fail; fail; fail = fail->fail) 448{ 449/* If the current node has some outgoing edge that the fail 450 doesn't, then the shift at the fail should be no larger 451 than the difference of their depths. */ 452if(!hasevery(fail->links, curr->links)) 453if(curr->depth - fail->depth < fail->shift) 454 fail->shift = curr->depth - fail->depth; 455 456/* If the current node is accepting then the shift at the 457 fail and its descendants should be no larger than the 458 difference of their depths. */ 459if(curr->accepting && fail->maxshift > curr->depth - fail->depth) 460 fail->maxshift = curr->depth - fail->depth; 461} 462} 463 464/* Traverse the trie in level order again, fixing up all nodes whose 465 shift exceeds their inherited maxshift. */ 466for(curr = kwset->trie->next; curr; curr = curr->next) 467{ 468if(curr->maxshift > curr->parent->maxshift) 469 curr->maxshift = curr->parent->maxshift; 470if(curr->shift > curr->maxshift) 471 curr->shift = curr->maxshift; 472} 473 474/* Create a vector, indexed by character code, of the outgoing links 475 from the root node. */ 476for(i =0; i < NCHAR; ++i) 477 next[i] = NULL; 478treenext(kwset->trie->links, next); 479 480if((trans = kwset->trans) != NULL) 481for(i =0; i < NCHAR; ++i) 482 kwset->next[i] = next[U(trans[i])]; 483else 484COPY_ARRAY(kwset->next, next, NCHAR); 485} 486 487/* Fix things up for any translation table. */ 488if((trans = kwset->trans) != NULL) 489for(i =0; i < NCHAR; ++i) 490 kwset->delta[i] = delta[U(trans[i])]; 491else 492memcpy(kwset->delta, delta, NCHAR); 493 494return NULL; 495} 496 497/* Fast boyer-moore search. */ 498static size_t 499bmexec(kwset_t kws,char const*text,size_t size) 500{ 501struct kwset const*kwset; 502registerunsigned char const*d1; 503registerchar const*ep, *sp, *tp; 504registerint d, gc, i, len, md2; 505 506 kwset = (struct kwset const*) kws; 507 len = kwset->mind; 508 509if(len ==0) 510return0; 511if(len > size) 512return-1; 513if(len ==1) 514{ 515 tp =memchr(text, kwset->target[0], size); 516return tp ? tp - text : -1; 517} 518 519 d1 = kwset->delta; 520 sp = kwset->target + len; 521 gc =U(sp[-2]); 522 md2 = kwset->mind2; 523 tp = text + len; 524 525/* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 526if(size >12* len) 527/* 11 is not a bug, the initial offset happens only once. */ 528for(ep = text + size -11* len;;) 529{ 530while(tp <= ep) 531{ 532 d = d1[U(tp[-1])], tp += d; 533 d = d1[U(tp[-1])], tp += d; 534if(d ==0) 535goto found; 536 d = d1[U(tp[-1])], tp += d; 537 d = d1[U(tp[-1])], tp += d; 538 d = d1[U(tp[-1])], tp += d; 539if(d ==0) 540goto found; 541 d = d1[U(tp[-1])], tp += d; 542 d = d1[U(tp[-1])], tp += d; 543 d = d1[U(tp[-1])], tp += d; 544if(d ==0) 545goto found; 546 d = d1[U(tp[-1])], tp += d; 547 d = d1[U(tp[-1])], tp += d; 548} 549break; 550 found: 551if(U(tp[-2]) == gc) 552{ 553for(i =3; i <= len &&U(tp[-i]) ==U(sp[-i]); ++i) 554; 555if(i > len) 556return tp - len - text; 557} 558 tp += md2; 559} 560 561/* Now we have only a few characters left to search. We 562 carefully avoid ever producing an out-of-bounds pointer. */ 563 ep = text + size; 564 d = d1[U(tp[-1])]; 565while(d <= ep - tp) 566{ 567 d = d1[U((tp += d)[-1])]; 568if(d !=0) 569continue; 570if(U(tp[-2]) == gc) 571{ 572for(i =3; i <= len &&U(tp[-i]) ==U(sp[-i]); ++i) 573; 574if(i > len) 575return tp - len - text; 576} 577 d = md2; 578} 579 580return-1; 581} 582 583/* Hairy multiple string search. */ 584static size_t 585cwexec(kwset_t kws,char const*text,size_t len,struct kwsmatch *kwsmatch) 586{ 587struct kwset const*kwset; 588struct trie *const*next; 589struct trie const*trie; 590struct trie const*accept; 591char const*beg, *lim, *mch, *lmch; 592registerunsigned char c; 593registerunsigned char const*delta; 594registerint d; 595registerchar const*end, *qlim; 596registerstruct tree const*tree; 597registerunsigned char const*trans; 598 599 accept = NULL; 600 601/* Initialize register copies and look for easy ways out. */ 602 kwset = (struct kwset *) kws; 603if(len < kwset->mind) 604return-1; 605 next = kwset->next; 606 delta = kwset->delta; 607 trans = kwset->trans; 608 lim = text + len; 609 end = text; 610if((d = kwset->mind) !=0) 611 mch = NULL; 612else 613{ 614 mch = text, accept = kwset->trie; 615goto match; 616} 617 618if(len >=4* kwset->mind) 619 qlim = lim -4* kwset->mind; 620else 621 qlim = NULL; 622 623while(lim - end >= d) 624{ 625if(qlim && end <= qlim) 626{ 627 end += d -1; 628while((d = delta[c = *end]) && end < qlim) 629{ 630 end += d; 631 end += delta[U(*end)]; 632 end += delta[U(*end)]; 633} 634++end; 635} 636else 637 d = delta[c = (end += d)[-1]]; 638if(d) 639continue; 640 beg = end -1; 641 trie = next[c]; 642if(trie->accepting) 643{ 644 mch = beg; 645 accept = trie; 646} 647 d = trie->shift; 648while(beg > text) 649{ 650 c = trans ? trans[U(*--beg)] : *--beg; 651 tree = trie->links; 652while(tree && c != tree->label) 653if(c < tree->label) 654 tree = tree->llink; 655else 656 tree = tree->rlink; 657if(tree) 658{ 659 trie = tree->trie; 660if(trie->accepting) 661{ 662 mch = beg; 663 accept = trie; 664} 665} 666else 667break; 668 d = trie->shift; 669} 670if(mch) 671goto match; 672} 673return-1; 674 675 match: 676/* Given a known match, find the longest possible match anchored 677 at or before its starting point. This is nearly a verbatim 678 copy of the preceding main search loops. */ 679if(lim - mch > kwset->maxd) 680 lim = mch + kwset->maxd; 681 lmch = NULL; 682 d =1; 683while(lim - end >= d) 684{ 685if((d = delta[c = (end += d)[-1]]) !=0) 686continue; 687 beg = end -1; 688if(!(trie = next[c])) 689{ 690 d =1; 691continue; 692} 693if(trie->accepting && beg <= mch) 694{ 695 lmch = beg; 696 accept = trie; 697} 698 d = trie->shift; 699while(beg > text) 700{ 701 c = trans ? trans[U(*--beg)] : *--beg; 702 tree = trie->links; 703while(tree && c != tree->label) 704if(c < tree->label) 705 tree = tree->llink; 706else 707 tree = tree->rlink; 708if(tree) 709{ 710 trie = tree->trie; 711if(trie->accepting && beg <= mch) 712{ 713 lmch = beg; 714 accept = trie; 715} 716} 717else 718break; 719 d = trie->shift; 720} 721if(lmch) 722{ 723 mch = lmch; 724goto match; 725} 726if(!d) 727 d =1; 728} 729 730if(kwsmatch) 731{ 732 kwsmatch->index = accept->accepting /2; 733 kwsmatch->offset[0] = mch - text; 734 kwsmatch->size[0] = accept->depth; 735} 736return mch - text; 737} 738 739/* Search through the given text for a match of any member of the 740 given keyword set. Return a pointer to the first character of 741 the matching substring, or NULL if no match is found. If FOUNDLEN 742 is non-NULL store in the referenced location the length of the 743 matching substring. Similarly, if FOUNDIDX is non-NULL, store 744 in the referenced location the index number of the particular 745 keyword matched. */ 746size_t 747kwsexec(kwset_t kws,char const*text,size_t size, 748struct kwsmatch *kwsmatch) 749{ 750struct kwset const*kwset = (struct kwset *) kws; 751if(kwset->words ==1&& kwset->trans == NULL) 752{ 753size_t ret =bmexec(kws, text, size); 754if(kwsmatch != NULL && ret != (size_t) -1) 755{ 756 kwsmatch->index =0; 757 kwsmatch->offset[0] = ret; 758 kwsmatch->size[0] = kwset->mind; 759} 760return ret; 761} 762else 763returncwexec(kws, text, size, kwsmatch); 764} 765 766/* Free the components of the given keyword set. */ 767void 768kwsfree(kwset_t kws) 769{ 770struct kwset *kwset; 771 772 kwset = (struct kwset *) kws; 773obstack_free(&kwset->obstack, NULL); 774free(kws); 775}