grep.con commit tutorial: misc updates. (c1d179f)
   1#include "cache.h"
   2#include "grep.h"
   3
   4void append_grep_pattern(struct grep_opt *opt, const char *pat,
   5                         const char *origin, int no, enum grep_pat_token t)
   6{
   7        struct grep_pat *p = xcalloc(1, sizeof(*p));
   8        p->pattern = pat;
   9        p->origin = origin;
  10        p->no = no;
  11        p->token = t;
  12        *opt->pattern_tail = p;
  13        opt->pattern_tail = &p->next;
  14        p->next = NULL;
  15}
  16
  17static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
  18{
  19        int err = regcomp(&p->regexp, p->pattern, opt->regflags);
  20        if (err) {
  21                char errbuf[1024];
  22                char where[1024];
  23                if (p->no)
  24                        sprintf(where, "In '%s' at %d, ",
  25                                p->origin, p->no);
  26                else if (p->origin)
  27                        sprintf(where, "%s, ", p->origin);
  28                else
  29                        where[0] = 0;
  30                regerror(err, &p->regexp, errbuf, 1024);
  31                regfree(&p->regexp);
  32                die("%s'%s': %s", where, p->pattern, errbuf);
  33        }
  34}
  35
  36static struct grep_expr *compile_pattern_or(struct grep_pat **);
  37static struct grep_expr *compile_pattern_atom(struct grep_pat **list)
  38{
  39        struct grep_pat *p;
  40        struct grep_expr *x;
  41
  42        p = *list;
  43        switch (p->token) {
  44        case GREP_PATTERN: /* atom */
  45        case GREP_PATTERN_HEAD:
  46        case GREP_PATTERN_BODY:
  47                x = xcalloc(1, sizeof (struct grep_expr));
  48                x->node = GREP_NODE_ATOM;
  49                x->u.atom = p;
  50                *list = p->next;
  51                return x;
  52        case GREP_OPEN_PAREN:
  53                *list = p->next;
  54                x = compile_pattern_or(list);
  55                if (!x)
  56                        return NULL;
  57                if (!*list || (*list)->token != GREP_CLOSE_PAREN)
  58                        die("unmatched parenthesis");
  59                *list = (*list)->next;
  60                return x;
  61        default:
  62                return NULL;
  63        }
  64}
  65
  66static struct grep_expr *compile_pattern_not(struct grep_pat **list)
  67{
  68        struct grep_pat *p;
  69        struct grep_expr *x;
  70
  71        p = *list;
  72        switch (p->token) {
  73        case GREP_NOT:
  74                if (!p->next)
  75                        die("--not not followed by pattern expression");
  76                *list = p->next;
  77                x = xcalloc(1, sizeof (struct grep_expr));
  78                x->node = GREP_NODE_NOT;
  79                x->u.unary = compile_pattern_not(list);
  80                if (!x->u.unary)
  81                        die("--not followed by non pattern expression");
  82                return x;
  83        default:
  84                return compile_pattern_atom(list);
  85        }
  86}
  87
  88static struct grep_expr *compile_pattern_and(struct grep_pat **list)
  89{
  90        struct grep_pat *p;
  91        struct grep_expr *x, *y, *z;
  92
  93        x = compile_pattern_not(list);
  94        p = *list;
  95        if (p && p->token == GREP_AND) {
  96                if (!p->next)
  97                        die("--and not followed by pattern expression");
  98                *list = p->next;
  99                y = compile_pattern_and(list);
 100                if (!y)
 101                        die("--and not followed by pattern expression");
 102                z = xcalloc(1, sizeof (struct grep_expr));
 103                z->node = GREP_NODE_AND;
 104                z->u.binary.left = x;
 105                z->u.binary.right = y;
 106                return z;
 107        }
 108        return x;
 109}
 110
 111static struct grep_expr *compile_pattern_or(struct grep_pat **list)
 112{
 113        struct grep_pat *p;
 114        struct grep_expr *x, *y, *z;
 115
 116        x = compile_pattern_and(list);
 117        p = *list;
 118        if (x && p && p->token != GREP_CLOSE_PAREN) {
 119                y = compile_pattern_or(list);
 120                if (!y)
 121                        die("not a pattern expression %s", p->pattern);
 122                z = xcalloc(1, sizeof (struct grep_expr));
 123                z->node = GREP_NODE_OR;
 124                z->u.binary.left = x;
 125                z->u.binary.right = y;
 126                return z;
 127        }
 128        return x;
 129}
 130
 131static struct grep_expr *compile_pattern_expr(struct grep_pat **list)
 132{
 133        return compile_pattern_or(list);
 134}
 135
 136void compile_grep_patterns(struct grep_opt *opt)
 137{
 138        struct grep_pat *p;
 139
 140        if (opt->all_match)
 141                opt->extended = 1;
 142
 143        for (p = opt->pattern_list; p; p = p->next) {
 144                switch (p->token) {
 145                case GREP_PATTERN: /* atom */
 146                case GREP_PATTERN_HEAD:
 147                case GREP_PATTERN_BODY:
 148                        if (!opt->fixed)
 149                                compile_regexp(p, opt);
 150                        break;
 151                default:
 152                        opt->extended = 1;
 153                        break;
 154                }
 155        }
 156
 157        if (!opt->extended)
 158                return;
 159
 160        /* Then bundle them up in an expression.
 161         * A classic recursive descent parser would do.
 162         */
 163        p = opt->pattern_list;
 164        opt->pattern_expression = compile_pattern_expr(&p);
 165        if (p)
 166                die("incomplete pattern expression: %s", p->pattern);
 167}
 168
 169static void free_pattern_expr(struct grep_expr *x)
 170{
 171        switch (x->node) {
 172        case GREP_NODE_ATOM:
 173                break;
 174        case GREP_NODE_NOT:
 175                free_pattern_expr(x->u.unary);
 176                break;
 177        case GREP_NODE_AND:
 178        case GREP_NODE_OR:
 179                free_pattern_expr(x->u.binary.left);
 180                free_pattern_expr(x->u.binary.right);
 181                break;
 182        }
 183        free(x);
 184}
 185
 186void free_grep_patterns(struct grep_opt *opt)
 187{
 188        struct grep_pat *p, *n;
 189
 190        for (p = opt->pattern_list; p; p = n) {
 191                n = p->next;
 192                switch (p->token) {
 193                case GREP_PATTERN: /* atom */
 194                case GREP_PATTERN_HEAD:
 195                case GREP_PATTERN_BODY:
 196                        regfree(&p->regexp);
 197                        break;
 198                default:
 199                        break;
 200                }
 201                free(p);
 202        }
 203
 204        if (!opt->extended)
 205                return;
 206        free_pattern_expr(opt->pattern_expression);
 207}
 208
 209static char *end_of_line(char *cp, unsigned long *left)
 210{
 211        unsigned long l = *left;
 212        while (l && *cp != '\n') {
 213                l--;
 214                cp++;
 215        }
 216        *left = l;
 217        return cp;
 218}
 219
 220static int word_char(char ch)
 221{
 222        return isalnum(ch) || ch == '_';
 223}
 224
 225static void show_line(struct grep_opt *opt, const char *bol, const char *eol,
 226                      const char *name, unsigned lno, char sign)
 227{
 228        if (opt->pathname)
 229                printf("%s%c", name, sign);
 230        if (opt->linenum)
 231                printf("%d%c", lno, sign);
 232        printf("%.*s\n", (int)(eol-bol), bol);
 233}
 234
 235/*
 236 * NEEDSWORK: share code with diff.c
 237 */
 238#define FIRST_FEW_BYTES 8000
 239static int buffer_is_binary(const char *ptr, unsigned long size)
 240{
 241        if (FIRST_FEW_BYTES < size)
 242                size = FIRST_FEW_BYTES;
 243        return !!memchr(ptr, 0, size);
 244}
 245
 246static int fixmatch(const char *pattern, char *line, regmatch_t *match)
 247{
 248        char *hit = strstr(line, pattern);
 249        if (!hit) {
 250                match->rm_so = match->rm_eo = -1;
 251                return REG_NOMATCH;
 252        }
 253        else {
 254                match->rm_so = hit - line;
 255                match->rm_eo = match->rm_so + strlen(pattern);
 256                return 0;
 257        }
 258}
 259
 260static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol, char *eol, enum grep_context ctx)
 261{
 262        int hit = 0;
 263        int at_true_bol = 1;
 264        regmatch_t pmatch[10];
 265
 266        if ((p->token != GREP_PATTERN) &&
 267            ((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
 268                return 0;
 269
 270 again:
 271        if (!opt->fixed) {
 272                regex_t *exp = &p->regexp;
 273                hit = !regexec(exp, bol, ARRAY_SIZE(pmatch),
 274                               pmatch, 0);
 275        }
 276        else {
 277                hit = !fixmatch(p->pattern, bol, pmatch);
 278        }
 279
 280        if (hit && opt->word_regexp) {
 281                if ((pmatch[0].rm_so < 0) ||
 282                    (eol - bol) <= pmatch[0].rm_so ||
 283                    (pmatch[0].rm_eo < 0) ||
 284                    (eol - bol) < pmatch[0].rm_eo)
 285                        die("regexp returned nonsense");
 286
 287                /* Match beginning must be either beginning of the
 288                 * line, or at word boundary (i.e. the last char must
 289                 * not be a word char).  Similarly, match end must be
 290                 * either end of the line, or at word boundary
 291                 * (i.e. the next char must not be a word char).
 292                 */
 293                if ( ((pmatch[0].rm_so == 0 && at_true_bol) ||
 294                      !word_char(bol[pmatch[0].rm_so-1])) &&
 295                     ((pmatch[0].rm_eo == (eol-bol)) ||
 296                      !word_char(bol[pmatch[0].rm_eo])) )
 297                        ;
 298                else
 299                        hit = 0;
 300
 301                if (!hit && pmatch[0].rm_so + bol + 1 < eol) {
 302                        /* There could be more than one match on the
 303                         * line, and the first match might not be
 304                         * strict word match.  But later ones could be!
 305                         */
 306                        bol = pmatch[0].rm_so + bol + 1;
 307                        at_true_bol = 0;
 308                        goto again;
 309                }
 310        }
 311        return hit;
 312}
 313
 314static int match_expr_eval(struct grep_opt *o,
 315                           struct grep_expr *x,
 316                           char *bol, char *eol,
 317                           enum grep_context ctx,
 318                           int collect_hits)
 319{
 320        int h = 0;
 321
 322        switch (x->node) {
 323        case GREP_NODE_ATOM:
 324                h = match_one_pattern(o, x->u.atom, bol, eol, ctx);
 325                break;
 326        case GREP_NODE_NOT:
 327                h = !match_expr_eval(o, x->u.unary, bol, eol, ctx, 0);
 328                break;
 329        case GREP_NODE_AND:
 330                if (!collect_hits)
 331                        return (match_expr_eval(o, x->u.binary.left,
 332                                                bol, eol, ctx, 0) &&
 333                                match_expr_eval(o, x->u.binary.right,
 334                                                bol, eol, ctx, 0));
 335                h = match_expr_eval(o, x->u.binary.left, bol, eol, ctx, 0);
 336                h &= match_expr_eval(o, x->u.binary.right, bol, eol, ctx, 0);
 337                break;
 338        case GREP_NODE_OR:
 339                if (!collect_hits)
 340                        return (match_expr_eval(o, x->u.binary.left,
 341                                                bol, eol, ctx, 0) ||
 342                                match_expr_eval(o, x->u.binary.right,
 343                                                bol, eol, ctx, 0));
 344                h = match_expr_eval(o, x->u.binary.left, bol, eol, ctx, 0);
 345                x->u.binary.left->hit |= h;
 346                h |= match_expr_eval(o, x->u.binary.right, bol, eol, ctx, 1);
 347                break;
 348        default:
 349                die("Unexpected node type (internal error) %d\n", x->node);
 350        }
 351        if (collect_hits)
 352                x->hit |= h;
 353        return h;
 354}
 355
 356static int match_expr(struct grep_opt *opt, char *bol, char *eol,
 357                      enum grep_context ctx, int collect_hits)
 358{
 359        struct grep_expr *x = opt->pattern_expression;
 360        return match_expr_eval(opt, x, bol, eol, ctx, collect_hits);
 361}
 362
 363static int match_line(struct grep_opt *opt, char *bol, char *eol,
 364                      enum grep_context ctx, int collect_hits)
 365{
 366        struct grep_pat *p;
 367        if (opt->extended)
 368                return match_expr(opt, bol, eol, ctx, collect_hits);
 369
 370        /* we do not call with collect_hits without being extended */
 371        for (p = opt->pattern_list; p; p = p->next) {
 372                if (match_one_pattern(opt, p, bol, eol, ctx))
 373                        return 1;
 374        }
 375        return 0;
 376}
 377
 378static int grep_buffer_1(struct grep_opt *opt, const char *name,
 379                         char *buf, unsigned long size, int collect_hits)
 380{
 381        char *bol = buf;
 382        unsigned long left = size;
 383        unsigned lno = 1;
 384        struct pre_context_line {
 385                char *bol;
 386                char *eol;
 387        } *prev = NULL, *pcl;
 388        unsigned last_hit = 0;
 389        unsigned last_shown = 0;
 390        int binary_match_only = 0;
 391        const char *hunk_mark = "";
 392        unsigned count = 0;
 393        enum grep_context ctx = GREP_CONTEXT_HEAD;
 394
 395        if (buffer_is_binary(buf, size)) {
 396                switch (opt->binary) {
 397                case GREP_BINARY_DEFAULT:
 398                        binary_match_only = 1;
 399                        break;
 400                case GREP_BINARY_NOMATCH:
 401                        return 0; /* Assume unmatch */
 402                        break;
 403                default:
 404                        break;
 405                }
 406        }
 407
 408        if (opt->pre_context)
 409                prev = xcalloc(opt->pre_context, sizeof(*prev));
 410        if (opt->pre_context || opt->post_context)
 411                hunk_mark = "--\n";
 412
 413        while (left) {
 414                char *eol, ch;
 415                int hit;
 416
 417                eol = end_of_line(bol, &left);
 418                ch = *eol;
 419                *eol = 0;
 420
 421                if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))
 422                        ctx = GREP_CONTEXT_BODY;
 423
 424                hit = match_line(opt, bol, eol, ctx, collect_hits);
 425                *eol = ch;
 426
 427                if (collect_hits)
 428                        goto next_line;
 429
 430                /* "grep -v -e foo -e bla" should list lines
 431                 * that do not have either, so inversion should
 432                 * be done outside.
 433                 */
 434                if (opt->invert)
 435                        hit = !hit;
 436                if (opt->unmatch_name_only) {
 437                        if (hit)
 438                                return 0;
 439                        goto next_line;
 440                }
 441                if (hit) {
 442                        count++;
 443                        if (opt->status_only)
 444                                return 1;
 445                        if (binary_match_only) {
 446                                printf("Binary file %s matches\n", name);
 447                                return 1;
 448                        }
 449                        if (opt->name_only) {
 450                                printf("%s\n", name);
 451                                return 1;
 452                        }
 453                        /* Hit at this line.  If we haven't shown the
 454                         * pre-context lines, we would need to show them.
 455                         * When asked to do "count", this still show
 456                         * the context which is nonsense, but the user
 457                         * deserves to get that ;-).
 458                         */
 459                        if (opt->pre_context) {
 460                                unsigned from;
 461                                if (opt->pre_context < lno)
 462                                        from = lno - opt->pre_context;
 463                                else
 464                                        from = 1;
 465                                if (from <= last_shown)
 466                                        from = last_shown + 1;
 467                                if (last_shown && from != last_shown + 1)
 468                                        printf(hunk_mark);
 469                                while (from < lno) {
 470                                        pcl = &prev[lno-from-1];
 471                                        show_line(opt, pcl->bol, pcl->eol,
 472                                                  name, from, '-');
 473                                        from++;
 474                                }
 475                                last_shown = lno-1;
 476                        }
 477                        if (last_shown && lno != last_shown + 1)
 478                                printf(hunk_mark);
 479                        if (!opt->count)
 480                                show_line(opt, bol, eol, name, lno, ':');
 481                        last_shown = last_hit = lno;
 482                }
 483                else if (last_hit &&
 484                         lno <= last_hit + opt->post_context) {
 485                        /* If the last hit is within the post context,
 486                         * we need to show this line.
 487                         */
 488                        if (last_shown && lno != last_shown + 1)
 489                                printf(hunk_mark);
 490                        show_line(opt, bol, eol, name, lno, '-');
 491                        last_shown = lno;
 492                }
 493                if (opt->pre_context) {
 494                        memmove(prev+1, prev,
 495                                (opt->pre_context-1) * sizeof(*prev));
 496                        prev->bol = bol;
 497                        prev->eol = eol;
 498                }
 499
 500        next_line:
 501                bol = eol + 1;
 502                if (!left)
 503                        break;
 504                left--;
 505                lno++;
 506        }
 507
 508        free(prev);
 509        if (collect_hits)
 510                return 0;
 511
 512        if (opt->status_only)
 513                return 0;
 514        if (opt->unmatch_name_only) {
 515                /* We did not see any hit, so we want to show this */
 516                printf("%s\n", name);
 517                return 1;
 518        }
 519
 520        /* NEEDSWORK:
 521         * The real "grep -c foo *.c" gives many "bar.c:0" lines,
 522         * which feels mostly useless but sometimes useful.  Maybe
 523         * make it another option?  For now suppress them.
 524         */
 525        if (opt->count && count)
 526                printf("%s:%u\n", name, count);
 527        return !!last_hit;
 528}
 529
 530static void clr_hit_marker(struct grep_expr *x)
 531{
 532        /* All-hit markers are meaningful only at the very top level
 533         * OR node.
 534         */
 535        while (1) {
 536                x->hit = 0;
 537                if (x->node != GREP_NODE_OR)
 538                        return;
 539                x->u.binary.left->hit = 0;
 540                x = x->u.binary.right;
 541        }
 542}
 543
 544static int chk_hit_marker(struct grep_expr *x)
 545{
 546        /* Top level nodes have hit markers.  See if they all are hits */
 547        while (1) {
 548                if (x->node != GREP_NODE_OR)
 549                        return x->hit;
 550                if (!x->u.binary.left->hit)
 551                        return 0;
 552                x = x->u.binary.right;
 553        }
 554}
 555
 556int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size)
 557{
 558        /*
 559         * we do not have to do the two-pass grep when we do not check
 560         * buffer-wide "all-match".
 561         */
 562        if (!opt->all_match)
 563                return grep_buffer_1(opt, name, buf, size, 0);
 564
 565        /* Otherwise the toplevel "or" terms hit a bit differently.
 566         * We first clear hit markers from them.
 567         */
 568        clr_hit_marker(opt->pattern_expression);
 569        grep_buffer_1(opt, name, buf, size, 1);
 570
 571        if (!chk_hit_marker(opt->pattern_expression))
 572                return 0;
 573
 574        return grep_buffer_1(opt, name, buf, size, 0);
 575}