grep.con commit cvsserver: Avoid miscounting bytes in Perl v5.8.x (4f88d3e)
   1#include "cache.h"
   2#include <regex.h>
   3#include "grep.h"
   4
   5void append_grep_pattern(struct grep_opt *opt, const char *pat,
   6                         const char *origin, int no, enum grep_pat_token t)
   7{
   8        struct grep_pat *p = xcalloc(1, sizeof(*p));
   9        p->pattern = pat;
  10        p->origin = origin;
  11        p->no = no;
  12        p->token = t;
  13        *opt->pattern_tail = p;
  14        opt->pattern_tail = &p->next;
  15        p->next = NULL;
  16}
  17
  18static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
  19{
  20        int err = regcomp(&p->regexp, p->pattern, opt->regflags);
  21        if (err) {
  22                char errbuf[1024];
  23                char where[1024];
  24                if (p->no)
  25                        sprintf(where, "In '%s' at %d, ",
  26                                p->origin, p->no);
  27                else if (p->origin)
  28                        sprintf(where, "%s, ", p->origin);
  29                else
  30                        where[0] = 0;
  31                regerror(err, &p->regexp, errbuf, 1024);
  32                regfree(&p->regexp);
  33                die("%s'%s': %s", where, p->pattern, errbuf);
  34        }
  35}
  36
  37static struct grep_expr *compile_pattern_or(struct grep_pat **);
  38static struct grep_expr *compile_pattern_atom(struct grep_pat **list)
  39{
  40        struct grep_pat *p;
  41        struct grep_expr *x;
  42
  43        p = *list;
  44        switch (p->token) {
  45        case GREP_PATTERN: /* atom */
  46        case GREP_PATTERN_HEAD:
  47        case GREP_PATTERN_BODY:
  48                x = xcalloc(1, sizeof (struct grep_expr));
  49                x->node = GREP_NODE_ATOM;
  50                x->u.atom = p;
  51                *list = p->next;
  52                return x;
  53        case GREP_OPEN_PAREN:
  54                *list = p->next;
  55                x = compile_pattern_or(list);
  56                if (!x)
  57                        return NULL;
  58                if (!*list || (*list)->token != GREP_CLOSE_PAREN)
  59                        die("unmatched parenthesis");
  60                *list = (*list)->next;
  61                return x;
  62        default:
  63                return NULL;
  64        }
  65}
  66
  67static struct grep_expr *compile_pattern_not(struct grep_pat **list)
  68{
  69        struct grep_pat *p;
  70        struct grep_expr *x;
  71
  72        p = *list;
  73        switch (p->token) {
  74        case GREP_NOT:
  75                if (!p->next)
  76                        die("--not not followed by pattern expression");
  77                *list = p->next;
  78                x = xcalloc(1, sizeof (struct grep_expr));
  79                x->node = GREP_NODE_NOT;
  80                x->u.unary = compile_pattern_not(list);
  81                if (!x->u.unary)
  82                        die("--not followed by non pattern expression");
  83                return x;
  84        default:
  85                return compile_pattern_atom(list);
  86        }
  87}
  88
  89static struct grep_expr *compile_pattern_and(struct grep_pat **list)
  90{
  91        struct grep_pat *p;
  92        struct grep_expr *x, *y, *z;
  93
  94        x = compile_pattern_not(list);
  95        p = *list;
  96        if (p && p->token == GREP_AND) {
  97                if (!p->next)
  98                        die("--and not followed by pattern expression");
  99                *list = p->next;
 100                y = compile_pattern_and(list);
 101                if (!y)
 102                        die("--and not followed by pattern expression");
 103                z = xcalloc(1, sizeof (struct grep_expr));
 104                z->node = GREP_NODE_AND;
 105                z->u.binary.left = x;
 106                z->u.binary.right = y;
 107                return z;
 108        }
 109        return x;
 110}
 111
 112static struct grep_expr *compile_pattern_or(struct grep_pat **list)
 113{
 114        struct grep_pat *p;
 115        struct grep_expr *x, *y, *z;
 116
 117        x = compile_pattern_and(list);
 118        p = *list;
 119        if (x && p && p->token != GREP_CLOSE_PAREN) {
 120                y = compile_pattern_or(list);
 121                if (!y)
 122                        die("not a pattern expression %s", p->pattern);
 123                z = xcalloc(1, sizeof (struct grep_expr));
 124                z->node = GREP_NODE_OR;
 125                z->u.binary.left = x;
 126                z->u.binary.right = y;
 127                return z;
 128        }
 129        return x;
 130}
 131
 132static struct grep_expr *compile_pattern_expr(struct grep_pat **list)
 133{
 134        return compile_pattern_or(list);
 135}
 136
 137void compile_grep_patterns(struct grep_opt *opt)
 138{
 139        struct grep_pat *p;
 140
 141        if (opt->all_match)
 142                opt->extended = 1;
 143
 144        for (p = opt->pattern_list; p; p = p->next) {
 145                switch (p->token) {
 146                case GREP_PATTERN: /* atom */
 147                case GREP_PATTERN_HEAD:
 148                case GREP_PATTERN_BODY:
 149                        if (!opt->fixed)
 150                                compile_regexp(p, opt);
 151                        break;
 152                default:
 153                        opt->extended = 1;
 154                        break;
 155                }
 156        }
 157
 158        if (!opt->extended)
 159                return;
 160
 161        /* Then bundle them up in an expression.
 162         * A classic recursive descent parser would do.
 163         */
 164        p = opt->pattern_list;
 165        opt->pattern_expression = compile_pattern_expr(&p);
 166        if (p)
 167                die("incomplete pattern expression: %s", p->pattern);
 168}
 169
 170static void free_pattern_expr(struct grep_expr *x)
 171{
 172        switch (x->node) {
 173        case GREP_NODE_ATOM:
 174                break;
 175        case GREP_NODE_NOT:
 176                free_pattern_expr(x->u.unary);
 177                break;
 178        case GREP_NODE_AND:
 179        case GREP_NODE_OR:
 180                free_pattern_expr(x->u.binary.left);
 181                free_pattern_expr(x->u.binary.right);
 182                break;
 183        }
 184        free(x);
 185}
 186
 187void free_grep_patterns(struct grep_opt *opt)
 188{
 189        struct grep_pat *p, *n;
 190
 191        for (p = opt->pattern_list; p; p = n) {
 192                n = p->next;
 193                switch (p->token) {
 194                case GREP_PATTERN: /* atom */
 195                case GREP_PATTERN_HEAD:
 196                case GREP_PATTERN_BODY:
 197                        regfree(&p->regexp);
 198                        break;
 199                default:
 200                        break;
 201                }
 202                free(p);
 203        }
 204
 205        if (!opt->extended)
 206                return;
 207        free_pattern_expr(opt->pattern_expression);
 208}
 209
 210static char *end_of_line(char *cp, unsigned long *left)
 211{
 212        unsigned long l = *left;
 213        while (l && *cp != '\n') {
 214                l--;
 215                cp++;
 216        }
 217        *left = l;
 218        return cp;
 219}
 220
 221static int word_char(char ch)
 222{
 223        return isalnum(ch) || ch == '_';
 224}
 225
 226static void show_line(struct grep_opt *opt, const char *bol, const char *eol,
 227                      const char *name, unsigned lno, char sign)
 228{
 229        if (opt->pathname)
 230                printf("%s%c", name, sign);
 231        if (opt->linenum)
 232                printf("%d%c", lno, sign);
 233        printf("%.*s\n", (int)(eol-bol), bol);
 234}
 235
 236/*
 237 * NEEDSWORK: share code with diff.c
 238 */
 239#define FIRST_FEW_BYTES 8000
 240static int buffer_is_binary(const char *ptr, unsigned long size)
 241{
 242        if (FIRST_FEW_BYTES < size)
 243                size = FIRST_FEW_BYTES;
 244        return !!memchr(ptr, 0, size);
 245}
 246
 247static int fixmatch(const char *pattern, char *line, regmatch_t *match)
 248{
 249        char *hit = strstr(line, pattern);
 250        if (!hit) {
 251                match->rm_so = match->rm_eo = -1;
 252                return REG_NOMATCH;
 253        }
 254        else {
 255                match->rm_so = hit - line;
 256                match->rm_eo = match->rm_so + strlen(pattern);
 257                return 0;
 258        }
 259}
 260
 261static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol, char *eol, enum grep_context ctx)
 262{
 263        int hit = 0;
 264        int at_true_bol = 1;
 265        regmatch_t pmatch[10];
 266
 267        if ((p->token != GREP_PATTERN) &&
 268            ((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
 269                return 0;
 270
 271 again:
 272        if (!opt->fixed) {
 273                regex_t *exp = &p->regexp;
 274                hit = !regexec(exp, bol, ARRAY_SIZE(pmatch),
 275                               pmatch, 0);
 276        }
 277        else {
 278                hit = !fixmatch(p->pattern, bol, pmatch);
 279        }
 280
 281        if (hit && opt->word_regexp) {
 282                if ((pmatch[0].rm_so < 0) ||
 283                    (eol - bol) <= pmatch[0].rm_so ||
 284                    (pmatch[0].rm_eo < 0) ||
 285                    (eol - bol) < pmatch[0].rm_eo)
 286                        die("regexp returned nonsense");
 287
 288                /* Match beginning must be either beginning of the
 289                 * line, or at word boundary (i.e. the last char must
 290                 * not be a word char).  Similarly, match end must be
 291                 * either end of the line, or at word boundary
 292                 * (i.e. the next char must not be a word char).
 293                 */
 294                if ( ((pmatch[0].rm_so == 0 && at_true_bol) ||
 295                      !word_char(bol[pmatch[0].rm_so-1])) &&
 296                     ((pmatch[0].rm_eo == (eol-bol)) ||
 297                      !word_char(bol[pmatch[0].rm_eo])) )
 298                        ;
 299                else
 300                        hit = 0;
 301
 302                if (!hit && pmatch[0].rm_so + bol + 1 < eol) {
 303                        /* There could be more than one match on the
 304                         * line, and the first match might not be
 305                         * strict word match.  But later ones could be!
 306                         */
 307                        bol = pmatch[0].rm_so + bol + 1;
 308                        at_true_bol = 0;
 309                        goto again;
 310                }
 311        }
 312        return hit;
 313}
 314
 315static int match_expr_eval(struct grep_opt *o,
 316                           struct grep_expr *x,
 317                           char *bol, char *eol,
 318                           enum grep_context ctx,
 319                           int collect_hits)
 320{
 321        int h = 0;
 322
 323        switch (x->node) {
 324        case GREP_NODE_ATOM:
 325                h = match_one_pattern(o, x->u.atom, bol, eol, ctx);
 326                break;
 327        case GREP_NODE_NOT:
 328                h = !match_expr_eval(o, x->u.unary, bol, eol, ctx, 0);
 329                break;
 330        case GREP_NODE_AND:
 331                if (!collect_hits)
 332                        return (match_expr_eval(o, x->u.binary.left,
 333                                                bol, eol, ctx, 0) &&
 334                                match_expr_eval(o, x->u.binary.right,
 335                                                bol, eol, ctx, 0));
 336                h = match_expr_eval(o, x->u.binary.left, bol, eol, ctx, 0);
 337                h &= match_expr_eval(o, x->u.binary.right, bol, eol, ctx, 0);
 338                break;
 339        case GREP_NODE_OR:
 340                if (!collect_hits)
 341                        return (match_expr_eval(o, x->u.binary.left,
 342                                                bol, eol, ctx, 0) ||
 343                                match_expr_eval(o, x->u.binary.right,
 344                                                bol, eol, ctx, 0));
 345                h = match_expr_eval(o, x->u.binary.left, bol, eol, ctx, 0);
 346                x->u.binary.left->hit |= h;
 347                h |= match_expr_eval(o, x->u.binary.right, bol, eol, ctx, 1);
 348                break;
 349        default:
 350                die("Unexpected node type (internal error) %d\n", x->node);
 351        }
 352        if (collect_hits)
 353                x->hit |= h;
 354        return h;
 355}
 356
 357static int match_expr(struct grep_opt *opt, char *bol, char *eol,
 358                      enum grep_context ctx, int collect_hits)
 359{
 360        struct grep_expr *x = opt->pattern_expression;
 361        return match_expr_eval(opt, x, bol, eol, ctx, collect_hits);
 362}
 363
 364static int match_line(struct grep_opt *opt, char *bol, char *eol,
 365                      enum grep_context ctx, int collect_hits)
 366{
 367        struct grep_pat *p;
 368        if (opt->extended)
 369                return match_expr(opt, bol, eol, ctx, collect_hits);
 370
 371        /* we do not call with collect_hits without being extended */
 372        for (p = opt->pattern_list; p; p = p->next) {
 373                if (match_one_pattern(opt, p, bol, eol, ctx))
 374                        return 1;
 375        }
 376        return 0;
 377}
 378
 379static int grep_buffer_1(struct grep_opt *opt, const char *name,
 380                         char *buf, unsigned long size, int collect_hits)
 381{
 382        char *bol = buf;
 383        unsigned long left = size;
 384        unsigned lno = 1;
 385        struct pre_context_line {
 386                char *bol;
 387                char *eol;
 388        } *prev = NULL, *pcl;
 389        unsigned last_hit = 0;
 390        unsigned last_shown = 0;
 391        int binary_match_only = 0;
 392        const char *hunk_mark = "";
 393        unsigned count = 0;
 394        enum grep_context ctx = GREP_CONTEXT_HEAD;
 395
 396        if (buffer_is_binary(buf, size)) {
 397                switch (opt->binary) {
 398                case GREP_BINARY_DEFAULT:
 399                        binary_match_only = 1;
 400                        break;
 401                case GREP_BINARY_NOMATCH:
 402                        return 0; /* Assume unmatch */
 403                        break;
 404                default:
 405                        break;
 406                }
 407        }
 408
 409        if (opt->pre_context)
 410                prev = xcalloc(opt->pre_context, sizeof(*prev));
 411        if (opt->pre_context || opt->post_context)
 412                hunk_mark = "--\n";
 413
 414        while (left) {
 415                char *eol, ch;
 416                int hit;
 417
 418                eol = end_of_line(bol, &left);
 419                ch = *eol;
 420                *eol = 0;
 421
 422                if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))
 423                        ctx = GREP_CONTEXT_BODY;
 424
 425                hit = match_line(opt, bol, eol, ctx, collect_hits);
 426                *eol = ch;
 427
 428                if (collect_hits)
 429                        goto next_line;
 430
 431                /* "grep -v -e foo -e bla" should list lines
 432                 * that do not have either, so inversion should
 433                 * be done outside.
 434                 */
 435                if (opt->invert)
 436                        hit = !hit;
 437                if (opt->unmatch_name_only) {
 438                        if (hit)
 439                                return 0;
 440                        goto next_line;
 441                }
 442                if (hit) {
 443                        count++;
 444                        if (opt->status_only)
 445                                return 1;
 446                        if (binary_match_only) {
 447                                printf("Binary file %s matches\n", name);
 448                                return 1;
 449                        }
 450                        if (opt->name_only) {
 451                                printf("%s\n", name);
 452                                return 1;
 453                        }
 454                        /* Hit at this line.  If we haven't shown the
 455                         * pre-context lines, we would need to show them.
 456                         * When asked to do "count", this still show
 457                         * the context which is nonsense, but the user
 458                         * deserves to get that ;-).
 459                         */
 460                        if (opt->pre_context) {
 461                                unsigned from;
 462                                if (opt->pre_context < lno)
 463                                        from = lno - opt->pre_context;
 464                                else
 465                                        from = 1;
 466                                if (from <= last_shown)
 467                                        from = last_shown + 1;
 468                                if (last_shown && from != last_shown + 1)
 469                                        printf(hunk_mark);
 470                                while (from < lno) {
 471                                        pcl = &prev[lno-from-1];
 472                                        show_line(opt, pcl->bol, pcl->eol,
 473                                                  name, from, '-');
 474                                        from++;
 475                                }
 476                                last_shown = lno-1;
 477                        }
 478                        if (last_shown && lno != last_shown + 1)
 479                                printf(hunk_mark);
 480                        if (!opt->count)
 481                                show_line(opt, bol, eol, name, lno, ':');
 482                        last_shown = last_hit = lno;
 483                }
 484                else if (last_hit &&
 485                         lno <= last_hit + opt->post_context) {
 486                        /* If the last hit is within the post context,
 487                         * we need to show this line.
 488                         */
 489                        if (last_shown && lno != last_shown + 1)
 490                                printf(hunk_mark);
 491                        show_line(opt, bol, eol, name, lno, '-');
 492                        last_shown = lno;
 493                }
 494                if (opt->pre_context) {
 495                        memmove(prev+1, prev,
 496                                (opt->pre_context-1) * sizeof(*prev));
 497                        prev->bol = bol;
 498                        prev->eol = eol;
 499                }
 500
 501        next_line:
 502                bol = eol + 1;
 503                if (!left)
 504                        break;
 505                left--;
 506                lno++;
 507        }
 508
 509        free(prev);
 510        if (collect_hits)
 511                return 0;
 512
 513        if (opt->status_only)
 514                return 0;
 515        if (opt->unmatch_name_only) {
 516                /* We did not see any hit, so we want to show this */
 517                printf("%s\n", name);
 518                return 1;
 519        }
 520
 521        /* NEEDSWORK:
 522         * The real "grep -c foo *.c" gives many "bar.c:0" lines,
 523         * which feels mostly useless but sometimes useful.  Maybe
 524         * make it another option?  For now suppress them.
 525         */
 526        if (opt->count && count)
 527                printf("%s:%u\n", name, count);
 528        return !!last_hit;
 529}
 530
 531static void clr_hit_marker(struct grep_expr *x)
 532{
 533        /* All-hit markers are meaningful only at the very top level
 534         * OR node.
 535         */
 536        while (1) {
 537                x->hit = 0;
 538                if (x->node != GREP_NODE_OR)
 539                        return;
 540                x->u.binary.left->hit = 0;
 541                x = x->u.binary.right;
 542        }
 543}
 544
 545static int chk_hit_marker(struct grep_expr *x)
 546{
 547        /* Top level nodes have hit markers.  See if they all are hits */
 548        while (1) {
 549                if (x->node != GREP_NODE_OR)
 550                        return x->hit;
 551                if (!x->u.binary.left->hit)
 552                        return 0;
 553                x = x->u.binary.right;
 554        }
 555}
 556
 557int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size)
 558{
 559        /*
 560         * we do not have to do the two-pass grep when we do not check
 561         * buffer-wide "all-match".
 562         */
 563        if (!opt->all_match)
 564                return grep_buffer_1(opt, name, buf, size, 0);
 565
 566        /* Otherwise the toplevel "or" terms hit a bit differently.
 567         * We first clear hit markers from them.
 568         */
 569        clr_hit_marker(opt->pattern_expression);
 570        grep_buffer_1(opt, name, buf, size, 1);
 571
 572        if (!chk_hit_marker(opt->pattern_expression))
 573                return 0;
 574
 575        return grep_buffer_1(opt, name, buf, size, 0);
 576}