grep.con commit core.whitespace: add test for diff whitespace error highlighting (49e703a)
   1#include "cache.h"
   2#include "grep.h"
   3#include "xdiff-interface.h"
   4
   5void append_grep_pattern(struct grep_opt *opt, const char *pat,
   6                         const char *origin, int no, enum grep_pat_token t)
   7{
   8        struct grep_pat *p = xcalloc(1, sizeof(*p));
   9        p->pattern = pat;
  10        p->origin = origin;
  11        p->no = no;
  12        p->token = t;
  13        *opt->pattern_tail = p;
  14        opt->pattern_tail = &p->next;
  15        p->next = NULL;
  16}
  17
  18static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
  19{
  20        int err = regcomp(&p->regexp, p->pattern, opt->regflags);
  21        if (err) {
  22                char errbuf[1024];
  23                char where[1024];
  24                if (p->no)
  25                        sprintf(where, "In '%s' at %d, ",
  26                                p->origin, p->no);
  27                else if (p->origin)
  28                        sprintf(where, "%s, ", p->origin);
  29                else
  30                        where[0] = 0;
  31                regerror(err, &p->regexp, errbuf, 1024);
  32                regfree(&p->regexp);
  33                die("%s'%s': %s", where, p->pattern, errbuf);
  34        }
  35}
  36
  37static struct grep_expr *compile_pattern_or(struct grep_pat **);
  38static struct grep_expr *compile_pattern_atom(struct grep_pat **list)
  39{
  40        struct grep_pat *p;
  41        struct grep_expr *x;
  42
  43        p = *list;
  44        switch (p->token) {
  45        case GREP_PATTERN: /* atom */
  46        case GREP_PATTERN_HEAD:
  47        case GREP_PATTERN_BODY:
  48                x = xcalloc(1, sizeof (struct grep_expr));
  49                x->node = GREP_NODE_ATOM;
  50                x->u.atom = p;
  51                *list = p->next;
  52                return x;
  53        case GREP_OPEN_PAREN:
  54                *list = p->next;
  55                x = compile_pattern_or(list);
  56                if (!x)
  57                        return NULL;
  58                if (!*list || (*list)->token != GREP_CLOSE_PAREN)
  59                        die("unmatched parenthesis");
  60                *list = (*list)->next;
  61                return x;
  62        default:
  63                return NULL;
  64        }
  65}
  66
  67static struct grep_expr *compile_pattern_not(struct grep_pat **list)
  68{
  69        struct grep_pat *p;
  70        struct grep_expr *x;
  71
  72        p = *list;
  73        switch (p->token) {
  74        case GREP_NOT:
  75                if (!p->next)
  76                        die("--not not followed by pattern expression");
  77                *list = p->next;
  78                x = xcalloc(1, sizeof (struct grep_expr));
  79                x->node = GREP_NODE_NOT;
  80                x->u.unary = compile_pattern_not(list);
  81                if (!x->u.unary)
  82                        die("--not followed by non pattern expression");
  83                return x;
  84        default:
  85                return compile_pattern_atom(list);
  86        }
  87}
  88
  89static struct grep_expr *compile_pattern_and(struct grep_pat **list)
  90{
  91        struct grep_pat *p;
  92        struct grep_expr *x, *y, *z;
  93
  94        x = compile_pattern_not(list);
  95        p = *list;
  96        if (p && p->token == GREP_AND) {
  97                if (!p->next)
  98                        die("--and not followed by pattern expression");
  99                *list = p->next;
 100                y = compile_pattern_and(list);
 101                if (!y)
 102                        die("--and not followed by pattern expression");
 103                z = xcalloc(1, sizeof (struct grep_expr));
 104                z->node = GREP_NODE_AND;
 105                z->u.binary.left = x;
 106                z->u.binary.right = y;
 107                return z;
 108        }
 109        return x;
 110}
 111
 112static struct grep_expr *compile_pattern_or(struct grep_pat **list)
 113{
 114        struct grep_pat *p;
 115        struct grep_expr *x, *y, *z;
 116
 117        x = compile_pattern_and(list);
 118        p = *list;
 119        if (x && p && p->token != GREP_CLOSE_PAREN) {
 120                y = compile_pattern_or(list);
 121                if (!y)
 122                        die("not a pattern expression %s", p->pattern);
 123                z = xcalloc(1, sizeof (struct grep_expr));
 124                z->node = GREP_NODE_OR;
 125                z->u.binary.left = x;
 126                z->u.binary.right = y;
 127                return z;
 128        }
 129        return x;
 130}
 131
 132static struct grep_expr *compile_pattern_expr(struct grep_pat **list)
 133{
 134        return compile_pattern_or(list);
 135}
 136
 137void compile_grep_patterns(struct grep_opt *opt)
 138{
 139        struct grep_pat *p;
 140
 141        if (opt->all_match)
 142                opt->extended = 1;
 143
 144        for (p = opt->pattern_list; p; p = p->next) {
 145                switch (p->token) {
 146                case GREP_PATTERN: /* atom */
 147                case GREP_PATTERN_HEAD:
 148                case GREP_PATTERN_BODY:
 149                        if (!opt->fixed)
 150                                compile_regexp(p, opt);
 151                        break;
 152                default:
 153                        opt->extended = 1;
 154                        break;
 155                }
 156        }
 157
 158        if (!opt->extended)
 159                return;
 160
 161        /* Then bundle them up in an expression.
 162         * A classic recursive descent parser would do.
 163         */
 164        p = opt->pattern_list;
 165        opt->pattern_expression = compile_pattern_expr(&p);
 166        if (p)
 167                die("incomplete pattern expression: %s", p->pattern);
 168}
 169
 170static void free_pattern_expr(struct grep_expr *x)
 171{
 172        switch (x->node) {
 173        case GREP_NODE_ATOM:
 174                break;
 175        case GREP_NODE_NOT:
 176                free_pattern_expr(x->u.unary);
 177                break;
 178        case GREP_NODE_AND:
 179        case GREP_NODE_OR:
 180                free_pattern_expr(x->u.binary.left);
 181                free_pattern_expr(x->u.binary.right);
 182                break;
 183        }
 184        free(x);
 185}
 186
 187void free_grep_patterns(struct grep_opt *opt)
 188{
 189        struct grep_pat *p, *n;
 190
 191        for (p = opt->pattern_list; p; p = n) {
 192                n = p->next;
 193                switch (p->token) {
 194                case GREP_PATTERN: /* atom */
 195                case GREP_PATTERN_HEAD:
 196                case GREP_PATTERN_BODY:
 197                        regfree(&p->regexp);
 198                        break;
 199                default:
 200                        break;
 201                }
 202                free(p);
 203        }
 204
 205        if (!opt->extended)
 206                return;
 207        free_pattern_expr(opt->pattern_expression);
 208}
 209
 210static char *end_of_line(char *cp, unsigned long *left)
 211{
 212        unsigned long l = *left;
 213        while (l && *cp != '\n') {
 214                l--;
 215                cp++;
 216        }
 217        *left = l;
 218        return cp;
 219}
 220
 221static int word_char(char ch)
 222{
 223        return isalnum(ch) || ch == '_';
 224}
 225
 226static void show_line(struct grep_opt *opt, const char *bol, const char *eol,
 227                      const char *name, unsigned lno, char sign)
 228{
 229        if (opt->pathname)
 230                printf("%s%c", name, sign);
 231        if (opt->linenum)
 232                printf("%d%c", lno, sign);
 233        printf("%.*s\n", (int)(eol-bol), bol);
 234}
 235
 236static int fixmatch(const char *pattern, char *line, regmatch_t *match)
 237{
 238        char *hit = strstr(line, pattern);
 239        if (!hit) {
 240                match->rm_so = match->rm_eo = -1;
 241                return REG_NOMATCH;
 242        }
 243        else {
 244                match->rm_so = hit - line;
 245                match->rm_eo = match->rm_so + strlen(pattern);
 246                return 0;
 247        }
 248}
 249
 250static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol, char *eol, enum grep_context ctx)
 251{
 252        int hit = 0;
 253        int at_true_bol = 1;
 254        regmatch_t pmatch[10];
 255
 256        if ((p->token != GREP_PATTERN) &&
 257            ((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
 258                return 0;
 259
 260 again:
 261        if (!opt->fixed) {
 262                regex_t *exp = &p->regexp;
 263                hit = !regexec(exp, bol, ARRAY_SIZE(pmatch),
 264                               pmatch, 0);
 265        }
 266        else {
 267                hit = !fixmatch(p->pattern, bol, pmatch);
 268        }
 269
 270        if (hit && opt->word_regexp) {
 271                if ((pmatch[0].rm_so < 0) ||
 272                    (eol - bol) <= pmatch[0].rm_so ||
 273                    (pmatch[0].rm_eo < 0) ||
 274                    (eol - bol) < pmatch[0].rm_eo)
 275                        die("regexp returned nonsense");
 276
 277                /* Match beginning must be either beginning of the
 278                 * line, or at word boundary (i.e. the last char must
 279                 * not be a word char).  Similarly, match end must be
 280                 * either end of the line, or at word boundary
 281                 * (i.e. the next char must not be a word char).
 282                 */
 283                if ( ((pmatch[0].rm_so == 0 && at_true_bol) ||
 284                      !word_char(bol[pmatch[0].rm_so-1])) &&
 285                     ((pmatch[0].rm_eo == (eol-bol)) ||
 286                      !word_char(bol[pmatch[0].rm_eo])) )
 287                        ;
 288                else
 289                        hit = 0;
 290
 291                if (!hit && pmatch[0].rm_so + bol + 1 < eol) {
 292                        /* There could be more than one match on the
 293                         * line, and the first match might not be
 294                         * strict word match.  But later ones could be!
 295                         */
 296                        bol = pmatch[0].rm_so + bol + 1;
 297                        at_true_bol = 0;
 298                        goto again;
 299                }
 300        }
 301        return hit;
 302}
 303
 304static int match_expr_eval(struct grep_opt *o,
 305                           struct grep_expr *x,
 306                           char *bol, char *eol,
 307                           enum grep_context ctx,
 308                           int collect_hits)
 309{
 310        int h = 0;
 311
 312        switch (x->node) {
 313        case GREP_NODE_ATOM:
 314                h = match_one_pattern(o, x->u.atom, bol, eol, ctx);
 315                break;
 316        case GREP_NODE_NOT:
 317                h = !match_expr_eval(o, x->u.unary, bol, eol, ctx, 0);
 318                break;
 319        case GREP_NODE_AND:
 320                if (!collect_hits)
 321                        return (match_expr_eval(o, x->u.binary.left,
 322                                                bol, eol, ctx, 0) &&
 323                                match_expr_eval(o, x->u.binary.right,
 324                                                bol, eol, ctx, 0));
 325                h = match_expr_eval(o, x->u.binary.left, bol, eol, ctx, 0);
 326                h &= match_expr_eval(o, x->u.binary.right, bol, eol, ctx, 0);
 327                break;
 328        case GREP_NODE_OR:
 329                if (!collect_hits)
 330                        return (match_expr_eval(o, x->u.binary.left,
 331                                                bol, eol, ctx, 0) ||
 332                                match_expr_eval(o, x->u.binary.right,
 333                                                bol, eol, ctx, 0));
 334                h = match_expr_eval(o, x->u.binary.left, bol, eol, ctx, 0);
 335                x->u.binary.left->hit |= h;
 336                h |= match_expr_eval(o, x->u.binary.right, bol, eol, ctx, 1);
 337                break;
 338        default:
 339                die("Unexpected node type (internal error) %d\n", x->node);
 340        }
 341        if (collect_hits)
 342                x->hit |= h;
 343        return h;
 344}
 345
 346static int match_expr(struct grep_opt *opt, char *bol, char *eol,
 347                      enum grep_context ctx, int collect_hits)
 348{
 349        struct grep_expr *x = opt->pattern_expression;
 350        return match_expr_eval(opt, x, bol, eol, ctx, collect_hits);
 351}
 352
 353static int match_line(struct grep_opt *opt, char *bol, char *eol,
 354                      enum grep_context ctx, int collect_hits)
 355{
 356        struct grep_pat *p;
 357        if (opt->extended)
 358                return match_expr(opt, bol, eol, ctx, collect_hits);
 359
 360        /* we do not call with collect_hits without being extended */
 361        for (p = opt->pattern_list; p; p = p->next) {
 362                if (match_one_pattern(opt, p, bol, eol, ctx))
 363                        return 1;
 364        }
 365        return 0;
 366}
 367
 368static int grep_buffer_1(struct grep_opt *opt, const char *name,
 369                         char *buf, unsigned long size, int collect_hits)
 370{
 371        char *bol = buf;
 372        unsigned long left = size;
 373        unsigned lno = 1;
 374        struct pre_context_line {
 375                char *bol;
 376                char *eol;
 377        } *prev = NULL, *pcl;
 378        unsigned last_hit = 0;
 379        unsigned last_shown = 0;
 380        int binary_match_only = 0;
 381        const char *hunk_mark = "";
 382        unsigned count = 0;
 383        enum grep_context ctx = GREP_CONTEXT_HEAD;
 384
 385        if (buffer_is_binary(buf, size)) {
 386                switch (opt->binary) {
 387                case GREP_BINARY_DEFAULT:
 388                        binary_match_only = 1;
 389                        break;
 390                case GREP_BINARY_NOMATCH:
 391                        return 0; /* Assume unmatch */
 392                        break;
 393                default:
 394                        break;
 395                }
 396        }
 397
 398        if (opt->pre_context)
 399                prev = xcalloc(opt->pre_context, sizeof(*prev));
 400        if (opt->pre_context || opt->post_context)
 401                hunk_mark = "--\n";
 402
 403        while (left) {
 404                char *eol, ch;
 405                int hit;
 406
 407                eol = end_of_line(bol, &left);
 408                ch = *eol;
 409                *eol = 0;
 410
 411                if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))
 412                        ctx = GREP_CONTEXT_BODY;
 413
 414                hit = match_line(opt, bol, eol, ctx, collect_hits);
 415                *eol = ch;
 416
 417                if (collect_hits)
 418                        goto next_line;
 419
 420                /* "grep -v -e foo -e bla" should list lines
 421                 * that do not have either, so inversion should
 422                 * be done outside.
 423                 */
 424                if (opt->invert)
 425                        hit = !hit;
 426                if (opt->unmatch_name_only) {
 427                        if (hit)
 428                                return 0;
 429                        goto next_line;
 430                }
 431                if (hit) {
 432                        count++;
 433                        if (opt->status_only)
 434                                return 1;
 435                        if (binary_match_only) {
 436                                printf("Binary file %s matches\n", name);
 437                                return 1;
 438                        }
 439                        if (opt->name_only) {
 440                                printf("%s\n", name);
 441                                return 1;
 442                        }
 443                        /* Hit at this line.  If we haven't shown the
 444                         * pre-context lines, we would need to show them.
 445                         * When asked to do "count", this still show
 446                         * the context which is nonsense, but the user
 447                         * deserves to get that ;-).
 448                         */
 449                        if (opt->pre_context) {
 450                                unsigned from;
 451                                if (opt->pre_context < lno)
 452                                        from = lno - opt->pre_context;
 453                                else
 454                                        from = 1;
 455                                if (from <= last_shown)
 456                                        from = last_shown + 1;
 457                                if (last_shown && from != last_shown + 1)
 458                                        printf(hunk_mark);
 459                                while (from < lno) {
 460                                        pcl = &prev[lno-from-1];
 461                                        show_line(opt, pcl->bol, pcl->eol,
 462                                                  name, from, '-');
 463                                        from++;
 464                                }
 465                                last_shown = lno-1;
 466                        }
 467                        if (last_shown && lno != last_shown + 1)
 468                                printf(hunk_mark);
 469                        if (!opt->count)
 470                                show_line(opt, bol, eol, name, lno, ':');
 471                        last_shown = last_hit = lno;
 472                }
 473                else if (last_hit &&
 474                         lno <= last_hit + opt->post_context) {
 475                        /* If the last hit is within the post context,
 476                         * we need to show this line.
 477                         */
 478                        if (last_shown && lno != last_shown + 1)
 479                                printf(hunk_mark);
 480                        show_line(opt, bol, eol, name, lno, '-');
 481                        last_shown = lno;
 482                }
 483                if (opt->pre_context) {
 484                        memmove(prev+1, prev,
 485                                (opt->pre_context-1) * sizeof(*prev));
 486                        prev->bol = bol;
 487                        prev->eol = eol;
 488                }
 489
 490        next_line:
 491                bol = eol + 1;
 492                if (!left)
 493                        break;
 494                left--;
 495                lno++;
 496        }
 497
 498        free(prev);
 499        if (collect_hits)
 500                return 0;
 501
 502        if (opt->status_only)
 503                return 0;
 504        if (opt->unmatch_name_only) {
 505                /* We did not see any hit, so we want to show this */
 506                printf("%s\n", name);
 507                return 1;
 508        }
 509
 510        /* NEEDSWORK:
 511         * The real "grep -c foo *.c" gives many "bar.c:0" lines,
 512         * which feels mostly useless but sometimes useful.  Maybe
 513         * make it another option?  For now suppress them.
 514         */
 515        if (opt->count && count)
 516                printf("%s:%u\n", name, count);
 517        return !!last_hit;
 518}
 519
 520static void clr_hit_marker(struct grep_expr *x)
 521{
 522        /* All-hit markers are meaningful only at the very top level
 523         * OR node.
 524         */
 525        while (1) {
 526                x->hit = 0;
 527                if (x->node != GREP_NODE_OR)
 528                        return;
 529                x->u.binary.left->hit = 0;
 530                x = x->u.binary.right;
 531        }
 532}
 533
 534static int chk_hit_marker(struct grep_expr *x)
 535{
 536        /* Top level nodes have hit markers.  See if they all are hits */
 537        while (1) {
 538                if (x->node != GREP_NODE_OR)
 539                        return x->hit;
 540                if (!x->u.binary.left->hit)
 541                        return 0;
 542                x = x->u.binary.right;
 543        }
 544}
 545
 546int grep_buffer(struct grep_opt *opt, const char *name, char *buf, unsigned long size)
 547{
 548        /*
 549         * we do not have to do the two-pass grep when we do not check
 550         * buffer-wide "all-match".
 551         */
 552        if (!opt->all_match)
 553                return grep_buffer_1(opt, name, buf, size, 0);
 554
 555        /* Otherwise the toplevel "or" terms hit a bit differently.
 556         * We first clear hit markers from them.
 557         */
 558        clr_hit_marker(opt->pattern_expression);
 559        grep_buffer_1(opt, name, buf, size, 1);
 560
 561        if (!chk_hit_marker(opt->pattern_expression))
 562                return 0;
 563
 564        return grep_buffer_1(opt, name, buf, size, 0);
 565}