builtin-grep.con commit Merge branch 'js/fmt-patch' into next (c2b9e69)
   1/*
   2 * Builtin "git grep"
   3 *
   4 * Copyright (c) 2006 Junio C Hamano
   5 */
   6#include "cache.h"
   7#include "blob.h"
   8#include "tree.h"
   9#include "commit.h"
  10#include "tag.h"
  11#include "tree-walk.h"
  12#include "builtin.h"
  13#include <regex.h>
  14#include <fnmatch.h>
  15
  16/*
  17 * git grep pathspecs are somewhat different from diff-tree pathspecs;
  18 * pathname wildcards are allowed.
  19 */
  20static int pathspec_matches(const char **paths, const char *name)
  21{
  22        int namelen, i;
  23        if (!paths || !*paths)
  24                return 1;
  25        namelen = strlen(name);
  26        for (i = 0; paths[i]; i++) {
  27                const char *match = paths[i];
  28                int matchlen = strlen(match);
  29                const char *cp, *meta;
  30
  31                if ((matchlen <= namelen) &&
  32                    !strncmp(name, match, matchlen) &&
  33                    (match[matchlen-1] == '/' ||
  34                     name[matchlen] == '\0' || name[matchlen] == '/'))
  35                        return 1;
  36                if (!fnmatch(match, name, 0))
  37                        return 1;
  38                if (name[namelen-1] != '/')
  39                        continue;
  40
  41                /* We are being asked if the directory ("name") is worth
  42                 * descending into.
  43                 *
  44                 * Find the longest leading directory name that does
  45                 * not have metacharacter in the pathspec; the name
  46                 * we are looking at must overlap with that directory.
  47                 */
  48                for (cp = match, meta = NULL; cp - match < matchlen; cp++) {
  49                        char ch = *cp;
  50                        if (ch == '*' || ch == '[' || ch == '?') {
  51                                meta = cp;
  52                                break;
  53                        }
  54                }
  55                if (!meta)
  56                        meta = cp; /* fully literal */
  57
  58                if (namelen <= meta - match) {
  59                        /* Looking at "Documentation/" and
  60                         * the pattern says "Documentation/howto/", or
  61                         * "Documentation/diff*.txt".  The name we
  62                         * have should match prefix.
  63                         */
  64                        if (!memcmp(match, name, namelen))
  65                                return 1;
  66                        continue;
  67                }
  68
  69                if (meta - match < namelen) {
  70                        /* Looking at "Documentation/howto/" and
  71                         * the pattern says "Documentation/h*";
  72                         * match up to "Do.../h"; this avoids descending
  73                         * into "Documentation/technical/".
  74                         */
  75                        if (!memcmp(match, name, meta - match))
  76                                return 1;
  77                        continue;
  78                }
  79        }
  80        return 0;
  81}
  82
  83struct grep_pat {
  84        struct grep_pat *next;
  85        const char *pattern;
  86        regex_t regexp;
  87};
  88
  89struct grep_opt {
  90        struct grep_pat *pattern_list;
  91        struct grep_pat **pattern_tail;
  92        regex_t regexp;
  93        unsigned linenum:1;
  94        unsigned invert:1;
  95        unsigned name_only:1;
  96        unsigned unmatch_name_only:1;
  97        unsigned count:1;
  98        unsigned word_regexp:1;
  99#define GREP_BINARY_DEFAULT     0
 100#define GREP_BINARY_NOMATCH     1
 101#define GREP_BINARY_TEXT        2
 102        unsigned binary:2;
 103        int regflags;
 104        unsigned pre_context;
 105        unsigned post_context;
 106};
 107
 108static void add_pattern(struct grep_opt *opt, const char *pat)
 109{
 110        struct grep_pat *p = xcalloc(1, sizeof(*p));
 111        p->pattern = pat;
 112        *opt->pattern_tail = p;
 113        opt->pattern_tail = &p->next;
 114        p->next = NULL;
 115}
 116
 117static void compile_patterns(struct grep_opt *opt)
 118{
 119        struct grep_pat *p;
 120        for (p = opt->pattern_list; p; p = p->next) {
 121                int err = regcomp(&p->regexp, p->pattern, opt->regflags);
 122                if (err) {
 123                        char errbuf[1024];
 124                        regerror(err, &p->regexp, errbuf, 1024);
 125                        regfree(&p->regexp);
 126                        die("'%s': %s", p->pattern, errbuf);
 127                }
 128        }
 129}
 130
 131static char *end_of_line(char *cp, unsigned long *left)
 132{
 133        unsigned long l = *left;
 134        while (l && *cp != '\n') {
 135                l--;
 136                cp++;
 137        }
 138        *left = l;
 139        return cp;
 140}
 141
 142static int word_char(char ch)
 143{
 144        return isalnum(ch) || ch == '_';
 145}
 146
 147static void show_line(struct grep_opt *opt, const char *bol, const char *eol,
 148                      const char *name, unsigned lno, char sign)
 149{
 150        printf("%s%c", name, sign);
 151        if (opt->linenum)
 152                printf("%d%c", lno, sign);
 153        printf("%.*s\n", (int)(eol-bol), bol);
 154}
 155
 156/*
 157 * NEEDSWORK: share code with diff.c
 158 */
 159#define FIRST_FEW_BYTES 8000
 160static int buffer_is_binary(const char *ptr, unsigned long size)
 161{
 162        if (FIRST_FEW_BYTES < size)
 163                size = FIRST_FEW_BYTES;
 164        if (memchr(ptr, 0, size))
 165                return 1;
 166        return 0;
 167}
 168
 169static int grep_buffer(struct grep_opt *opt, const char *name,
 170                       char *buf, unsigned long size)
 171{
 172        char *bol = buf;
 173        unsigned long left = size;
 174        unsigned lno = 1;
 175        struct pre_context_line {
 176                char *bol;
 177                char *eol;
 178        } *prev = NULL, *pcl;
 179        unsigned last_hit = 0;
 180        unsigned last_shown = 0;
 181        int binary_match_only = 0;
 182        const char *hunk_mark = "";
 183        unsigned count = 0;
 184
 185        if (buffer_is_binary(buf, size)) {
 186                switch (opt->binary) {
 187                case GREP_BINARY_DEFAULT:
 188                        binary_match_only = 1;
 189                        break;
 190                case GREP_BINARY_NOMATCH:
 191                        return 0; /* Assume unmatch */
 192                        break;
 193                default:
 194                        break;
 195                }
 196        }
 197
 198        if (opt->pre_context)
 199                prev = xcalloc(opt->pre_context, sizeof(*prev));
 200        if (opt->pre_context || opt->post_context)
 201                hunk_mark = "--\n";
 202
 203        while (left) {
 204                regmatch_t pmatch[10];
 205                char *eol, ch;
 206                int hit = 0;
 207                struct grep_pat *p;
 208
 209                eol = end_of_line(bol, &left);
 210                ch = *eol;
 211                *eol = 0;
 212
 213                for (p = opt->pattern_list; p; p = p->next) {
 214                        regex_t *exp = &p->regexp;
 215                        hit = !regexec(exp, bol, ARRAY_SIZE(pmatch),
 216                                       pmatch, 0);
 217
 218                        if (hit && opt->word_regexp) {
 219                                /* Match beginning must be either
 220                                 * beginning of the line, or at word
 221                                 * boundary (i.e. the last char must
 222                                 * not be alnum or underscore).
 223                                 */
 224                                if ((pmatch[0].rm_so < 0) ||
 225                                    (eol - bol) <= pmatch[0].rm_so ||
 226                                    (pmatch[0].rm_eo < 0) ||
 227                                    (eol - bol) < pmatch[0].rm_eo)
 228                                        die("regexp returned nonsense");
 229                                if (pmatch[0].rm_so != 0 &&
 230                                    word_char(bol[pmatch[0].rm_so-1]))
 231                                        continue; /* not a word boundary */
 232                                if ((eol-bol) < pmatch[0].rm_eo &&
 233                                    word_char(bol[pmatch[0].rm_eo]))
 234                                        continue; /* not a word boundary */
 235                        }
 236                        if (hit)
 237                                break;
 238                }
 239                /* "grep -v -e foo -e bla" should list lines
 240                 * that do not have either, so inversion should
 241                 * be done outside.
 242                 */
 243                if (opt->invert)
 244                        hit = !hit;
 245                if (opt->unmatch_name_only) {
 246                        if (hit)
 247                                return 0;
 248                        goto next_line;
 249                }
 250                if (hit) {
 251                        count++;
 252                        if (binary_match_only) {
 253                                printf("Binary file %s matches\n", name);
 254                                return 1;
 255                        }
 256                        if (opt->name_only) {
 257                                printf("%s\n", name);
 258                                return 1;
 259                        }
 260                        /* Hit at this line.  If we haven't shown the
 261                         * pre-context lines, we would need to show them.
 262                         * When asked to do "count", this still show
 263                         * the context which is nonsense, but the user
 264                         * deserves to get that ;-).
 265                         */
 266                        if (opt->pre_context) {
 267                                unsigned from;
 268                                if (opt->pre_context < lno)
 269                                        from = lno - opt->pre_context;
 270                                else
 271                                        from = 1;
 272                                if (from <= last_shown)
 273                                        from = last_shown + 1;
 274                                if (last_shown && from != last_shown + 1)
 275                                        printf(hunk_mark);
 276                                while (from < lno) {
 277                                        pcl = &prev[lno-from-1];
 278                                        show_line(opt, pcl->bol, pcl->eol,
 279                                                  name, from, '-');
 280                                        from++;
 281                                }
 282                                last_shown = lno-1;
 283                        }
 284                        if (last_shown && lno != last_shown + 1)
 285                                printf(hunk_mark);
 286                        if (!opt->count)
 287                                show_line(opt, bol, eol, name, lno, ':');
 288                        last_shown = last_hit = lno;
 289                }
 290                else if (last_hit &&
 291                         lno <= last_hit + opt->post_context) {
 292                        /* If the last hit is within the post context,
 293                         * we need to show this line.
 294                         */
 295                        if (last_shown && lno != last_shown + 1)
 296                                printf(hunk_mark);
 297                        show_line(opt, bol, eol, name, lno, '-');
 298                        last_shown = lno;
 299                }
 300                if (opt->pre_context) {
 301                        memmove(prev+1, prev,
 302                                (opt->pre_context-1) * sizeof(*prev));
 303                        prev->bol = bol;
 304                        prev->eol = eol;
 305                }
 306
 307        next_line:
 308                *eol = ch;
 309                bol = eol + 1;
 310                if (!left)
 311                        break;
 312                left--;
 313                lno++;
 314        }
 315
 316        if (opt->unmatch_name_only) {
 317                /* We did not see any hit, so we want to show this */
 318                printf("%s\n", name);
 319                return 1;
 320        }
 321
 322        /* NEEDSWORK:
 323         * The real "grep -c foo *.c" gives many "bar.c:0" lines,
 324         * which feels mostly useless but sometimes useful.  Maybe
 325         * make it another option?  For now suppress them.
 326         */
 327        if (opt->count && count)
 328                printf("%s:%u\n", name, count);
 329        return !!last_hit;
 330}
 331
 332static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1, const char *name)
 333{
 334        unsigned long size;
 335        char *data;
 336        char type[20];
 337        int hit;
 338        data = read_sha1_file(sha1, type, &size);
 339        if (!data) {
 340                error("'%s': unable to read %s", name, sha1_to_hex(sha1));
 341                return 0;
 342        }
 343        hit = grep_buffer(opt, name, data, size);
 344        free(data);
 345        return hit;
 346}
 347
 348static int grep_file(struct grep_opt *opt, const char *filename)
 349{
 350        struct stat st;
 351        int i;
 352        char *data;
 353        if (lstat(filename, &st) < 0) {
 354        err_ret:
 355                if (errno != ENOENT)
 356                        error("'%s': %s", filename, strerror(errno));
 357                return 0;
 358        }
 359        if (!st.st_size)
 360                return 0; /* empty file -- no grep hit */
 361        if (!S_ISREG(st.st_mode))
 362                return 0;
 363        i = open(filename, O_RDONLY);
 364        if (i < 0)
 365                goto err_ret;
 366        data = xmalloc(st.st_size + 1);
 367        if (st.st_size != xread(i, data, st.st_size)) {
 368                error("'%s': short read %s", filename, strerror(errno));
 369                close(i);
 370                free(data);
 371                return 0;
 372        }
 373        close(i);
 374        i = grep_buffer(opt, filename, data, st.st_size);
 375        free(data);
 376        return i;
 377}
 378
 379static int grep_cache(struct grep_opt *opt, const char **paths, int cached)
 380{
 381        int hit = 0;
 382        int nr;
 383        read_cache();
 384
 385        for (nr = 0; nr < active_nr; nr++) {
 386                struct cache_entry *ce = active_cache[nr];
 387                if (ce_stage(ce) || !S_ISREG(ntohl(ce->ce_mode)))
 388                        continue;
 389                if (!pathspec_matches(paths, ce->name))
 390                        continue;
 391                if (cached)
 392                        hit |= grep_sha1(opt, ce->sha1, ce->name);
 393                else
 394                        hit |= grep_file(opt, ce->name);
 395        }
 396        return hit;
 397}
 398
 399static int grep_tree(struct grep_opt *opt, const char **paths,
 400                     struct tree_desc *tree,
 401                     const char *tree_name, const char *base)
 402{
 403        unsigned mode;
 404        int len;
 405        int hit = 0;
 406        const char *path;
 407        const unsigned char *sha1;
 408        char *down;
 409        char *path_buf = xmalloc(PATH_MAX + strlen(tree_name) + 100);
 410
 411        if (tree_name[0]) {
 412                int offset = sprintf(path_buf, "%s:", tree_name);
 413                down = path_buf + offset;
 414                strcat(down, base);
 415        }
 416        else {
 417                down = path_buf;
 418                strcpy(down, base);
 419        }
 420        len = strlen(path_buf);
 421
 422        while (tree->size) {
 423                int pathlen;
 424                sha1 = tree_entry_extract(tree, &path, &mode);
 425                pathlen = strlen(path);
 426                strcpy(path_buf + len, path);
 427
 428                if (S_ISDIR(mode))
 429                        /* Match "abc/" against pathspec to
 430                         * decide if we want to descend into "abc"
 431                         * directory.
 432                         */
 433                        strcpy(path_buf + len + pathlen, "/");
 434
 435                if (!pathspec_matches(paths, down))
 436                        ;
 437                else if (S_ISREG(mode))
 438                        hit |= grep_sha1(opt, sha1, path_buf);
 439                else if (S_ISDIR(mode)) {
 440                        char type[20];
 441                        struct tree_desc sub;
 442                        void *data;
 443                        data = read_sha1_file(sha1, type, &sub.size);
 444                        if (!data)
 445                                die("unable to read tree (%s)",
 446                                    sha1_to_hex(sha1));
 447                        sub.buf = data;
 448                        hit |= grep_tree(opt, paths, &sub, tree_name, down);
 449                        free(data);
 450                }
 451                update_tree_entry(tree);
 452        }
 453        return hit;
 454}
 455
 456static int grep_object(struct grep_opt *opt, const char **paths,
 457                       struct object *obj, const char *name)
 458{
 459        if (!strcmp(obj->type, blob_type))
 460                return grep_sha1(opt, obj->sha1, name);
 461        if (!strcmp(obj->type, commit_type) ||
 462            !strcmp(obj->type, tree_type)) {
 463                struct tree_desc tree;
 464                void *data;
 465                int hit;
 466                data = read_object_with_reference(obj->sha1, tree_type,
 467                                                  &tree.size, NULL);
 468                if (!data)
 469                        die("unable to read tree (%s)", sha1_to_hex(obj->sha1));
 470                tree.buf = data;
 471                hit = grep_tree(opt, paths, &tree, name, "");
 472                free(data);
 473                return hit;
 474        }
 475        die("unable to grep from object of type %s", obj->type);
 476}
 477
 478static const char builtin_grep_usage[] =
 479"git-grep <option>* <rev>* [-e] <pattern> [<path>...]";
 480
 481int cmd_grep(int argc, const char **argv, char **envp)
 482{
 483        int hit = 0;
 484        int no_more_flags = 0;
 485        int seen_noncommit = 0;
 486        int cached = 0;
 487        struct grep_opt opt;
 488        struct object_list *list, **tail, *object_list = NULL;
 489        const char *prefix = setup_git_directory();
 490        const char **paths = NULL;
 491
 492        memset(&opt, 0, sizeof(opt));
 493        opt.pattern_tail = &opt.pattern_list;
 494        opt.regflags = REG_NEWLINE;
 495
 496        /*
 497         * No point using rev_info, really.
 498         */
 499        while (1 < argc) {
 500                const char *arg = argv[1];
 501                argc--; argv++;
 502                if (!strcmp("--cached", arg)) {
 503                        cached = 1;
 504                        continue;
 505                }
 506                if (!strcmp("-a", arg) ||
 507                    !strcmp("--text", arg)) {
 508                        opt.binary = GREP_BINARY_TEXT;
 509                        continue;
 510                }
 511                if (!strcmp("-i", arg) ||
 512                    !strcmp("--ignore-case", arg)) {
 513                        opt.regflags |= REG_ICASE;
 514                        continue;
 515                }
 516                if (!strcmp("-I", arg)) {
 517                        opt.binary = GREP_BINARY_NOMATCH;
 518                        continue;
 519                }
 520                if (!strcmp("-v", arg) ||
 521                    !strcmp("--invert-match", arg)) {
 522                        opt.invert = 1;
 523                        continue;
 524                }
 525                if (!strcmp("-E", arg) ||
 526                    !strcmp("--extended-regexp", arg)) {
 527                        opt.regflags |= REG_EXTENDED;
 528                        continue;
 529                }
 530                if (!strcmp("-G", arg) ||
 531                    !strcmp("--basic-regexp", arg)) {
 532                        opt.regflags &= ~REG_EXTENDED;
 533                        continue;
 534                }
 535                if (!strcmp("-n", arg)) {
 536                        opt.linenum = 1;
 537                        continue;
 538                }
 539                if (!strcmp("-H", arg)) {
 540                        /* We always show the pathname, so this
 541                         * is a noop.
 542                         */
 543                        continue;
 544                }
 545                if (!strcmp("-l", arg) ||
 546                    !strcmp("--files-with-matches", arg)) {
 547                        opt.name_only = 1;
 548                        continue;
 549                }
 550                if (!strcmp("-L", arg) ||
 551                    !strcmp("--files-without-match", arg)) {
 552                        opt.unmatch_name_only = 1;
 553                        continue;
 554                }
 555                if (!strcmp("-c", arg) ||
 556                    !strcmp("--count", arg)) {
 557                        opt.count = 1;
 558                        continue;
 559                }
 560                if (!strcmp("-w", arg) ||
 561                    !strcmp("--word-regexp", arg)) {
 562                        opt.word_regexp = 1;
 563                        continue;
 564                }
 565                if (!strncmp("-A", arg, 2) ||
 566                    !strncmp("-B", arg, 2) ||
 567                    !strncmp("-C", arg, 2) ||
 568                    (arg[0] == '-' && '1' <= arg[1] && arg[1] <= '9')) {
 569                        unsigned num;
 570                        const char *scan;
 571                        switch (arg[1]) {
 572                        case 'A': case 'B': case 'C':
 573                                if (!arg[2]) {
 574                                        if (argc <= 1)
 575                                                usage(builtin_grep_usage);
 576                                        scan = *++argv;
 577                                        argc--;
 578                                }
 579                                else
 580                                        scan = arg + 2;
 581                                break;
 582                        default:
 583                                scan = arg + 1;
 584                                break;
 585                        }
 586                        if (sscanf(scan, "%u", &num) != 1)
 587                                usage(builtin_grep_usage);
 588                        switch (arg[1]) {
 589                        case 'A':
 590                                opt.post_context = num;
 591                                break;
 592                        default:
 593                        case 'C':
 594                                opt.post_context = num;
 595                        case 'B':
 596                                opt.pre_context = num;
 597                                break;
 598                        }
 599                        continue;
 600                }
 601                if (!strcmp("-e", arg)) {
 602                        if (1 < argc) {
 603                                add_pattern(&opt, argv[1]);
 604                                argv++;
 605                                argc--;
 606                                continue;
 607                        }
 608                        usage(builtin_grep_usage);
 609                }
 610                if (!strcmp("--", arg)) {
 611                        no_more_flags = 1;
 612                        continue;
 613                }
 614                /* Either unrecognized option or a single pattern */
 615                if (!no_more_flags && *arg == '-')
 616                        usage(builtin_grep_usage);
 617                if (!opt.pattern_list) {
 618                        add_pattern(&opt, arg);
 619                        break;
 620                }
 621                else {
 622                        /* We are looking at the first path or rev;
 623                         * it is found at argv[0] after leaving the
 624                         * loop.
 625                         */
 626                        argc++; argv--;
 627                        break;
 628                }
 629        }
 630        if (!opt.pattern_list)
 631                die("no pattern given.");
 632        compile_patterns(&opt);
 633        tail = &object_list;
 634        while (1 < argc) {
 635                struct object *object;
 636                struct object_list *elem;
 637                const char *arg = argv[1];
 638                unsigned char sha1[20];
 639                if (get_sha1(arg, sha1) < 0)
 640                        break;
 641                object = parse_object(sha1);
 642                if (!object)
 643                        die("bad object %s", arg);
 644                elem = object_list_insert(object, tail);
 645                elem->name = arg;
 646                tail = &elem->next;
 647                argc--; argv++;
 648        }
 649        if (1 < argc)
 650                paths = get_pathspec(prefix, argv + 1);
 651        else if (prefix) {
 652                paths = xcalloc(2, sizeof(const char *));
 653                paths[0] = prefix;
 654                paths[1] = NULL;
 655        }
 656
 657        if (!object_list)
 658                return !grep_cache(&opt, paths, cached);
 659        /*
 660         * Do not walk "grep -e foo master next pu -- Documentation/"
 661         * but do walk "grep -e foo master..next -- Documentation/".
 662         * Ranged request mixed with a blob or tree object, like
 663         * "grep -e foo v1.0.0:Documentation/ master..next"
 664         * so detect that and complain.
 665         */
 666        for (list = object_list; list; list = list->next) {
 667                struct object *real_obj;
 668                real_obj = deref_tag(list->item, NULL, 0);
 669                if (strcmp(real_obj->type, commit_type))
 670                        seen_noncommit = 1;
 671        }
 672        if (cached)
 673                die("both --cached and revisions given.");
 674
 675        for (list = object_list; list; list = list->next) {
 676                struct object *real_obj;
 677                real_obj = deref_tag(list->item, NULL, 0);
 678                if (grep_object(&opt, paths, real_obj, list->name))
 679                        hit = 1;
 680        }
 681        return !hit;
 682}