builtin-grep.con commit Merge branch 'jc/xsha1' into next (7dd0d0b)
   1/*
   2 * Builtin "git grep"
   3 *
   4 * Copyright (c) 2006 Junio C Hamano
   5 */
   6#include "cache.h"
   7#include "blob.h"
   8#include "tree.h"
   9#include "commit.h"
  10#include "tag.h"
  11#include "tree-walk.h"
  12#include "builtin.h"
  13#include <regex.h>
  14#include <fnmatch.h>
  15
  16/*
  17 * git grep pathspecs are somewhat different from diff-tree pathspecs;
  18 * pathname wildcards are allowed.
  19 */
  20static int pathspec_matches(const char **paths, const char *name)
  21{
  22        int namelen, i;
  23        if (!paths || !*paths)
  24                return 1;
  25        namelen = strlen(name);
  26        for (i = 0; paths[i]; i++) {
  27                const char *match = paths[i];
  28                int matchlen = strlen(match);
  29                const char *cp, *meta;
  30
  31                if ((matchlen <= namelen) &&
  32                    !strncmp(name, match, matchlen) &&
  33                    (match[matchlen-1] == '/' ||
  34                     name[matchlen] == '\0' || name[matchlen] == '/'))
  35                        return 1;
  36                if (!fnmatch(match, name, 0))
  37                        return 1;
  38                if (name[namelen-1] != '/')
  39                        continue;
  40
  41                /* We are being asked if the directory ("name") is worth
  42                 * descending into.
  43                 *
  44                 * Find the longest leading directory name that does
  45                 * not have metacharacter in the pathspec; the name
  46                 * we are looking at must overlap with that directory.
  47                 */
  48                for (cp = match, meta = NULL; cp - match < matchlen; cp++) {
  49                        char ch = *cp;
  50                        if (ch == '*' || ch == '[' || ch == '?') {
  51                                meta = cp;
  52                                break;
  53                        }
  54                }
  55                if (!meta)
  56                        meta = cp; /* fully literal */
  57
  58                if (namelen <= meta - match) {
  59                        /* Looking at "Documentation/" and
  60                         * the pattern says "Documentation/howto/", or
  61                         * "Documentation/diff*.txt".  The name we
  62                         * have should match prefix.
  63                         */
  64                        if (!memcmp(match, name, namelen))
  65                                return 1;
  66                        continue;
  67                }
  68
  69                if (meta - match < namelen) {
  70                        /* Looking at "Documentation/howto/" and
  71                         * the pattern says "Documentation/h*";
  72                         * match up to "Do.../h"; this avoids descending
  73                         * into "Documentation/technical/".
  74                         */
  75                        if (!memcmp(match, name, meta - match))
  76                                return 1;
  77                        continue;
  78                }
  79        }
  80        return 0;
  81}
  82
  83struct grep_pat {
  84        struct grep_pat *next;
  85        const char *origin;
  86        int no;
  87        const char *pattern;
  88        regex_t regexp;
  89};
  90
  91struct grep_opt {
  92        struct grep_pat *pattern_list;
  93        struct grep_pat **pattern_tail;
  94        regex_t regexp;
  95        unsigned linenum:1;
  96        unsigned invert:1;
  97        unsigned name_only:1;
  98        unsigned unmatch_name_only:1;
  99        unsigned count:1;
 100        unsigned word_regexp:1;
 101#define GREP_BINARY_DEFAULT     0
 102#define GREP_BINARY_NOMATCH     1
 103#define GREP_BINARY_TEXT        2
 104        unsigned binary:2;
 105        int regflags;
 106        unsigned pre_context;
 107        unsigned post_context;
 108};
 109
 110static void add_pattern(struct grep_opt *opt, const char *pat,
 111                        const char *origin, int no)
 112{
 113        struct grep_pat *p = xcalloc(1, sizeof(*p));
 114        p->pattern = pat;
 115        p->origin = origin;
 116        p->no = no;
 117        *opt->pattern_tail = p;
 118        opt->pattern_tail = &p->next;
 119        p->next = NULL;
 120}
 121
 122static void compile_patterns(struct grep_opt *opt)
 123{
 124        struct grep_pat *p;
 125        for (p = opt->pattern_list; p; p = p->next) {
 126                int err = regcomp(&p->regexp, p->pattern, opt->regflags);
 127                if (err) {
 128                        char errbuf[1024];
 129                        char where[1024];
 130                        if (p->no)
 131                                sprintf(where, "In '%s' at %d, ",
 132                                        p->origin, p->no);
 133                        else if (p->origin)
 134                                sprintf(where, "%s, ", p->origin);
 135                        else
 136                                where[0] = 0;
 137                        regerror(err, &p->regexp, errbuf, 1024);
 138                        regfree(&p->regexp);
 139                        die("%s'%s': %s", where, p->pattern, errbuf);
 140                }
 141        }
 142}
 143
 144static char *end_of_line(char *cp, unsigned long *left)
 145{
 146        unsigned long l = *left;
 147        while (l && *cp != '\n') {
 148                l--;
 149                cp++;
 150        }
 151        *left = l;
 152        return cp;
 153}
 154
 155static int word_char(char ch)
 156{
 157        return isalnum(ch) || ch == '_';
 158}
 159
 160static void show_line(struct grep_opt *opt, const char *bol, const char *eol,
 161                      const char *name, unsigned lno, char sign)
 162{
 163        printf("%s%c", name, sign);
 164        if (opt->linenum)
 165                printf("%d%c", lno, sign);
 166        printf("%.*s\n", (int)(eol-bol), bol);
 167}
 168
 169/*
 170 * NEEDSWORK: share code with diff.c
 171 */
 172#define FIRST_FEW_BYTES 8000
 173static int buffer_is_binary(const char *ptr, unsigned long size)
 174{
 175        if (FIRST_FEW_BYTES < size)
 176                size = FIRST_FEW_BYTES;
 177        if (memchr(ptr, 0, size))
 178                return 1;
 179        return 0;
 180}
 181
 182static int grep_buffer(struct grep_opt *opt, const char *name,
 183                       char *buf, unsigned long size)
 184{
 185        char *bol = buf;
 186        unsigned long left = size;
 187        unsigned lno = 1;
 188        struct pre_context_line {
 189                char *bol;
 190                char *eol;
 191        } *prev = NULL, *pcl;
 192        unsigned last_hit = 0;
 193        unsigned last_shown = 0;
 194        int binary_match_only = 0;
 195        const char *hunk_mark = "";
 196        unsigned count = 0;
 197
 198        if (buffer_is_binary(buf, size)) {
 199                switch (opt->binary) {
 200                case GREP_BINARY_DEFAULT:
 201                        binary_match_only = 1;
 202                        break;
 203                case GREP_BINARY_NOMATCH:
 204                        return 0; /* Assume unmatch */
 205                        break;
 206                default:
 207                        break;
 208                }
 209        }
 210
 211        if (opt->pre_context)
 212                prev = xcalloc(opt->pre_context, sizeof(*prev));
 213        if (opt->pre_context || opt->post_context)
 214                hunk_mark = "--\n";
 215
 216        while (left) {
 217                regmatch_t pmatch[10];
 218                char *eol, ch;
 219                int hit = 0;
 220                struct grep_pat *p;
 221
 222                eol = end_of_line(bol, &left);
 223                ch = *eol;
 224                *eol = 0;
 225
 226                for (p = opt->pattern_list; p; p = p->next) {
 227                        regex_t *exp = &p->regexp;
 228                        hit = !regexec(exp, bol, ARRAY_SIZE(pmatch),
 229                                       pmatch, 0);
 230
 231                        if (hit && opt->word_regexp) {
 232                                /* Match beginning must be either
 233                                 * beginning of the line, or at word
 234                                 * boundary (i.e. the last char must
 235                                 * not be alnum or underscore).
 236                                 */
 237                                if ((pmatch[0].rm_so < 0) ||
 238                                    (eol - bol) <= pmatch[0].rm_so ||
 239                                    (pmatch[0].rm_eo < 0) ||
 240                                    (eol - bol) < pmatch[0].rm_eo)
 241                                        die("regexp returned nonsense");
 242                                if (pmatch[0].rm_so != 0 &&
 243                                    word_char(bol[pmatch[0].rm_so-1]))
 244                                        continue; /* not a word boundary */
 245                                if ((eol-bol) < pmatch[0].rm_eo &&
 246                                    word_char(bol[pmatch[0].rm_eo]))
 247                                        continue; /* not a word boundary */
 248                        }
 249                        if (hit)
 250                                break;
 251                }
 252                /* "grep -v -e foo -e bla" should list lines
 253                 * that do not have either, so inversion should
 254                 * be done outside.
 255                 */
 256                if (opt->invert)
 257                        hit = !hit;
 258                if (opt->unmatch_name_only) {
 259                        if (hit)
 260                                return 0;
 261                        goto next_line;
 262                }
 263                if (hit) {
 264                        count++;
 265                        if (binary_match_only) {
 266                                printf("Binary file %s matches\n", name);
 267                                return 1;
 268                        }
 269                        if (opt->name_only) {
 270                                printf("%s\n", name);
 271                                return 1;
 272                        }
 273                        /* Hit at this line.  If we haven't shown the
 274                         * pre-context lines, we would need to show them.
 275                         * When asked to do "count", this still show
 276                         * the context which is nonsense, but the user
 277                         * deserves to get that ;-).
 278                         */
 279                        if (opt->pre_context) {
 280                                unsigned from;
 281                                if (opt->pre_context < lno)
 282                                        from = lno - opt->pre_context;
 283                                else
 284                                        from = 1;
 285                                if (from <= last_shown)
 286                                        from = last_shown + 1;
 287                                if (last_shown && from != last_shown + 1)
 288                                        printf(hunk_mark);
 289                                while (from < lno) {
 290                                        pcl = &prev[lno-from-1];
 291                                        show_line(opt, pcl->bol, pcl->eol,
 292                                                  name, from, '-');
 293                                        from++;
 294                                }
 295                                last_shown = lno-1;
 296                        }
 297                        if (last_shown && lno != last_shown + 1)
 298                                printf(hunk_mark);
 299                        if (!opt->count)
 300                                show_line(opt, bol, eol, name, lno, ':');
 301                        last_shown = last_hit = lno;
 302                }
 303                else if (last_hit &&
 304                         lno <= last_hit + opt->post_context) {
 305                        /* If the last hit is within the post context,
 306                         * we need to show this line.
 307                         */
 308                        if (last_shown && lno != last_shown + 1)
 309                                printf(hunk_mark);
 310                        show_line(opt, bol, eol, name, lno, '-');
 311                        last_shown = lno;
 312                }
 313                if (opt->pre_context) {
 314                        memmove(prev+1, prev,
 315                                (opt->pre_context-1) * sizeof(*prev));
 316                        prev->bol = bol;
 317                        prev->eol = eol;
 318                }
 319
 320        next_line:
 321                *eol = ch;
 322                bol = eol + 1;
 323                if (!left)
 324                        break;
 325                left--;
 326                lno++;
 327        }
 328
 329        if (opt->unmatch_name_only) {
 330                /* We did not see any hit, so we want to show this */
 331                printf("%s\n", name);
 332                return 1;
 333        }
 334
 335        /* NEEDSWORK:
 336         * The real "grep -c foo *.c" gives many "bar.c:0" lines,
 337         * which feels mostly useless but sometimes useful.  Maybe
 338         * make it another option?  For now suppress them.
 339         */
 340        if (opt->count && count)
 341                printf("%s:%u\n", name, count);
 342        return !!last_hit;
 343}
 344
 345static int grep_sha1(struct grep_opt *opt, const unsigned char *sha1, const char *name)
 346{
 347        unsigned long size;
 348        char *data;
 349        char type[20];
 350        int hit;
 351        data = read_sha1_file(sha1, type, &size);
 352        if (!data) {
 353                error("'%s': unable to read %s", name, sha1_to_hex(sha1));
 354                return 0;
 355        }
 356        hit = grep_buffer(opt, name, data, size);
 357        free(data);
 358        return hit;
 359}
 360
 361static int grep_file(struct grep_opt *opt, const char *filename)
 362{
 363        struct stat st;
 364        int i;
 365        char *data;
 366        if (lstat(filename, &st) < 0) {
 367        err_ret:
 368                if (errno != ENOENT)
 369                        error("'%s': %s", filename, strerror(errno));
 370                return 0;
 371        }
 372        if (!st.st_size)
 373                return 0; /* empty file -- no grep hit */
 374        if (!S_ISREG(st.st_mode))
 375                return 0;
 376        i = open(filename, O_RDONLY);
 377        if (i < 0)
 378                goto err_ret;
 379        data = xmalloc(st.st_size + 1);
 380        if (st.st_size != xread(i, data, st.st_size)) {
 381                error("'%s': short read %s", filename, strerror(errno));
 382                close(i);
 383                free(data);
 384                return 0;
 385        }
 386        close(i);
 387        i = grep_buffer(opt, filename, data, st.st_size);
 388        free(data);
 389        return i;
 390}
 391
 392static int grep_cache(struct grep_opt *opt, const char **paths, int cached)
 393{
 394        int hit = 0;
 395        int nr;
 396        read_cache();
 397
 398        for (nr = 0; nr < active_nr; nr++) {
 399                struct cache_entry *ce = active_cache[nr];
 400                if (ce_stage(ce) || !S_ISREG(ntohl(ce->ce_mode)))
 401                        continue;
 402                if (!pathspec_matches(paths, ce->name))
 403                        continue;
 404                if (cached)
 405                        hit |= grep_sha1(opt, ce->sha1, ce->name);
 406                else
 407                        hit |= grep_file(opt, ce->name);
 408        }
 409        return hit;
 410}
 411
 412static int grep_tree(struct grep_opt *opt, const char **paths,
 413                     struct tree_desc *tree,
 414                     const char *tree_name, const char *base)
 415{
 416        unsigned mode;
 417        int len;
 418        int hit = 0;
 419        const char *path;
 420        const unsigned char *sha1;
 421        char *down;
 422        char *path_buf = xmalloc(PATH_MAX + strlen(tree_name) + 100);
 423
 424        if (tree_name[0]) {
 425                int offset = sprintf(path_buf, "%s:", tree_name);
 426                down = path_buf + offset;
 427                strcat(down, base);
 428        }
 429        else {
 430                down = path_buf;
 431                strcpy(down, base);
 432        }
 433        len = strlen(path_buf);
 434
 435        while (tree->size) {
 436                int pathlen;
 437                sha1 = tree_entry_extract(tree, &path, &mode);
 438                pathlen = strlen(path);
 439                strcpy(path_buf + len, path);
 440
 441                if (S_ISDIR(mode))
 442                        /* Match "abc/" against pathspec to
 443                         * decide if we want to descend into "abc"
 444                         * directory.
 445                         */
 446                        strcpy(path_buf + len + pathlen, "/");
 447
 448                if (!pathspec_matches(paths, down))
 449                        ;
 450                else if (S_ISREG(mode))
 451                        hit |= grep_sha1(opt, sha1, path_buf);
 452                else if (S_ISDIR(mode)) {
 453                        char type[20];
 454                        struct tree_desc sub;
 455                        void *data;
 456                        data = read_sha1_file(sha1, type, &sub.size);
 457                        if (!data)
 458                                die("unable to read tree (%s)",
 459                                    sha1_to_hex(sha1));
 460                        sub.buf = data;
 461                        hit |= grep_tree(opt, paths, &sub, tree_name, down);
 462                        free(data);
 463                }
 464                update_tree_entry(tree);
 465        }
 466        return hit;
 467}
 468
 469static int grep_object(struct grep_opt *opt, const char **paths,
 470                       struct object *obj, const char *name)
 471{
 472        if (!strcmp(obj->type, blob_type))
 473                return grep_sha1(opt, obj->sha1, name);
 474        if (!strcmp(obj->type, commit_type) ||
 475            !strcmp(obj->type, tree_type)) {
 476                struct tree_desc tree;
 477                void *data;
 478                int hit;
 479                data = read_object_with_reference(obj->sha1, tree_type,
 480                                                  &tree.size, NULL);
 481                if (!data)
 482                        die("unable to read tree (%s)", sha1_to_hex(obj->sha1));
 483                tree.buf = data;
 484                hit = grep_tree(opt, paths, &tree, name, "");
 485                free(data);
 486                return hit;
 487        }
 488        die("unable to grep from object of type %s", obj->type);
 489}
 490
 491static const char builtin_grep_usage[] =
 492"git-grep <option>* <rev>* [-e] <pattern> [<path>...]";
 493
 494int cmd_grep(int argc, const char **argv, char **envp)
 495{
 496        int hit = 0;
 497        int no_more_flags = 0;
 498        int cached = 0;
 499        struct grep_opt opt;
 500        struct object_list *list, **tail, *object_list = NULL;
 501        const char *prefix = setup_git_directory();
 502        const char **paths = NULL;
 503
 504        memset(&opt, 0, sizeof(opt));
 505        opt.pattern_tail = &opt.pattern_list;
 506        opt.regflags = REG_NEWLINE;
 507
 508        /*
 509         * No point using rev_info, really.
 510         */
 511        while (1 < argc) {
 512                const char *arg = argv[1];
 513                argc--; argv++;
 514                if (!strcmp("--cached", arg)) {
 515                        cached = 1;
 516                        continue;
 517                }
 518                if (!strcmp("-a", arg) ||
 519                    !strcmp("--text", arg)) {
 520                        opt.binary = GREP_BINARY_TEXT;
 521                        continue;
 522                }
 523                if (!strcmp("-i", arg) ||
 524                    !strcmp("--ignore-case", arg)) {
 525                        opt.regflags |= REG_ICASE;
 526                        continue;
 527                }
 528                if (!strcmp("-I", arg)) {
 529                        opt.binary = GREP_BINARY_NOMATCH;
 530                        continue;
 531                }
 532                if (!strcmp("-v", arg) ||
 533                    !strcmp("--invert-match", arg)) {
 534                        opt.invert = 1;
 535                        continue;
 536                }
 537                if (!strcmp("-E", arg) ||
 538                    !strcmp("--extended-regexp", arg)) {
 539                        opt.regflags |= REG_EXTENDED;
 540                        continue;
 541                }
 542                if (!strcmp("-G", arg) ||
 543                    !strcmp("--basic-regexp", arg)) {
 544                        opt.regflags &= ~REG_EXTENDED;
 545                        continue;
 546                }
 547                if (!strcmp("-n", arg)) {
 548                        opt.linenum = 1;
 549                        continue;
 550                }
 551                if (!strcmp("-H", arg)) {
 552                        /* We always show the pathname, so this
 553                         * is a noop.
 554                         */
 555                        continue;
 556                }
 557                if (!strcmp("-l", arg) ||
 558                    !strcmp("--files-with-matches", arg)) {
 559                        opt.name_only = 1;
 560                        continue;
 561                }
 562                if (!strcmp("-L", arg) ||
 563                    !strcmp("--files-without-match", arg)) {
 564                        opt.unmatch_name_only = 1;
 565                        continue;
 566                }
 567                if (!strcmp("-c", arg) ||
 568                    !strcmp("--count", arg)) {
 569                        opt.count = 1;
 570                        continue;
 571                }
 572                if (!strcmp("-w", arg) ||
 573                    !strcmp("--word-regexp", arg)) {
 574                        opt.word_regexp = 1;
 575                        continue;
 576                }
 577                if (!strncmp("-A", arg, 2) ||
 578                    !strncmp("-B", arg, 2) ||
 579                    !strncmp("-C", arg, 2) ||
 580                    (arg[0] == '-' && '1' <= arg[1] && arg[1] <= '9')) {
 581                        unsigned num;
 582                        const char *scan;
 583                        switch (arg[1]) {
 584                        case 'A': case 'B': case 'C':
 585                                if (!arg[2]) {
 586                                        if (argc <= 1)
 587                                                usage(builtin_grep_usage);
 588                                        scan = *++argv;
 589                                        argc--;
 590                                }
 591                                else
 592                                        scan = arg + 2;
 593                                break;
 594                        default:
 595                                scan = arg + 1;
 596                                break;
 597                        }
 598                        if (sscanf(scan, "%u", &num) != 1)
 599                                usage(builtin_grep_usage);
 600                        switch (arg[1]) {
 601                        case 'A':
 602                                opt.post_context = num;
 603                                break;
 604                        default:
 605                        case 'C':
 606                                opt.post_context = num;
 607                        case 'B':
 608                                opt.pre_context = num;
 609                                break;
 610                        }
 611                        continue;
 612                }
 613                if (!strcmp("-f", arg)) {
 614                        FILE *patterns;
 615                        int lno = 0;
 616                        char buf[1024];
 617                        if (argc <= 1)
 618                                usage(builtin_grep_usage);
 619                        patterns = fopen(argv[1], "r");
 620                        if (!patterns)
 621                                die("'%s': %s", strerror(errno));
 622                        while (fgets(buf, sizeof(buf), patterns)) {
 623                                int len = strlen(buf);
 624                                if (buf[len-1] == '\n')
 625                                        buf[len-1] = 0;
 626                                /* ignore empty line like grep does */
 627                                if (!buf[0])
 628                                        continue;
 629                                add_pattern(&opt, strdup(buf), argv[1], ++lno);
 630                        }
 631                        fclose(patterns);
 632                        argv++;
 633                        argc--;
 634                        continue;
 635                }
 636                if (!strcmp("-e", arg)) {
 637                        if (1 < argc) {
 638                                add_pattern(&opt, argv[1], "-e option", 0);
 639                                argv++;
 640                                argc--;
 641                                continue;
 642                        }
 643                        usage(builtin_grep_usage);
 644                }
 645                if (!strcmp("--", arg)) {
 646                        no_more_flags = 1;
 647                        continue;
 648                }
 649                /* Either unrecognized option or a single pattern */
 650                if (!no_more_flags && *arg == '-')
 651                        usage(builtin_grep_usage);
 652                if (!opt.pattern_list) {
 653                        add_pattern(&opt, arg, "command line", 0);
 654                        break;
 655                }
 656                else {
 657                        /* We are looking at the first path or rev;
 658                         * it is found at argv[0] after leaving the
 659                         * loop.
 660                         */
 661                        argc++; argv--;
 662                        break;
 663                }
 664        }
 665        if (!opt.pattern_list)
 666                die("no pattern given.");
 667        compile_patterns(&opt);
 668        tail = &object_list;
 669        while (1 < argc) {
 670                struct object *object;
 671                struct object_list *elem;
 672                const char *arg = argv[1];
 673                unsigned char sha1[20];
 674                if (get_sha1(arg, sha1) < 0)
 675                        break;
 676                object = parse_object(sha1);
 677                if (!object)
 678                        die("bad object %s", arg);
 679                elem = object_list_insert(object, tail);
 680                elem->name = arg;
 681                tail = &elem->next;
 682                argc--; argv++;
 683        }
 684        if (1 < argc)
 685                paths = get_pathspec(prefix, argv + 1);
 686        else if (prefix) {
 687                paths = xcalloc(2, sizeof(const char *));
 688                paths[0] = prefix;
 689                paths[1] = NULL;
 690        }
 691
 692        if (!object_list)
 693                return !grep_cache(&opt, paths, cached);
 694
 695        if (cached)
 696                die("both --cached and trees are given.");
 697
 698        for (list = object_list; list; list = list->next) {
 699                struct object *real_obj;
 700                real_obj = deref_tag(list->item, NULL, 0);
 701                if (grep_object(&opt, paths, real_obj, list->name))
 702                        hit = 1;
 703        }
 704        return !hit;
 705}