convert.con commit Merge branch 'fg/autocrlf' (d249515)
   1#include "cache.h"
   2#include "attr.h"
   3#include "run-command.h"
   4
   5/*
   6 * convert.c - convert a file when checking it out and checking it in.
   7 *
   8 * This should use the pathname to decide on whether it wants to do some
   9 * more interesting conversions (automatic gzip/unzip, general format
  10 * conversions etc etc), but by default it just does automatic CRLF<->LF
  11 * translation when the "auto_crlf" option is set.
  12 */
  13
  14#define CRLF_GUESS      (-1)
  15#define CRLF_BINARY     0
  16#define CRLF_TEXT       1
  17#define CRLF_INPUT      2
  18
  19struct text_stat {
  20        /* NUL, CR, LF and CRLF counts */
  21        unsigned nul, cr, lf, crlf;
  22
  23        /* These are just approximations! */
  24        unsigned printable, nonprintable;
  25};
  26
  27static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  28{
  29        unsigned long i;
  30
  31        memset(stats, 0, sizeof(*stats));
  32
  33        for (i = 0; i < size; i++) {
  34                unsigned char c = buf[i];
  35                if (c == '\r') {
  36                        stats->cr++;
  37                        if (i+1 < size && buf[i+1] == '\n')
  38                                stats->crlf++;
  39                        continue;
  40                }
  41                if (c == '\n') {
  42                        stats->lf++;
  43                        continue;
  44                }
  45                if (c == 127)
  46                        /* DEL */
  47                        stats->nonprintable++;
  48                else if (c < 32) {
  49                        switch (c) {
  50                                /* BS, HT, ESC and FF */
  51                        case '\b': case '\t': case '\033': case '\014':
  52                                stats->printable++;
  53                                break;
  54                        case 0:
  55                                stats->nul++;
  56                                /* fall through */
  57                        default:
  58                                stats->nonprintable++;
  59                        }
  60                }
  61                else
  62                        stats->printable++;
  63        }
  64
  65        /* If file ends with EOF then don't count this EOF as non-printable. */
  66        if (size >= 1 && buf[size-1] == '\032')
  67                stats->nonprintable--;
  68}
  69
  70/*
  71 * The same heuristics as diff.c::mmfile_is_binary()
  72 */
  73static int is_binary(unsigned long size, struct text_stat *stats)
  74{
  75
  76        if (stats->nul)
  77                return 1;
  78        if ((stats->printable >> 7) < stats->nonprintable)
  79                return 1;
  80        /*
  81         * Other heuristics? Average line length might be relevant,
  82         * as might LF vs CR vs CRLF counts..
  83         *
  84         * NOTE! It might be normal to have a low ratio of CRLF to LF
  85         * (somebody starts with a LF-only file and edits it with an editor
  86         * that adds CRLF only to lines that are added..). But do  we
  87         * want to support CR-only? Probably not.
  88         */
  89        return 0;
  90}
  91
  92static void check_safe_crlf(const char *path, int action,
  93                            struct text_stat *stats, enum safe_crlf checksafe)
  94{
  95        if (!checksafe)
  96                return;
  97
  98        if (action == CRLF_INPUT || auto_crlf <= 0) {
  99                /*
 100                 * CRLFs would not be restored by checkout:
 101                 * check if we'd remove CRLFs
 102                 */
 103                if (stats->crlf) {
 104                        if (checksafe == SAFE_CRLF_WARN)
 105                                warning("CRLF will be replaced by LF in %s.", path);
 106                        else /* i.e. SAFE_CRLF_FAIL */
 107                                die("CRLF would be replaced by LF in %s.", path);
 108                }
 109        } else if (auto_crlf > 0) {
 110                /*
 111                 * CRLFs would be added by checkout:
 112                 * check if we have "naked" LFs
 113                 */
 114                if (stats->lf != stats->crlf) {
 115                        if (checksafe == SAFE_CRLF_WARN)
 116                                warning("LF will be replaced by CRLF in %s", path);
 117                        else /* i.e. SAFE_CRLF_FAIL */
 118                                die("LF would be replaced by CRLF in %s", path);
 119                }
 120        }
 121}
 122
 123static int has_cr_in_index(const char *path)
 124{
 125        int pos, len;
 126        unsigned long sz;
 127        enum object_type type;
 128        void *data;
 129        int has_cr;
 130        struct index_state *istate = &the_index;
 131
 132        len = strlen(path);
 133        pos = index_name_pos(istate, path, len);
 134        if (pos < 0) {
 135                /*
 136                 * We might be in the middle of a merge, in which
 137                 * case we would read stage #2 (ours).
 138                 */
 139                int i;
 140                for (i = -pos - 1;
 141                     (pos < 0 && i < istate->cache_nr &&
 142                      !strcmp(istate->cache[i]->name, path));
 143                     i++)
 144                        if (ce_stage(istate->cache[i]) == 2)
 145                                pos = i;
 146        }
 147        if (pos < 0)
 148                return 0;
 149        data = read_sha1_file(istate->cache[pos]->sha1, &type, &sz);
 150        if (!data || type != OBJ_BLOB) {
 151                free(data);
 152                return 0;
 153        }
 154
 155        has_cr = memchr(data, '\r', sz) != NULL;
 156        free(data);
 157        return has_cr;
 158}
 159
 160static int crlf_to_git(const char *path, const char *src, size_t len,
 161                       struct strbuf *buf, int action, enum safe_crlf checksafe)
 162{
 163        struct text_stat stats;
 164        char *dst;
 165
 166        if ((action == CRLF_BINARY) || !auto_crlf || !len)
 167                return 0;
 168
 169        gather_stats(src, len, &stats);
 170
 171        if (action == CRLF_GUESS) {
 172                /*
 173                 * We're currently not going to even try to convert stuff
 174                 * that has bare CR characters. Does anybody do that crazy
 175                 * stuff?
 176                 */
 177                if (stats.cr != stats.crlf)
 178                        return 0;
 179
 180                /*
 181                 * And add some heuristics for binary vs text, of course...
 182                 */
 183                if (is_binary(len, &stats))
 184                        return 0;
 185
 186                /*
 187                 * If the file in the index has any CR in it, do not convert.
 188                 * This is the new safer autocrlf handling.
 189                 */
 190                if (has_cr_in_index(path))
 191                        return 0;
 192        }
 193
 194        check_safe_crlf(path, action, &stats, checksafe);
 195
 196        /* Optimization: No CR? Nothing to convert, regardless. */
 197        if (!stats.cr)
 198                return 0;
 199
 200        /* only grow if not in place */
 201        if (strbuf_avail(buf) + buf->len < len)
 202                strbuf_grow(buf, len - buf->len);
 203        dst = buf->buf;
 204        if (action == CRLF_GUESS) {
 205                /*
 206                 * If we guessed, we already know we rejected a file with
 207                 * lone CR, and we can strip a CR without looking at what
 208                 * follow it.
 209                 */
 210                do {
 211                        unsigned char c = *src++;
 212                        if (c != '\r')
 213                                *dst++ = c;
 214                } while (--len);
 215        } else {
 216                do {
 217                        unsigned char c = *src++;
 218                        if (! (c == '\r' && (1 < len && *src == '\n')))
 219                                *dst++ = c;
 220                } while (--len);
 221        }
 222        strbuf_setlen(buf, dst - buf->buf);
 223        return 1;
 224}
 225
 226static int crlf_to_worktree(const char *path, const char *src, size_t len,
 227                            struct strbuf *buf, int action)
 228{
 229        char *to_free = NULL;
 230        struct text_stat stats;
 231
 232        if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 233            auto_crlf <= 0)
 234                return 0;
 235
 236        if (!len)
 237                return 0;
 238
 239        gather_stats(src, len, &stats);
 240
 241        /* No LF? Nothing to convert, regardless. */
 242        if (!stats.lf)
 243                return 0;
 244
 245        /* Was it already in CRLF format? */
 246        if (stats.lf == stats.crlf)
 247                return 0;
 248
 249        if (action == CRLF_GUESS) {
 250                /* If we have any CR or CRLF line endings, we do not touch it */
 251                /* This is the new safer autocrlf-handling */
 252                if (stats.cr > 0 || stats.crlf > 0)
 253                        return 0;
 254
 255                /* If we have any bare CR characters, we're not going to touch it */
 256                if (stats.cr != stats.crlf)
 257                        return 0;
 258
 259                if (is_binary(len, &stats))
 260                        return 0;
 261        }
 262
 263        /* are we "faking" in place editing ? */
 264        if (src == buf->buf)
 265                to_free = strbuf_detach(buf, NULL);
 266
 267        strbuf_grow(buf, len + stats.lf - stats.crlf);
 268        for (;;) {
 269                const char *nl = memchr(src, '\n', len);
 270                if (!nl)
 271                        break;
 272                if (nl > src && nl[-1] == '\r') {
 273                        strbuf_add(buf, src, nl + 1 - src);
 274                } else {
 275                        strbuf_add(buf, src, nl - src);
 276                        strbuf_addstr(buf, "\r\n");
 277                }
 278                len -= nl + 1 - src;
 279                src  = nl + 1;
 280        }
 281        strbuf_add(buf, src, len);
 282
 283        free(to_free);
 284        return 1;
 285}
 286
 287struct filter_params {
 288        const char *src;
 289        unsigned long size;
 290        const char *cmd;
 291};
 292
 293static int filter_buffer(int in, int out, void *data)
 294{
 295        /*
 296         * Spawn cmd and feed the buffer contents through its stdin.
 297         */
 298        struct child_process child_process;
 299        struct filter_params *params = (struct filter_params *)data;
 300        int write_err, status;
 301        const char *argv[] = { NULL, NULL };
 302
 303        argv[0] = params->cmd;
 304
 305        memset(&child_process, 0, sizeof(child_process));
 306        child_process.argv = argv;
 307        child_process.use_shell = 1;
 308        child_process.in = -1;
 309        child_process.out = out;
 310
 311        if (start_command(&child_process))
 312                return error("cannot fork to run external filter %s", params->cmd);
 313
 314        write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
 315        if (close(child_process.in))
 316                write_err = 1;
 317        if (write_err)
 318                error("cannot feed the input to external filter %s", params->cmd);
 319
 320        status = finish_command(&child_process);
 321        if (status)
 322                error("external filter %s failed %d", params->cmd, status);
 323        return (write_err || status);
 324}
 325
 326static int apply_filter(const char *path, const char *src, size_t len,
 327                        struct strbuf *dst, const char *cmd)
 328{
 329        /*
 330         * Create a pipeline to have the command filter the buffer's
 331         * contents.
 332         *
 333         * (child --> cmd) --> us
 334         */
 335        int ret = 1;
 336        struct strbuf nbuf = STRBUF_INIT;
 337        struct async async;
 338        struct filter_params params;
 339
 340        if (!cmd)
 341                return 0;
 342
 343        memset(&async, 0, sizeof(async));
 344        async.proc = filter_buffer;
 345        async.data = &params;
 346        async.out = -1;
 347        params.src = src;
 348        params.size = len;
 349        params.cmd = cmd;
 350
 351        fflush(NULL);
 352        if (start_async(&async))
 353                return 0;       /* error was already reported */
 354
 355        if (strbuf_read(&nbuf, async.out, len) < 0) {
 356                error("read from external filter %s failed", cmd);
 357                ret = 0;
 358        }
 359        if (close(async.out)) {
 360                error("read from external filter %s failed", cmd);
 361                ret = 0;
 362        }
 363        if (finish_async(&async)) {
 364                error("external filter %s failed", cmd);
 365                ret = 0;
 366        }
 367
 368        if (ret) {
 369                strbuf_swap(dst, &nbuf);
 370        }
 371        strbuf_release(&nbuf);
 372        return ret;
 373}
 374
 375static struct convert_driver {
 376        const char *name;
 377        struct convert_driver *next;
 378        const char *smudge;
 379        const char *clean;
 380} *user_convert, **user_convert_tail;
 381
 382static int read_convert_config(const char *var, const char *value, void *cb)
 383{
 384        const char *ep, *name;
 385        int namelen;
 386        struct convert_driver *drv;
 387
 388        /*
 389         * External conversion drivers are configured using
 390         * "filter.<name>.variable".
 391         */
 392        if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
 393                return 0;
 394        name = var + 7;
 395        namelen = ep - name;
 396        for (drv = user_convert; drv; drv = drv->next)
 397                if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
 398                        break;
 399        if (!drv) {
 400                drv = xcalloc(1, sizeof(struct convert_driver));
 401                drv->name = xmemdupz(name, namelen);
 402                *user_convert_tail = drv;
 403                user_convert_tail = &(drv->next);
 404        }
 405
 406        ep++;
 407
 408        /*
 409         * filter.<name>.smudge and filter.<name>.clean specifies
 410         * the command line:
 411         *
 412         *      command-line
 413         *
 414         * The command-line will not be interpolated in any way.
 415         */
 416
 417        if (!strcmp("smudge", ep))
 418                return git_config_string(&drv->smudge, var, value);
 419
 420        if (!strcmp("clean", ep))
 421                return git_config_string(&drv->clean, var, value);
 422
 423        return 0;
 424}
 425
 426static void setup_convert_check(struct git_attr_check *check)
 427{
 428        static struct git_attr *attr_crlf;
 429        static struct git_attr *attr_ident;
 430        static struct git_attr *attr_filter;
 431
 432        if (!attr_crlf) {
 433                attr_crlf = git_attr("crlf");
 434                attr_ident = git_attr("ident");
 435                attr_filter = git_attr("filter");
 436                user_convert_tail = &user_convert;
 437                git_config(read_convert_config, NULL);
 438        }
 439        check[0].attr = attr_crlf;
 440        check[1].attr = attr_ident;
 441        check[2].attr = attr_filter;
 442}
 443
 444static int count_ident(const char *cp, unsigned long size)
 445{
 446        /*
 447         * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
 448         */
 449        int cnt = 0;
 450        char ch;
 451
 452        while (size) {
 453                ch = *cp++;
 454                size--;
 455                if (ch != '$')
 456                        continue;
 457                if (size < 3)
 458                        break;
 459                if (memcmp("Id", cp, 2))
 460                        continue;
 461                ch = cp[2];
 462                cp += 3;
 463                size -= 3;
 464                if (ch == '$')
 465                        cnt++; /* $Id$ */
 466                if (ch != ':')
 467                        continue;
 468
 469                /*
 470                 * "$Id: ... "; scan up to the closing dollar sign and discard.
 471                 */
 472                while (size) {
 473                        ch = *cp++;
 474                        size--;
 475                        if (ch == '$') {
 476                                cnt++;
 477                                break;
 478                        }
 479                        if (ch == '\n')
 480                                break;
 481                }
 482        }
 483        return cnt;
 484}
 485
 486static int ident_to_git(const char *path, const char *src, size_t len,
 487                        struct strbuf *buf, int ident)
 488{
 489        char *dst, *dollar;
 490
 491        if (!ident || !count_ident(src, len))
 492                return 0;
 493
 494        /* only grow if not in place */
 495        if (strbuf_avail(buf) + buf->len < len)
 496                strbuf_grow(buf, len - buf->len);
 497        dst = buf->buf;
 498        for (;;) {
 499                dollar = memchr(src, '$', len);
 500                if (!dollar)
 501                        break;
 502                memcpy(dst, src, dollar + 1 - src);
 503                dst += dollar + 1 - src;
 504                len -= dollar + 1 - src;
 505                src  = dollar + 1;
 506
 507                if (len > 3 && !memcmp(src, "Id:", 3)) {
 508                        dollar = memchr(src + 3, '$', len - 3);
 509                        if (!dollar)
 510                                break;
 511                        if (memchr(src + 3, '\n', dollar - src - 3)) {
 512                                /* Line break before the next dollar. */
 513                                continue;
 514                        }
 515
 516                        memcpy(dst, "Id$", 3);
 517                        dst += 3;
 518                        len -= dollar + 1 - src;
 519                        src  = dollar + 1;
 520                }
 521        }
 522        memcpy(dst, src, len);
 523        strbuf_setlen(buf, dst + len - buf->buf);
 524        return 1;
 525}
 526
 527static int ident_to_worktree(const char *path, const char *src, size_t len,
 528                             struct strbuf *buf, int ident)
 529{
 530        unsigned char sha1[20];
 531        char *to_free = NULL, *dollar, *spc;
 532        int cnt;
 533
 534        if (!ident)
 535                return 0;
 536
 537        cnt = count_ident(src, len);
 538        if (!cnt)
 539                return 0;
 540
 541        /* are we "faking" in place editing ? */
 542        if (src == buf->buf)
 543                to_free = strbuf_detach(buf, NULL);
 544        hash_sha1_file(src, len, "blob", sha1);
 545
 546        strbuf_grow(buf, len + cnt * 43);
 547        for (;;) {
 548                /* step 1: run to the next '$' */
 549                dollar = memchr(src, '$', len);
 550                if (!dollar)
 551                        break;
 552                strbuf_add(buf, src, dollar + 1 - src);
 553                len -= dollar + 1 - src;
 554                src  = dollar + 1;
 555
 556                /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
 557                if (len < 3 || memcmp("Id", src, 2))
 558                        continue;
 559
 560                /* step 3: skip over Id$ or Id:xxxxx$ */
 561                if (src[2] == '$') {
 562                        src += 3;
 563                        len -= 3;
 564                } else if (src[2] == ':') {
 565                        /*
 566                         * It's possible that an expanded Id has crept its way into the
 567                         * repository, we cope with that by stripping the expansion out.
 568                         * This is probably not a good idea, since it will cause changes
 569                         * on checkout, which won't go away by stash, but let's keep it
 570                         * for git-style ids.
 571                         */
 572                        dollar = memchr(src + 3, '$', len - 3);
 573                        if (!dollar) {
 574                                /* incomplete keyword, no more '$', so just quit the loop */
 575                                break;
 576                        }
 577
 578                        if (memchr(src + 3, '\n', dollar - src - 3)) {
 579                                /* Line break before the next dollar. */
 580                                continue;
 581                        }
 582
 583                        spc = memchr(src + 4, ' ', dollar - src - 4);
 584                        if (spc && spc < dollar-1) {
 585                                /* There are spaces in unexpected places.
 586                                 * This is probably an id from some other
 587                                 * versioning system. Keep it for now.
 588                                 */
 589                                continue;
 590                        }
 591
 592                        len -= dollar + 1 - src;
 593                        src  = dollar + 1;
 594                } else {
 595                        /* it wasn't a "Id$" or "Id:xxxx$" */
 596                        continue;
 597                }
 598
 599                /* step 4: substitute */
 600                strbuf_addstr(buf, "Id: ");
 601                strbuf_add(buf, sha1_to_hex(sha1), 40);
 602                strbuf_addstr(buf, " $");
 603        }
 604        strbuf_add(buf, src, len);
 605
 606        free(to_free);
 607        return 1;
 608}
 609
 610static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 611{
 612        const char *value = check->value;
 613
 614        if (ATTR_TRUE(value))
 615                return CRLF_TEXT;
 616        else if (ATTR_FALSE(value))
 617                return CRLF_BINARY;
 618        else if (ATTR_UNSET(value))
 619                ;
 620        else if (!strcmp(value, "input"))
 621                return CRLF_INPUT;
 622        return CRLF_GUESS;
 623}
 624
 625static struct convert_driver *git_path_check_convert(const char *path,
 626                                             struct git_attr_check *check)
 627{
 628        const char *value = check->value;
 629        struct convert_driver *drv;
 630
 631        if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
 632                return NULL;
 633        for (drv = user_convert; drv; drv = drv->next)
 634                if (!strcmp(value, drv->name))
 635                        return drv;
 636        return NULL;
 637}
 638
 639static int git_path_check_ident(const char *path, struct git_attr_check *check)
 640{
 641        const char *value = check->value;
 642
 643        return !!ATTR_TRUE(value);
 644}
 645
 646int convert_to_git(const char *path, const char *src, size_t len,
 647                   struct strbuf *dst, enum safe_crlf checksafe)
 648{
 649        struct git_attr_check check[3];
 650        int crlf = CRLF_GUESS;
 651        int ident = 0, ret = 0;
 652        const char *filter = NULL;
 653
 654        setup_convert_check(check);
 655        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 656                struct convert_driver *drv;
 657                crlf = git_path_check_crlf(path, check + 0);
 658                ident = git_path_check_ident(path, check + 1);
 659                drv = git_path_check_convert(path, check + 2);
 660                if (drv && drv->clean)
 661                        filter = drv->clean;
 662        }
 663
 664        ret |= apply_filter(path, src, len, dst, filter);
 665        if (ret) {
 666                src = dst->buf;
 667                len = dst->len;
 668        }
 669        ret |= crlf_to_git(path, src, len, dst, crlf, checksafe);
 670        if (ret) {
 671                src = dst->buf;
 672                len = dst->len;
 673        }
 674        return ret | ident_to_git(path, src, len, dst, ident);
 675}
 676
 677int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
 678{
 679        struct git_attr_check check[3];
 680        int crlf = CRLF_GUESS;
 681        int ident = 0, ret = 0;
 682        const char *filter = NULL;
 683
 684        setup_convert_check(check);
 685        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 686                struct convert_driver *drv;
 687                crlf = git_path_check_crlf(path, check + 0);
 688                ident = git_path_check_ident(path, check + 1);
 689                drv = git_path_check_convert(path, check + 2);
 690                if (drv && drv->smudge)
 691                        filter = drv->smudge;
 692        }
 693
 694        ret |= ident_to_worktree(path, src, len, dst, ident);
 695        if (ret) {
 696                src = dst->buf;
 697                len = dst->len;
 698        }
 699        ret |= crlf_to_worktree(path, src, len, dst, crlf);
 700        if (ret) {
 701                src = dst->buf;
 702                len = dst->len;
 703        }
 704        return ret | apply_filter(path, src, len, dst, filter);
 705}