a54c5fc4a24d64ae3639d27d6abdf545d1ffacc4
   1#include "cache.h"
   2#include "attr.h"
   3#include "run-command.h"
   4
   5/*
   6 * convert.c - convert a file when checking it out and checking it in.
   7 *
   8 * This should use the pathname to decide on whether it wants to do some
   9 * more interesting conversions (automatic gzip/unzip, general format
  10 * conversions etc etc), but by default it just does automatic CRLF<->LF
  11 * translation when the "auto_crlf" option is set.
  12 */
  13
  14#define CRLF_GUESS      (-1)
  15#define CRLF_BINARY     0
  16#define CRLF_TEXT       1
  17#define CRLF_INPUT      2
  18
  19struct text_stat {
  20        /* NUL, CR, LF and CRLF counts */
  21        unsigned nul, cr, lf, crlf;
  22
  23        /* These are just approximations! */
  24        unsigned printable, nonprintable;
  25};
  26
  27static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  28{
  29        unsigned long i;
  30
  31        memset(stats, 0, sizeof(*stats));
  32
  33        for (i = 0; i < size; i++) {
  34                unsigned char c = buf[i];
  35                if (c == '\r') {
  36                        stats->cr++;
  37                        if (i+1 < size && buf[i+1] == '\n')
  38                                stats->crlf++;
  39                        continue;
  40                }
  41                if (c == '\n') {
  42                        stats->lf++;
  43                        continue;
  44                }
  45                if (c == 127)
  46                        /* DEL */
  47                        stats->nonprintable++;
  48                else if (c < 32) {
  49                        switch (c) {
  50                                /* BS, HT, ESC and FF */
  51                        case '\b': case '\t': case '\033': case '\014':
  52                                stats->printable++;
  53                                break;
  54                        case 0:
  55                                stats->nul++;
  56                                /* fall through */
  57                        default:
  58                                stats->nonprintable++;
  59                        }
  60                }
  61                else
  62                        stats->printable++;
  63        }
  64
  65        /* If file ends with EOF then don't count this EOF as non-printable. */
  66        if (size >= 1 && buf[size-1] == '\032')
  67                stats->nonprintable--;
  68}
  69
  70/*
  71 * The same heuristics as diff.c::mmfile_is_binary()
  72 */
  73static int is_binary(unsigned long size, struct text_stat *stats)
  74{
  75
  76        if (stats->nul)
  77                return 1;
  78        if ((stats->printable >> 7) < stats->nonprintable)
  79                return 1;
  80        /*
  81         * Other heuristics? Average line length might be relevant,
  82         * as might LF vs CR vs CRLF counts..
  83         *
  84         * NOTE! It might be normal to have a low ratio of CRLF to LF
  85         * (somebody starts with a LF-only file and edits it with an editor
  86         * that adds CRLF only to lines that are added..). But do  we
  87         * want to support CR-only? Probably not.
  88         */
  89        return 0;
  90}
  91
  92static void check_safe_crlf(const char *path, int action,
  93                            struct text_stat *stats, enum safe_crlf checksafe)
  94{
  95        if (!checksafe)
  96                return;
  97
  98        if (action == CRLF_INPUT || auto_crlf <= 0) {
  99                /*
 100                 * CRLFs would not be restored by checkout:
 101                 * check if we'd remove CRLFs
 102                 */
 103                if (stats->crlf) {
 104                        if (checksafe == SAFE_CRLF_WARN)
 105                                warning("CRLF will be replaced by LF in %s.", path);
 106                        else /* i.e. SAFE_CRLF_FAIL */
 107                                die("CRLF would be replaced by LF in %s.", path);
 108                }
 109        } else if (auto_crlf > 0) {
 110                /*
 111                 * CRLFs would be added by checkout:
 112                 * check if we have "naked" LFs
 113                 */
 114                if (stats->lf != stats->crlf) {
 115                        if (checksafe == SAFE_CRLF_WARN)
 116                                warning("LF will be replaced by CRLF in %s", path);
 117                        else /* i.e. SAFE_CRLF_FAIL */
 118                                die("LF would be replaced by CRLF in %s", path);
 119                }
 120        }
 121}
 122
 123static int has_cr_in_index(const char *path)
 124{
 125        int pos, len;
 126        unsigned long sz;
 127        enum object_type type;
 128        void *data;
 129        int has_cr;
 130        struct index_state *istate = &the_index;
 131
 132        len = strlen(path);
 133        pos = index_name_pos(istate, path, len);
 134        if (pos < 0) {
 135                /*
 136                 * We might be in the middle of a merge, in which
 137                 * case we would read stage #2 (ours).
 138                 */
 139                int i;
 140                for (i = -pos - 1;
 141                     (pos < 0 && i < istate->cache_nr &&
 142                      !strcmp(istate->cache[i]->name, path));
 143                     i++)
 144                        if (ce_stage(istate->cache[i]) == 2)
 145                                pos = i;
 146        }
 147        if (pos < 0)
 148                return 0;
 149        data = read_sha1_file(istate->cache[pos]->sha1, &type, &sz);
 150        if (!data || type != OBJ_BLOB) {
 151                free(data);
 152                return 0;
 153        }
 154
 155        has_cr = memchr(data, '\r', sz) != NULL;
 156        free(data);
 157        return has_cr;
 158}
 159
 160static int crlf_to_git(const char *path, const char *src, size_t len,
 161                       struct strbuf *buf, int action, enum safe_crlf checksafe)
 162{
 163        struct text_stat stats;
 164        char *dst;
 165
 166        if ((action == CRLF_BINARY) || !auto_crlf || !len)
 167                return 0;
 168
 169        gather_stats(src, len, &stats);
 170
 171        if (action == CRLF_GUESS) {
 172                /*
 173                 * We're currently not going to even try to convert stuff
 174                 * that has bare CR characters. Does anybody do that crazy
 175                 * stuff?
 176                 */
 177                if (stats.cr != stats.crlf)
 178                        return 0;
 179
 180                /*
 181                 * And add some heuristics for binary vs text, of course...
 182                 */
 183                if (is_binary(len, &stats))
 184                        return 0;
 185
 186                /*
 187                 * If the file in the index has any CR in it, do not convert.
 188                 * This is the new safer autocrlf handling.
 189                 */
 190                if (has_cr_in_index(path))
 191                        return 0;
 192        }
 193
 194        check_safe_crlf(path, action, &stats, checksafe);
 195
 196        /* Optimization: No CR? Nothing to convert, regardless. */
 197        if (!stats.cr)
 198                return 0;
 199
 200        /* only grow if not in place */
 201        if (strbuf_avail(buf) + buf->len < len)
 202                strbuf_grow(buf, len - buf->len);
 203        dst = buf->buf;
 204        if (action == CRLF_GUESS) {
 205                /*
 206                 * If we guessed, we already know we rejected a file with
 207                 * lone CR, and we can strip a CR without looking at what
 208                 * follow it.
 209                 */
 210                do {
 211                        unsigned char c = *src++;
 212                        if (c != '\r')
 213                                *dst++ = c;
 214                } while (--len);
 215        } else {
 216                do {
 217                        unsigned char c = *src++;
 218                        if (! (c == '\r' && (1 < len && *src == '\n')))
 219                                *dst++ = c;
 220                } while (--len);
 221        }
 222        strbuf_setlen(buf, dst - buf->buf);
 223        return 1;
 224}
 225
 226static int crlf_to_worktree(const char *path, const char *src, size_t len,
 227                            struct strbuf *buf, int action)
 228{
 229        char *to_free = NULL;
 230        struct text_stat stats;
 231
 232        if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 233            auto_crlf <= 0)
 234                return 0;
 235
 236        if (!len)
 237                return 0;
 238
 239        gather_stats(src, len, &stats);
 240
 241        /* No LF? Nothing to convert, regardless. */
 242        if (!stats.lf)
 243                return 0;
 244
 245        /* Was it already in CRLF format? */
 246        if (stats.lf == stats.crlf)
 247                return 0;
 248
 249        if (action == CRLF_GUESS) {
 250                /* If we have any CR or CRLF line endings, we do not touch it */
 251                /* This is the new safer autocrlf-handling */
 252                if (stats.cr > 0 || stats.crlf > 0)
 253                        return 0;
 254
 255                /* If we have any bare CR characters, we're not going to touch it */
 256                if (stats.cr != stats.crlf)
 257                        return 0;
 258
 259                if (is_binary(len, &stats))
 260                        return 0;
 261        }
 262
 263        /* are we "faking" in place editing ? */
 264        if (src == buf->buf)
 265                to_free = strbuf_detach(buf, NULL);
 266
 267        strbuf_grow(buf, len + stats.lf - stats.crlf);
 268        for (;;) {
 269                const char *nl = memchr(src, '\n', len);
 270                if (!nl)
 271                        break;
 272                if (nl > src && nl[-1] == '\r') {
 273                        strbuf_add(buf, src, nl + 1 - src);
 274                } else {
 275                        strbuf_add(buf, src, nl - src);
 276                        strbuf_addstr(buf, "\r\n");
 277                }
 278                len -= nl + 1 - src;
 279                src  = nl + 1;
 280        }
 281        strbuf_add(buf, src, len);
 282
 283        free(to_free);
 284        return 1;
 285}
 286
 287struct filter_params {
 288        const char *src;
 289        unsigned long size;
 290        const char *cmd;
 291};
 292
 293static int filter_buffer(int fd, void *data)
 294{
 295        /*
 296         * Spawn cmd and feed the buffer contents through its stdin.
 297         */
 298        struct child_process child_process;
 299        struct filter_params *params = (struct filter_params *)data;
 300        int write_err, status;
 301        const char *argv[] = { params->cmd, NULL };
 302
 303        memset(&child_process, 0, sizeof(child_process));
 304        child_process.argv = argv;
 305        child_process.use_shell = 1;
 306        child_process.in = -1;
 307        child_process.out = fd;
 308
 309        if (start_command(&child_process))
 310                return error("cannot fork to run external filter %s", params->cmd);
 311
 312        write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
 313        if (close(child_process.in))
 314                write_err = 1;
 315        if (write_err)
 316                error("cannot feed the input to external filter %s", params->cmd);
 317
 318        status = finish_command(&child_process);
 319        if (status)
 320                error("external filter %s failed %d", params->cmd, status);
 321        return (write_err || status);
 322}
 323
 324static int apply_filter(const char *path, const char *src, size_t len,
 325                        struct strbuf *dst, const char *cmd)
 326{
 327        /*
 328         * Create a pipeline to have the command filter the buffer's
 329         * contents.
 330         *
 331         * (child --> cmd) --> us
 332         */
 333        int ret = 1;
 334        struct strbuf nbuf = STRBUF_INIT;
 335        struct async async;
 336        struct filter_params params;
 337
 338        if (!cmd)
 339                return 0;
 340
 341        memset(&async, 0, sizeof(async));
 342        async.proc = filter_buffer;
 343        async.data = &params;
 344        params.src = src;
 345        params.size = len;
 346        params.cmd = cmd;
 347
 348        fflush(NULL);
 349        if (start_async(&async))
 350                return 0;       /* error was already reported */
 351
 352        if (strbuf_read(&nbuf, async.out, len) < 0) {
 353                error("read from external filter %s failed", cmd);
 354                ret = 0;
 355        }
 356        if (close(async.out)) {
 357                error("read from external filter %s failed", cmd);
 358                ret = 0;
 359        }
 360        if (finish_async(&async)) {
 361                error("external filter %s failed", cmd);
 362                ret = 0;
 363        }
 364
 365        if (ret) {
 366                strbuf_swap(dst, &nbuf);
 367        }
 368        strbuf_release(&nbuf);
 369        return ret;
 370}
 371
 372static struct convert_driver {
 373        const char *name;
 374        struct convert_driver *next;
 375        const char *smudge;
 376        const char *clean;
 377} *user_convert, **user_convert_tail;
 378
 379static int read_convert_config(const char *var, const char *value, void *cb)
 380{
 381        const char *ep, *name;
 382        int namelen;
 383        struct convert_driver *drv;
 384
 385        /*
 386         * External conversion drivers are configured using
 387         * "filter.<name>.variable".
 388         */
 389        if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
 390                return 0;
 391        name = var + 7;
 392        namelen = ep - name;
 393        for (drv = user_convert; drv; drv = drv->next)
 394                if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
 395                        break;
 396        if (!drv) {
 397                drv = xcalloc(1, sizeof(struct convert_driver));
 398                drv->name = xmemdupz(name, namelen);
 399                *user_convert_tail = drv;
 400                user_convert_tail = &(drv->next);
 401        }
 402
 403        ep++;
 404
 405        /*
 406         * filter.<name>.smudge and filter.<name>.clean specifies
 407         * the command line:
 408         *
 409         *      command-line
 410         *
 411         * The command-line will not be interpolated in any way.
 412         */
 413
 414        if (!strcmp("smudge", ep))
 415                return git_config_string(&drv->smudge, var, value);
 416
 417        if (!strcmp("clean", ep))
 418                return git_config_string(&drv->clean, var, value);
 419
 420        return 0;
 421}
 422
 423static void setup_convert_check(struct git_attr_check *check)
 424{
 425        static struct git_attr *attr_crlf;
 426        static struct git_attr *attr_ident;
 427        static struct git_attr *attr_filter;
 428
 429        if (!attr_crlf) {
 430                attr_crlf = git_attr("crlf");
 431                attr_ident = git_attr("ident");
 432                attr_filter = git_attr("filter");
 433                user_convert_tail = &user_convert;
 434                git_config(read_convert_config, NULL);
 435        }
 436        check[0].attr = attr_crlf;
 437        check[1].attr = attr_ident;
 438        check[2].attr = attr_filter;
 439}
 440
 441static int count_ident(const char *cp, unsigned long size)
 442{
 443        /*
 444         * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
 445         */
 446        int cnt = 0;
 447        char ch;
 448
 449        while (size) {
 450                ch = *cp++;
 451                size--;
 452                if (ch != '$')
 453                        continue;
 454                if (size < 3)
 455                        break;
 456                if (memcmp("Id", cp, 2))
 457                        continue;
 458                ch = cp[2];
 459                cp += 3;
 460                size -= 3;
 461                if (ch == '$')
 462                        cnt++; /* $Id$ */
 463                if (ch != ':')
 464                        continue;
 465
 466                /*
 467                 * "$Id: ... "; scan up to the closing dollar sign and discard.
 468                 */
 469                while (size) {
 470                        ch = *cp++;
 471                        size--;
 472                        if (ch == '$') {
 473                                cnt++;
 474                                break;
 475                        }
 476                }
 477        }
 478        return cnt;
 479}
 480
 481static int ident_to_git(const char *path, const char *src, size_t len,
 482                        struct strbuf *buf, int ident)
 483{
 484        char *dst, *dollar;
 485
 486        if (!ident || !count_ident(src, len))
 487                return 0;
 488
 489        /* only grow if not in place */
 490        if (strbuf_avail(buf) + buf->len < len)
 491                strbuf_grow(buf, len - buf->len);
 492        dst = buf->buf;
 493        for (;;) {
 494                dollar = memchr(src, '$', len);
 495                if (!dollar)
 496                        break;
 497                memcpy(dst, src, dollar + 1 - src);
 498                dst += dollar + 1 - src;
 499                len -= dollar + 1 - src;
 500                src  = dollar + 1;
 501
 502                if (len > 3 && !memcmp(src, "Id:", 3)) {
 503                        dollar = memchr(src + 3, '$', len - 3);
 504                        if (!dollar)
 505                                break;
 506                        memcpy(dst, "Id$", 3);
 507                        dst += 3;
 508                        len -= dollar + 1 - src;
 509                        src  = dollar + 1;
 510                }
 511        }
 512        memcpy(dst, src, len);
 513        strbuf_setlen(buf, dst + len - buf->buf);
 514        return 1;
 515}
 516
 517static int ident_to_worktree(const char *path, const char *src, size_t len,
 518                             struct strbuf *buf, int ident)
 519{
 520        unsigned char sha1[20];
 521        char *to_free = NULL, *dollar;
 522        int cnt;
 523
 524        if (!ident)
 525                return 0;
 526
 527        cnt = count_ident(src, len);
 528        if (!cnt)
 529                return 0;
 530
 531        /* are we "faking" in place editing ? */
 532        if (src == buf->buf)
 533                to_free = strbuf_detach(buf, NULL);
 534        hash_sha1_file(src, len, "blob", sha1);
 535
 536        strbuf_grow(buf, len + cnt * 43);
 537        for (;;) {
 538                /* step 1: run to the next '$' */
 539                dollar = memchr(src, '$', len);
 540                if (!dollar)
 541                        break;
 542                strbuf_add(buf, src, dollar + 1 - src);
 543                len -= dollar + 1 - src;
 544                src  = dollar + 1;
 545
 546                /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
 547                if (len < 3 || memcmp("Id", src, 2))
 548                        continue;
 549
 550                /* step 3: skip over Id$ or Id:xxxxx$ */
 551                if (src[2] == '$') {
 552                        src += 3;
 553                        len -= 3;
 554                } else if (src[2] == ':') {
 555                        /*
 556                         * It's possible that an expanded Id has crept its way into the
 557                         * repository, we cope with that by stripping the expansion out
 558                         */
 559                        dollar = memchr(src + 3, '$', len - 3);
 560                        if (!dollar) {
 561                                /* incomplete keyword, no more '$', so just quit the loop */
 562                                break;
 563                        }
 564
 565                        len -= dollar + 1 - src;
 566                        src  = dollar + 1;
 567                } else {
 568                        /* it wasn't a "Id$" or "Id:xxxx$" */
 569                        continue;
 570                }
 571
 572                /* step 4: substitute */
 573                strbuf_addstr(buf, "Id: ");
 574                strbuf_add(buf, sha1_to_hex(sha1), 40);
 575                strbuf_addstr(buf, " $");
 576        }
 577        strbuf_add(buf, src, len);
 578
 579        free(to_free);
 580        return 1;
 581}
 582
 583static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 584{
 585        const char *value = check->value;
 586
 587        if (ATTR_TRUE(value))
 588                return CRLF_TEXT;
 589        else if (ATTR_FALSE(value))
 590                return CRLF_BINARY;
 591        else if (ATTR_UNSET(value))
 592                ;
 593        else if (!strcmp(value, "input"))
 594                return CRLF_INPUT;
 595        return CRLF_GUESS;
 596}
 597
 598static struct convert_driver *git_path_check_convert(const char *path,
 599                                             struct git_attr_check *check)
 600{
 601        const char *value = check->value;
 602        struct convert_driver *drv;
 603
 604        if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
 605                return NULL;
 606        for (drv = user_convert; drv; drv = drv->next)
 607                if (!strcmp(value, drv->name))
 608                        return drv;
 609        return NULL;
 610}
 611
 612static int git_path_check_ident(const char *path, struct git_attr_check *check)
 613{
 614        const char *value = check->value;
 615
 616        return !!ATTR_TRUE(value);
 617}
 618
 619int convert_to_git(const char *path, const char *src, size_t len,
 620                   struct strbuf *dst, enum safe_crlf checksafe)
 621{
 622        struct git_attr_check check[3];
 623        int crlf = CRLF_GUESS;
 624        int ident = 0, ret = 0;
 625        const char *filter = NULL;
 626
 627        setup_convert_check(check);
 628        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 629                struct convert_driver *drv;
 630                crlf = git_path_check_crlf(path, check + 0);
 631                ident = git_path_check_ident(path, check + 1);
 632                drv = git_path_check_convert(path, check + 2);
 633                if (drv && drv->clean)
 634                        filter = drv->clean;
 635        }
 636
 637        ret |= apply_filter(path, src, len, dst, filter);
 638        if (ret) {
 639                src = dst->buf;
 640                len = dst->len;
 641        }
 642        ret |= crlf_to_git(path, src, len, dst, crlf, checksafe);
 643        if (ret) {
 644                src = dst->buf;
 645                len = dst->len;
 646        }
 647        return ret | ident_to_git(path, src, len, dst, ident);
 648}
 649
 650int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
 651{
 652        struct git_attr_check check[3];
 653        int crlf = CRLF_GUESS;
 654        int ident = 0, ret = 0;
 655        const char *filter = NULL;
 656
 657        setup_convert_check(check);
 658        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 659                struct convert_driver *drv;
 660                crlf = git_path_check_crlf(path, check + 0);
 661                ident = git_path_check_ident(path, check + 1);
 662                drv = git_path_check_convert(path, check + 2);
 663                if (drv && drv->smudge)
 664                        filter = drv->smudge;
 665        }
 666
 667        ret |= ident_to_worktree(path, src, len, dst, ident);
 668        if (ret) {
 669                src = dst->buf;
 670                len = dst->len;
 671        }
 672        ret |= crlf_to_worktree(path, src, len, dst, crlf);
 673        if (ret) {
 674                src = dst->buf;
 675                len = dst->len;
 676        }
 677        return ret | apply_filter(path, src, len, dst, filter);
 678}