convert.con commit Add "core.eol" config variable (942e774)
   1#include "cache.h"
   2#include "attr.h"
   3#include "run-command.h"
   4
   5/*
   6 * convert.c - convert a file when checking it out and checking it in.
   7 *
   8 * This should use the pathname to decide on whether it wants to do some
   9 * more interesting conversions (automatic gzip/unzip, general format
  10 * conversions etc etc), but by default it just does automatic CRLF<->LF
  11 * translation when the "text" attribute or "auto_crlf" option is set.
  12 */
  13
  14enum action {
  15        CRLF_GUESS = -1,
  16        CRLF_BINARY = 0,
  17        CRLF_TEXT,
  18        CRLF_INPUT,
  19        CRLF_CRLF,
  20        CRLF_AUTO,
  21};
  22
  23struct text_stat {
  24        /* NUL, CR, LF and CRLF counts */
  25        unsigned nul, cr, lf, crlf;
  26
  27        /* These are just approximations! */
  28        unsigned printable, nonprintable;
  29};
  30
  31static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  32{
  33        unsigned long i;
  34
  35        memset(stats, 0, sizeof(*stats));
  36
  37        for (i = 0; i < size; i++) {
  38                unsigned char c = buf[i];
  39                if (c == '\r') {
  40                        stats->cr++;
  41                        if (i+1 < size && buf[i+1] == '\n')
  42                                stats->crlf++;
  43                        continue;
  44                }
  45                if (c == '\n') {
  46                        stats->lf++;
  47                        continue;
  48                }
  49                if (c == 127)
  50                        /* DEL */
  51                        stats->nonprintable++;
  52                else if (c < 32) {
  53                        switch (c) {
  54                                /* BS, HT, ESC and FF */
  55                        case '\b': case '\t': case '\033': case '\014':
  56                                stats->printable++;
  57                                break;
  58                        case 0:
  59                                stats->nul++;
  60                                /* fall through */
  61                        default:
  62                                stats->nonprintable++;
  63                        }
  64                }
  65                else
  66                        stats->printable++;
  67        }
  68
  69        /* If file ends with EOF then don't count this EOF as non-printable. */
  70        if (size >= 1 && buf[size-1] == '\032')
  71                stats->nonprintable--;
  72}
  73
  74/*
  75 * The same heuristics as diff.c::mmfile_is_binary()
  76 */
  77static int is_binary(unsigned long size, struct text_stat *stats)
  78{
  79
  80        if (stats->nul)
  81                return 1;
  82        if ((stats->printable >> 7) < stats->nonprintable)
  83                return 1;
  84        /*
  85         * Other heuristics? Average line length might be relevant,
  86         * as might LF vs CR vs CRLF counts..
  87         *
  88         * NOTE! It might be normal to have a low ratio of CRLF to LF
  89         * (somebody starts with a LF-only file and edits it with an editor
  90         * that adds CRLF only to lines that are added..). But do  we
  91         * want to support CR-only? Probably not.
  92         */
  93        return 0;
  94}
  95
  96static enum eol determine_output_conversion(enum action action) {
  97        switch (action) {
  98        case CRLF_BINARY:
  99                return EOL_UNSET;
 100        case CRLF_CRLF:
 101                return EOL_CRLF;
 102        case CRLF_INPUT:
 103                return EOL_LF;
 104        case CRLF_GUESS:
 105                if (!auto_crlf)
 106                        return EOL_UNSET;
 107                /* fall through */
 108        case CRLF_TEXT:
 109        case CRLF_AUTO:
 110                if (auto_crlf == AUTO_CRLF_TRUE)
 111                        return EOL_CRLF;
 112                else if (auto_crlf == AUTO_CRLF_INPUT)
 113                        return EOL_LF;
 114                else if (eol == EOL_UNSET)
 115                        return EOL_NATIVE;
 116        }
 117        return eol;
 118}
 119
 120static void check_safe_crlf(const char *path, enum action action,
 121                            struct text_stat *stats, enum safe_crlf checksafe)
 122{
 123        if (!checksafe)
 124                return;
 125
 126        if (determine_output_conversion(action) == EOL_LF) {
 127                /*
 128                 * CRLFs would not be restored by checkout:
 129                 * check if we'd remove CRLFs
 130                 */
 131                if (stats->crlf) {
 132                        if (checksafe == SAFE_CRLF_WARN)
 133                                warning("CRLF will be replaced by LF in %s.\nThe file will have its original line endings in your working directory.", path);
 134                        else /* i.e. SAFE_CRLF_FAIL */
 135                                die("CRLF would be replaced by LF in %s.", path);
 136                }
 137        } else if (determine_output_conversion(action) == EOL_CRLF) {
 138                /*
 139                 * CRLFs would be added by checkout:
 140                 * check if we have "naked" LFs
 141                 */
 142                if (stats->lf != stats->crlf) {
 143                        if (checksafe == SAFE_CRLF_WARN)
 144                                warning("LF will be replaced by CRLF in %s.\nThe file will have its original line endings in your working directory.", path);
 145                        else /* i.e. SAFE_CRLF_FAIL */
 146                                die("LF would be replaced by CRLF in %s", path);
 147                }
 148        }
 149}
 150
 151static int has_cr_in_index(const char *path)
 152{
 153        int pos, len;
 154        unsigned long sz;
 155        enum object_type type;
 156        void *data;
 157        int has_cr;
 158        struct index_state *istate = &the_index;
 159
 160        len = strlen(path);
 161        pos = index_name_pos(istate, path, len);
 162        if (pos < 0) {
 163                /*
 164                 * We might be in the middle of a merge, in which
 165                 * case we would read stage #2 (ours).
 166                 */
 167                int i;
 168                for (i = -pos - 1;
 169                     (pos < 0 && i < istate->cache_nr &&
 170                      !strcmp(istate->cache[i]->name, path));
 171                     i++)
 172                        if (ce_stage(istate->cache[i]) == 2)
 173                                pos = i;
 174        }
 175        if (pos < 0)
 176                return 0;
 177        data = read_sha1_file(istate->cache[pos]->sha1, &type, &sz);
 178        if (!data || type != OBJ_BLOB) {
 179                free(data);
 180                return 0;
 181        }
 182
 183        has_cr = memchr(data, '\r', sz) != NULL;
 184        free(data);
 185        return has_cr;
 186}
 187
 188static int crlf_to_git(const char *path, const char *src, size_t len,
 189                       struct strbuf *buf, enum action action, enum safe_crlf checksafe)
 190{
 191        struct text_stat stats;
 192        char *dst;
 193
 194        if (action == CRLF_BINARY ||
 195            (action == CRLF_GUESS && auto_crlf == AUTO_CRLF_FALSE) || !len)
 196                return 0;
 197
 198        gather_stats(src, len, &stats);
 199
 200        if (action == CRLF_AUTO || action == CRLF_GUESS) {
 201                /*
 202                 * We're currently not going to even try to convert stuff
 203                 * that has bare CR characters. Does anybody do that crazy
 204                 * stuff?
 205                 */
 206                if (stats.cr != stats.crlf)
 207                        return 0;
 208
 209                /*
 210                 * And add some heuristics for binary vs text, of course...
 211                 */
 212                if (is_binary(len, &stats))
 213                        return 0;
 214
 215                if (action == CRLF_GUESS) {
 216                        /*
 217                         * If the file in the index has any CR in it, do not convert.
 218                         * This is the new safer autocrlf handling.
 219                         */
 220                        if (has_cr_in_index(path))
 221                                return 0;
 222                }
 223        }
 224
 225        check_safe_crlf(path, action, &stats, checksafe);
 226
 227        /* Optimization: No CR? Nothing to convert, regardless. */
 228        if (!stats.cr)
 229                return 0;
 230
 231        /* only grow if not in place */
 232        if (strbuf_avail(buf) + buf->len < len)
 233                strbuf_grow(buf, len - buf->len);
 234        dst = buf->buf;
 235        if (action == CRLF_AUTO || action == CRLF_GUESS) {
 236                /*
 237                 * If we guessed, we already know we rejected a file with
 238                 * lone CR, and we can strip a CR without looking at what
 239                 * follow it.
 240                 */
 241                do {
 242                        unsigned char c = *src++;
 243                        if (c != '\r')
 244                                *dst++ = c;
 245                } while (--len);
 246        } else {
 247                do {
 248                        unsigned char c = *src++;
 249                        if (! (c == '\r' && (1 < len && *src == '\n')))
 250                                *dst++ = c;
 251                } while (--len);
 252        }
 253        strbuf_setlen(buf, dst - buf->buf);
 254        return 1;
 255}
 256
 257static int crlf_to_worktree(const char *path, const char *src, size_t len,
 258                            struct strbuf *buf, enum action action)
 259{
 260        char *to_free = NULL;
 261        struct text_stat stats;
 262
 263        if (!len || determine_output_conversion(action) != EOL_CRLF)
 264                return 0;
 265
 266        gather_stats(src, len, &stats);
 267
 268        /* No LF? Nothing to convert, regardless. */
 269        if (!stats.lf)
 270                return 0;
 271
 272        /* Was it already in CRLF format? */
 273        if (stats.lf == stats.crlf)
 274                return 0;
 275
 276        if (action == CRLF_AUTO || action == CRLF_GUESS) {
 277                if (action == CRLF_GUESS) {
 278                        /* If we have any CR or CRLF line endings, we do not touch it */
 279                        /* This is the new safer autocrlf-handling */
 280                        if (stats.cr > 0 || stats.crlf > 0)
 281                                return 0;
 282                }
 283
 284                /* If we have any bare CR characters, we're not going to touch it */
 285                if (stats.cr != stats.crlf)
 286                        return 0;
 287
 288                if (is_binary(len, &stats))
 289                        return 0;
 290        }
 291
 292        /* are we "faking" in place editing ? */
 293        if (src == buf->buf)
 294                to_free = strbuf_detach(buf, NULL);
 295
 296        strbuf_grow(buf, len + stats.lf - stats.crlf);
 297        for (;;) {
 298                const char *nl = memchr(src, '\n', len);
 299                if (!nl)
 300                        break;
 301                if (nl > src && nl[-1] == '\r') {
 302                        strbuf_add(buf, src, nl + 1 - src);
 303                } else {
 304                        strbuf_add(buf, src, nl - src);
 305                        strbuf_addstr(buf, "\r\n");
 306                }
 307                len -= nl + 1 - src;
 308                src  = nl + 1;
 309        }
 310        strbuf_add(buf, src, len);
 311
 312        free(to_free);
 313        return 1;
 314}
 315
 316struct filter_params {
 317        const char *src;
 318        unsigned long size;
 319        const char *cmd;
 320};
 321
 322static int filter_buffer(int fd, void *data)
 323{
 324        /*
 325         * Spawn cmd and feed the buffer contents through its stdin.
 326         */
 327        struct child_process child_process;
 328        struct filter_params *params = (struct filter_params *)data;
 329        int write_err, status;
 330        const char *argv[] = { params->cmd, NULL };
 331
 332        memset(&child_process, 0, sizeof(child_process));
 333        child_process.argv = argv;
 334        child_process.use_shell = 1;
 335        child_process.in = -1;
 336        child_process.out = fd;
 337
 338        if (start_command(&child_process))
 339                return error("cannot fork to run external filter %s", params->cmd);
 340
 341        write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
 342        if (close(child_process.in))
 343                write_err = 1;
 344        if (write_err)
 345                error("cannot feed the input to external filter %s", params->cmd);
 346
 347        status = finish_command(&child_process);
 348        if (status)
 349                error("external filter %s failed %d", params->cmd, status);
 350        return (write_err || status);
 351}
 352
 353static int apply_filter(const char *path, const char *src, size_t len,
 354                        struct strbuf *dst, const char *cmd)
 355{
 356        /*
 357         * Create a pipeline to have the command filter the buffer's
 358         * contents.
 359         *
 360         * (child --> cmd) --> us
 361         */
 362        int ret = 1;
 363        struct strbuf nbuf = STRBUF_INIT;
 364        struct async async;
 365        struct filter_params params;
 366
 367        if (!cmd)
 368                return 0;
 369
 370        memset(&async, 0, sizeof(async));
 371        async.proc = filter_buffer;
 372        async.data = &params;
 373        params.src = src;
 374        params.size = len;
 375        params.cmd = cmd;
 376
 377        fflush(NULL);
 378        if (start_async(&async))
 379                return 0;       /* error was already reported */
 380
 381        if (strbuf_read(&nbuf, async.out, len) < 0) {
 382                error("read from external filter %s failed", cmd);
 383                ret = 0;
 384        }
 385        if (close(async.out)) {
 386                error("read from external filter %s failed", cmd);
 387                ret = 0;
 388        }
 389        if (finish_async(&async)) {
 390                error("external filter %s failed", cmd);
 391                ret = 0;
 392        }
 393
 394        if (ret) {
 395                strbuf_swap(dst, &nbuf);
 396        }
 397        strbuf_release(&nbuf);
 398        return ret;
 399}
 400
 401static struct convert_driver {
 402        const char *name;
 403        struct convert_driver *next;
 404        const char *smudge;
 405        const char *clean;
 406} *user_convert, **user_convert_tail;
 407
 408static int read_convert_config(const char *var, const char *value, void *cb)
 409{
 410        const char *ep, *name;
 411        int namelen;
 412        struct convert_driver *drv;
 413
 414        /*
 415         * External conversion drivers are configured using
 416         * "filter.<name>.variable".
 417         */
 418        if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
 419                return 0;
 420        name = var + 7;
 421        namelen = ep - name;
 422        for (drv = user_convert; drv; drv = drv->next)
 423                if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
 424                        break;
 425        if (!drv) {
 426                drv = xcalloc(1, sizeof(struct convert_driver));
 427                drv->name = xmemdupz(name, namelen);
 428                *user_convert_tail = drv;
 429                user_convert_tail = &(drv->next);
 430        }
 431
 432        ep++;
 433
 434        /*
 435         * filter.<name>.smudge and filter.<name>.clean specifies
 436         * the command line:
 437         *
 438         *      command-line
 439         *
 440         * The command-line will not be interpolated in any way.
 441         */
 442
 443        if (!strcmp("smudge", ep))
 444                return git_config_string(&drv->smudge, var, value);
 445
 446        if (!strcmp("clean", ep))
 447                return git_config_string(&drv->clean, var, value);
 448
 449        return 0;
 450}
 451
 452static void setup_convert_check(struct git_attr_check *check)
 453{
 454        static struct git_attr *attr_text;
 455        static struct git_attr *attr_crlf;
 456        static struct git_attr *attr_eol;
 457        static struct git_attr *attr_ident;
 458        static struct git_attr *attr_filter;
 459
 460        if (!attr_text) {
 461                attr_text = git_attr("text");
 462                attr_crlf = git_attr("crlf");
 463                attr_eol = git_attr("eol");
 464                attr_ident = git_attr("ident");
 465                attr_filter = git_attr("filter");
 466                user_convert_tail = &user_convert;
 467                git_config(read_convert_config, NULL);
 468        }
 469        check[0].attr = attr_crlf;
 470        check[1].attr = attr_ident;
 471        check[2].attr = attr_filter;
 472        check[3].attr = attr_eol;
 473        check[4].attr = attr_text;
 474}
 475
 476static int count_ident(const char *cp, unsigned long size)
 477{
 478        /*
 479         * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
 480         */
 481        int cnt = 0;
 482        char ch;
 483
 484        while (size) {
 485                ch = *cp++;
 486                size--;
 487                if (ch != '$')
 488                        continue;
 489                if (size < 3)
 490                        break;
 491                if (memcmp("Id", cp, 2))
 492                        continue;
 493                ch = cp[2];
 494                cp += 3;
 495                size -= 3;
 496                if (ch == '$')
 497                        cnt++; /* $Id$ */
 498                if (ch != ':')
 499                        continue;
 500
 501                /*
 502                 * "$Id: ... "; scan up to the closing dollar sign and discard.
 503                 */
 504                while (size) {
 505                        ch = *cp++;
 506                        size--;
 507                        if (ch == '$') {
 508                                cnt++;
 509                                break;
 510                        }
 511                }
 512        }
 513        return cnt;
 514}
 515
 516static int ident_to_git(const char *path, const char *src, size_t len,
 517                        struct strbuf *buf, int ident)
 518{
 519        char *dst, *dollar;
 520
 521        if (!ident || !count_ident(src, len))
 522                return 0;
 523
 524        /* only grow if not in place */
 525        if (strbuf_avail(buf) + buf->len < len)
 526                strbuf_grow(buf, len - buf->len);
 527        dst = buf->buf;
 528        for (;;) {
 529                dollar = memchr(src, '$', len);
 530                if (!dollar)
 531                        break;
 532                memcpy(dst, src, dollar + 1 - src);
 533                dst += dollar + 1 - src;
 534                len -= dollar + 1 - src;
 535                src  = dollar + 1;
 536
 537                if (len > 3 && !memcmp(src, "Id:", 3)) {
 538                        dollar = memchr(src + 3, '$', len - 3);
 539                        if (!dollar)
 540                                break;
 541                        memcpy(dst, "Id$", 3);
 542                        dst += 3;
 543                        len -= dollar + 1 - src;
 544                        src  = dollar + 1;
 545                }
 546        }
 547        memcpy(dst, src, len);
 548        strbuf_setlen(buf, dst + len - buf->buf);
 549        return 1;
 550}
 551
 552static int ident_to_worktree(const char *path, const char *src, size_t len,
 553                             struct strbuf *buf, int ident)
 554{
 555        unsigned char sha1[20];
 556        char *to_free = NULL, *dollar;
 557        int cnt;
 558
 559        if (!ident)
 560                return 0;
 561
 562        cnt = count_ident(src, len);
 563        if (!cnt)
 564                return 0;
 565
 566        /* are we "faking" in place editing ? */
 567        if (src == buf->buf)
 568                to_free = strbuf_detach(buf, NULL);
 569        hash_sha1_file(src, len, "blob", sha1);
 570
 571        strbuf_grow(buf, len + cnt * 43);
 572        for (;;) {
 573                /* step 1: run to the next '$' */
 574                dollar = memchr(src, '$', len);
 575                if (!dollar)
 576                        break;
 577                strbuf_add(buf, src, dollar + 1 - src);
 578                len -= dollar + 1 - src;
 579                src  = dollar + 1;
 580
 581                /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
 582                if (len < 3 || memcmp("Id", src, 2))
 583                        continue;
 584
 585                /* step 3: skip over Id$ or Id:xxxxx$ */
 586                if (src[2] == '$') {
 587                        src += 3;
 588                        len -= 3;
 589                } else if (src[2] == ':') {
 590                        /*
 591                         * It's possible that an expanded Id has crept its way into the
 592                         * repository, we cope with that by stripping the expansion out
 593                         */
 594                        dollar = memchr(src + 3, '$', len - 3);
 595                        if (!dollar) {
 596                                /* incomplete keyword, no more '$', so just quit the loop */
 597                                break;
 598                        }
 599
 600                        len -= dollar + 1 - src;
 601                        src  = dollar + 1;
 602                } else {
 603                        /* it wasn't a "Id$" or "Id:xxxx$" */
 604                        continue;
 605                }
 606
 607                /* step 4: substitute */
 608                strbuf_addstr(buf, "Id: ");
 609                strbuf_add(buf, sha1_to_hex(sha1), 40);
 610                strbuf_addstr(buf, " $");
 611        }
 612        strbuf_add(buf, src, len);
 613
 614        free(to_free);
 615        return 1;
 616}
 617
 618static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 619{
 620        const char *value = check->value;
 621
 622        if (ATTR_TRUE(value))
 623                return CRLF_TEXT;
 624        else if (ATTR_FALSE(value))
 625                return CRLF_BINARY;
 626        else if (ATTR_UNSET(value))
 627                ;
 628        else if (!strcmp(value, "input"))
 629                return CRLF_INPUT;
 630        else if (!strcmp(value, "auto"))
 631                return CRLF_AUTO;
 632        return CRLF_GUESS;
 633}
 634
 635static int git_path_check_eol(const char *path, struct git_attr_check *check)
 636{
 637        const char *value = check->value;
 638
 639        if (ATTR_UNSET(value))
 640                ;
 641        else if (!strcmp(value, "lf"))
 642                return EOL_LF;
 643        else if (!strcmp(value, "crlf"))
 644                return EOL_CRLF;
 645        return EOL_UNSET;
 646}
 647
 648static struct convert_driver *git_path_check_convert(const char *path,
 649                                             struct git_attr_check *check)
 650{
 651        const char *value = check->value;
 652        struct convert_driver *drv;
 653
 654        if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
 655                return NULL;
 656        for (drv = user_convert; drv; drv = drv->next)
 657                if (!strcmp(value, drv->name))
 658                        return drv;
 659        return NULL;
 660}
 661
 662static int git_path_check_ident(const char *path, struct git_attr_check *check)
 663{
 664        const char *value = check->value;
 665
 666        return !!ATTR_TRUE(value);
 667}
 668
 669enum action determine_action(enum action text_attr, enum eol eol_attr) {
 670        if (text_attr == CRLF_BINARY)
 671                return CRLF_BINARY;
 672        if (eol_attr == EOL_LF)
 673                return CRLF_INPUT;
 674        if (eol_attr == EOL_CRLF)
 675                return CRLF_CRLF;
 676        return text_attr;
 677}
 678
 679int convert_to_git(const char *path, const char *src, size_t len,
 680                   struct strbuf *dst, enum safe_crlf checksafe)
 681{
 682        struct git_attr_check check[5];
 683        enum action action = CRLF_GUESS;
 684        enum eol eol_attr = EOL_UNSET;
 685        int ident = 0, ret = 0;
 686        const char *filter = NULL;
 687
 688        setup_convert_check(check);
 689        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 690                struct convert_driver *drv;
 691                action = git_path_check_crlf(path, check + 4);
 692                if (action == CRLF_GUESS)
 693                        action = git_path_check_crlf(path, check + 0);
 694                ident = git_path_check_ident(path, check + 1);
 695                drv = git_path_check_convert(path, check + 2);
 696                eol_attr = git_path_check_eol(path, check + 3);
 697                if (drv && drv->clean)
 698                        filter = drv->clean;
 699        }
 700
 701        ret |= apply_filter(path, src, len, dst, filter);
 702        if (ret) {
 703                src = dst->buf;
 704                len = dst->len;
 705        }
 706        action = determine_action(action, eol_attr);
 707        ret |= crlf_to_git(path, src, len, dst, action, checksafe);
 708        if (ret) {
 709                src = dst->buf;
 710                len = dst->len;
 711        }
 712        return ret | ident_to_git(path, src, len, dst, ident);
 713}
 714
 715int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
 716{
 717        struct git_attr_check check[5];
 718        enum action action = CRLF_GUESS;
 719        enum eol eol_attr = EOL_UNSET;
 720        int ident = 0, ret = 0;
 721        const char *filter = NULL;
 722
 723        setup_convert_check(check);
 724        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 725                struct convert_driver *drv;
 726                action = git_path_check_crlf(path, check + 4);
 727                if (action == CRLF_GUESS)
 728                        action = git_path_check_crlf(path, check + 0);
 729                ident = git_path_check_ident(path, check + 1);
 730                drv = git_path_check_convert(path, check + 2);
 731                eol_attr = git_path_check_eol(path, check + 3);
 732                if (drv && drv->smudge)
 733                        filter = drv->smudge;
 734        }
 735
 736        ret |= ident_to_worktree(path, src, len, dst, ident);
 737        if (ret) {
 738                src = dst->buf;
 739                len = dst->len;
 740        }
 741        action = determine_action(action, eol_attr);
 742        ret |= crlf_to_worktree(path, src, len, dst, action);
 743        if (ret) {
 744                src = dst->buf;
 745                len = dst->len;
 746        }
 747        return ret | apply_filter(path, src, len, dst, filter);
 748}