convert.con commit diff: change semantics of "ignore whitespace" options (f245194)
   1#include "cache.h"
   2#include "attr.h"
   3#include "run-command.h"
   4
   5/*
   6 * convert.c - convert a file when checking it out and checking it in.
   7 *
   8 * This should use the pathname to decide on whether it wants to do some
   9 * more interesting conversions (automatic gzip/unzip, general format
  10 * conversions etc etc), but by default it just does automatic CRLF<->LF
  11 * translation when the "auto_crlf" option is set.
  12 */
  13
  14#define CRLF_GUESS      (-1)
  15#define CRLF_BINARY     0
  16#define CRLF_TEXT       1
  17#define CRLF_INPUT      2
  18
  19struct text_stat {
  20        /* NUL, CR, LF and CRLF counts */
  21        unsigned nul, cr, lf, crlf;
  22
  23        /* These are just approximations! */
  24        unsigned printable, nonprintable;
  25};
  26
  27static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  28{
  29        unsigned long i;
  30
  31        memset(stats, 0, sizeof(*stats));
  32
  33        for (i = 0; i < size; i++) {
  34                unsigned char c = buf[i];
  35                if (c == '\r') {
  36                        stats->cr++;
  37                        if (i+1 < size && buf[i+1] == '\n')
  38                                stats->crlf++;
  39                        continue;
  40                }
  41                if (c == '\n') {
  42                        stats->lf++;
  43                        continue;
  44                }
  45                if (c == 127)
  46                        /* DEL */
  47                        stats->nonprintable++;
  48                else if (c < 32) {
  49                        switch (c) {
  50                                /* BS, HT, ESC and FF */
  51                        case '\b': case '\t': case '\033': case '\014':
  52                                stats->printable++;
  53                                break;
  54                        case 0:
  55                                stats->nul++;
  56                                /* fall through */
  57                        default:
  58                                stats->nonprintable++;
  59                        }
  60                }
  61                else
  62                        stats->printable++;
  63        }
  64
  65        /* If file ends with EOF then don't count this EOF as non-printable. */
  66        if (size >= 1 && buf[size-1] == '\032')
  67                stats->nonprintable--;
  68}
  69
  70/*
  71 * The same heuristics as diff.c::mmfile_is_binary()
  72 */
  73static int is_binary(unsigned long size, struct text_stat *stats)
  74{
  75
  76        if (stats->nul)
  77                return 1;
  78        if ((stats->printable >> 7) < stats->nonprintable)
  79                return 1;
  80        /*
  81         * Other heuristics? Average line length might be relevant,
  82         * as might LF vs CR vs CRLF counts..
  83         *
  84         * NOTE! It might be normal to have a low ratio of CRLF to LF
  85         * (somebody starts with a LF-only file and edits it with an editor
  86         * that adds CRLF only to lines that are added..). But do  we
  87         * want to support CR-only? Probably not.
  88         */
  89        return 0;
  90}
  91
  92static void check_safe_crlf(const char *path, int action,
  93                            struct text_stat *stats, enum safe_crlf checksafe)
  94{
  95        if (!checksafe)
  96                return;
  97
  98        if (action == CRLF_INPUT || auto_crlf <= 0) {
  99                /*
 100                 * CRLFs would not be restored by checkout:
 101                 * check if we'd remove CRLFs
 102                 */
 103                if (stats->crlf) {
 104                        if (checksafe == SAFE_CRLF_WARN)
 105                                warning("CRLF will be replaced by LF in %s.", path);
 106                        else /* i.e. SAFE_CRLF_FAIL */
 107                                die("CRLF would be replaced by LF in %s.", path);
 108                }
 109        } else if (auto_crlf > 0) {
 110                /*
 111                 * CRLFs would be added by checkout:
 112                 * check if we have "naked" LFs
 113                 */
 114                if (stats->lf != stats->crlf) {
 115                        if (checksafe == SAFE_CRLF_WARN)
 116                                warning("LF will be replaced by CRLF in %s", path);
 117                        else /* i.e. SAFE_CRLF_FAIL */
 118                                die("LF would be replaced by CRLF in %s", path);
 119                }
 120        }
 121}
 122
 123static int crlf_to_git(const char *path, const char *src, size_t len,
 124                       struct strbuf *buf, int action, enum safe_crlf checksafe)
 125{
 126        struct text_stat stats;
 127        char *dst;
 128
 129        if ((action == CRLF_BINARY) || !auto_crlf || !len)
 130                return 0;
 131
 132        gather_stats(src, len, &stats);
 133
 134        if (action == CRLF_GUESS) {
 135                /*
 136                 * We're currently not going to even try to convert stuff
 137                 * that has bare CR characters. Does anybody do that crazy
 138                 * stuff?
 139                 */
 140                if (stats.cr != stats.crlf)
 141                        return 0;
 142
 143                /*
 144                 * And add some heuristics for binary vs text, of course...
 145                 */
 146                if (is_binary(len, &stats))
 147                        return 0;
 148        }
 149
 150        check_safe_crlf(path, action, &stats, checksafe);
 151
 152        /* Optimization: No CR? Nothing to convert, regardless. */
 153        if (!stats.cr)
 154                return 0;
 155
 156        /* only grow if not in place */
 157        if (strbuf_avail(buf) + buf->len < len)
 158                strbuf_grow(buf, len - buf->len);
 159        dst = buf->buf;
 160        if (action == CRLF_GUESS) {
 161                /*
 162                 * If we guessed, we already know we rejected a file with
 163                 * lone CR, and we can strip a CR without looking at what
 164                 * follow it.
 165                 */
 166                do {
 167                        unsigned char c = *src++;
 168                        if (c != '\r')
 169                                *dst++ = c;
 170                } while (--len);
 171        } else {
 172                do {
 173                        unsigned char c = *src++;
 174                        if (! (c == '\r' && (1 < len && *src == '\n')))
 175                                *dst++ = c;
 176                } while (--len);
 177        }
 178        strbuf_setlen(buf, dst - buf->buf);
 179        return 1;
 180}
 181
 182static int crlf_to_worktree(const char *path, const char *src, size_t len,
 183                            struct strbuf *buf, int action)
 184{
 185        char *to_free = NULL;
 186        struct text_stat stats;
 187
 188        if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 189            auto_crlf <= 0)
 190                return 0;
 191
 192        if (!len)
 193                return 0;
 194
 195        gather_stats(src, len, &stats);
 196
 197        /* No LF? Nothing to convert, regardless. */
 198        if (!stats.lf)
 199                return 0;
 200
 201        /* Was it already in CRLF format? */
 202        if (stats.lf == stats.crlf)
 203                return 0;
 204
 205        if (action == CRLF_GUESS) {
 206                /* If we have any bare CR characters, we're not going to touch it */
 207                if (stats.cr != stats.crlf)
 208                        return 0;
 209
 210                if (is_binary(len, &stats))
 211                        return 0;
 212        }
 213
 214        /* are we "faking" in place editing ? */
 215        if (src == buf->buf)
 216                to_free = strbuf_detach(buf, NULL);
 217
 218        strbuf_grow(buf, len + stats.lf - stats.crlf);
 219        for (;;) {
 220                const char *nl = memchr(src, '\n', len);
 221                if (!nl)
 222                        break;
 223                if (nl > src && nl[-1] == '\r') {
 224                        strbuf_add(buf, src, nl + 1 - src);
 225                } else {
 226                        strbuf_add(buf, src, nl - src);
 227                        strbuf_addstr(buf, "\r\n");
 228                }
 229                len -= nl + 1 - src;
 230                src  = nl + 1;
 231        }
 232        strbuf_add(buf, src, len);
 233
 234        free(to_free);
 235        return 1;
 236}
 237
 238struct filter_params {
 239        const char *src;
 240        unsigned long size;
 241        const char *cmd;
 242};
 243
 244static int filter_buffer(int fd, void *data)
 245{
 246        /*
 247         * Spawn cmd and feed the buffer contents through its stdin.
 248         */
 249        struct child_process child_process;
 250        struct filter_params *params = (struct filter_params *)data;
 251        int write_err, status;
 252        const char *argv[] = { "sh", "-c", params->cmd, NULL };
 253
 254        memset(&child_process, 0, sizeof(child_process));
 255        child_process.argv = argv;
 256        child_process.in = -1;
 257        child_process.out = fd;
 258
 259        if (start_command(&child_process))
 260                return error("cannot fork to run external filter %s", params->cmd);
 261
 262        write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
 263        if (close(child_process.in))
 264                write_err = 1;
 265        if (write_err)
 266                error("cannot feed the input to external filter %s", params->cmd);
 267
 268        status = finish_command(&child_process);
 269        if (status)
 270                error("external filter %s failed %d", params->cmd, -status);
 271        return (write_err || status);
 272}
 273
 274static int apply_filter(const char *path, const char *src, size_t len,
 275                        struct strbuf *dst, const char *cmd)
 276{
 277        /*
 278         * Create a pipeline to have the command filter the buffer's
 279         * contents.
 280         *
 281         * (child --> cmd) --> us
 282         */
 283        int ret = 1;
 284        struct strbuf nbuf = STRBUF_INIT;
 285        struct async async;
 286        struct filter_params params;
 287
 288        if (!cmd)
 289                return 0;
 290
 291        memset(&async, 0, sizeof(async));
 292        async.proc = filter_buffer;
 293        async.data = &params;
 294        params.src = src;
 295        params.size = len;
 296        params.cmd = cmd;
 297
 298        fflush(NULL);
 299        if (start_async(&async))
 300                return 0;       /* error was already reported */
 301
 302        if (strbuf_read(&nbuf, async.out, len) < 0) {
 303                error("read from external filter %s failed", cmd);
 304                ret = 0;
 305        }
 306        if (close(async.out)) {
 307                error("read from external filter %s failed", cmd);
 308                ret = 0;
 309        }
 310        if (finish_async(&async)) {
 311                error("external filter %s failed", cmd);
 312                ret = 0;
 313        }
 314
 315        if (ret) {
 316                strbuf_swap(dst, &nbuf);
 317        }
 318        strbuf_release(&nbuf);
 319        return ret;
 320}
 321
 322static struct convert_driver {
 323        const char *name;
 324        struct convert_driver *next;
 325        const char *smudge;
 326        const char *clean;
 327} *user_convert, **user_convert_tail;
 328
 329static int read_convert_config(const char *var, const char *value, void *cb)
 330{
 331        const char *ep, *name;
 332        int namelen;
 333        struct convert_driver *drv;
 334
 335        /*
 336         * External conversion drivers are configured using
 337         * "filter.<name>.variable".
 338         */
 339        if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
 340                return 0;
 341        name = var + 7;
 342        namelen = ep - name;
 343        for (drv = user_convert; drv; drv = drv->next)
 344                if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
 345                        break;
 346        if (!drv) {
 347                drv = xcalloc(1, sizeof(struct convert_driver));
 348                drv->name = xmemdupz(name, namelen);
 349                *user_convert_tail = drv;
 350                user_convert_tail = &(drv->next);
 351        }
 352
 353        ep++;
 354
 355        /*
 356         * filter.<name>.smudge and filter.<name>.clean specifies
 357         * the command line:
 358         *
 359         *      command-line
 360         *
 361         * The command-line will not be interpolated in any way.
 362         */
 363
 364        if (!strcmp("smudge", ep))
 365                return git_config_string(&drv->smudge, var, value);
 366
 367        if (!strcmp("clean", ep))
 368                return git_config_string(&drv->clean, var, value);
 369
 370        return 0;
 371}
 372
 373static void setup_convert_check(struct git_attr_check *check)
 374{
 375        static struct git_attr *attr_crlf;
 376        static struct git_attr *attr_ident;
 377        static struct git_attr *attr_filter;
 378
 379        if (!attr_crlf) {
 380                attr_crlf = git_attr("crlf", 4);
 381                attr_ident = git_attr("ident", 5);
 382                attr_filter = git_attr("filter", 6);
 383                user_convert_tail = &user_convert;
 384                git_config(read_convert_config, NULL);
 385        }
 386        check[0].attr = attr_crlf;
 387        check[1].attr = attr_ident;
 388        check[2].attr = attr_filter;
 389}
 390
 391static int count_ident(const char *cp, unsigned long size)
 392{
 393        /*
 394         * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
 395         */
 396        int cnt = 0;
 397        char ch;
 398
 399        while (size) {
 400                ch = *cp++;
 401                size--;
 402                if (ch != '$')
 403                        continue;
 404                if (size < 3)
 405                        break;
 406                if (memcmp("Id", cp, 2))
 407                        continue;
 408                ch = cp[2];
 409                cp += 3;
 410                size -= 3;
 411                if (ch == '$')
 412                        cnt++; /* $Id$ */
 413                if (ch != ':')
 414                        continue;
 415
 416                /*
 417                 * "$Id: ... "; scan up to the closing dollar sign and discard.
 418                 */
 419                while (size) {
 420                        ch = *cp++;
 421                        size--;
 422                        if (ch == '$') {
 423                                cnt++;
 424                                break;
 425                        }
 426                }
 427        }
 428        return cnt;
 429}
 430
 431static int ident_to_git(const char *path, const char *src, size_t len,
 432                        struct strbuf *buf, int ident)
 433{
 434        char *dst, *dollar;
 435
 436        if (!ident || !count_ident(src, len))
 437                return 0;
 438
 439        /* only grow if not in place */
 440        if (strbuf_avail(buf) + buf->len < len)
 441                strbuf_grow(buf, len - buf->len);
 442        dst = buf->buf;
 443        for (;;) {
 444                dollar = memchr(src, '$', len);
 445                if (!dollar)
 446                        break;
 447                memcpy(dst, src, dollar + 1 - src);
 448                dst += dollar + 1 - src;
 449                len -= dollar + 1 - src;
 450                src  = dollar + 1;
 451
 452                if (len > 3 && !memcmp(src, "Id:", 3)) {
 453                        dollar = memchr(src + 3, '$', len - 3);
 454                        if (!dollar)
 455                                break;
 456                        memcpy(dst, "Id$", 3);
 457                        dst += 3;
 458                        len -= dollar + 1 - src;
 459                        src  = dollar + 1;
 460                }
 461        }
 462        memcpy(dst, src, len);
 463        strbuf_setlen(buf, dst + len - buf->buf);
 464        return 1;
 465}
 466
 467static int ident_to_worktree(const char *path, const char *src, size_t len,
 468                             struct strbuf *buf, int ident)
 469{
 470        unsigned char sha1[20];
 471        char *to_free = NULL, *dollar;
 472        int cnt;
 473
 474        if (!ident)
 475                return 0;
 476
 477        cnt = count_ident(src, len);
 478        if (!cnt)
 479                return 0;
 480
 481        /* are we "faking" in place editing ? */
 482        if (src == buf->buf)
 483                to_free = strbuf_detach(buf, NULL);
 484        hash_sha1_file(src, len, "blob", sha1);
 485
 486        strbuf_grow(buf, len + cnt * 43);
 487        for (;;) {
 488                /* step 1: run to the next '$' */
 489                dollar = memchr(src, '$', len);
 490                if (!dollar)
 491                        break;
 492                strbuf_add(buf, src, dollar + 1 - src);
 493                len -= dollar + 1 - src;
 494                src  = dollar + 1;
 495
 496                /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
 497                if (len < 3 || memcmp("Id", src, 2))
 498                        continue;
 499
 500                /* step 3: skip over Id$ or Id:xxxxx$ */
 501                if (src[2] == '$') {
 502                        src += 3;
 503                        len -= 3;
 504                } else if (src[2] == ':') {
 505                        /*
 506                         * It's possible that an expanded Id has crept its way into the
 507                         * repository, we cope with that by stripping the expansion out
 508                         */
 509                        dollar = memchr(src + 3, '$', len - 3);
 510                        if (!dollar) {
 511                                /* incomplete keyword, no more '$', so just quit the loop */
 512                                break;
 513                        }
 514
 515                        len -= dollar + 1 - src;
 516                        src  = dollar + 1;
 517                } else {
 518                        /* it wasn't a "Id$" or "Id:xxxx$" */
 519                        continue;
 520                }
 521
 522                /* step 4: substitute */
 523                strbuf_addstr(buf, "Id: ");
 524                strbuf_add(buf, sha1_to_hex(sha1), 40);
 525                strbuf_addstr(buf, " $");
 526        }
 527        strbuf_add(buf, src, len);
 528
 529        free(to_free);
 530        return 1;
 531}
 532
 533static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 534{
 535        const char *value = check->value;
 536
 537        if (ATTR_TRUE(value))
 538                return CRLF_TEXT;
 539        else if (ATTR_FALSE(value))
 540                return CRLF_BINARY;
 541        else if (ATTR_UNSET(value))
 542                ;
 543        else if (!strcmp(value, "input"))
 544                return CRLF_INPUT;
 545        return CRLF_GUESS;
 546}
 547
 548static struct convert_driver *git_path_check_convert(const char *path,
 549                                             struct git_attr_check *check)
 550{
 551        const char *value = check->value;
 552        struct convert_driver *drv;
 553
 554        if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
 555                return NULL;
 556        for (drv = user_convert; drv; drv = drv->next)
 557                if (!strcmp(value, drv->name))
 558                        return drv;
 559        return NULL;
 560}
 561
 562static int git_path_check_ident(const char *path, struct git_attr_check *check)
 563{
 564        const char *value = check->value;
 565
 566        return !!ATTR_TRUE(value);
 567}
 568
 569int convert_to_git(const char *path, const char *src, size_t len,
 570                   struct strbuf *dst, enum safe_crlf checksafe)
 571{
 572        struct git_attr_check check[3];
 573        int crlf = CRLF_GUESS;
 574        int ident = 0, ret = 0;
 575        const char *filter = NULL;
 576
 577        setup_convert_check(check);
 578        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 579                struct convert_driver *drv;
 580                crlf = git_path_check_crlf(path, check + 0);
 581                ident = git_path_check_ident(path, check + 1);
 582                drv = git_path_check_convert(path, check + 2);
 583                if (drv && drv->clean)
 584                        filter = drv->clean;
 585        }
 586
 587        ret |= apply_filter(path, src, len, dst, filter);
 588        if (ret) {
 589                src = dst->buf;
 590                len = dst->len;
 591        }
 592        ret |= crlf_to_git(path, src, len, dst, crlf, checksafe);
 593        if (ret) {
 594                src = dst->buf;
 595                len = dst->len;
 596        }
 597        return ret | ident_to_git(path, src, len, dst, ident);
 598}
 599
 600int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
 601{
 602        struct git_attr_check check[3];
 603        int crlf = CRLF_GUESS;
 604        int ident = 0, ret = 0;
 605        const char *filter = NULL;
 606
 607        setup_convert_check(check);
 608        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 609                struct convert_driver *drv;
 610                crlf = git_path_check_crlf(path, check + 0);
 611                ident = git_path_check_ident(path, check + 1);
 612                drv = git_path_check_convert(path, check + 2);
 613                if (drv && drv->smudge)
 614                        filter = drv->smudge;
 615        }
 616
 617        ret |= ident_to_worktree(path, src, len, dst, ident);
 618        if (ret) {
 619                src = dst->buf;
 620                len = dst->len;
 621        }
 622        ret |= crlf_to_worktree(path, src, len, dst, crlf);
 623        if (ret) {
 624                src = dst->buf;
 625                len = dst->len;
 626        }
 627        return ret | apply_filter(path, src, len, dst, filter);
 628}