convert.con commit revision --simplify-merges: do not leave commits unprocessed (53030f8)
   1#include "cache.h"
   2#include "attr.h"
   3#include "run-command.h"
   4
   5/*
   6 * convert.c - convert a file when checking it out and checking it in.
   7 *
   8 * This should use the pathname to decide on whether it wants to do some
   9 * more interesting conversions (automatic gzip/unzip, general format
  10 * conversions etc etc), but by default it just does automatic CRLF<->LF
  11 * translation when the "auto_crlf" option is set.
  12 */
  13
  14#define CRLF_GUESS      (-1)
  15#define CRLF_BINARY     0
  16#define CRLF_TEXT       1
  17#define CRLF_INPUT      2
  18
  19struct text_stat {
  20        /* NUL, CR, LF and CRLF counts */
  21        unsigned nul, cr, lf, crlf;
  22
  23        /* These are just approximations! */
  24        unsigned printable, nonprintable;
  25};
  26
  27static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  28{
  29        unsigned long i;
  30
  31        memset(stats, 0, sizeof(*stats));
  32
  33        for (i = 0; i < size; i++) {
  34                unsigned char c = buf[i];
  35                if (c == '\r') {
  36                        stats->cr++;
  37                        if (i+1 < size && buf[i+1] == '\n')
  38                                stats->crlf++;
  39                        continue;
  40                }
  41                if (c == '\n') {
  42                        stats->lf++;
  43                        continue;
  44                }
  45                if (c == 127)
  46                        /* DEL */
  47                        stats->nonprintable++;
  48                else if (c < 32) {
  49                        switch (c) {
  50                                /* BS, HT, ESC and FF */
  51                        case '\b': case '\t': case '\033': case '\014':
  52                                stats->printable++;
  53                                break;
  54                        case 0:
  55                                stats->nul++;
  56                                /* fall through */
  57                        default:
  58                                stats->nonprintable++;
  59                        }
  60                }
  61                else
  62                        stats->printable++;
  63        }
  64
  65        /* If file ends with EOF then don't count this EOF as non-printable. */
  66        if (size >= 1 && buf[size-1] == '\032')
  67                stats->nonprintable--;
  68}
  69
  70/*
  71 * The same heuristics as diff.c::mmfile_is_binary()
  72 */
  73static int is_binary(unsigned long size, struct text_stat *stats)
  74{
  75
  76        if (stats->nul)
  77                return 1;
  78        if ((stats->printable >> 7) < stats->nonprintable)
  79                return 1;
  80        /*
  81         * Other heuristics? Average line length might be relevant,
  82         * as might LF vs CR vs CRLF counts..
  83         *
  84         * NOTE! It might be normal to have a low ratio of CRLF to LF
  85         * (somebody starts with a LF-only file and edits it with an editor
  86         * that adds CRLF only to lines that are added..). But do  we
  87         * want to support CR-only? Probably not.
  88         */
  89        return 0;
  90}
  91
  92static void check_safe_crlf(const char *path, int action,
  93                            struct text_stat *stats, enum safe_crlf checksafe)
  94{
  95        if (!checksafe)
  96                return;
  97
  98        if (action == CRLF_INPUT || auto_crlf <= 0) {
  99                /*
 100                 * CRLFs would not be restored by checkout:
 101                 * check if we'd remove CRLFs
 102                 */
 103                if (stats->crlf) {
 104                        if (checksafe == SAFE_CRLF_WARN)
 105                                warning("CRLF will be replaced by LF in %s.", path);
 106                        else /* i.e. SAFE_CRLF_FAIL */
 107                                die("CRLF would be replaced by LF in %s.", path);
 108                }
 109        } else if (auto_crlf > 0) {
 110                /*
 111                 * CRLFs would be added by checkout:
 112                 * check if we have "naked" LFs
 113                 */
 114                if (stats->lf != stats->crlf) {
 115                        if (checksafe == SAFE_CRLF_WARN)
 116                                warning("LF will be replaced by CRLF in %s", path);
 117                        else /* i.e. SAFE_CRLF_FAIL */
 118                                die("LF would be replaced by CRLF in %s", path);
 119                }
 120        }
 121}
 122
 123static int crlf_to_git(const char *path, const char *src, size_t len,
 124                       struct strbuf *buf, int action, enum safe_crlf checksafe)
 125{
 126        struct text_stat stats;
 127        char *dst;
 128
 129        if ((action == CRLF_BINARY) || !auto_crlf || !len)
 130                return 0;
 131
 132        gather_stats(src, len, &stats);
 133
 134        if (action == CRLF_GUESS) {
 135                /*
 136                 * We're currently not going to even try to convert stuff
 137                 * that has bare CR characters. Does anybody do that crazy
 138                 * stuff?
 139                 */
 140                if (stats.cr != stats.crlf)
 141                        return 0;
 142
 143                /*
 144                 * And add some heuristics for binary vs text, of course...
 145                 */
 146                if (is_binary(len, &stats))
 147                        return 0;
 148        }
 149
 150        check_safe_crlf(path, action, &stats, checksafe);
 151
 152        /* Optimization: No CR? Nothing to convert, regardless. */
 153        if (!stats.cr)
 154                return 0;
 155
 156        /* only grow if not in place */
 157        if (strbuf_avail(buf) + buf->len < len)
 158                strbuf_grow(buf, len - buf->len);
 159        dst = buf->buf;
 160        if (action == CRLF_GUESS) {
 161                /*
 162                 * If we guessed, we already know we rejected a file with
 163                 * lone CR, and we can strip a CR without looking at what
 164                 * follow it.
 165                 */
 166                do {
 167                        unsigned char c = *src++;
 168                        if (c != '\r')
 169                                *dst++ = c;
 170                } while (--len);
 171        } else {
 172                do {
 173                        unsigned char c = *src++;
 174                        if (! (c == '\r' && (1 < len && *src == '\n')))
 175                                *dst++ = c;
 176                } while (--len);
 177        }
 178        strbuf_setlen(buf, dst - buf->buf);
 179        return 1;
 180}
 181
 182static int crlf_to_worktree(const char *path, const char *src, size_t len,
 183                            struct strbuf *buf, int action)
 184{
 185        char *to_free = NULL;
 186        struct text_stat stats;
 187
 188        if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 189            auto_crlf <= 0)
 190                return 0;
 191
 192        if (!len)
 193                return 0;
 194
 195        gather_stats(src, len, &stats);
 196
 197        /* No LF? Nothing to convert, regardless. */
 198        if (!stats.lf)
 199                return 0;
 200
 201        /* Was it already in CRLF format? */
 202        if (stats.lf == stats.crlf)
 203                return 0;
 204
 205        if (action == CRLF_GUESS) {
 206                /* If we have any bare CR characters, we're not going to touch it */
 207                if (stats.cr != stats.crlf)
 208                        return 0;
 209
 210                if (is_binary(len, &stats))
 211                        return 0;
 212        }
 213
 214        /* are we "faking" in place editing ? */
 215        if (src == buf->buf)
 216                to_free = strbuf_detach(buf, NULL);
 217
 218        strbuf_grow(buf, len + stats.lf - stats.crlf);
 219        for (;;) {
 220                const char *nl = memchr(src, '\n', len);
 221                if (!nl)
 222                        break;
 223                if (nl > src && nl[-1] == '\r') {
 224                        strbuf_add(buf, src, nl + 1 - src);
 225                } else {
 226                        strbuf_add(buf, src, nl - src);
 227                        strbuf_addstr(buf, "\r\n");
 228                }
 229                len -= nl + 1 - src;
 230                src  = nl + 1;
 231        }
 232        strbuf_add(buf, src, len);
 233
 234        free(to_free);
 235        return 1;
 236}
 237
 238struct filter_params {
 239        const char *src;
 240        unsigned long size;
 241        const char *cmd;
 242};
 243
 244static int filter_buffer(int fd, void *data)
 245{
 246        /*
 247         * Spawn cmd and feed the buffer contents through its stdin.
 248         */
 249        struct child_process child_process;
 250        struct filter_params *params = (struct filter_params *)data;
 251        int write_err, status;
 252        const char *argv[] = { "sh", "-c", params->cmd, NULL };
 253
 254        memset(&child_process, 0, sizeof(child_process));
 255        child_process.argv = argv;
 256        child_process.in = -1;
 257        child_process.out = fd;
 258
 259        if (start_command(&child_process))
 260                return error("cannot fork to run external filter %s", params->cmd);
 261
 262        write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
 263        if (close(child_process.in))
 264                write_err = 1;
 265        if (write_err)
 266                error("cannot feed the input to external filter %s", params->cmd);
 267
 268        status = finish_command(&child_process);
 269        if (status)
 270                error("external filter %s failed %d", params->cmd, -status);
 271        return (write_err || status);
 272}
 273
 274static int apply_filter(const char *path, const char *src, size_t len,
 275                        struct strbuf *dst, const char *cmd)
 276{
 277        /*
 278         * Create a pipeline to have the command filter the buffer's
 279         * contents.
 280         *
 281         * (child --> cmd) --> us
 282         */
 283        int ret = 1;
 284        struct strbuf nbuf;
 285        struct async async;
 286        struct filter_params params;
 287
 288        if (!cmd)
 289                return 0;
 290
 291        memset(&async, 0, sizeof(async));
 292        async.proc = filter_buffer;
 293        async.data = &params;
 294        params.src = src;
 295        params.size = len;
 296        params.cmd = cmd;
 297
 298        fflush(NULL);
 299        if (start_async(&async))
 300                return 0;       /* error was already reported */
 301
 302        strbuf_init(&nbuf, 0);
 303        if (strbuf_read(&nbuf, async.out, len) < 0) {
 304                error("read from external filter %s failed", cmd);
 305                ret = 0;
 306        }
 307        if (close(async.out)) {
 308                error("read from external filter %s failed", cmd);
 309                ret = 0;
 310        }
 311        if (finish_async(&async)) {
 312                error("external filter %s failed", cmd);
 313                ret = 0;
 314        }
 315
 316        if (ret) {
 317                strbuf_swap(dst, &nbuf);
 318        }
 319        strbuf_release(&nbuf);
 320        return ret;
 321}
 322
 323static struct convert_driver {
 324        const char *name;
 325        struct convert_driver *next;
 326        const char *smudge;
 327        const char *clean;
 328} *user_convert, **user_convert_tail;
 329
 330static int read_convert_config(const char *var, const char *value, void *cb)
 331{
 332        const char *ep, *name;
 333        int namelen;
 334        struct convert_driver *drv;
 335
 336        /*
 337         * External conversion drivers are configured using
 338         * "filter.<name>.variable".
 339         */
 340        if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
 341                return 0;
 342        name = var + 7;
 343        namelen = ep - name;
 344        for (drv = user_convert; drv; drv = drv->next)
 345                if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
 346                        break;
 347        if (!drv) {
 348                drv = xcalloc(1, sizeof(struct convert_driver));
 349                drv->name = xmemdupz(name, namelen);
 350                *user_convert_tail = drv;
 351                user_convert_tail = &(drv->next);
 352        }
 353
 354        ep++;
 355
 356        /*
 357         * filter.<name>.smudge and filter.<name>.clean specifies
 358         * the command line:
 359         *
 360         *      command-line
 361         *
 362         * The command-line will not be interpolated in any way.
 363         */
 364
 365        if (!strcmp("smudge", ep))
 366                return git_config_string(&drv->smudge, var, value);
 367
 368        if (!strcmp("clean", ep))
 369                return git_config_string(&drv->clean, var, value);
 370
 371        return 0;
 372}
 373
 374static void setup_convert_check(struct git_attr_check *check)
 375{
 376        static struct git_attr *attr_crlf;
 377        static struct git_attr *attr_ident;
 378        static struct git_attr *attr_filter;
 379
 380        if (!attr_crlf) {
 381                attr_crlf = git_attr("crlf", 4);
 382                attr_ident = git_attr("ident", 5);
 383                attr_filter = git_attr("filter", 6);
 384                user_convert_tail = &user_convert;
 385                git_config(read_convert_config, NULL);
 386        }
 387        check[0].attr = attr_crlf;
 388        check[1].attr = attr_ident;
 389        check[2].attr = attr_filter;
 390}
 391
 392static int count_ident(const char *cp, unsigned long size)
 393{
 394        /*
 395         * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
 396         */
 397        int cnt = 0;
 398        char ch;
 399
 400        while (size) {
 401                ch = *cp++;
 402                size--;
 403                if (ch != '$')
 404                        continue;
 405                if (size < 3)
 406                        break;
 407                if (memcmp("Id", cp, 2))
 408                        continue;
 409                ch = cp[2];
 410                cp += 3;
 411                size -= 3;
 412                if (ch == '$')
 413                        cnt++; /* $Id$ */
 414                if (ch != ':')
 415                        continue;
 416
 417                /*
 418                 * "$Id: ... "; scan up to the closing dollar sign and discard.
 419                 */
 420                while (size) {
 421                        ch = *cp++;
 422                        size--;
 423                        if (ch == '$') {
 424                                cnt++;
 425                                break;
 426                        }
 427                }
 428        }
 429        return cnt;
 430}
 431
 432static int ident_to_git(const char *path, const char *src, size_t len,
 433                        struct strbuf *buf, int ident)
 434{
 435        char *dst, *dollar;
 436
 437        if (!ident || !count_ident(src, len))
 438                return 0;
 439
 440        /* only grow if not in place */
 441        if (strbuf_avail(buf) + buf->len < len)
 442                strbuf_grow(buf, len - buf->len);
 443        dst = buf->buf;
 444        for (;;) {
 445                dollar = memchr(src, '$', len);
 446                if (!dollar)
 447                        break;
 448                memcpy(dst, src, dollar + 1 - src);
 449                dst += dollar + 1 - src;
 450                len -= dollar + 1 - src;
 451                src  = dollar + 1;
 452
 453                if (len > 3 && !memcmp(src, "Id:", 3)) {
 454                        dollar = memchr(src + 3, '$', len - 3);
 455                        if (!dollar)
 456                                break;
 457                        memcpy(dst, "Id$", 3);
 458                        dst += 3;
 459                        len -= dollar + 1 - src;
 460                        src  = dollar + 1;
 461                }
 462        }
 463        memcpy(dst, src, len);
 464        strbuf_setlen(buf, dst + len - buf->buf);
 465        return 1;
 466}
 467
 468static int ident_to_worktree(const char *path, const char *src, size_t len,
 469                             struct strbuf *buf, int ident)
 470{
 471        unsigned char sha1[20];
 472        char *to_free = NULL, *dollar;
 473        int cnt;
 474
 475        if (!ident)
 476                return 0;
 477
 478        cnt = count_ident(src, len);
 479        if (!cnt)
 480                return 0;
 481
 482        /* are we "faking" in place editing ? */
 483        if (src == buf->buf)
 484                to_free = strbuf_detach(buf, NULL);
 485        hash_sha1_file(src, len, "blob", sha1);
 486
 487        strbuf_grow(buf, len + cnt * 43);
 488        for (;;) {
 489                /* step 1: run to the next '$' */
 490                dollar = memchr(src, '$', len);
 491                if (!dollar)
 492                        break;
 493                strbuf_add(buf, src, dollar + 1 - src);
 494                len -= dollar + 1 - src;
 495                src  = dollar + 1;
 496
 497                /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
 498                if (len < 3 || memcmp("Id", src, 2))
 499                        continue;
 500
 501                /* step 3: skip over Id$ or Id:xxxxx$ */
 502                if (src[2] == '$') {
 503                        src += 3;
 504                        len -= 3;
 505                } else if (src[2] == ':') {
 506                        /*
 507                         * It's possible that an expanded Id has crept its way into the
 508                         * repository, we cope with that by stripping the expansion out
 509                         */
 510                        dollar = memchr(src + 3, '$', len - 3);
 511                        if (!dollar) {
 512                                /* incomplete keyword, no more '$', so just quit the loop */
 513                                break;
 514                        }
 515
 516                        len -= dollar + 1 - src;
 517                        src  = dollar + 1;
 518                } else {
 519                        /* it wasn't a "Id$" or "Id:xxxx$" */
 520                        continue;
 521                }
 522
 523                /* step 4: substitute */
 524                strbuf_addstr(buf, "Id: ");
 525                strbuf_add(buf, sha1_to_hex(sha1), 40);
 526                strbuf_addstr(buf, " $");
 527        }
 528        strbuf_add(buf, src, len);
 529
 530        free(to_free);
 531        return 1;
 532}
 533
 534static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 535{
 536        const char *value = check->value;
 537
 538        if (ATTR_TRUE(value))
 539                return CRLF_TEXT;
 540        else if (ATTR_FALSE(value))
 541                return CRLF_BINARY;
 542        else if (ATTR_UNSET(value))
 543                ;
 544        else if (!strcmp(value, "input"))
 545                return CRLF_INPUT;
 546        return CRLF_GUESS;
 547}
 548
 549static struct convert_driver *git_path_check_convert(const char *path,
 550                                             struct git_attr_check *check)
 551{
 552        const char *value = check->value;
 553        struct convert_driver *drv;
 554
 555        if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
 556                return NULL;
 557        for (drv = user_convert; drv; drv = drv->next)
 558                if (!strcmp(value, drv->name))
 559                        return drv;
 560        return NULL;
 561}
 562
 563static int git_path_check_ident(const char *path, struct git_attr_check *check)
 564{
 565        const char *value = check->value;
 566
 567        return !!ATTR_TRUE(value);
 568}
 569
 570int convert_to_git(const char *path, const char *src, size_t len,
 571                   struct strbuf *dst, enum safe_crlf checksafe)
 572{
 573        struct git_attr_check check[3];
 574        int crlf = CRLF_GUESS;
 575        int ident = 0, ret = 0;
 576        const char *filter = NULL;
 577
 578        setup_convert_check(check);
 579        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 580                struct convert_driver *drv;
 581                crlf = git_path_check_crlf(path, check + 0);
 582                ident = git_path_check_ident(path, check + 1);
 583                drv = git_path_check_convert(path, check + 2);
 584                if (drv && drv->clean)
 585                        filter = drv->clean;
 586        }
 587
 588        ret |= apply_filter(path, src, len, dst, filter);
 589        if (ret) {
 590                src = dst->buf;
 591                len = dst->len;
 592        }
 593        ret |= crlf_to_git(path, src, len, dst, crlf, checksafe);
 594        if (ret) {
 595                src = dst->buf;
 596                len = dst->len;
 597        }
 598        return ret | ident_to_git(path, src, len, dst, ident);
 599}
 600
 601int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
 602{
 603        struct git_attr_check check[3];
 604        int crlf = CRLF_GUESS;
 605        int ident = 0, ret = 0;
 606        const char *filter = NULL;
 607
 608        setup_convert_check(check);
 609        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 610                struct convert_driver *drv;
 611                crlf = git_path_check_crlf(path, check + 0);
 612                ident = git_path_check_ident(path, check + 1);
 613                drv = git_path_check_convert(path, check + 2);
 614                if (drv && drv->smudge)
 615                        filter = drv->smudge;
 616        }
 617
 618        ret |= ident_to_worktree(path, src, len, dst, ident);
 619        if (ret) {
 620                src = dst->buf;
 621                len = dst->len;
 622        }
 623        ret |= crlf_to_worktree(path, src, len, dst, crlf);
 624        if (ret) {
 625                src = dst->buf;
 626                len = dst->len;
 627        }
 628        return ret | apply_filter(path, src, len, dst, filter);
 629}