convert.con commit instaweb: add minification awareness (09b89d1)
   1#include "cache.h"
   2#include "attr.h"
   3#include "run-command.h"
   4
   5/*
   6 * convert.c - convert a file when checking it out and checking it in.
   7 *
   8 * This should use the pathname to decide on whether it wants to do some
   9 * more interesting conversions (automatic gzip/unzip, general format
  10 * conversions etc etc), but by default it just does automatic CRLF<->LF
  11 * translation when the "auto_crlf" option is set.
  12 */
  13
  14#define CRLF_GUESS      (-1)
  15#define CRLF_BINARY     0
  16#define CRLF_TEXT       1
  17#define CRLF_INPUT      2
  18
  19struct text_stat {
  20        /* NUL, CR, LF and CRLF counts */
  21        unsigned nul, cr, lf, crlf;
  22
  23        /* These are just approximations! */
  24        unsigned printable, nonprintable;
  25};
  26
  27static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  28{
  29        unsigned long i;
  30
  31        memset(stats, 0, sizeof(*stats));
  32
  33        for (i = 0; i < size; i++) {
  34                unsigned char c = buf[i];
  35                if (c == '\r') {
  36                        stats->cr++;
  37                        if (i+1 < size && buf[i+1] == '\n')
  38                                stats->crlf++;
  39                        continue;
  40                }
  41                if (c == '\n') {
  42                        stats->lf++;
  43                        continue;
  44                }
  45                if (c == 127)
  46                        /* DEL */
  47                        stats->nonprintable++;
  48                else if (c < 32) {
  49                        switch (c) {
  50                                /* BS, HT, ESC and FF */
  51                        case '\b': case '\t': case '\033': case '\014':
  52                                stats->printable++;
  53                                break;
  54                        case 0:
  55                                stats->nul++;
  56                                /* fall through */
  57                        default:
  58                                stats->nonprintable++;
  59                        }
  60                }
  61                else
  62                        stats->printable++;
  63        }
  64
  65        /* If file ends with EOF then don't count this EOF as non-printable. */
  66        if (size >= 1 && buf[size-1] == '\032')
  67                stats->nonprintable--;
  68}
  69
  70/*
  71 * The same heuristics as diff.c::mmfile_is_binary()
  72 */
  73static int is_binary(unsigned long size, struct text_stat *stats)
  74{
  75
  76        if (stats->nul)
  77                return 1;
  78        if ((stats->printable >> 7) < stats->nonprintable)
  79                return 1;
  80        /*
  81         * Other heuristics? Average line length might be relevant,
  82         * as might LF vs CR vs CRLF counts..
  83         *
  84         * NOTE! It might be normal to have a low ratio of CRLF to LF
  85         * (somebody starts with a LF-only file and edits it with an editor
  86         * that adds CRLF only to lines that are added..). But do  we
  87         * want to support CR-only? Probably not.
  88         */
  89        return 0;
  90}
  91
  92static void check_safe_crlf(const char *path, int action,
  93                            struct text_stat *stats, enum safe_crlf checksafe)
  94{
  95        if (!checksafe)
  96                return;
  97
  98        if (action == CRLF_INPUT || auto_crlf <= 0) {
  99                /*
 100                 * CRLFs would not be restored by checkout:
 101                 * check if we'd remove CRLFs
 102                 */
 103                if (stats->crlf) {
 104                        if (checksafe == SAFE_CRLF_WARN)
 105                                warning("CRLF will be replaced by LF in %s.", path);
 106                        else /* i.e. SAFE_CRLF_FAIL */
 107                                die("CRLF would be replaced by LF in %s.", path);
 108                }
 109        } else if (auto_crlf > 0) {
 110                /*
 111                 * CRLFs would be added by checkout:
 112                 * check if we have "naked" LFs
 113                 */
 114                if (stats->lf != stats->crlf) {
 115                        if (checksafe == SAFE_CRLF_WARN)
 116                                warning("LF will be replaced by CRLF in %s", path);
 117                        else /* i.e. SAFE_CRLF_FAIL */
 118                                die("LF would be replaced by CRLF in %s", path);
 119                }
 120        }
 121}
 122
 123static int crlf_to_git(const char *path, const char *src, size_t len,
 124                       struct strbuf *buf, int action, enum safe_crlf checksafe)
 125{
 126        struct text_stat stats;
 127        char *dst;
 128
 129        if ((action == CRLF_BINARY) || !auto_crlf || !len)
 130                return 0;
 131
 132        gather_stats(src, len, &stats);
 133
 134        if (action == CRLF_GUESS) {
 135                /*
 136                 * We're currently not going to even try to convert stuff
 137                 * that has bare CR characters. Does anybody do that crazy
 138                 * stuff?
 139                 */
 140                if (stats.cr != stats.crlf)
 141                        return 0;
 142
 143                /*
 144                 * And add some heuristics for binary vs text, of course...
 145                 */
 146                if (is_binary(len, &stats))
 147                        return 0;
 148        }
 149
 150        check_safe_crlf(path, action, &stats, checksafe);
 151
 152        /* Optimization: No CR? Nothing to convert, regardless. */
 153        if (!stats.cr)
 154                return 0;
 155
 156        /* only grow if not in place */
 157        if (strbuf_avail(buf) + buf->len < len)
 158                strbuf_grow(buf, len - buf->len);
 159        dst = buf->buf;
 160        if (action == CRLF_GUESS) {
 161                /*
 162                 * If we guessed, we already know we rejected a file with
 163                 * lone CR, and we can strip a CR without looking at what
 164                 * follow it.
 165                 */
 166                do {
 167                        unsigned char c = *src++;
 168                        if (c != '\r')
 169                                *dst++ = c;
 170                } while (--len);
 171        } else {
 172                do {
 173                        unsigned char c = *src++;
 174                        if (! (c == '\r' && (1 < len && *src == '\n')))
 175                                *dst++ = c;
 176                } while (--len);
 177        }
 178        strbuf_setlen(buf, dst - buf->buf);
 179        return 1;
 180}
 181
 182static int crlf_to_worktree(const char *path, const char *src, size_t len,
 183                            struct strbuf *buf, int action)
 184{
 185        char *to_free = NULL;
 186        struct text_stat stats;
 187
 188        if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 189            auto_crlf <= 0)
 190                return 0;
 191
 192        if (!len)
 193                return 0;
 194
 195        gather_stats(src, len, &stats);
 196
 197        /* No LF? Nothing to convert, regardless. */
 198        if (!stats.lf)
 199                return 0;
 200
 201        /* Was it already in CRLF format? */
 202        if (stats.lf == stats.crlf)
 203                return 0;
 204
 205        if (action == CRLF_GUESS) {
 206                /* If we have any bare CR characters, we're not going to touch it */
 207                if (stats.cr != stats.crlf)
 208                        return 0;
 209
 210                if (is_binary(len, &stats))
 211                        return 0;
 212        }
 213
 214        /* are we "faking" in place editing ? */
 215        if (src == buf->buf)
 216                to_free = strbuf_detach(buf, NULL);
 217
 218        strbuf_grow(buf, len + stats.lf - stats.crlf);
 219        for (;;) {
 220                const char *nl = memchr(src, '\n', len);
 221                if (!nl)
 222                        break;
 223                if (nl > src && nl[-1] == '\r') {
 224                        strbuf_add(buf, src, nl + 1 - src);
 225                } else {
 226                        strbuf_add(buf, src, nl - src);
 227                        strbuf_addstr(buf, "\r\n");
 228                }
 229                len -= nl + 1 - src;
 230                src  = nl + 1;
 231        }
 232        strbuf_add(buf, src, len);
 233
 234        free(to_free);
 235        return 1;
 236}
 237
 238struct filter_params {
 239        const char *src;
 240        unsigned long size;
 241        const char *cmd;
 242};
 243
 244static int filter_buffer(int in, int out, void *data)
 245{
 246        /*
 247         * Spawn cmd and feed the buffer contents through its stdin.
 248         */
 249        struct child_process child_process;
 250        struct filter_params *params = (struct filter_params *)data;
 251        int write_err, status;
 252        const char *argv[] = { params->cmd, NULL };
 253
 254        memset(&child_process, 0, sizeof(child_process));
 255        child_process.argv = argv;
 256        child_process.use_shell = 1;
 257        child_process.in = -1;
 258        child_process.out = out;
 259
 260        if (start_command(&child_process))
 261                return error("cannot fork to run external filter %s", params->cmd);
 262
 263        write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
 264        if (close(child_process.in))
 265                write_err = 1;
 266        if (write_err)
 267                error("cannot feed the input to external filter %s", params->cmd);
 268
 269        status = finish_command(&child_process);
 270        if (status)
 271                error("external filter %s failed %d", params->cmd, status);
 272        return (write_err || status);
 273}
 274
 275static int apply_filter(const char *path, const char *src, size_t len,
 276                        struct strbuf *dst, const char *cmd)
 277{
 278        /*
 279         * Create a pipeline to have the command filter the buffer's
 280         * contents.
 281         *
 282         * (child --> cmd) --> us
 283         */
 284        int ret = 1;
 285        struct strbuf nbuf = STRBUF_INIT;
 286        struct async async;
 287        struct filter_params params;
 288
 289        if (!cmd)
 290                return 0;
 291
 292        memset(&async, 0, sizeof(async));
 293        async.proc = filter_buffer;
 294        async.data = &params;
 295        async.out = -1;
 296        params.src = src;
 297        params.size = len;
 298        params.cmd = cmd;
 299
 300        fflush(NULL);
 301        if (start_async(&async))
 302                return 0;       /* error was already reported */
 303
 304        if (strbuf_read(&nbuf, async.out, len) < 0) {
 305                error("read from external filter %s failed", cmd);
 306                ret = 0;
 307        }
 308        if (close(async.out)) {
 309                error("read from external filter %s failed", cmd);
 310                ret = 0;
 311        }
 312        if (finish_async(&async)) {
 313                error("external filter %s failed", cmd);
 314                ret = 0;
 315        }
 316
 317        if (ret) {
 318                strbuf_swap(dst, &nbuf);
 319        }
 320        strbuf_release(&nbuf);
 321        return ret;
 322}
 323
 324static struct convert_driver {
 325        const char *name;
 326        struct convert_driver *next;
 327        const char *smudge;
 328        const char *clean;
 329} *user_convert, **user_convert_tail;
 330
 331static int read_convert_config(const char *var, const char *value, void *cb)
 332{
 333        const char *ep, *name;
 334        int namelen;
 335        struct convert_driver *drv;
 336
 337        /*
 338         * External conversion drivers are configured using
 339         * "filter.<name>.variable".
 340         */
 341        if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
 342                return 0;
 343        name = var + 7;
 344        namelen = ep - name;
 345        for (drv = user_convert; drv; drv = drv->next)
 346                if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
 347                        break;
 348        if (!drv) {
 349                drv = xcalloc(1, sizeof(struct convert_driver));
 350                drv->name = xmemdupz(name, namelen);
 351                *user_convert_tail = drv;
 352                user_convert_tail = &(drv->next);
 353        }
 354
 355        ep++;
 356
 357        /*
 358         * filter.<name>.smudge and filter.<name>.clean specifies
 359         * the command line:
 360         *
 361         *      command-line
 362         *
 363         * The command-line will not be interpolated in any way.
 364         */
 365
 366        if (!strcmp("smudge", ep))
 367                return git_config_string(&drv->smudge, var, value);
 368
 369        if (!strcmp("clean", ep))
 370                return git_config_string(&drv->clean, var, value);
 371
 372        return 0;
 373}
 374
 375static void setup_convert_check(struct git_attr_check *check)
 376{
 377        static struct git_attr *attr_crlf;
 378        static struct git_attr *attr_ident;
 379        static struct git_attr *attr_filter;
 380
 381        if (!attr_crlf) {
 382                attr_crlf = git_attr("crlf");
 383                attr_ident = git_attr("ident");
 384                attr_filter = git_attr("filter");
 385                user_convert_tail = &user_convert;
 386                git_config(read_convert_config, NULL);
 387        }
 388        check[0].attr = attr_crlf;
 389        check[1].attr = attr_ident;
 390        check[2].attr = attr_filter;
 391}
 392
 393static int count_ident(const char *cp, unsigned long size)
 394{
 395        /*
 396         * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
 397         */
 398        int cnt = 0;
 399        char ch;
 400
 401        while (size) {
 402                ch = *cp++;
 403                size--;
 404                if (ch != '$')
 405                        continue;
 406                if (size < 3)
 407                        break;
 408                if (memcmp("Id", cp, 2))
 409                        continue;
 410                ch = cp[2];
 411                cp += 3;
 412                size -= 3;
 413                if (ch == '$')
 414                        cnt++; /* $Id$ */
 415                if (ch != ':')
 416                        continue;
 417
 418                /*
 419                 * "$Id: ... "; scan up to the closing dollar sign and discard.
 420                 */
 421                while (size) {
 422                        ch = *cp++;
 423                        size--;
 424                        if (ch == '$') {
 425                                cnt++;
 426                                break;
 427                        }
 428                }
 429        }
 430        return cnt;
 431}
 432
 433static int ident_to_git(const char *path, const char *src, size_t len,
 434                        struct strbuf *buf, int ident)
 435{
 436        char *dst, *dollar;
 437
 438        if (!ident || !count_ident(src, len))
 439                return 0;
 440
 441        /* only grow if not in place */
 442        if (strbuf_avail(buf) + buf->len < len)
 443                strbuf_grow(buf, len - buf->len);
 444        dst = buf->buf;
 445        for (;;) {
 446                dollar = memchr(src, '$', len);
 447                if (!dollar)
 448                        break;
 449                memcpy(dst, src, dollar + 1 - src);
 450                dst += dollar + 1 - src;
 451                len -= dollar + 1 - src;
 452                src  = dollar + 1;
 453
 454                if (len > 3 && !memcmp(src, "Id:", 3)) {
 455                        dollar = memchr(src + 3, '$', len - 3);
 456                        if (!dollar)
 457                                break;
 458                        memcpy(dst, "Id$", 3);
 459                        dst += 3;
 460                        len -= dollar + 1 - src;
 461                        src  = dollar + 1;
 462                }
 463        }
 464        memcpy(dst, src, len);
 465        strbuf_setlen(buf, dst + len - buf->buf);
 466        return 1;
 467}
 468
 469static int ident_to_worktree(const char *path, const char *src, size_t len,
 470                             struct strbuf *buf, int ident)
 471{
 472        unsigned char sha1[20];
 473        char *to_free = NULL, *dollar;
 474        int cnt;
 475
 476        if (!ident)
 477                return 0;
 478
 479        cnt = count_ident(src, len);
 480        if (!cnt)
 481                return 0;
 482
 483        /* are we "faking" in place editing ? */
 484        if (src == buf->buf)
 485                to_free = strbuf_detach(buf, NULL);
 486        hash_sha1_file(src, len, "blob", sha1);
 487
 488        strbuf_grow(buf, len + cnt * 43);
 489        for (;;) {
 490                /* step 1: run to the next '$' */
 491                dollar = memchr(src, '$', len);
 492                if (!dollar)
 493                        break;
 494                strbuf_add(buf, src, dollar + 1 - src);
 495                len -= dollar + 1 - src;
 496                src  = dollar + 1;
 497
 498                /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
 499                if (len < 3 || memcmp("Id", src, 2))
 500                        continue;
 501
 502                /* step 3: skip over Id$ or Id:xxxxx$ */
 503                if (src[2] == '$') {
 504                        src += 3;
 505                        len -= 3;
 506                } else if (src[2] == ':') {
 507                        /*
 508                         * It's possible that an expanded Id has crept its way into the
 509                         * repository, we cope with that by stripping the expansion out
 510                         */
 511                        dollar = memchr(src + 3, '$', len - 3);
 512                        if (!dollar) {
 513                                /* incomplete keyword, no more '$', so just quit the loop */
 514                                break;
 515                        }
 516
 517                        len -= dollar + 1 - src;
 518                        src  = dollar + 1;
 519                } else {
 520                        /* it wasn't a "Id$" or "Id:xxxx$" */
 521                        continue;
 522                }
 523
 524                /* step 4: substitute */
 525                strbuf_addstr(buf, "Id: ");
 526                strbuf_add(buf, sha1_to_hex(sha1), 40);
 527                strbuf_addstr(buf, " $");
 528        }
 529        strbuf_add(buf, src, len);
 530
 531        free(to_free);
 532        return 1;
 533}
 534
 535static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 536{
 537        const char *value = check->value;
 538
 539        if (ATTR_TRUE(value))
 540                return CRLF_TEXT;
 541        else if (ATTR_FALSE(value))
 542                return CRLF_BINARY;
 543        else if (ATTR_UNSET(value))
 544                ;
 545        else if (!strcmp(value, "input"))
 546                return CRLF_INPUT;
 547        return CRLF_GUESS;
 548}
 549
 550static struct convert_driver *git_path_check_convert(const char *path,
 551                                             struct git_attr_check *check)
 552{
 553        const char *value = check->value;
 554        struct convert_driver *drv;
 555
 556        if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
 557                return NULL;
 558        for (drv = user_convert; drv; drv = drv->next)
 559                if (!strcmp(value, drv->name))
 560                        return drv;
 561        return NULL;
 562}
 563
 564static int git_path_check_ident(const char *path, struct git_attr_check *check)
 565{
 566        const char *value = check->value;
 567
 568        return !!ATTR_TRUE(value);
 569}
 570
 571int convert_to_git(const char *path, const char *src, size_t len,
 572                   struct strbuf *dst, enum safe_crlf checksafe)
 573{
 574        struct git_attr_check check[3];
 575        int crlf = CRLF_GUESS;
 576        int ident = 0, ret = 0;
 577        const char *filter = NULL;
 578
 579        setup_convert_check(check);
 580        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 581                struct convert_driver *drv;
 582                crlf = git_path_check_crlf(path, check + 0);
 583                ident = git_path_check_ident(path, check + 1);
 584                drv = git_path_check_convert(path, check + 2);
 585                if (drv && drv->clean)
 586                        filter = drv->clean;
 587        }
 588
 589        ret |= apply_filter(path, src, len, dst, filter);
 590        if (ret) {
 591                src = dst->buf;
 592                len = dst->len;
 593        }
 594        ret |= crlf_to_git(path, src, len, dst, crlf, checksafe);
 595        if (ret) {
 596                src = dst->buf;
 597                len = dst->len;
 598        }
 599        return ret | ident_to_git(path, src, len, dst, ident);
 600}
 601
 602int convert_to_working_tree(const char *path, const char *src, size_t len, struct strbuf *dst)
 603{
 604        struct git_attr_check check[3];
 605        int crlf = CRLF_GUESS;
 606        int ident = 0, ret = 0;
 607        const char *filter = NULL;
 608
 609        setup_convert_check(check);
 610        if (!git_checkattr(path, ARRAY_SIZE(check), check)) {
 611                struct convert_driver *drv;
 612                crlf = git_path_check_crlf(path, check + 0);
 613                ident = git_path_check_ident(path, check + 1);
 614                drv = git_path_check_convert(path, check + 2);
 615                if (drv && drv->smudge)
 616                        filter = drv->smudge;
 617        }
 618
 619        ret |= ident_to_worktree(path, src, len, dst, ident);
 620        if (ret) {
 621                src = dst->buf;
 622                len = dst->len;
 623        }
 624        ret |= crlf_to_worktree(path, src, len, dst, crlf);
 625        if (ret) {
 626                src = dst->buf;
 627                len = dst->len;
 628        }
 629        return ret | apply_filter(path, src, len, dst, filter);
 630}