convert.con commit cherry-pick: Suggest a better method to retain authorship (f52463a)
   1#include "cache.h"
   2/*
   3 * convert.c - convert a file when checking it out and checking it in.
   4 *
   5 * This should use the pathname to decide on whether it wants to do some
   6 * more interesting conversions (automatic gzip/unzip, general format
   7 * conversions etc etc), but by default it just does automatic CRLF<->LF
   8 * translation when the "auto_crlf" option is set.
   9 */
  10
  11struct text_stat {
  12        /* CR, LF and CRLF counts */
  13        unsigned cr, lf, crlf;
  14
  15        /* These are just approximations! */
  16        unsigned printable, nonprintable;
  17};
  18
  19static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  20{
  21        unsigned long i;
  22
  23        memset(stats, 0, sizeof(*stats));
  24
  25        for (i = 0; i < size; i++) {
  26                unsigned char c = buf[i];
  27                if (c == '\r') {
  28                        stats->cr++;
  29                        if (i+1 < size && buf[i+1] == '\n')
  30                                stats->crlf++;
  31                        continue;
  32                }
  33                if (c == '\n') {
  34                        stats->lf++;
  35                        continue;
  36                }
  37                if (c == 127)
  38                        /* DEL */
  39                        stats->nonprintable++;
  40                else if (c < 32) {
  41                        switch (c) {
  42                                /* BS, HT, ESC and FF */
  43                        case '\b': case '\t': case '\033': case '\014':
  44                                stats->printable++;
  45                                break;
  46                        default:
  47                                stats->nonprintable++;
  48                        }
  49                }
  50                else
  51                        stats->printable++;
  52        }
  53}
  54
  55/*
  56 * The same heuristics as diff.c::mmfile_is_binary()
  57 */
  58static int is_binary(unsigned long size, struct text_stat *stats)
  59{
  60
  61        if ((stats->printable >> 7) < stats->nonprintable)
  62                return 1;
  63        /*
  64         * Other heuristics? Average line length might be relevant,
  65         * as might LF vs CR vs CRLF counts..
  66         *
  67         * NOTE! It might be normal to have a low ratio of CRLF to LF
  68         * (somebody starts with a LF-only file and edits it with an editor
  69         * that adds CRLF only to lines that are added..). But do  we
  70         * want to support CR-only? Probably not.
  71         */
  72        return 0;
  73}
  74
  75int convert_to_git(const char *path, char **bufp, unsigned long *sizep)
  76{
  77        char *buffer, *nbuf;
  78        unsigned long size, nsize;
  79        struct text_stat stats;
  80
  81        /*
  82         * FIXME! Other pluggable conversions should go here,
  83         * based on filename patterns. Right now we just do the
  84         * stupid auto-CRLF one.
  85         */
  86        if (!auto_crlf)
  87                return 0;
  88
  89        size = *sizep;
  90        if (!size)
  91                return 0;
  92        buffer = *bufp;
  93
  94        gather_stats(buffer, size, &stats);
  95
  96        /* No CR? Nothing to convert, regardless. */
  97        if (!stats.cr)
  98                return 0;
  99
 100        /*
 101         * We're currently not going to even try to convert stuff
 102         * that has bare CR characters. Does anybody do that crazy
 103         * stuff?
 104         */
 105        if (stats.cr != stats.crlf)
 106                return 0;
 107
 108        /*
 109         * And add some heuristics for binary vs text, of course...
 110         */
 111        if (is_binary(size, &stats))
 112                return 0;
 113
 114        /*
 115         * Ok, allocate a new buffer, fill it in, and return true
 116         * to let the caller know that we switched buffers on it.
 117         */
 118        nsize = size - stats.crlf;
 119        nbuf = xmalloc(nsize);
 120        *bufp = nbuf;
 121        *sizep = nsize;
 122        do {
 123                unsigned char c = *buffer++;
 124                if (c != '\r')
 125                        *nbuf++ = c;
 126        } while (--size);
 127
 128        return 1;
 129}
 130
 131int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
 132{
 133        char *buffer, *nbuf;
 134        unsigned long size, nsize;
 135        struct text_stat stats;
 136        unsigned char last;
 137
 138        /*
 139         * FIXME! Other pluggable conversions should go here,
 140         * based on filename patterns. Right now we just do the
 141         * stupid auto-CRLF one.
 142         */
 143        if (auto_crlf <= 0)
 144                return 0;
 145
 146        size = *sizep;
 147        if (!size)
 148                return 0;
 149        buffer = *bufp;
 150
 151        gather_stats(buffer, size, &stats);
 152
 153        /* No LF? Nothing to convert, regardless. */
 154        if (!stats.lf)
 155                return 0;
 156
 157        /* Was it already in CRLF format? */
 158        if (stats.lf == stats.crlf)
 159                return 0;
 160
 161        /* If we have any bare CR characters, we're not going to touch it */
 162        if (stats.cr != stats.crlf)
 163                return 0;
 164
 165        if (is_binary(size, &stats))
 166                return 0;
 167
 168        /*
 169         * Ok, allocate a new buffer, fill it in, and return true
 170         * to let the caller know that we switched buffers on it.
 171         */
 172        nsize = size + stats.lf - stats.crlf;
 173        nbuf = xmalloc(nsize);
 174        *bufp = nbuf;
 175        *sizep = nsize;
 176        last = 0;
 177        do {
 178                unsigned char c = *buffer++;
 179                if (c == '\n' && last != '\r')
 180                        *nbuf++ = '\r';
 181                *nbuf++ = c;
 182                last = c;
 183        } while (--size);
 184
 185        return 1;
 186}