da64253a16e51881decd3e31d5763892f4847e5f
   1#include "cache.h"
   2#include "attr.h"
   3
   4/*
   5 * convert.c - convert a file when checking it out and checking it in.
   6 *
   7 * This should use the pathname to decide on whether it wants to do some
   8 * more interesting conversions (automatic gzip/unzip, general format
   9 * conversions etc etc), but by default it just does automatic CRLF<->LF
  10 * translation when the "auto_crlf" option is set.
  11 */
  12
  13#define CRLF_GUESS      (-1)
  14#define CRLF_BINARY     0
  15#define CRLF_TEXT       1
  16#define CRLF_INPUT      2
  17
  18struct text_stat {
  19        /* CR, LF and CRLF counts */
  20        unsigned cr, lf, crlf;
  21
  22        /* These are just approximations! */
  23        unsigned printable, nonprintable;
  24};
  25
  26static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  27{
  28        unsigned long i;
  29
  30        memset(stats, 0, sizeof(*stats));
  31
  32        for (i = 0; i < size; i++) {
  33                unsigned char c = buf[i];
  34                if (c == '\r') {
  35                        stats->cr++;
  36                        if (i+1 < size && buf[i+1] == '\n')
  37                                stats->crlf++;
  38                        continue;
  39                }
  40                if (c == '\n') {
  41                        stats->lf++;
  42                        continue;
  43                }
  44                if (c == 127)
  45                        /* DEL */
  46                        stats->nonprintable++;
  47                else if (c < 32) {
  48                        switch (c) {
  49                                /* BS, HT, ESC and FF */
  50                        case '\b': case '\t': case '\033': case '\014':
  51                                stats->printable++;
  52                                break;
  53                        default:
  54                                stats->nonprintable++;
  55                        }
  56                }
  57                else
  58                        stats->printable++;
  59        }
  60}
  61
  62/*
  63 * The same heuristics as diff.c::mmfile_is_binary()
  64 */
  65static int is_binary(unsigned long size, struct text_stat *stats)
  66{
  67
  68        if ((stats->printable >> 7) < stats->nonprintable)
  69                return 1;
  70        /*
  71         * Other heuristics? Average line length might be relevant,
  72         * as might LF vs CR vs CRLF counts..
  73         *
  74         * NOTE! It might be normal to have a low ratio of CRLF to LF
  75         * (somebody starts with a LF-only file and edits it with an editor
  76         * that adds CRLF only to lines that are added..). But do  we
  77         * want to support CR-only? Probably not.
  78         */
  79        return 0;
  80}
  81
  82static int crlf_to_git(const char *path, char **bufp, unsigned long *sizep, int action)
  83{
  84        char *buffer, *nbuf;
  85        unsigned long size, nsize;
  86        struct text_stat stats;
  87
  88        if ((action == CRLF_BINARY) || (action == CRLF_GUESS && !auto_crlf))
  89                return 0;
  90
  91        size = *sizep;
  92        if (!size)
  93                return 0;
  94        buffer = *bufp;
  95
  96        gather_stats(buffer, size, &stats);
  97
  98        /* No CR? Nothing to convert, regardless. */
  99        if (!stats.cr)
 100                return 0;
 101
 102        if (action == CRLF_GUESS) {
 103                /*
 104                 * We're currently not going to even try to convert stuff
 105                 * that has bare CR characters. Does anybody do that crazy
 106                 * stuff?
 107                 */
 108                if (stats.cr != stats.crlf)
 109                        return 0;
 110
 111                /*
 112                 * And add some heuristics for binary vs text, of course...
 113                 */
 114                if (is_binary(size, &stats))
 115                        return 0;
 116        }
 117
 118        /*
 119         * Ok, allocate a new buffer, fill it in, and return true
 120         * to let the caller know that we switched buffers on it.
 121         */
 122        nsize = size - stats.crlf;
 123        nbuf = xmalloc(nsize);
 124        *bufp = nbuf;
 125        *sizep = nsize;
 126
 127        if (action == CRLF_GUESS) {
 128                /*
 129                 * If we guessed, we already know we rejected a file with
 130                 * lone CR, and we can strip a CR without looking at what
 131                 * follow it.
 132                 */
 133                do {
 134                        unsigned char c = *buffer++;
 135                        if (c != '\r')
 136                                *nbuf++ = c;
 137                } while (--size);
 138        } else {
 139                do {
 140                        unsigned char c = *buffer++;
 141                        if (! (c == '\r' && (1 < size && *buffer == '\n')))
 142                                *nbuf++ = c;
 143                } while (--size);
 144        }
 145
 146        return 1;
 147}
 148
 149static int crlf_to_worktree(const char *path, char **bufp, unsigned long *sizep, int action)
 150{
 151        char *buffer, *nbuf;
 152        unsigned long size, nsize;
 153        struct text_stat stats;
 154        unsigned char last;
 155
 156        if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 157            (action == CRLF_GUESS && auto_crlf <= 0))
 158                return 0;
 159
 160        size = *sizep;
 161        if (!size)
 162                return 0;
 163        buffer = *bufp;
 164
 165        gather_stats(buffer, size, &stats);
 166
 167        /* No LF? Nothing to convert, regardless. */
 168        if (!stats.lf)
 169                return 0;
 170
 171        /* Was it already in CRLF format? */
 172        if (stats.lf == stats.crlf)
 173                return 0;
 174
 175        if (action == CRLF_GUESS) {
 176                /* If we have any bare CR characters, we're not going to touch it */
 177                if (stats.cr != stats.crlf)
 178                        return 0;
 179
 180                if (is_binary(size, &stats))
 181                        return 0;
 182        }
 183
 184        /*
 185         * Ok, allocate a new buffer, fill it in, and return true
 186         * to let the caller know that we switched buffers on it.
 187         */
 188        nsize = size + stats.lf - stats.crlf;
 189        nbuf = xmalloc(nsize);
 190        *bufp = nbuf;
 191        *sizep = nsize;
 192        last = 0;
 193        do {
 194                unsigned char c = *buffer++;
 195                if (c == '\n' && last != '\r')
 196                        *nbuf++ = '\r';
 197                *nbuf++ = c;
 198                last = c;
 199        } while (--size);
 200
 201        return 1;
 202}
 203
 204static void setup_crlf_check(struct git_attr_check *check)
 205{
 206        static struct git_attr *attr_crlf;
 207
 208        if (!attr_crlf)
 209                attr_crlf = git_attr("crlf", 4);
 210        check->attr = attr_crlf;
 211}
 212
 213static int git_path_check_crlf(const char *path)
 214{
 215        struct git_attr_check attr_crlf_check;
 216
 217        setup_crlf_check(&attr_crlf_check);
 218
 219        if (!git_checkattr(path, 1, &attr_crlf_check)) {
 220                const char *value = attr_crlf_check.value;
 221                if (ATTR_TRUE(value))
 222                        return CRLF_TEXT;
 223                else if (ATTR_FALSE(value))
 224                        return CRLF_BINARY;
 225                else if (ATTR_UNSET(value))
 226                        ;
 227                else if (!strcmp(value, "input"))
 228                        return CRLF_INPUT;
 229                /* fallthru */
 230        }
 231        return CRLF_GUESS;
 232}
 233
 234int convert_to_git(const char *path, char **bufp, unsigned long *sizep)
 235{
 236        return crlf_to_git(path, bufp, sizep, git_path_check_crlf(path));
 237}
 238
 239int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
 240{
 241        return crlf_to_worktree(path, bufp, sizep, git_path_check_crlf(path));
 242}