742b895cfa110e112673bf6aefd822d6adf66382
   1#include "cache.h"
   2#include "attr.h"
   3
   4/*
   5 * convert.c - convert a file when checking it out and checking it in.
   6 *
   7 * This should use the pathname to decide on whether it wants to do some
   8 * more interesting conversions (automatic gzip/unzip, general format
   9 * conversions etc etc), but by default it just does automatic CRLF<->LF
  10 * translation when the "auto_crlf" option is set.
  11 */
  12
  13#define CRLF_GUESS      (-1)
  14#define CRLF_BINARY     0
  15#define CRLF_TEXT       1
  16#define CRLF_INPUT      2
  17
  18struct text_stat {
  19        /* CR, LF and CRLF counts */
  20        unsigned cr, lf, crlf;
  21
  22        /* These are just approximations! */
  23        unsigned printable, nonprintable;
  24};
  25
  26static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  27{
  28        unsigned long i;
  29
  30        memset(stats, 0, sizeof(*stats));
  31
  32        for (i = 0; i < size; i++) {
  33                unsigned char c = buf[i];
  34                if (c == '\r') {
  35                        stats->cr++;
  36                        if (i+1 < size && buf[i+1] == '\n')
  37                                stats->crlf++;
  38                        continue;
  39                }
  40                if (c == '\n') {
  41                        stats->lf++;
  42                        continue;
  43                }
  44                if (c == 127)
  45                        /* DEL */
  46                        stats->nonprintable++;
  47                else if (c < 32) {
  48                        switch (c) {
  49                                /* BS, HT, ESC and FF */
  50                        case '\b': case '\t': case '\033': case '\014':
  51                                stats->printable++;
  52                                break;
  53                        default:
  54                                stats->nonprintable++;
  55                        }
  56                }
  57                else
  58                        stats->printable++;
  59        }
  60}
  61
  62/*
  63 * The same heuristics as diff.c::mmfile_is_binary()
  64 */
  65static int is_binary(unsigned long size, struct text_stat *stats)
  66{
  67
  68        if ((stats->printable >> 7) < stats->nonprintable)
  69                return 1;
  70        /*
  71         * Other heuristics? Average line length might be relevant,
  72         * as might LF vs CR vs CRLF counts..
  73         *
  74         * NOTE! It might be normal to have a low ratio of CRLF to LF
  75         * (somebody starts with a LF-only file and edits it with an editor
  76         * that adds CRLF only to lines that are added..). But do  we
  77         * want to support CR-only? Probably not.
  78         */
  79        return 0;
  80}
  81
  82static char *crlf_to_git(const char *path, const char *src, unsigned long *sizep, int action)
  83{
  84        char *buffer, *dst;
  85        unsigned long size, nsize;
  86        struct text_stat stats;
  87
  88        if ((action == CRLF_BINARY) || (action == CRLF_GUESS && !auto_crlf))
  89                return NULL;
  90
  91        size = *sizep;
  92        if (!size)
  93                return NULL;
  94
  95        gather_stats(src, size, &stats);
  96
  97        /* No CR? Nothing to convert, regardless. */
  98        if (!stats.cr)
  99                return NULL;
 100
 101        if (action == CRLF_GUESS) {
 102                /*
 103                 * We're currently not going to even try to convert stuff
 104                 * that has bare CR characters. Does anybody do that crazy
 105                 * stuff?
 106                 */
 107                if (stats.cr != stats.crlf)
 108                        return NULL;
 109
 110                /*
 111                 * And add some heuristics for binary vs text, of course...
 112                 */
 113                if (is_binary(size, &stats))
 114                        return NULL;
 115        }
 116
 117        /*
 118         * Ok, allocate a new buffer, fill it in, and return true
 119         * to let the caller know that we switched buffers on it.
 120         */
 121        nsize = size - stats.crlf;
 122        buffer = xmalloc(nsize);
 123        *sizep = nsize;
 124
 125        dst = buffer;
 126        if (action == CRLF_GUESS) {
 127                /*
 128                 * If we guessed, we already know we rejected a file with
 129                 * lone CR, and we can strip a CR without looking at what
 130                 * follow it.
 131                 */
 132                do {
 133                        unsigned char c = *src++;
 134                        if (c != '\r')
 135                                *dst++ = c;
 136                } while (--size);
 137        } else {
 138                do {
 139                        unsigned char c = *src++;
 140                        if (! (c == '\r' && (1 < size && *buffer == '\n')))
 141                                *dst++ = c;
 142                } while (--size);
 143        }
 144
 145        return buffer;
 146}
 147
 148static char *crlf_to_worktree(const char *path, const char *src, unsigned long *sizep, int action)
 149{
 150        char *buffer, *dst;
 151        unsigned long size, nsize;
 152        struct text_stat stats;
 153        unsigned char last;
 154
 155        if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 156            (action == CRLF_GUESS && auto_crlf <= 0))
 157                return NULL;
 158
 159        size = *sizep;
 160        if (!size)
 161                return NULL;
 162
 163        gather_stats(src, size, &stats);
 164
 165        /* No LF? Nothing to convert, regardless. */
 166        if (!stats.lf)
 167                return NULL;
 168
 169        /* Was it already in CRLF format? */
 170        if (stats.lf == stats.crlf)
 171                return NULL;
 172
 173        if (action == CRLF_GUESS) {
 174                /* If we have any bare CR characters, we're not going to touch it */
 175                if (stats.cr != stats.crlf)
 176                        return NULL;
 177
 178                if (is_binary(size, &stats))
 179                        return NULL;
 180        }
 181
 182        /*
 183         * Ok, allocate a new buffer, fill it in, and return true
 184         * to let the caller know that we switched buffers on it.
 185         */
 186        nsize = size + stats.lf - stats.crlf;
 187        buffer = xmalloc(nsize);
 188        *sizep = nsize;
 189        last = 0;
 190
 191        dst = buffer;
 192        do {
 193                unsigned char c = *src++;
 194                if (c == '\n' && last != '\r')
 195                        *dst++ = '\r';
 196                *dst++ = c;
 197                last = c;
 198        } while (--size);
 199
 200        return buffer;
 201}
 202
 203static void setup_crlf_check(struct git_attr_check *check)
 204{
 205        static struct git_attr *attr_crlf;
 206
 207        if (!attr_crlf)
 208                attr_crlf = git_attr("crlf", 4);
 209        check->attr = attr_crlf;
 210}
 211
 212static int git_path_check_crlf(const char *path)
 213{
 214        struct git_attr_check attr_crlf_check;
 215
 216        setup_crlf_check(&attr_crlf_check);
 217
 218        if (!git_checkattr(path, 1, &attr_crlf_check)) {
 219                const char *value = attr_crlf_check.value;
 220                if (ATTR_TRUE(value))
 221                        return CRLF_TEXT;
 222                else if (ATTR_FALSE(value))
 223                        return CRLF_BINARY;
 224                else if (ATTR_UNSET(value))
 225                        ;
 226                else if (!strcmp(value, "input"))
 227                        return CRLF_INPUT;
 228                /* fallthru */
 229        }
 230        return CRLF_GUESS;
 231}
 232
 233char *convert_to_git(const char *path, const char *src, unsigned long *sizep)
 234{
 235        return crlf_to_git(path, src, sizep, git_path_check_crlf(path));
 236}
 237
 238char *convert_to_working_tree(const char *path, const char *src, unsigned long *sizep)
 239{
 240        return crlf_to_worktree(path, src, sizep, git_path_check_crlf(path));
 241}