1#include "cache.h"
2#include "attr.h"
3
4/*
5 * convert.c - convert a file when checking it out and checking it in.
6 *
7 * This should use the pathname to decide on whether it wants to do some
8 * more interesting conversions (automatic gzip/unzip, general format
9 * conversions etc etc), but by default it just does automatic CRLF<->LF
10 * translation when the "auto_crlf" option is set.
11 */
12
13struct text_stat {
14 /* CR, LF and CRLF counts */
15 unsigned cr, lf, crlf;
16
17 /* These are just approximations! */
18 unsigned printable, nonprintable;
19};
20
21static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
22{
23 unsigned long i;
24
25 memset(stats, 0, sizeof(*stats));
26
27 for (i = 0; i < size; i++) {
28 unsigned char c = buf[i];
29 if (c == '\r') {
30 stats->cr++;
31 if (i+1 < size && buf[i+1] == '\n')
32 stats->crlf++;
33 continue;
34 }
35 if (c == '\n') {
36 stats->lf++;
37 continue;
38 }
39 if (c == 127)
40 /* DEL */
41 stats->nonprintable++;
42 else if (c < 32) {
43 switch (c) {
44 /* BS, HT, ESC and FF */
45 case '\b': case '\t': case '\033': case '\014':
46 stats->printable++;
47 break;
48 default:
49 stats->nonprintable++;
50 }
51 }
52 else
53 stats->printable++;
54 }
55}
56
57/*
58 * The same heuristics as diff.c::mmfile_is_binary()
59 */
60static int is_binary(unsigned long size, struct text_stat *stats)
61{
62
63 if ((stats->printable >> 7) < stats->nonprintable)
64 return 1;
65 /*
66 * Other heuristics? Average line length might be relevant,
67 * as might LF vs CR vs CRLF counts..
68 *
69 * NOTE! It might be normal to have a low ratio of CRLF to LF
70 * (somebody starts with a LF-only file and edits it with an editor
71 * that adds CRLF only to lines that are added..). But do we
72 * want to support CR-only? Probably not.
73 */
74 return 0;
75}
76
77static int crlf_to_git(const char *path, char **bufp, unsigned long *sizep, int guess)
78{
79 char *buffer, *nbuf;
80 unsigned long size, nsize;
81 struct text_stat stats;
82
83 if (guess && !auto_crlf)
84 return 0;
85
86 size = *sizep;
87 if (!size)
88 return 0;
89 buffer = *bufp;
90
91 gather_stats(buffer, size, &stats);
92
93 /* No CR? Nothing to convert, regardless. */
94 if (!stats.cr)
95 return 0;
96
97 if (guess) {
98 /*
99 * We're currently not going to even try to convert stuff
100 * that has bare CR characters. Does anybody do that crazy
101 * stuff?
102 */
103 if (stats.cr != stats.crlf)
104 return 0;
105
106 /*
107 * And add some heuristics for binary vs text, of course...
108 */
109 if (is_binary(size, &stats))
110 return 0;
111 }
112
113 /*
114 * Ok, allocate a new buffer, fill it in, and return true
115 * to let the caller know that we switched buffers on it.
116 */
117 nsize = size - stats.crlf;
118 nbuf = xmalloc(nsize);
119 *bufp = nbuf;
120 *sizep = nsize;
121
122 if (guess) {
123 do {
124 unsigned char c = *buffer++;
125 if (c != '\r')
126 *nbuf++ = c;
127 } while (--size);
128 } else {
129 do {
130 unsigned char c = *buffer++;
131 if (! (c == '\r' && (1 < size && *buffer == '\n')))
132 *nbuf++ = c;
133 } while (--size);
134 }
135
136 return 1;
137}
138
139static int autocrlf_to_git(const char *path, char **bufp, unsigned long *sizep)
140{
141 return crlf_to_git(path, bufp, sizep, 1);
142}
143
144static int forcecrlf_to_git(const char *path, char **bufp, unsigned long *sizep)
145{
146 return crlf_to_git(path, bufp, sizep, 0);
147}
148
149static int crlf_to_working_tree(const char *path, char **bufp, unsigned long *sizep, int guess)
150{
151 char *buffer, *nbuf;
152 unsigned long size, nsize;
153 struct text_stat stats;
154 unsigned char last;
155
156 if (guess && auto_crlf <= 0)
157 return 0;
158
159 size = *sizep;
160 if (!size)
161 return 0;
162 buffer = *bufp;
163
164 gather_stats(buffer, size, &stats);
165
166 /* No LF? Nothing to convert, regardless. */
167 if (!stats.lf)
168 return 0;
169
170 /* Was it already in CRLF format? */
171 if (stats.lf == stats.crlf)
172 return 0;
173
174 if (guess) {
175 /* If we have any bare CR characters, we're not going to touch it */
176 if (stats.cr != stats.crlf)
177 return 0;
178
179 if (is_binary(size, &stats))
180 return 0;
181 }
182
183 /*
184 * Ok, allocate a new buffer, fill it in, and return true
185 * to let the caller know that we switched buffers on it.
186 */
187 nsize = size + stats.lf - stats.crlf;
188 nbuf = xmalloc(nsize);
189 *bufp = nbuf;
190 *sizep = nsize;
191 last = 0;
192 do {
193 unsigned char c = *buffer++;
194 if (c == '\n' && last != '\r')
195 *nbuf++ = '\r';
196 *nbuf++ = c;
197 last = c;
198 } while (--size);
199
200 return 1;
201}
202
203static int autocrlf_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
204{
205 return crlf_to_working_tree(path, bufp, sizep, 1);
206}
207
208static int forcecrlf_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
209{
210 return crlf_to_working_tree(path, bufp, sizep, 0);
211}
212
213static void setup_crlf_check(struct git_attr_check *check)
214{
215 static struct git_attr *attr_crlf;
216
217 if (!attr_crlf)
218 attr_crlf = git_attr("crlf", 4);
219 check->attr = attr_crlf;
220}
221
222static int git_path_check_crlf(const char *path)
223{
224 struct git_attr_check attr_crlf_check;
225
226 setup_crlf_check(&attr_crlf_check);
227
228 if (git_checkattr(path, 1, &attr_crlf_check))
229 return -1;
230 return attr_crlf_check.isset;
231}
232
233int convert_to_git(const char *path, char **bufp, unsigned long *sizep)
234{
235 switch (git_path_check_crlf(path)) {
236 case 0:
237 return 0;
238 case 1:
239 return forcecrlf_to_git(path, bufp, sizep);
240 default:
241 return autocrlf_to_git(path, bufp, sizep);
242 }
243}
244
245int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
246{
247 switch (git_path_check_crlf(path)) {
248 case 0:
249 return 0;
250 case 1:
251 return forcecrlf_to_working_tree(path, bufp, sizep);
252 default:
253 return autocrlf_to_working_tree(path, bufp, sizep);
254 }
255}