92fcc677e454eb0156f2752c723e6e208a6e8be4
1/*
2 * Copyright (C) 2005 Junio C Hamano
3 */
4#include <sys/types.h>
5#include <sys/wait.h>
6#include <signal.h>
7#include <limits.h>
8#include "cache.h"
9#include "diff.h"
10#include "delta.h"
11
12static const char *diff_opts = "-pu";
13static unsigned char null_sha1[20] = { 0, };
14
15static const char *external_diff(void)
16{
17 static const char *external_diff_cmd = NULL;
18 static int done_preparing = 0;
19
20 if (done_preparing)
21 return external_diff_cmd;
22
23 /*
24 * Default values above are meant to match the
25 * Linux kernel development style. Examples of
26 * alternative styles you can specify via environment
27 * variables are:
28 *
29 * GIT_DIFF_OPTS="-c";
30 */
31 if (gitenv("GIT_EXTERNAL_DIFF"))
32 external_diff_cmd = gitenv("GIT_EXTERNAL_DIFF");
33
34 /* In case external diff fails... */
35 diff_opts = gitenv("GIT_DIFF_OPTS") ? : diff_opts;
36
37 done_preparing = 1;
38 return external_diff_cmd;
39}
40
41/* Help to copy the thing properly quoted for the shell safety.
42 * any single quote is replaced with '\'', and the caller is
43 * expected to enclose the result within a single quote pair.
44 *
45 * E.g.
46 * original sq_expand result
47 * name ==> name ==> 'name'
48 * a b ==> a b ==> 'a b'
49 * a'b ==> a'\''b ==> 'a'\''b'
50 */
51static char *sq_expand(const char *src)
52{
53 static char *buf = NULL;
54 int cnt, c;
55 const char *cp;
56 char *bp;
57
58 /* count bytes needed to store the quoted string. */
59 for (cnt = 1, cp = src; *cp; cnt++, cp++)
60 if (*cp == '\'')
61 cnt += 3;
62
63 buf = xmalloc(cnt);
64 bp = buf;
65 while ((c = *src++)) {
66 if (c != '\'')
67 *bp++ = c;
68 else {
69 bp = strcpy(bp, "'\\''");
70 bp += 4;
71 }
72 }
73 *bp = 0;
74 return buf;
75}
76
77static struct diff_tempfile {
78 const char *name;
79 char hex[41];
80 char mode[10];
81 char tmp_path[50];
82} diff_temp[2];
83
84struct diff_spec {
85 unsigned char blob_sha1[20];
86 unsigned short mode; /* file mode */
87 unsigned sha1_valid : 1; /* if true, use blob_sha1 and trust mode;
88 * if false, use the name and read from
89 * the filesystem.
90 */
91 unsigned file_valid : 1; /* if false the file does not exist */
92};
93
94static void builtin_diff(const char *name_a,
95 const char *name_b,
96 struct diff_tempfile *temp)
97{
98 int i, next_at, cmd_size;
99 const char *diff_cmd = "diff -L'%s%s' -L'%s%s'";
100 const char *diff_arg = "'%s' '%s'||:"; /* "||:" is to return 0 */
101 const char *input_name_sq[2];
102 const char *path0[2];
103 const char *path1[2];
104 const char *name_sq[2];
105 char *cmd;
106
107 name_sq[0] = sq_expand(name_a);
108 name_sq[1] = sq_expand(name_b);
109
110 /* diff_cmd and diff_arg have 6 %s in total which makes
111 * the sum of these strings 12 bytes larger than required.
112 * we use 2 spaces around diff-opts, and we need to count
113 * terminating NUL, so we subtract 9 here.
114 */
115 cmd_size = (strlen(diff_cmd) + strlen(diff_opts) +
116 strlen(diff_arg) - 9);
117 for (i = 0; i < 2; i++) {
118 input_name_sq[i] = sq_expand(temp[i].name);
119 if (!strcmp(temp[i].name, "/dev/null")) {
120 path0[i] = "/dev/null";
121 path1[i] = "";
122 } else {
123 path0[i] = i ? "b/" : "a/";
124 path1[i] = name_sq[i];
125 }
126 cmd_size += (strlen(path0[i]) + strlen(path1[i]) +
127 strlen(input_name_sq[i]));
128 }
129
130 cmd = xmalloc(cmd_size);
131
132 next_at = 0;
133 next_at += snprintf(cmd+next_at, cmd_size-next_at,
134 diff_cmd,
135 path0[0], path1[0], path0[1], path1[1]);
136 next_at += snprintf(cmd+next_at, cmd_size-next_at,
137 " %s ", diff_opts);
138 next_at += snprintf(cmd+next_at, cmd_size-next_at,
139 diff_arg, input_name_sq[0], input_name_sq[1]);
140
141 printf("diff --git a/%s b/%s\n", name_a, name_b);
142 if (!path1[0][0])
143 printf("new file mode %s\n", temp[1].mode);
144 else if (!path1[1][0])
145 printf("deleted file mode %s\n", temp[0].mode);
146 else {
147 if (strcmp(temp[0].mode, temp[1].mode)) {
148 printf("old mode %s\n", temp[0].mode);
149 printf("new mode %s\n", temp[1].mode);
150 }
151 if (strcmp(name_a, name_b)) {
152 printf("rename old %s\n", name_a);
153 printf("rename new %s\n", name_b);
154 }
155 if (strncmp(temp[0].mode, temp[1].mode, 3))
156 /* we do not run diff between different kind
157 * of objects.
158 */
159 exit(0);
160 }
161 fflush(NULL);
162 execlp("/bin/sh","sh", "-c", cmd, NULL);
163}
164
165/*
166 * Given a name and sha1 pair, if the dircache tells us the file in
167 * the work tree has that object contents, return true, so that
168 * prepare_temp_file() does not have to inflate and extract.
169 */
170static int work_tree_matches(const char *name, const unsigned char *sha1)
171{
172 struct cache_entry *ce;
173 struct stat st;
174 int pos, len;
175
176 /* We do not read the cache ourselves here, because the
177 * benchmark with my previous version that always reads cache
178 * shows that it makes things worse for diff-tree comparing
179 * two linux-2.6 kernel trees in an already checked out work
180 * tree. This is because most diff-tree comparisons deal with
181 * only a small number of files, while reading the cache is
182 * expensive for a large project, and its cost outweighs the
183 * savings we get by not inflating the object to a temporary
184 * file. Practically, this code only helps when we are used
185 * by diff-cache --cached, which does read the cache before
186 * calling us.
187 */
188 if (!active_cache)
189 return 0;
190
191 len = strlen(name);
192 pos = cache_name_pos(name, len);
193 if (pos < 0)
194 return 0;
195 ce = active_cache[pos];
196 if ((lstat(name, &st) < 0) ||
197 !S_ISREG(st.st_mode) ||
198 ce_match_stat(ce, &st) ||
199 memcmp(sha1, ce->sha1, 20))
200 return 0;
201 return 1;
202}
203
204static void prep_temp_blob(struct diff_tempfile *temp,
205 void *blob,
206 unsigned long size,
207 unsigned char *sha1,
208 int mode)
209{
210 int fd;
211
212 strcpy(temp->tmp_path, ".diff_XXXXXX");
213 fd = mkstemp(temp->tmp_path);
214 if (fd < 0)
215 die("unable to create temp-file");
216 if (write(fd, blob, size) != size)
217 die("unable to write temp-file");
218 close(fd);
219 temp->name = temp->tmp_path;
220 strcpy(temp->hex, sha1_to_hex(sha1));
221 temp->hex[40] = 0;
222 sprintf(temp->mode, "%06o", mode);
223}
224
225static void prepare_temp_file(const char *name,
226 struct diff_tempfile *temp,
227 struct diff_spec *one)
228{
229 if (!one->file_valid) {
230 not_a_valid_file:
231 /* A '-' entry produces this for file-2, and
232 * a '+' entry produces this for file-1.
233 */
234 temp->name = "/dev/null";
235 strcpy(temp->hex, ".");
236 strcpy(temp->mode, ".");
237 return;
238 }
239
240 if (!one->sha1_valid ||
241 work_tree_matches(name, one->blob_sha1)) {
242 struct stat st;
243 temp->name = name;
244 if (lstat(temp->name, &st) < 0) {
245 if (errno == ENOENT)
246 goto not_a_valid_file;
247 die("stat(%s): %s", temp->name, strerror(errno));
248 }
249 if (S_ISLNK(st.st_mode)) {
250 int ret;
251 char *buf, buf_[1024];
252 buf = ((sizeof(buf_) < st.st_size) ?
253 xmalloc(st.st_size) : buf_);
254 ret = readlink(name, buf, st.st_size);
255 if (ret < 0)
256 die("readlink(%s)", name);
257 prep_temp_blob(temp, buf, st.st_size,
258 (one->sha1_valid ?
259 one->blob_sha1 : null_sha1),
260 (one->sha1_valid ?
261 one->mode : S_IFLNK));
262 }
263 else {
264 if (!one->sha1_valid)
265 strcpy(temp->hex, sha1_to_hex(null_sha1));
266 else
267 strcpy(temp->hex, sha1_to_hex(one->blob_sha1));
268 sprintf(temp->mode, "%06o",
269 S_IFREG |ce_permissions(st.st_mode));
270 }
271 return;
272 }
273 else {
274 void *blob;
275 char type[20];
276 unsigned long size;
277
278 blob = read_sha1_file(one->blob_sha1, type, &size);
279 if (!blob || strcmp(type, "blob"))
280 die("unable to read blob object for %s (%s)",
281 name, sha1_to_hex(one->blob_sha1));
282 prep_temp_blob(temp, blob, size, one->blob_sha1, one->mode);
283 free(blob);
284 }
285}
286
287static void remove_tempfile(void)
288{
289 int i;
290
291 for (i = 0; i < 2; i++)
292 if (diff_temp[i].name == diff_temp[i].tmp_path) {
293 unlink(diff_temp[i].name);
294 diff_temp[i].name = NULL;
295 }
296}
297
298static void remove_tempfile_on_signal(int signo)
299{
300 remove_tempfile();
301}
302
303static int detect_rename;
304static int reverse_diff;
305static const char **pathspec;
306static int speccnt;
307static int diff_rename_minimum_score;
308
309static int matches_pathspec(const char *name)
310{
311 int i;
312 int namelen;
313
314 if (speccnt == 0)
315 return 1;
316
317 namelen = strlen(name);
318 for (i = 0; i < speccnt; i++) {
319 int speclen = strlen(pathspec[i]);
320 if (! strncmp(pathspec[i], name, speclen) &&
321 speclen <= namelen &&
322 (name[speclen] == 0 || name[speclen] == '/'))
323 return 1;
324 }
325 return 0;
326}
327
328/* An external diff command takes:
329 *
330 * diff-cmd name infile1 infile1-sha1 infile1-mode \
331 * infile2 infile2-sha1 infile2-mode [ rename-to ]
332 *
333 */
334static void run_external_diff(const char *name,
335 const char *other,
336 struct diff_spec *one,
337 struct diff_spec *two)
338{
339 struct diff_tempfile *temp = diff_temp;
340 pid_t pid;
341 int status;
342 static int atexit_asked = 0;
343
344 if (reverse_diff) {
345 struct diff_spec *tmp_spec;
346 tmp_spec = one; one = two; two = tmp_spec;
347 if (other) {
348 const char *tmp;
349 tmp = name; name = other; other = tmp;
350 }
351 }
352
353 if (!matches_pathspec(name) && (!other || !matches_pathspec(other)))
354 return;
355
356 if (one && two) {
357 prepare_temp_file(name, &temp[0], one);
358 prepare_temp_file(other ? : name, &temp[1], two);
359 if (! atexit_asked &&
360 (temp[0].name == temp[0].tmp_path ||
361 temp[1].name == temp[1].tmp_path)) {
362 atexit_asked = 1;
363 atexit(remove_tempfile);
364 }
365 signal(SIGINT, remove_tempfile_on_signal);
366 }
367
368 fflush(NULL);
369 pid = fork();
370 if (pid < 0)
371 die("unable to fork");
372 if (!pid) {
373 const char *pgm = external_diff();
374 if (pgm) {
375 if (one && two) {
376 const char *exec_arg[9];
377 const char **arg = &exec_arg[0];
378 *arg++ = pgm;
379 *arg++ = name;
380 *arg++ = temp[0].name;
381 *arg++ = temp[0].hex;
382 *arg++ = temp[0].mode;
383 *arg++ = temp[1].name;
384 *arg++ = temp[1].hex;
385 *arg++ = temp[1].mode;
386 if (other)
387 *arg++ = other;
388 *arg = 0;
389 execvp(pgm, (char *const*) exec_arg);
390 }
391 else
392 execlp(pgm, pgm, name, NULL);
393 }
394 /*
395 * otherwise we use the built-in one.
396 */
397 if (one && two)
398 builtin_diff(name, other ? : name, temp);
399 else
400 printf("* Unmerged path %s\n", name);
401 exit(0);
402 }
403 if (waitpid(pid, &status, 0) < 0 ||
404 !WIFEXITED(status) || WEXITSTATUS(status)) {
405 /* Earlier we did not check the exit status because
406 * diff exits non-zero if files are different, and
407 * we are not interested in knowing that. It was a
408 * mistake which made it harder to quit a diff-*
409 * session that uses the git-apply-patch-script as
410 * the GIT_EXTERNAL_DIFF. A custom GIT_EXTERNAL_DIFF
411 * should also exit non-zero only when it wants to
412 * abort the entire diff-* session.
413 */
414 remove_tempfile();
415 fprintf(stderr, "external diff died, stopping at %s.\n", name);
416 exit(1);
417 }
418 remove_tempfile();
419}
420
421/*
422 * We do not detect circular renames. Just hold created and deleted
423 * entries and later attempt to match them up. If they do not match,
424 * then spit them out as deletes or creates as original.
425 */
426
427static struct diff_spec_hold {
428 struct diff_spec_hold *next;
429 struct diff_spec it;
430 unsigned long size;
431 int flags;
432#define MATCHED 1
433#define SHOULD_FREE 2
434#define SHOULD_MUNMAP 4
435 void *data;
436 char path[1];
437} *createdfile, *deletedfile;
438
439static void hold_diff(const char *name,
440 struct diff_spec *one,
441 struct diff_spec *two)
442{
443 struct diff_spec_hold **list, *elem;
444
445 if (one->file_valid && two->file_valid)
446 die("internal error");
447
448 if (!detect_rename) {
449 run_external_diff(name, NULL, one, two);
450 return;
451 }
452 elem = xmalloc(sizeof(*elem) + strlen(name));
453 strcpy(elem->path, name);
454 elem->size = 0;
455 elem->data = NULL;
456 elem->flags = 0;
457 if (one->file_valid) {
458 list = &deletedfile;
459 elem->it = *one;
460 }
461 else {
462 list = &createdfile;
463 elem->it = *two;
464 }
465 elem->next = *list;
466 *list = elem;
467}
468
469static int populate_data(struct diff_spec_hold *s)
470{
471 char type[20];
472
473 if (s->data)
474 return 0;
475 if (s->it.sha1_valid) {
476 s->data = read_sha1_file(s->it.blob_sha1, type, &s->size);
477 s->flags |= SHOULD_FREE;
478 }
479 else {
480 struct stat st;
481 int fd;
482 fd = open(s->path, O_RDONLY);
483 if (fd < 0)
484 return -1;
485 if (fstat(fd, &st)) {
486 close(fd);
487 return -1;
488 }
489 s->size = st.st_size;
490 s->data = mmap(NULL, s->size, PROT_READ, MAP_PRIVATE, fd, 0);
491 close(fd);
492 if (!s->size)
493 s->data = "";
494 else
495 s->flags |= SHOULD_MUNMAP;
496 }
497 return 0;
498}
499
500static void free_data(struct diff_spec_hold *s)
501{
502 if (s->flags & SHOULD_FREE)
503 free(s->data);
504 else if (s->flags & SHOULD_MUNMAP)
505 munmap(s->data, s->size);
506 s->flags &= ~(SHOULD_FREE|SHOULD_MUNMAP);
507 s->data = 0;
508}
509
510static void flush_remaining_diff(struct diff_spec_hold *elem,
511 int on_created_list)
512{
513 static struct diff_spec null_file_spec;
514
515 null_file_spec.file_valid = 0;
516 for ( ; elem ; elem = elem->next) {
517 free_data(elem);
518 if (elem->flags & MATCHED)
519 continue;
520 if (on_created_list)
521 run_external_diff(elem->path, NULL,
522 &null_file_spec, &elem->it);
523 else
524 run_external_diff(elem->path, NULL,
525 &elem->it, &null_file_spec);
526 }
527}
528
529static int is_exact_match(struct diff_spec_hold *src,
530 struct diff_spec_hold *dst)
531{
532 if (src->it.sha1_valid && dst->it.sha1_valid &&
533 !memcmp(src->it.blob_sha1, dst->it.blob_sha1, 20))
534 return 1;
535 if (populate_data(src) || populate_data(dst))
536 /* this is an error but will be caught downstream */
537 return 0;
538 if (src->size == dst->size &&
539 !memcmp(src->data, dst->data, src->size))
540 return 1;
541 return 0;
542}
543
544#define MINIMUM_SCORE 5000
545int estimate_similarity(struct diff_spec_hold *src, struct diff_spec_hold *dst)
546{
547 /* src points at a deleted file and dst points at a created
548 * file. They may be quite similar, in which case we want to
549 * say src is renamed to dst.
550 *
551 * Compare them and return how similar they are, representing
552 * the score as an integer between 0 and 10000. 10000 is
553 * reserved for the case where they match exactly.
554 */
555 void *delta;
556 unsigned long delta_size;
557
558 delta_size = ((src->size < dst->size) ?
559 (dst->size - src->size) : (src->size - dst->size));
560
561 /* We would not consider rename followed by more than
562 * 20% edits; that is, delta_size must be smaller than
563 * (src->size + dst->size)/2 * 0.2, which means...
564 */
565 if ((src->size + dst->size) < delta_size * 10)
566 return 0;
567
568 delta = diff_delta(src->data, src->size,
569 dst->data, dst->size,
570 &delta_size);
571 free(delta);
572
573 /* This "delta" is really xdiff with adler32 and all the
574 * overheads but it is a quick and dirty approximation.
575 *
576 * Now we will give some score to it. Let's say 20% edit gets
577 * 5000 points and 0% edit gets 9000 points. That is, every
578 * 1/20000 edit gets 1 point penalty. The amount of penalty is:
579 *
580 * (delta_size * 2 / (src->size + dst->size)) * 20000
581 *
582 */
583 return 9000 - (40000 * delta_size / (src->size+dst->size));
584}
585
586struct diff_score {
587 struct diff_spec_hold *src;
588 struct diff_spec_hold *dst;
589 int score;
590};
591
592static int score_compare(const void *a_, const void *b_)
593{
594 const struct diff_score *a = a_, *b = b_;
595 return b->score - a->score;
596}
597
598static void flush_rename_pair(struct diff_spec_hold *src,
599 struct diff_spec_hold *dst)
600{
601 src->flags |= MATCHED;
602 dst->flags |= MATCHED;
603 free_data(src);
604 free_data(dst);
605 run_external_diff(src->path, dst->path,
606 &src->it, &dst->it);
607}
608
609static void free_held_diff(struct diff_spec_hold *list)
610{
611 struct diff_spec_hold *h;
612 for (h = list; list; list = h) {
613 h = list->next;
614 free_data(list);
615 free(list);
616 }
617}
618
619void diff_flush(void)
620{
621 int num_create, num_delete, c, d;
622 struct diff_spec_hold *elem, *src, *dst;
623 struct diff_score *mx;
624
625 /* We really want to cull the candidates list early
626 * with cheap tests in order to avoid doing deltas.
627 *
628 * With the current callers, we should not have already
629 * matched entries at this point, but it is nonetheless
630 * checked for sanity.
631 */
632 for (dst = createdfile; dst; dst = dst->next) {
633 if (dst->flags & MATCHED)
634 continue;
635 for (src = deletedfile; src; src = src->next) {
636 if (src->flags & MATCHED)
637 continue;
638 if (! is_exact_match(src, dst))
639 continue;
640 flush_rename_pair(src, dst);
641 break;
642 }
643 }
644
645 /* Count surviving candidates */
646 for (num_create = 0, elem = createdfile; elem; elem = elem->next)
647 if (!(elem->flags & MATCHED))
648 num_create++;
649
650 for (num_delete = 0, elem = deletedfile; elem; elem = elem->next)
651 if (!(elem->flags & MATCHED))
652 num_delete++;
653
654 if (num_create == 0 || num_delete == 0)
655 goto exit_path;
656
657 mx = xmalloc(sizeof(*mx) * num_create * num_delete);
658 for (c = 0, dst = createdfile; dst; dst = dst->next) {
659 int base = c * num_delete;
660 if (dst->flags & MATCHED)
661 continue;
662 for (d = 0, src = deletedfile; src; src = src->next) {
663 struct diff_score *m = &mx[base+d];
664 if (src->flags & MATCHED)
665 continue;
666 m->src = src;
667 m->dst = dst;
668 m->score = estimate_similarity(src, dst);
669 d++;
670 }
671 c++;
672 }
673 qsort(mx, num_create*num_delete, sizeof(*mx), score_compare);
674
675#if 0
676 for (c = 0; c < num_create * num_delete; c++) {
677 src = mx[c].src;
678 dst = mx[c].dst;
679 if ((src->flags & MATCHED) || (dst->flags & MATCHED))
680 continue;
681 fprintf(stderr,
682 "**score ** %d %s %s\n",
683 mx[c].score, src->path, dst->path);
684 }
685#endif
686
687 for (c = 0; c < num_create * num_delete; c++) {
688 src = mx[c].src;
689 dst = mx[c].dst;
690 if ((src->flags & MATCHED) || (dst->flags & MATCHED))
691 continue;
692 if (mx[c].score < diff_rename_minimum_score)
693 break;
694 flush_rename_pair(src, dst);
695 }
696 free(mx);
697
698 exit_path:
699 flush_remaining_diff(createdfile, 1);
700 flush_remaining_diff(deletedfile, 0);
701 free_held_diff(createdfile);
702 free_held_diff(deletedfile);
703 createdfile = deletedfile = NULL;
704}
705
706void diff_setup(int detect_rename_, int minimum_score_, int reverse_diff_,
707 const char **pathspec_, int speccnt_)
708{
709 free_held_diff(createdfile);
710 free_held_diff(deletedfile);
711 createdfile = deletedfile = NULL;
712
713 detect_rename = detect_rename_;
714 reverse_diff = reverse_diff_;
715 pathspec = pathspec_;
716 speccnt = speccnt_;
717 diff_rename_minimum_score = minimum_score_ ? : MINIMUM_SCORE;
718}
719
720void diff_addremove(int addremove, unsigned mode,
721 const unsigned char *sha1,
722 const char *base, const char *path)
723{
724 char concatpath[PATH_MAX];
725 struct diff_spec spec[2], *one, *two;
726
727 memcpy(spec[0].blob_sha1, sha1, 20);
728 spec[0].mode = mode;
729 spec[0].sha1_valid = !!memcmp(sha1, null_sha1, 20);
730 spec[0].file_valid = 1;
731 spec[1].file_valid = 0;
732
733 if (addremove == '+') {
734 one = spec + 1; two = spec;
735 } else {
736 one = spec; two = one + 1;
737 }
738
739 if (path) {
740 strcpy(concatpath, base);
741 strcat(concatpath, path);
742 }
743 hold_diff(path ? concatpath : base, one, two);
744}
745
746void diff_change(unsigned old_mode, unsigned new_mode,
747 const unsigned char *old_sha1,
748 const unsigned char *new_sha1,
749 const char *base, const char *path) {
750 char concatpath[PATH_MAX];
751 struct diff_spec spec[2];
752
753 if (path) {
754 strcpy(concatpath, base);
755 strcat(concatpath, path);
756 }
757
758 memcpy(spec[0].blob_sha1, old_sha1, 20);
759 spec[0].mode = old_mode;
760 memcpy(spec[1].blob_sha1, new_sha1, 20);
761 spec[1].mode = new_mode;
762 spec[0].sha1_valid = !!memcmp(old_sha1, null_sha1, 20);
763 spec[1].sha1_valid = !!memcmp(new_sha1, null_sha1, 20);
764 spec[1].file_valid = spec[0].file_valid = 1;
765
766 /* We do not look at changed files as candidate for
767 * rename detection ever.
768 */
769 run_external_diff(path ? concatpath : base, NULL, &spec[0], &spec[1]);
770}
771
772void diff_unmerge(const char *path)
773{
774 run_external_diff(path, NULL, NULL, NULL);
775}