1/* 2 * Copyright (C) 2005 Junio C Hamano 3 */ 4#include "cache.h" 5#include "diff.h" 6#include "diffcore.h" 7#include "delta.h" 8 9struct diff_rename_pool { 10 struct diff_filespec **s; 11 int nr, alloc; 12}; 13 14static void diff_rename_pool_clear(struct diff_rename_pool *pool) 15{ 16 pool->s = NULL; pool->nr = pool->alloc = 0; 17} 18 19static void diff_rename_pool_add(struct diff_rename_pool *pool, 20 struct diff_filespec *s) 21{ 22 if (S_ISDIR(s->mode)) 23 return; /* rename/copy patch for tree does not make sense. */ 24 25 if (pool->alloc <= pool->nr) { 26 pool->alloc = alloc_nr(pool->alloc); 27 pool->s = xrealloc(pool->s, 28 sizeof(*(pool->s)) * pool->alloc); 29 } 30 pool->s[pool->nr] = s; 31 pool->nr++; 32} 33 34static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst) 35{ 36 if (src->sha1_valid && dst->sha1_valid && 37 !memcmp(src->sha1, dst->sha1, 20)) 38 return 1; 39 if (diff_populate_filespec(src) || diff_populate_filespec(dst)) 40 /* this is an error but will be caught downstream */ 41 return 0; 42 if (src->size == dst->size && 43 !memcmp(src->data, dst->data, src->size)) 44 return 1; 45 return 0; 46} 47 48struct diff_score { 49 struct diff_filespec *src; 50 struct diff_filespec *dst; 51 int score; 52 int rank; 53}; 54 55static int estimate_similarity(struct diff_filespec *src, 56 struct diff_filespec *dst, 57 int minimum_score) 58{ 59 /* src points at a file that existed in the original tree (or 60 * optionally a file in the destination tree) and dst points 61 * at a newly created file. They may be quite similar, in which 62 * case we want to say src is renamed to dst or src is copied into 63 * dst, and then some edit has been applied to dst. 64 * 65 * Compare them and return how similar they are, representing 66 * the score as an integer between 0 and 10000, except 67 * where they match exactly it is considered better than anything 68 * else. 69 */ 70 void *delta; 71 unsigned long delta_size; 72 int score; 73 74 delta_size = ((src->size < dst->size) ? 75 (dst->size - src->size) : (src->size - dst->size)); 76 77 /* We would not consider rename followed by more than 78 * minimum_score/MAX_SCORE edits; that is, delta_size must be smaller 79 * than (src->size + dst->size)/2 * minimum_score/MAX_SCORE, 80 * which means... 81 */ 82 83 if ((src->size+dst->size)*minimum_score < delta_size*MAX_SCORE*2) 84 return 0; 85 86 delta = diff_delta(src->data, src->size, 87 dst->data, dst->size, 88 &delta_size); 89 free(delta); 90 91 /* This "delta" is really xdiff with adler32 and all the 92 * overheads but it is a quick and dirty approximation. 93 * 94 * Now we will give some score to it. 100% edit gets 95 * 0 points and 0% edit gets MAX_SCORE points. That is, every 96 * 1/MAX_SCORE edit gets 1 point penalty. The amount of penalty is: 97 * 98 * (delta_size * 2 / (src->size + dst->size)) * MAX_SCORE 99 * 100 */ 101 score = MAX_SCORE-(MAX_SCORE*2*delta_size/(src->size+dst->size)); 102 if (score < 0) return 0; 103 if (MAX_SCORE < score) return MAX_SCORE; 104 return score; 105} 106 107static void record_rename_pair(struct diff_queue_struct *outq, 108 struct diff_filespec *src, 109 struct diff_filespec *dst, 110 int rank, 111 int score) 112{ 113 /* The rank is used to sort the final output, because there 114 * are certain dependencies. 115 * 116 * - rank #0 depends on deleted ones. 117 * - rank #1 depends on kept files before they are modified. 118 * - rank #2 depends on kept files after they are modified; 119 * currently not used. 120 * 121 * Therefore, the final output order should be: 122 * 123 * 1. rank #0 rename/copy diffs. 124 * 2. deletions in the original. 125 * 3. rank #1 rename/copy diffs. 126 * 4. additions and modifications in the original. 127 * 5. rank #2 rename/copy diffs; currently not used. 128 * 129 * To achieve this sort order, we give xform_work the number 130 * above. 131 */ 132 struct diff_file_pair *dp = diff_queue(outq, src, dst); 133 dp->xfrm_work = (rank * 2 + 1) | (score<<RENAME_SCORE_SHIFT); 134 dst->xfrm_flags |= RENAME_DST_MATCHED; 135} 136 137#if 0 138static void debug_filespec(struct diff_filespec *s, int x, const char *one) 139{ 140 fprintf(stderr, "queue[%d] %s (%s) %s %06o %s\n", 141 x, one, 142 s->path, 143 s->file_valid ? "valid" : "invalid", 144 s->mode, 145 s->sha1_valid ? sha1_to_hex(s->sha1) : ""); 146 fprintf(stderr, "queue[%d] %s size %lu flags %d\n", 147 x, one, 148 s->size, s->xfrm_flags); 149} 150 151static void debug_filepair(const struct diff_file_pair *p, int i) 152{ 153 debug_filespec(p->one, i, "one"); 154 debug_filespec(p->two, i, "two"); 155 fprintf(stderr, "pair flags %d, orig order %d, score %d\n", 156 (p->xfrm_work & ((1<<RENAME_SCORE_SHIFT) - 1)), 157 p->orig_order, 158 (p->xfrm_work >> RENAME_SCORE_SHIFT)); 159} 160 161static void debug_queue(const char *msg, struct diff_queue_struct *q) 162{ 163 int i; 164 if (msg) 165 fprintf(stderr, "%s\n", msg); 166 fprintf(stderr, "q->nr = %d\n", q->nr); 167 for (i = 0; i < q->nr; i++) { 168 struct diff_file_pair *p = q->queue[i]; 169 debug_filepair(p, i); 170 } 171} 172#else 173#define debug_queue(a,b) do { ; /*nothing*/ } while(0) 174#endif 175 176/* 177 * We sort the outstanding diff entries according to the rank (see 178 * comment at the beginning of record_rename_pair) and tiebreak with 179 * the order in the original input. 180 */ 181static int rank_compare(const void *a_, const void *b_) 182{ 183 const struct diff_file_pair *a = *(const struct diff_file_pair **)a_; 184 const struct diff_file_pair *b = *(const struct diff_file_pair **)b_; 185 int a_rank = a->xfrm_work & ((1<<RENAME_SCORE_SHIFT) - 1); 186 int b_rank = b->xfrm_work & ((1<<RENAME_SCORE_SHIFT) - 1); 187 188 if (a_rank != b_rank) 189 return a_rank - b_rank; 190 return a->orig_order - b->orig_order; 191} 192 193/* 194 * We sort the rename similarity matrix with the score, in descending 195 * order (more similar first). 196 */ 197static int score_compare(const void *a_, const void *b_) 198{ 199 const struct diff_score *a = a_, *b = b_; 200 return b->score - a->score; 201} 202 203static int needs_to_stay(struct diff_queue_struct *q, int i, 204 struct diff_filespec *it) 205{ 206 /* If it will be used in later entry (either stay or used 207 * as the source of rename/copy), we need to copy, not rename. 208 */ 209 while (i < q->nr) { 210 struct diff_file_pair *p = q->queue[i++]; 211 if (!p->two->file_valid) 212 continue; /* removed is fine */ 213 if (strcmp(p->one->path, it->path)) 214 continue; /* not relevant */ 215 216 /* p has its src set to *it and it is not a delete; 217 * it will be used for in-place change or rename/copy, 218 * so we cannot rename it out. 219 */ 220 return 1; 221 } 222 return 0; 223} 224 225void diff_detect_rename(struct diff_queue_struct *q, 226 int detect_rename, 227 int minimum_score) 228{ 229 struct diff_queue_struct outq; 230 struct diff_rename_pool created, deleted, stay; 231 struct diff_rename_pool *(srcs[2]); 232 struct diff_score *mx; 233 int h, i, j; 234 int num_create, num_src, dst_cnt, src_cnt; 235 236 outq.queue = NULL; 237 outq.nr = outq.alloc = 0; 238 239 diff_rename_pool_clear(&created); 240 diff_rename_pool_clear(&deleted); 241 diff_rename_pool_clear(&stay); 242 243 srcs[0] = &deleted; 244 srcs[1] = &stay; 245 246 /* NEEDSWORK: 247 * (1) make sure we properly ignore but pass trees. 248 * 249 * (2) make sure we do right thing on the same path deleted 250 * and created in the same patch. 251 */ 252 253 for (i = 0; i < q->nr; i++) { 254 struct diff_file_pair *p = q->queue[i]; 255 if (!p->one->file_valid) 256 if (!p->two->file_valid) 257 continue; /* ignore nonsense */ 258 else 259 diff_rename_pool_add(&created, p->two); 260 else if (!p->two->file_valid) 261 diff_rename_pool_add(&deleted, p->one); 262 else if (1 < detect_rename) /* find copy, too */ 263 diff_rename_pool_add(&stay, p->one); 264 } 265 if (created.nr == 0) 266 goto cleanup; /* nothing to do */ 267 268 /* We really want to cull the candidates list early 269 * with cheap tests in order to avoid doing deltas. 270 * 271 * With the current callers, we should not have already 272 * matched entries at this point, but it is nonetheless 273 * checked for sanity. 274 */ 275 for (i = 0; i < created.nr; i++) { 276 if (created.s[i]->xfrm_flags & RENAME_DST_MATCHED) 277 continue; /* we have matched exactly already */ 278 for (h = 0; h < sizeof(srcs)/sizeof(srcs[0]); h++) { 279 struct diff_rename_pool *p = srcs[h]; 280 for (j = 0; j < p->nr; j++) { 281 if (!is_exact_match(p->s[j], created.s[i])) 282 continue; 283 record_rename_pair(&outq, 284 p->s[j], created.s[i], h, 285 MAX_SCORE); 286 break; /* we are done with this entry */ 287 } 288 } 289 } 290 debug_queue("done detecting exact", &outq); 291 292 /* Have we run out the created file pool? If so we can avoid 293 * doing the delta matrix altogether. 294 */ 295 if (outq.nr == created.nr) 296 goto flush_rest; 297 298 num_create = (created.nr - outq.nr); 299 num_src = deleted.nr + stay.nr; 300 mx = xmalloc(sizeof(*mx) * num_create * num_src); 301 for (dst_cnt = i = 0; i < created.nr; i++) { 302 int base = dst_cnt * num_src; 303 if (created.s[i]->xfrm_flags & RENAME_DST_MATCHED) 304 continue; /* dealt with exact match already. */ 305 for (src_cnt = h = 0; h < sizeof(srcs)/sizeof(srcs[0]); h++) { 306 struct diff_rename_pool *p = srcs[h]; 307 for (j = 0; j < p->nr; j++, src_cnt++) { 308 struct diff_score *m = &mx[base + src_cnt]; 309 m->src = p->s[j]; 310 m->dst = created.s[i]; 311 m->score = estimate_similarity(m->src, m->dst, 312 minimum_score); 313 m->rank = h; 314 } 315 } 316 dst_cnt++; 317 } 318 /* cost matrix sorted by most to least similar pair */ 319 qsort(mx, num_create * num_src, sizeof(*mx), score_compare); 320 for (i = 0; i < num_create * num_src; i++) { 321 if (mx[i].dst->xfrm_flags & RENAME_DST_MATCHED) 322 continue; /* alreayd done, either exact or fuzzy. */ 323 if (mx[i].score < minimum_score) 324 continue; 325 record_rename_pair(&outq, 326 mx[i].src, mx[i].dst, mx[i].rank, 327 mx[i].score); 328 } 329 free(mx); 330 debug_queue("done detecting fuzzy", &outq); 331 332 flush_rest: 333 /* At this point, we have found some renames and copies and they 334 * are kept in outq. The original list is still in *q. 335 * 336 * Scan the original list and move them into the outq; we will sort 337 * outq and swap it into the queue supplied to pass that to 338 * downstream, so we assign the sort keys in this loop. 339 * 340 * See comments at the top of record_rename_pair for numbers used 341 * to assign xfrm_work. 342 * 343 * Note that we have not annotated the diff_file_pair with any comment 344 * so there is nothing other than p to free. 345 */ 346 for (i = 0; i < q->nr; i++) { 347 struct diff_file_pair *dp, *p = q->queue[i]; 348 if (!p->one->file_valid) { 349 if (p->two->file_valid) { 350 /* creation */ 351 dp = diff_queue(&outq, p->one, p->two); 352 dp->xfrm_work = 4; 353 } 354 /* otherwise it is a nonsense; just ignore it */ 355 } 356 else if (!p->two->file_valid) { 357 /* deletion */ 358 dp = diff_queue(&outq, p->one, p->two); 359 dp->xfrm_work = 2; 360 } 361 else { 362 /* modification, or stay as is */ 363 dp = diff_queue(&outq, p->one, p->two); 364 dp->xfrm_work = 4; 365 } 366 free(p); 367 } 368 debug_queue("done copying original", &outq); 369 370 /* Sort outq */ 371 qsort(outq.queue, outq.nr, sizeof(outq.queue[0]), rank_compare); 372 373 debug_queue("done sorting", &outq); 374 375 free(q->queue); 376 q->nr = q->alloc = 0; 377 q->queue = NULL; 378 379 /* Copy it out to q, removing duplicates. */ 380 for (i = 0; i < outq.nr; i++) { 381 struct diff_file_pair *p = outq.queue[i]; 382 if (!p->one->file_valid) { 383 /* created */ 384 if (p->two->xfrm_flags & RENAME_DST_MATCHED) 385 ; /* rename/copy created it already */ 386 else 387 diff_queue(q, p->one, p->two); 388 } 389 else if (!p->two->file_valid) { 390 /* deleted */ 391 if (p->one->xfrm_flags & RENAME_SRC_GONE) 392 ; /* rename/copy deleted it already */ 393 else 394 diff_queue(q, p->one, p->two); 395 } 396 else if (strcmp(p->one->path, p->two->path)) { 397 /* rename or copy */ 398 struct diff_file_pair *dp = 399 diff_queue(q, p->one, p->two); 400 int msglen = (strlen(p->one->path) + 401 strlen(p->two->path) + 100); 402 int score = (p->xfrm_work >> RENAME_SCORE_SHIFT); 403 dp->xfrm_msg = xmalloc(msglen); 404 405 /* if we have a later entry that is a rename/copy 406 * that depends on p->one, then we copy here. 407 * otherwise we rename it. 408 */ 409 if (needs_to_stay(&outq, i+1, p->one)) { 410 /* copy it */ 411 sprintf(dp->xfrm_msg, 412 "similarity index %d%%\n" 413 "copy from %s\n" 414 "copy to %s\n", 415 (int)(0.5 + score * 100 / MAX_SCORE), 416 p->one->path, p->two->path); 417 } 418 else { 419 /* rename it, and mark it as gone. */ 420 p->one->xfrm_flags |= RENAME_SRC_GONE; 421 sprintf(dp->xfrm_msg, 422 "similarity index %d%%\n" 423 "rename old %s\n" 424 "rename new %s\n", 425 (int)(0.5 + score * 100 / MAX_SCORE), 426 p->one->path, p->two->path); 427 } 428 } 429 else 430 /* otherwise it is a modified (or stayed) entry */ 431 diff_queue(q, p->one, p->two); 432 free(p); 433 } 434 435 free(outq.queue); 436 debug_queue("done collapsing", q); 437 438 cleanup: 439 free(created.s); 440 free(deleted.s); 441 free(stay.s); 442 return; 443}