1/* 2 * Copyright (C) 2005 Junio C Hamano 3 */ 4#include "cache.h" 5#include "diff.h" 6#include "diffcore.h" 7#include "object-store.h" 8#include "hashmap.h" 9#include "progress.h" 10 11/* Table of rename/copy destinations */ 12 13static struct diff_rename_dst { 14 struct diff_filespec *two; 15 struct diff_filepair *pair; 16} *rename_dst; 17static int rename_dst_nr, rename_dst_alloc; 18 19static int find_rename_dst(struct diff_filespec *two) 20{ 21 int first, last; 22 23 first = 0; 24 last = rename_dst_nr; 25 while (last > first) { 26 int next = (last + first) >> 1; 27 struct diff_rename_dst *dst = &(rename_dst[next]); 28 int cmp = strcmp(two->path, dst->two->path); 29 if (!cmp) 30 return next; 31 if (cmp < 0) { 32 last = next; 33 continue; 34 } 35 first = next+1; 36 } 37 return -first - 1; 38} 39 40static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two) 41{ 42 int ofs = find_rename_dst(two); 43 return ofs < 0 ? NULL : &rename_dst[ofs]; 44} 45 46/* 47 * Returns 0 on success, -1 if we found a duplicate. 48 */ 49static int add_rename_dst(struct diff_filespec *two) 50{ 51 int first = find_rename_dst(two); 52 53 if (first >= 0) 54 return -1; 55 first = -first - 1; 56 57 /* insert to make it at "first" */ 58 ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc); 59 rename_dst_nr++; 60 if (first < rename_dst_nr) 61 MOVE_ARRAY(rename_dst + first + 1, rename_dst + first, 62 rename_dst_nr - first - 1); 63 rename_dst[first].two = alloc_filespec(two->path); 64 fill_filespec(rename_dst[first].two, &two->oid, two->oid_valid, 65 two->mode); 66 rename_dst[first].pair = NULL; 67 return 0; 68} 69 70/* Table of rename/copy src files */ 71static struct diff_rename_src { 72 struct diff_filepair *p; 73 unsigned short score; /* to remember the break score */ 74} *rename_src; 75static int rename_src_nr, rename_src_alloc; 76 77static struct diff_rename_src *register_rename_src(struct diff_filepair *p) 78{ 79 int first, last; 80 struct diff_filespec *one = p->one; 81 unsigned short score = p->score; 82 83 first = 0; 84 last = rename_src_nr; 85 while (last > first) { 86 int next = (last + first) >> 1; 87 struct diff_rename_src *src = &(rename_src[next]); 88 int cmp = strcmp(one->path, src->p->one->path); 89 if (!cmp) 90 return src; 91 if (cmp < 0) { 92 last = next; 93 continue; 94 } 95 first = next+1; 96 } 97 98 /* insert to make it at "first" */ 99 ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc); 100 rename_src_nr++; 101 if (first < rename_src_nr) 102 MOVE_ARRAY(rename_src + first + 1, rename_src + first, 103 rename_src_nr - first - 1); 104 rename_src[first].p = p; 105 rename_src[first].score = score; 106 return &(rename_src[first]); 107} 108 109static int basename_same(struct diff_filespec *src, struct diff_filespec *dst) 110{ 111 int src_len = strlen(src->path), dst_len = strlen(dst->path); 112 while (src_len && dst_len) { 113 char c1 = src->path[--src_len]; 114 char c2 = dst->path[--dst_len]; 115 if (c1 != c2) 116 return 0; 117 if (c1 == '/') 118 return 1; 119 } 120 return (!src_len || src->path[src_len - 1] == '/') && 121 (!dst_len || dst->path[dst_len - 1] == '/'); 122} 123 124struct diff_score { 125 int src; /* index in rename_src */ 126 int dst; /* index in rename_dst */ 127 unsigned short score; 128 short name_score; 129}; 130 131static int estimate_similarity(struct repository *r, 132 struct diff_filespec *src, 133 struct diff_filespec *dst, 134 int minimum_score) 135{ 136 /* src points at a file that existed in the original tree (or 137 * optionally a file in the destination tree) and dst points 138 * at a newly created file. They may be quite similar, in which 139 * case we want to say src is renamed to dst or src is copied into 140 * dst, and then some edit has been applied to dst. 141 * 142 * Compare them and return how similar they are, representing 143 * the score as an integer between 0 and MAX_SCORE. 144 * 145 * When there is an exact match, it is considered a better 146 * match than anything else; the destination does not even 147 * call into this function in that case. 148 */ 149 unsigned long max_size, delta_size, base_size, src_copied, literal_added; 150 int score; 151 152 /* We deal only with regular files. Symlink renames are handled 153 * only when they are exact matches --- in other words, no edits 154 * after renaming. 155 */ 156 if (!S_ISREG(src->mode) || !S_ISREG(dst->mode)) 157 return 0; 158 159 /* 160 * Need to check that source and destination sizes are 161 * filled in before comparing them. 162 * 163 * If we already have "cnt_data" filled in, we know it's 164 * all good (avoid checking the size for zero, as that 165 * is a possible size - we really should have a flag to 166 * say whether the size is valid or not!) 167 */ 168 if (!src->cnt_data && 169 diff_populate_filespec(r, src, CHECK_SIZE_ONLY)) 170 return 0; 171 if (!dst->cnt_data && 172 diff_populate_filespec(r, dst, CHECK_SIZE_ONLY)) 173 return 0; 174 175 max_size = ((src->size > dst->size) ? src->size : dst->size); 176 base_size = ((src->size < dst->size) ? src->size : dst->size); 177 delta_size = max_size - base_size; 178 179 /* We would not consider edits that change the file size so 180 * drastically. delta_size must be smaller than 181 * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size). 182 * 183 * Note that base_size == 0 case is handled here already 184 * and the final score computation below would not have a 185 * divide-by-zero issue. 186 */ 187 if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) 188 return 0; 189 190 if (!src->cnt_data && diff_populate_filespec(r, src, 0)) 191 return 0; 192 if (!dst->cnt_data && diff_populate_filespec(r, dst, 0)) 193 return 0; 194 195 if (diffcore_count_changes(r, src, dst, 196 &src->cnt_data, &dst->cnt_data, 197 &src_copied, &literal_added)) 198 return 0; 199 200 /* How similar are they? 201 * what percentage of material in dst are from source? 202 */ 203 if (!dst->size) 204 score = 0; /* should not happen */ 205 else 206 score = (int)(src_copied * MAX_SCORE / max_size); 207 return score; 208} 209 210static void record_rename_pair(int dst_index, int src_index, int score) 211{ 212 struct diff_filespec *src, *dst; 213 struct diff_filepair *dp; 214 215 if (rename_dst[dst_index].pair) 216 die("internal error: dst already matched."); 217 218 src = rename_src[src_index].p->one; 219 src->rename_used++; 220 src->count++; 221 222 dst = rename_dst[dst_index].two; 223 dst->count++; 224 225 dp = diff_queue(NULL, src, dst); 226 dp->renamed_pair = 1; 227 if (!strcmp(src->path, dst->path)) 228 dp->score = rename_src[src_index].score; 229 else 230 dp->score = score; 231 rename_dst[dst_index].pair = dp; 232} 233 234/* 235 * We sort the rename similarity matrix with the score, in descending 236 * order (the most similar first). 237 */ 238static int score_compare(const void *a_, const void *b_) 239{ 240 const struct diff_score *a = a_, *b = b_; 241 242 /* sink the unused ones to the bottom */ 243 if (a->dst < 0) 244 return (0 <= b->dst); 245 else if (b->dst < 0) 246 return -1; 247 248 if (a->score == b->score) 249 return b->name_score - a->name_score; 250 251 return b->score - a->score; 252} 253 254struct file_similarity { 255 struct hashmap_entry entry; 256 int index; 257 struct diff_filespec *filespec; 258}; 259 260static unsigned int hash_filespec(struct repository *r, 261 struct diff_filespec *filespec) 262{ 263 if (!filespec->oid_valid) { 264 if (diff_populate_filespec(r, filespec, 0)) 265 return 0; 266 hash_object_file(filespec->data, filespec->size, "blob", 267 &filespec->oid); 268 } 269 return sha1hash(filespec->oid.hash); 270} 271 272static int find_identical_files(struct hashmap *srcs, 273 int dst_index, 274 struct diff_options *options) 275{ 276 int renames = 0; 277 278 struct diff_filespec *target = rename_dst[dst_index].two; 279 struct file_similarity *p, *best = NULL; 280 int i = 100, best_score = -1; 281 282 /* 283 * Find the best source match for specified destination. 284 */ 285 p = hashmap_get_from_hash(srcs, 286 hash_filespec(options->repo, target), 287 NULL); 288 for (; p; p = hashmap_get_next(srcs, p)) { 289 int score; 290 struct diff_filespec *source = p->filespec; 291 292 /* False hash collision? */ 293 if (!oideq(&source->oid, &target->oid)) 294 continue; 295 /* Non-regular files? If so, the modes must match! */ 296 if (!S_ISREG(source->mode) || !S_ISREG(target->mode)) { 297 if (source->mode != target->mode) 298 continue; 299 } 300 /* Give higher scores to sources that haven't been used already */ 301 score = !source->rename_used; 302 if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY) 303 continue; 304 score += basename_same(source, target); 305 if (score > best_score) { 306 best = p; 307 best_score = score; 308 if (score == 2) 309 break; 310 } 311 312 /* Too many identical alternatives? Pick one */ 313 if (!--i) 314 break; 315 } 316 if (best) { 317 record_rename_pair(dst_index, best->index, MAX_SCORE); 318 renames++; 319 } 320 return renames; 321} 322 323static void insert_file_table(struct repository *r, 324 struct hashmap *table, int index, 325 struct diff_filespec *filespec) 326{ 327 struct file_similarity *entry = xmalloc(sizeof(*entry)); 328 329 entry->index = index; 330 entry->filespec = filespec; 331 332 hashmap_entry_init(entry, hash_filespec(r, filespec)); 333 hashmap_add(table, entry); 334} 335 336/* 337 * Find exact renames first. 338 * 339 * The first round matches up the up-to-date entries, 340 * and then during the second round we try to match 341 * cache-dirty entries as well. 342 */ 343static int find_exact_renames(struct diff_options *options) 344{ 345 int i, renames = 0; 346 struct hashmap file_table; 347 348 /* Add all sources to the hash table in reverse order, because 349 * later on they will be retrieved in LIFO order. 350 */ 351 hashmap_init(&file_table, NULL, NULL, rename_src_nr); 352 for (i = rename_src_nr-1; i >= 0; i--) 353 insert_file_table(options->repo, 354 &file_table, i, 355 rename_src[i].p->one); 356 357 /* Walk the destinations and find best source match */ 358 for (i = 0; i < rename_dst_nr; i++) 359 renames += find_identical_files(&file_table, i, options); 360 361 /* Free the hash data structure and entries */ 362 hashmap_free(&file_table, 1); 363 364 return renames; 365} 366 367#define NUM_CANDIDATE_PER_DST 4 368static void record_if_better(struct diff_score m[], struct diff_score *o) 369{ 370 int i, worst; 371 372 /* find the worst one */ 373 worst = 0; 374 for (i = 1; i < NUM_CANDIDATE_PER_DST; i++) 375 if (score_compare(&m[i], &m[worst]) > 0) 376 worst = i; 377 378 /* is it better than the worst one? */ 379 if (score_compare(&m[worst], o) > 0) 380 m[worst] = *o; 381} 382 383/* 384 * Returns: 385 * 0 if we are under the limit; 386 * 1 if we need to disable inexact rename detection; 387 * 2 if we would be under the limit if we were given -C instead of -C -C. 388 */ 389static int too_many_rename_candidates(int num_create, 390 struct diff_options *options) 391{ 392 int rename_limit = options->rename_limit; 393 int num_src = rename_src_nr; 394 int i; 395 396 options->needed_rename_limit = 0; 397 398 /* 399 * This basically does a test for the rename matrix not 400 * growing larger than a "rename_limit" square matrix, ie: 401 * 402 * num_create * num_src > rename_limit * rename_limit 403 */ 404 if (rename_limit <= 0) 405 rename_limit = 32767; 406 if ((num_create <= rename_limit || num_src <= rename_limit) && 407 ((uint64_t)num_create * (uint64_t)num_src 408 <= (uint64_t)rename_limit * (uint64_t)rename_limit)) 409 return 0; 410 411 options->needed_rename_limit = 412 num_src > num_create ? num_src : num_create; 413 414 /* Are we running under -C -C? */ 415 if (!options->flags.find_copies_harder) 416 return 1; 417 418 /* Would we bust the limit if we were running under -C? */ 419 for (num_src = i = 0; i < rename_src_nr; i++) { 420 if (diff_unmodified_pair(rename_src[i].p)) 421 continue; 422 num_src++; 423 } 424 if ((num_create <= rename_limit || num_src <= rename_limit) && 425 ((uint64_t)num_create * (uint64_t)num_src 426 <= (uint64_t)rename_limit * (uint64_t)rename_limit)) 427 return 2; 428 return 1; 429} 430 431static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies) 432{ 433 int count = 0, i; 434 435 for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { 436 struct diff_rename_dst *dst; 437 438 if ((mx[i].dst < 0) || 439 (mx[i].score < minimum_score)) 440 break; /* there is no more usable pair. */ 441 dst = &rename_dst[mx[i].dst]; 442 if (dst->pair) 443 continue; /* already done, either exact or fuzzy. */ 444 if (!copies && rename_src[mx[i].src].p->one->rename_used) 445 continue; 446 record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); 447 count++; 448 } 449 return count; 450} 451 452void diffcore_rename(struct diff_options *options) 453{ 454 int detect_rename = options->detect_rename; 455 int minimum_score = options->rename_score; 456 struct diff_queue_struct *q = &diff_queued_diff; 457 struct diff_queue_struct outq; 458 struct diff_score *mx; 459 int i, j, rename_count, skip_unmodified = 0; 460 int num_create, dst_cnt; 461 struct progress *progress = NULL; 462 463 if (!minimum_score) 464 minimum_score = DEFAULT_RENAME_SCORE; 465 466 for (i = 0; i < q->nr; i++) { 467 struct diff_filepair *p = q->queue[i]; 468 if (!DIFF_FILE_VALID(p->one)) { 469 if (!DIFF_FILE_VALID(p->two)) 470 continue; /* unmerged */ 471 else if (options->single_follow && 472 strcmp(options->single_follow, p->two->path)) 473 continue; /* not interested */ 474 else if (!options->flags.rename_empty && 475 is_empty_blob_oid(&p->two->oid)) 476 continue; 477 else if (add_rename_dst(p->two) < 0) { 478 warning("skipping rename detection, detected" 479 " duplicate destination '%s'", 480 p->two->path); 481 goto cleanup; 482 } 483 } 484 else if (!options->flags.rename_empty && 485 is_empty_blob_oid(&p->one->oid)) 486 continue; 487 else if (!DIFF_PAIR_UNMERGED(p) && !DIFF_FILE_VALID(p->two)) { 488 /* 489 * If the source is a broken "delete", and 490 * they did not really want to get broken, 491 * that means the source actually stays. 492 * So we increment the "rename_used" score 493 * by one, to indicate ourselves as a user 494 */ 495 if (p->broken_pair && !p->score) 496 p->one->rename_used++; 497 register_rename_src(p); 498 } 499 else if (detect_rename == DIFF_DETECT_COPY) { 500 /* 501 * Increment the "rename_used" score by 502 * one, to indicate ourselves as a user. 503 */ 504 p->one->rename_used++; 505 register_rename_src(p); 506 } 507 } 508 if (rename_dst_nr == 0 || rename_src_nr == 0) 509 goto cleanup; /* nothing to do */ 510 511 /* 512 * We really want to cull the candidates list early 513 * with cheap tests in order to avoid doing deltas. 514 */ 515 rename_count = find_exact_renames(options); 516 517 /* Did we only want exact renames? */ 518 if (minimum_score == MAX_SCORE) 519 goto cleanup; 520 521 /* 522 * Calculate how many renames are left (but all the source 523 * files still remain as options for rename/copies!) 524 */ 525 num_create = (rename_dst_nr - rename_count); 526 527 /* All done? */ 528 if (!num_create) 529 goto cleanup; 530 531 switch (too_many_rename_candidates(num_create, options)) { 532 case 1: 533 goto cleanup; 534 case 2: 535 options->degraded_cc_to_c = 1; 536 skip_unmodified = 1; 537 break; 538 default: 539 break; 540 } 541 542 if (options->show_rename_progress) { 543 progress = start_delayed_progress( 544 _("Performing inexact rename detection"), 545 (uint64_t)rename_dst_nr * (uint64_t)rename_src_nr); 546 } 547 548 mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_create), sizeof(*mx)); 549 for (dst_cnt = i = 0; i < rename_dst_nr; i++) { 550 struct diff_filespec *two = rename_dst[i].two; 551 struct diff_score *m; 552 553 if (rename_dst[i].pair) 554 continue; /* dealt with exact match already. */ 555 556 m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST]; 557 for (j = 0; j < NUM_CANDIDATE_PER_DST; j++) 558 m[j].dst = -1; 559 560 for (j = 0; j < rename_src_nr; j++) { 561 struct diff_filespec *one = rename_src[j].p->one; 562 struct diff_score this_src; 563 564 if (skip_unmodified && 565 diff_unmodified_pair(rename_src[j].p)) 566 continue; 567 568 this_src.score = estimate_similarity(options->repo, 569 one, two, 570 minimum_score); 571 this_src.name_score = basename_same(one, two); 572 this_src.dst = i; 573 this_src.src = j; 574 record_if_better(m, &this_src); 575 /* 576 * Once we run estimate_similarity, 577 * We do not need the text anymore. 578 */ 579 diff_free_filespec_blob(one); 580 diff_free_filespec_blob(two); 581 } 582 dst_cnt++; 583 display_progress(progress, (uint64_t)(i+1)*(uint64_t)rename_src_nr); 584 } 585 stop_progress(&progress); 586 587 /* cost matrix sorted by most to least similar pair */ 588 QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare); 589 590 rename_count += find_renames(mx, dst_cnt, minimum_score, 0); 591 if (detect_rename == DIFF_DETECT_COPY) 592 rename_count += find_renames(mx, dst_cnt, minimum_score, 1); 593 free(mx); 594 595 cleanup: 596 /* At this point, we have found some renames and copies and they 597 * are recorded in rename_dst. The original list is still in *q. 598 */ 599 DIFF_QUEUE_CLEAR(&outq); 600 for (i = 0; i < q->nr; i++) { 601 struct diff_filepair *p = q->queue[i]; 602 struct diff_filepair *pair_to_free = NULL; 603 604 if (DIFF_PAIR_UNMERGED(p)) { 605 diff_q(&outq, p); 606 } 607 else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) { 608 /* 609 * Creation 610 * 611 * We would output this create record if it has 612 * not been turned into a rename/copy already. 613 */ 614 struct diff_rename_dst *dst = locate_rename_dst(p->two); 615 if (dst && dst->pair) { 616 diff_q(&outq, dst->pair); 617 pair_to_free = p; 618 } 619 else 620 /* no matching rename/copy source, so 621 * record this as a creation. 622 */ 623 diff_q(&outq, p); 624 } 625 else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) { 626 /* 627 * Deletion 628 * 629 * We would output this delete record if: 630 * 631 * (1) this is a broken delete and the counterpart 632 * broken create remains in the output; or 633 * (2) this is not a broken delete, and rename_dst 634 * does not have a rename/copy to move p->one->path 635 * out of existence. 636 * 637 * Otherwise, the counterpart broken create 638 * has been turned into a rename-edit; or 639 * delete did not have a matching create to 640 * begin with. 641 */ 642 if (DIFF_PAIR_BROKEN(p)) { 643 /* broken delete */ 644 struct diff_rename_dst *dst = locate_rename_dst(p->one); 645 if (dst && dst->pair) 646 /* counterpart is now rename/copy */ 647 pair_to_free = p; 648 } 649 else { 650 if (p->one->rename_used) 651 /* this path remains */ 652 pair_to_free = p; 653 } 654 655 if (pair_to_free) 656 ; 657 else 658 diff_q(&outq, p); 659 } 660 else if (!diff_unmodified_pair(p)) 661 /* all the usual ones need to be kept */ 662 diff_q(&outq, p); 663 else 664 /* no need to keep unmodified pairs */ 665 pair_to_free = p; 666 667 if (pair_to_free) 668 diff_free_filepair(pair_to_free); 669 } 670 diff_debug_queue("done copying original", &outq); 671 672 free(q->queue); 673 *q = outq; 674 diff_debug_queue("done collapsing", q); 675 676 for (i = 0; i < rename_dst_nr; i++) 677 free_filespec(rename_dst[i].two); 678 679 FREE_AND_NULL(rename_dst); 680 rename_dst_nr = rename_dst_alloc = 0; 681 FREE_AND_NULL(rename_src); 682 rename_src_nr = rename_src_alloc = 0; 683 return; 684}