1/* 2 * Copyright (C) 2005 Junio C Hamano 3 */ 4#include "cache.h" 5#include "diff.h" 6#include "diffcore.h" 7#include "delta.h" 8#include "count-delta.h" 9 10/* Table of rename/copy destinations */ 11 12static struct diff_rename_dst { 13 struct diff_filespec *two; 14 struct diff_filepair *pair; 15} *rename_dst; 16static int rename_dst_nr, rename_dst_alloc; 17 18static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two, 19 int insert_ok) 20{ 21 int first, last; 22 23 first = 0; 24 last = rename_dst_nr; 25 while (last > first) { 26 int next = (last + first) >> 1; 27 struct diff_rename_dst *dst = &(rename_dst[next]); 28 int cmp = strcmp(two->path, dst->two->path); 29 if (!cmp) 30 return dst; 31 if (cmp < 0) { 32 last = next; 33 continue; 34 } 35 first = next+1; 36 } 37 /* not found */ 38 if (!insert_ok) 39 return NULL; 40 /* insert to make it at "first" */ 41 if (rename_dst_alloc <= rename_dst_nr) { 42 rename_dst_alloc = alloc_nr(rename_dst_alloc); 43 rename_dst = xrealloc(rename_dst, 44 rename_dst_alloc * sizeof(*rename_dst)); 45 } 46 rename_dst_nr++; 47 if (first < rename_dst_nr) 48 memmove(rename_dst + first + 1, rename_dst + first, 49 (rename_dst_nr - first - 1) * sizeof(*rename_dst)); 50 rename_dst[first].two = two; 51 rename_dst[first].pair = NULL; 52 return &(rename_dst[first]); 53} 54 55static struct diff_rename_src { 56 struct diff_filespec *one; 57 unsigned src_used : 1; 58} *rename_src; 59static int rename_src_nr, rename_src_alloc; 60 61static struct diff_rename_src *locate_rename_src(struct diff_filespec *one, 62 int insert_ok) 63{ 64 int first, last; 65 66 first = 0; 67 last = rename_src_nr; 68 while (last > first) { 69 int next = (last + first) >> 1; 70 struct diff_rename_src *src = &(rename_src[next]); 71 int cmp = strcmp(one->path, src->one->path); 72 if (!cmp) 73 return src; 74 if (cmp < 0) { 75 last = next; 76 continue; 77 } 78 first = next+1; 79 } 80 /* not found */ 81 if (!insert_ok) 82 return NULL; 83 /* insert to make it at "first" */ 84 if (rename_src_alloc <= rename_src_nr) { 85 rename_src_alloc = alloc_nr(rename_src_alloc); 86 rename_src = xrealloc(rename_src, 87 rename_src_alloc * sizeof(*rename_src)); 88 } 89 rename_src_nr++; 90 if (first < rename_src_nr) 91 memmove(rename_src + first + 1, rename_src + first, 92 (rename_src_nr - first - 1) * sizeof(*rename_src)); 93 rename_src[first].one = one; 94 rename_src[first].src_used = 0; 95 return &(rename_src[first]); 96} 97 98static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst) 99{ 100 if (src->sha1_valid && dst->sha1_valid && 101 !memcmp(src->sha1, dst->sha1, 20)) 102 return 1; 103 if (diff_populate_filespec(src) || diff_populate_filespec(dst)) 104 /* this is an error but will be caught downstream */ 105 return 0; 106 if (src->size == dst->size && 107 !memcmp(src->data, dst->data, src->size)) 108 return 1; 109 return 0; 110} 111 112struct diff_score { 113 int src; /* index in rename_src */ 114 int dst; /* index in rename_dst */ 115 int score; 116 int rank; 117}; 118 119static int estimate_similarity(struct diff_filespec *src, 120 struct diff_filespec *dst, 121 int minimum_score) 122{ 123 /* src points at a file that existed in the original tree (or 124 * optionally a file in the destination tree) and dst points 125 * at a newly created file. They may be quite similar, in which 126 * case we want to say src is renamed to dst or src is copied into 127 * dst, and then some edit has been applied to dst. 128 * 129 * Compare them and return how similar they are, representing 130 * the score as an integer between 0 and 10000, except 131 * where they match exactly it is considered better than anything 132 * else. 133 */ 134 void *delta; 135 unsigned long delta_size, base_size; 136 int score; 137 138 /* We deal only with regular files. Symlink renames are handled 139 * only when they are exact matches --- in other words, no edits 140 * after renaming. 141 */ 142 if (!S_ISREG(src->mode) || !S_ISREG(dst->mode)) 143 return 0; 144 145 delta_size = ((src->size < dst->size) ? 146 (dst->size - src->size) : (src->size - dst->size)); 147 base_size = ((src->size < dst->size) ? src->size : dst->size); 148 149 /* We would not consider edits that change the file size so 150 * drastically. delta_size must be smaller than 151 * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size). 152 * Note that base_size == 0 case is handled here already 153 * and the final score computation below would not have a 154 * divide-by-zero issue. 155 */ 156 if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) 157 return 0; 158 159 delta = diff_delta(src->data, src->size, 160 dst->data, dst->size, 161 &delta_size); 162 163 /* A delta that has a lot of literal additions would have 164 * big delta_size no matter what else it does. 165 */ 166 if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) 167 return 0; 168 169 /* Estimate the edit size by interpreting delta. */ 170 delta_size = count_delta(delta, delta_size); 171 free(delta); 172 if (delta_size == UINT_MAX) 173 return 0; 174 175 /* 176 * Now we will give some score to it. 100% edit gets 0 points 177 * and 0% edit gets MAX_SCORE points. 178 */ 179 score = MAX_SCORE - (MAX_SCORE * delta_size / base_size); 180 if (score < 0) return 0; 181 if (MAX_SCORE < score) return MAX_SCORE; 182 return score; 183} 184 185static void record_rename_pair(struct diff_queue_struct *renq, 186 int dst_index, int src_index, int score) 187{ 188 struct diff_filespec *one, *two, *src, *dst; 189 struct diff_filepair *dp; 190 191 if (rename_dst[dst_index].pair) 192 die("internal error: dst already matched."); 193 194 src = rename_src[src_index].one; 195 one = alloc_filespec(src->path); 196 fill_filespec(one, src->sha1, src->mode); 197 198 dst = rename_dst[dst_index].two; 199 two = alloc_filespec(dst->path); 200 fill_filespec(two, dst->sha1, dst->mode); 201 202 dp = diff_queue(renq, one, two); 203 dp->score = score; 204 205 rename_src[src_index].src_used = 1; 206 rename_dst[dst_index].pair = dp; 207} 208 209/* 210 * We sort the rename similarity matrix with the score, in descending 211 * order (more similar first). 212 */ 213static int score_compare(const void *a_, const void *b_) 214{ 215 const struct diff_score *a = a_, *b = b_; 216 return b->score - a->score; 217} 218 219int diff_scoreopt_parse(const char *opt) 220{ 221 int diglen, num, scale, i; 222 if (opt[0] != '-' || (opt[1] != 'M' && opt[1] != 'C')) 223 return -1; /* that is not a -M nor -C option */ 224 diglen = strspn(opt+2, "0123456789"); 225 if (diglen == 0 || strlen(opt+2) != diglen) 226 return 0; /* use default */ 227 sscanf(opt+2, "%d", &num); 228 for (i = 0, scale = 1; i < diglen; i++) 229 scale *= 10; 230 231 /* user says num divided by scale and we say internally that 232 * is MAX_SCORE * num / scale. 233 */ 234 return MAX_SCORE * num / scale; 235} 236 237void diffcore_rename(int detect_rename, int minimum_score) 238{ 239 struct diff_queue_struct *q = &diff_queued_diff; 240 struct diff_queue_struct renq, outq; 241 struct diff_score *mx; 242 int i, j; 243 int num_create, num_src, dst_cnt; 244 245 if (!minimum_score) 246 minimum_score = DEFAULT_MINIMUM_SCORE; 247 renq.queue = NULL; 248 renq.nr = renq.alloc = 0; 249 250 for (i = 0; i < q->nr; i++) { 251 struct diff_filepair *p = q->queue[i]; 252 if (!DIFF_FILE_VALID(p->one)) 253 if (!DIFF_FILE_VALID(p->two)) 254 continue; /* unmerged */ 255 else 256 locate_rename_dst(p->two, 1); 257 else if (!DIFF_FILE_VALID(p->two)) 258 locate_rename_src(p->one, 1); 259 else if (1 < detect_rename) /* find copy, too */ 260 locate_rename_src(p->one, 1); 261 } 262 if (rename_dst_nr == 0) 263 goto cleanup; /* nothing to do */ 264 265 /* We really want to cull the candidates list early 266 * with cheap tests in order to avoid doing deltas. 267 */ 268 for (i = 0; i < rename_dst_nr; i++) { 269 struct diff_filespec *two = rename_dst[i].two; 270 for (j = 0; j < rename_src_nr; j++) { 271 struct diff_filespec *one = rename_src[j].one; 272 if (!is_exact_match(one, two)) 273 continue; 274 record_rename_pair(&renq, i, j, MAX_SCORE); 275 break; /* we are done with this entry */ 276 } 277 } 278 diff_debug_queue("done detecting exact", &renq); 279 280 /* Have we run out the created file pool? If so we can avoid 281 * doing the delta matrix altogether. 282 */ 283 if (renq.nr == rename_dst_nr) 284 goto flush_rest; 285 286 num_create = (rename_dst_nr - renq.nr); 287 num_src = rename_src_nr; 288 mx = xmalloc(sizeof(*mx) * num_create * num_src); 289 for (dst_cnt = i = 0; i < rename_dst_nr; i++) { 290 int base = dst_cnt * num_src; 291 struct diff_filespec *two = rename_dst[i].two; 292 if (rename_dst[i].pair) 293 continue; /* dealt with exact match already. */ 294 for (j = 0; j < rename_src_nr; j++) { 295 struct diff_filespec *one = rename_src[j].one; 296 struct diff_score *m = &mx[base+j]; 297 m->src = j; 298 m->dst = i; 299 m->score = estimate_similarity(one, two, 300 minimum_score); 301 } 302 dst_cnt++; 303 } 304 /* cost matrix sorted by most to least similar pair */ 305 qsort(mx, num_create * num_src, sizeof(*mx), score_compare); 306 for (i = 0; i < num_create * num_src; i++) { 307 struct diff_rename_dst *dst = &rename_dst[mx[i].dst]; 308 if (dst->pair) 309 continue; /* already done, either exact or fuzzy. */ 310 if (mx[i].score < minimum_score) 311 break; /* there is not any more diffs applicable. */ 312 record_rename_pair(&renq, mx[i].dst, mx[i].src, mx[i].score); 313 } 314 free(mx); 315 diff_debug_queue("done detecting fuzzy", &renq); 316 317 flush_rest: 318 /* At this point, we have found some renames and copies and they 319 * are kept in renq. The original list is still in *q. 320 * 321 * Scan the original list and move them into the outq; we will sort 322 * outq and swap it into the queue supplied to pass that to 323 * downstream, so we assign the sort keys in this loop. 324 * 325 * See comments at the top of record_rename_pair for numbers used 326 * to assign rename_rank. 327 */ 328 outq.queue = NULL; 329 outq.nr = outq.alloc = 0; 330 for (i = 0; i < q->nr; i++) { 331 struct diff_filepair *p = q->queue[i]; 332 struct diff_rename_src *src = locate_rename_src(p->one, 0); 333 struct diff_rename_dst *dst = locate_rename_dst(p->two, 0); 334 struct diff_filepair *pair_to_free = NULL; 335 336 if (dst) { 337 /* creation */ 338 if (dst->pair) { 339 /* renq has rename/copy already to produce 340 * this file, so we do not emit the creation 341 * record in the output. 342 */ 343 diff_q(&outq, dst->pair); 344 pair_to_free = p; 345 } 346 else 347 /* no matching rename/copy source, so record 348 * this as a creation. 349 */ 350 diff_q(&outq, p); 351 } 352 else if (!diff_unmodified_pair(p)) 353 /* all the other cases need to be recorded as is */ 354 diff_q(&outq, p); 355 else { 356 /* unmodified pair needs to be recorded only if 357 * it is used as the source of rename/copy 358 */ 359 if (src && src->src_used) 360 diff_q(&outq, p); 361 else 362 pair_to_free = p; 363 } 364 if (pair_to_free) 365 diff_free_filepair(pair_to_free); 366 } 367 diff_debug_queue("done copying original", &outq); 368 369 free(renq.queue); 370 free(q->queue); 371 *q = outq; 372 diff_debug_queue("done collapsing", q); 373 374 cleanup: 375 free(rename_dst); 376 rename_dst = NULL; 377 rename_dst_nr = rename_dst_alloc = 0; 378 free(rename_src); 379 rename_src = NULL; 380 rename_src_nr = rename_src_alloc = 0; 381 return; 382}