1/* 2 * Copyright (C) 2005 Junio C Hamano 3 */ 4#include"cache.h" 5#include"diff.h" 6#include"diffcore.h" 7#include"delta.h" 8 9struct diff_rename_pool { 10struct diff_filespec **s; 11int nr, alloc; 12}; 13 14static voiddiff_rename_pool_clear(struct diff_rename_pool *pool) 15{ 16 pool->s = NULL; pool->nr = pool->alloc =0; 17} 18 19static voiddiff_rename_pool_add(struct diff_rename_pool *pool, 20struct diff_filespec *s) 21{ 22if(S_ISDIR(s->mode)) 23return;/* rename/copy patch for tree does not make sense. */ 24 25if(pool->alloc <= pool->nr) { 26 pool->alloc =alloc_nr(pool->alloc); 27 pool->s =xrealloc(pool->s, 28sizeof(*(pool->s)) * pool->alloc); 29} 30 pool->s[pool->nr] = s; 31 pool->nr++; 32} 33 34static intis_exact_match(struct diff_filespec *src,struct diff_filespec *dst) 35{ 36if(src->sha1_valid && dst->sha1_valid && 37!memcmp(src->sha1, dst->sha1,20)) 38return1; 39if(diff_populate_filespec(src) ||diff_populate_filespec(dst)) 40/* this is an error but will be caught downstream */ 41return0; 42if(src->size == dst->size && 43!memcmp(src->data, dst->data, src->size)) 44return1; 45return0; 46} 47 48struct diff_score { 49struct diff_filespec *src; 50struct diff_filespec *dst; 51int score; 52int rank; 53}; 54 55static intestimate_similarity(struct diff_filespec *src, 56struct diff_filespec *dst, 57int minimum_score) 58{ 59/* src points at a file that existed in the original tree (or 60 * optionally a file in the destination tree) and dst points 61 * at a newly created file. They may be quite similar, in which 62 * case we want to say src is renamed to dst or src is copied into 63 * dst, and then some edit has been applied to dst. 64 * 65 * Compare them and return how similar they are, representing 66 * the score as an integer between 0 and 10000, except 67 * where they match exactly it is considered better than anything 68 * else. 69 */ 70void*delta; 71unsigned long delta_size; 72int score; 73 74 delta_size = ((src->size < dst->size) ? 75(dst->size - src->size) : (src->size - dst->size)); 76 77/* We would not consider rename followed by more than 78 * minimum_score/MAX_SCORE edits; that is, delta_size must be smaller 79 * than (src->size + dst->size)/2 * minimum_score/MAX_SCORE, 80 * which means... 81 */ 82 83if((src->size+dst->size)*minimum_score < delta_size*MAX_SCORE*2) 84return0; 85 86 delta =diff_delta(src->data, src->size, 87 dst->data, dst->size, 88&delta_size); 89free(delta); 90 91/* This "delta" is really xdiff with adler32 and all the 92 * overheads but it is a quick and dirty approximation. 93 * 94 * Now we will give some score to it. 100% edit gets 95 * 0 points and 0% edit gets MAX_SCORE points. That is, every 96 * 1/MAX_SCORE edit gets 1 point penalty. The amount of penalty is: 97 * 98 * (delta_size * 2 / (src->size + dst->size)) * MAX_SCORE 99 * 100 */ 101 score = MAX_SCORE-(MAX_SCORE*2*delta_size/(src->size+dst->size)); 102if(score <0)return0; 103if(MAX_SCORE < score)return MAX_SCORE; 104return score; 105} 106 107static voidrecord_rename_pair(struct diff_queue_struct *outq, 108struct diff_filespec *src, 109struct diff_filespec *dst, 110int rank, 111int score) 112{ 113/* The rank is used to sort the final output, because there 114 * are certain dependencies. 115 * 116 * - rank #0 depends on deleted ones. 117 * - rank #1 depends on kept files before they are modified. 118 * - rank #2 depends on kept files after they are modified; 119 * currently not used. 120 * 121 * Therefore, the final output order should be: 122 * 123 * 1. rank #0 rename/copy diffs. 124 * 2. deletions in the original. 125 * 3. rank #1 rename/copy diffs. 126 * 4. additions and modifications in the original. 127 * 5. rank #2 rename/copy diffs; currently not used. 128 * 129 * To achieve this sort order, we give xform_work the number 130 * above. 131 */ 132struct diff_filepair *dp =diff_queue(outq, src, dst); 133 dp->xfrm_work = (rank *2+1) | (score<<RENAME_SCORE_SHIFT); 134 dst->xfrm_flags |= RENAME_DST_MATCHED; 135} 136 137#if 0 138static voiddebug_filespec(struct diff_filespec *s,int x,const char*one) 139{ 140fprintf(stderr,"queue[%d]%s(%s)%s %06o%s\n", 141 x, one, 142 s->path, 143 s->file_valid ?"valid":"invalid", 144 s->mode, 145 s->sha1_valid ?sha1_to_hex(s->sha1) :""); 146fprintf(stderr,"queue[%d]%ssize%lu flags%d\n", 147 x, one, 148 s->size, s->xfrm_flags); 149} 150 151static voiddebug_filepair(const struct diff_filepair *p,int i) 152{ 153debug_filespec(p->one, i,"one"); 154debug_filespec(p->two, i,"two"); 155fprintf(stderr,"pair flags%d, orig order%d, score%d\n", 156(p->xfrm_work & ((1<<RENAME_SCORE_SHIFT) -1)), 157 p->orig_order, 158(p->xfrm_work >> RENAME_SCORE_SHIFT)); 159} 160 161static voiddebug_queue(const char*msg,struct diff_queue_struct *q) 162{ 163int i; 164if(msg) 165fprintf(stderr,"%s\n", msg); 166fprintf(stderr,"q->nr =%d\n", q->nr); 167for(i =0; i < q->nr; i++) { 168struct diff_filepair *p = q->queue[i]; 169debug_filepair(p, i); 170} 171} 172#else 173#define debug_queue(a,b) do { ;/*nothing*/ } while(0) 174#endif 175 176/* 177 * We sort the outstanding diff entries according to the rank (see 178 * comment at the beginning of record_rename_pair) and tiebreak with 179 * the order in the original input. 180 */ 181static intrank_compare(const void*a_,const void*b_) 182{ 183const struct diff_filepair *a = *(const struct diff_filepair **)a_; 184const struct diff_filepair *b = *(const struct diff_filepair **)b_; 185int a_rank = a->xfrm_work & ((1<<RENAME_SCORE_SHIFT) -1); 186int b_rank = b->xfrm_work & ((1<<RENAME_SCORE_SHIFT) -1); 187 188if(a_rank != b_rank) 189return a_rank - b_rank; 190return a->orig_order - b->orig_order; 191} 192 193/* 194 * We sort the rename similarity matrix with the score, in descending 195 * order (more similar first). 196 */ 197static intscore_compare(const void*a_,const void*b_) 198{ 199const struct diff_score *a = a_, *b = b_; 200return b->score - a->score; 201} 202 203static intneeds_to_stay(struct diff_queue_struct *q,int i, 204struct diff_filespec *it) 205{ 206/* If it will be used in later entry (either stay or used 207 * as the source of rename/copy), we need to copy, not rename. 208 */ 209while(i < q->nr) { 210struct diff_filepair *p = q->queue[i++]; 211if(!p->two->file_valid) 212continue;/* removed is fine */ 213if(strcmp(p->one->path, it->path)) 214continue;/* not relevant */ 215 216/* p has its src set to *it and it is not a delete; 217 * it will be used for in-place change or rename/copy, 218 * so we cannot rename it out. 219 */ 220return1; 221} 222return0; 223} 224 225voiddiff_detect_rename(struct diff_queue_struct *q, 226int detect_rename, 227int minimum_score) 228{ 229struct diff_queue_struct outq; 230struct diff_rename_pool created, deleted, stay; 231struct diff_rename_pool *(srcs[2]); 232struct diff_score *mx; 233int h, i, j; 234int num_create, num_src, dst_cnt, src_cnt; 235 236 outq.queue = NULL; 237 outq.nr = outq.alloc =0; 238 239diff_rename_pool_clear(&created); 240diff_rename_pool_clear(&deleted); 241diff_rename_pool_clear(&stay); 242 243 srcs[0] = &deleted; 244 srcs[1] = &stay; 245 246for(i =0; i < q->nr; i++) { 247struct diff_filepair *p = q->queue[i]; 248if(!p->one->file_valid) 249if(!p->two->file_valid) 250continue;/* ignore nonsense */ 251else 252diff_rename_pool_add(&created, p->two); 253else if(!p->two->file_valid) 254diff_rename_pool_add(&deleted, p->one); 255else if(1< detect_rename)/* find copy, too */ 256diff_rename_pool_add(&stay, p->one); 257} 258if(created.nr ==0) 259goto cleanup;/* nothing to do */ 260 261/* We really want to cull the candidates list early 262 * with cheap tests in order to avoid doing deltas. 263 * 264 * With the current callers, we should not have already 265 * matched entries at this point, but it is nonetheless 266 * checked for sanity. 267 */ 268for(i =0; i < created.nr; i++) { 269if(created.s[i]->xfrm_flags & RENAME_DST_MATCHED) 270continue;/* we have matched exactly already */ 271for(h =0; h <sizeof(srcs)/sizeof(srcs[0]); h++) { 272struct diff_rename_pool *p = srcs[h]; 273for(j =0; j < p->nr; j++) { 274if(!is_exact_match(p->s[j], created.s[i])) 275continue; 276record_rename_pair(&outq, 277 p->s[j], created.s[i], h, 278 MAX_SCORE); 279break;/* we are done with this entry */ 280} 281} 282} 283debug_queue("done detecting exact", &outq); 284 285/* Have we run out the created file pool? If so we can avoid 286 * doing the delta matrix altogether. 287 */ 288if(outq.nr == created.nr) 289goto flush_rest; 290 291 num_create = (created.nr - outq.nr); 292 num_src = deleted.nr + stay.nr; 293 mx =xmalloc(sizeof(*mx) * num_create * num_src); 294for(dst_cnt = i =0; i < created.nr; i++) { 295int base = dst_cnt * num_src; 296if(created.s[i]->xfrm_flags & RENAME_DST_MATCHED) 297continue;/* dealt with exact match already. */ 298for(src_cnt = h =0; h <sizeof(srcs)/sizeof(srcs[0]); h++) { 299struct diff_rename_pool *p = srcs[h]; 300for(j =0; j < p->nr; j++, src_cnt++) { 301struct diff_score *m = &mx[base + src_cnt]; 302 m->src = p->s[j]; 303 m->dst = created.s[i]; 304 m->score =estimate_similarity(m->src, m->dst, 305 minimum_score); 306 m->rank = h; 307} 308} 309 dst_cnt++; 310} 311/* cost matrix sorted by most to least similar pair */ 312qsort(mx, num_create * num_src,sizeof(*mx), score_compare); 313for(i =0; i < num_create * num_src; i++) { 314if(mx[i].dst->xfrm_flags & RENAME_DST_MATCHED) 315continue;/* alreayd done, either exact or fuzzy. */ 316if(mx[i].score < minimum_score) 317continue; 318record_rename_pair(&outq, 319 mx[i].src, mx[i].dst, mx[i].rank, 320 mx[i].score); 321} 322free(mx); 323debug_queue("done detecting fuzzy", &outq); 324 325 flush_rest: 326/* At this point, we have found some renames and copies and they 327 * are kept in outq. The original list is still in *q. 328 * 329 * Scan the original list and move them into the outq; we will sort 330 * outq and swap it into the queue supplied to pass that to 331 * downstream, so we assign the sort keys in this loop. 332 * 333 * See comments at the top of record_rename_pair for numbers used 334 * to assign xfrm_work. 335 * 336 * Note that we have not annotated the diff_filepair with any comment 337 * so there is nothing other than p to free. 338 */ 339for(i =0; i < q->nr; i++) { 340struct diff_filepair *dp, *p = q->queue[i]; 341if(!p->one->file_valid) { 342if(p->two->file_valid) { 343/* creation */ 344 dp =diff_queue(&outq, p->one, p->two); 345 dp->xfrm_work =4; 346} 347/* otherwise it is a nonsense; just ignore it */ 348} 349else if(!p->two->file_valid) { 350/* deletion */ 351 dp =diff_queue(&outq, p->one, p->two); 352 dp->xfrm_work =2; 353} 354else{ 355/* modification, or stay as is */ 356 dp =diff_queue(&outq, p->one, p->two); 357 dp->xfrm_work =4; 358} 359free(p); 360} 361debug_queue("done copying original", &outq); 362 363/* Sort outq */ 364qsort(outq.queue, outq.nr,sizeof(outq.queue[0]), rank_compare); 365 366debug_queue("done sorting", &outq); 367 368free(q->queue); 369 q->nr = q->alloc =0; 370 q->queue = NULL; 371 372/* Copy it out to q, removing duplicates. */ 373for(i =0; i < outq.nr; i++) { 374struct diff_filepair *p = outq.queue[i]; 375if(!p->one->file_valid) { 376/* created */ 377if(p->two->xfrm_flags & RENAME_DST_MATCHED) 378;/* rename/copy created it already */ 379else 380diff_queue(q, p->one, p->two); 381} 382else if(!p->two->file_valid) { 383/* deleted */ 384if(p->one->xfrm_flags & RENAME_SRC_GONE) 385;/* rename/copy deleted it already */ 386else 387diff_queue(q, p->one, p->two); 388} 389else if(strcmp(p->one->path, p->two->path)) { 390/* rename or copy */ 391struct diff_filepair *dp = 392diff_queue(q, p->one, p->two); 393int msglen = (strlen(p->one->path) + 394strlen(p->two->path) +100); 395int score = (p->xfrm_work >> RENAME_SCORE_SHIFT); 396 dp->xfrm_msg =xmalloc(msglen); 397 398/* if we have a later entry that is a rename/copy 399 * that depends on p->one, then we copy here. 400 * otherwise we rename it. 401 */ 402if(needs_to_stay(&outq, i+1, p->one)) { 403/* copy it */ 404sprintf(dp->xfrm_msg, 405"similarity index%d%%\n" 406"copy from%s\n" 407"copy to%s\n", 408(int)(0.5+ score *100/ MAX_SCORE), 409 p->one->path, p->two->path); 410} 411else{ 412/* rename it, and mark it as gone. */ 413 p->one->xfrm_flags |= RENAME_SRC_GONE; 414sprintf(dp->xfrm_msg, 415"similarity index%d%%\n" 416"rename old%s\n" 417"rename new%s\n", 418(int)(0.5+ score *100/ MAX_SCORE), 419 p->one->path, p->two->path); 420} 421} 422else 423/* otherwise it is a modified (or stayed) entry */ 424diff_queue(q, p->one, p->two); 425free(p); 426} 427 428free(outq.queue); 429debug_queue("done collapsing", q); 430 431 cleanup: 432free(created.s); 433free(deleted.s); 434free(stay.s); 435return; 436}