1/*2* Copyright (C) 2005 Junio C Hamano3*/4#include "cache.h"5#include "diff.h"6#include "diffcore.h"7#include "delta.h"8#include "count-delta.h"910/* Table of rename/copy destinations */1112static struct diff_rename_dst {13struct diff_filespec *two;14struct diff_filepair *pair;15} *rename_dst;16static int rename_dst_nr, rename_dst_alloc;1718static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,19int insert_ok)20{21int first, last;2223first = 0;24last = rename_dst_nr;25while (last > first) {26int next = (last + first) >> 1;27struct diff_rename_dst *dst = &(rename_dst[next]);28int cmp = strcmp(two->path, dst->two->path);29if (!cmp)30return dst;31if (cmp < 0) {32last = next;33continue;34}35first = next+1;36}37/* not found */38if (!insert_ok)39return NULL;40/* insert to make it at "first" */41if (rename_dst_alloc <= rename_dst_nr) {42rename_dst_alloc = alloc_nr(rename_dst_alloc);43rename_dst = xrealloc(rename_dst,44rename_dst_alloc * sizeof(*rename_dst));45}46rename_dst_nr++;47if (first < rename_dst_nr)48memmove(rename_dst + first + 1, rename_dst + first,49(rename_dst_nr - first - 1) * sizeof(*rename_dst));50rename_dst[first].two = two;51rename_dst[first].pair = NULL;52return &(rename_dst[first]);53}5455/* Table of rename/copy src files */56static struct diff_rename_src {57struct diff_filespec *one;58unsigned src_stays : 1;59} *rename_src;60static int rename_src_nr, rename_src_alloc;6162static struct diff_rename_src *register_rename_src(struct diff_filespec *one,63int src_stays)64{65int first, last;6667first = 0;68last = rename_src_nr;69while (last > first) {70int next = (last + first) >> 1;71struct diff_rename_src *src = &(rename_src[next]);72int cmp = strcmp(one->path, src->one->path);73if (!cmp)74return src;75if (cmp < 0) {76last = next;77continue;78}79first = next+1;80}8182/* insert to make it at "first" */83if (rename_src_alloc <= rename_src_nr) {84rename_src_alloc = alloc_nr(rename_src_alloc);85rename_src = xrealloc(rename_src,86rename_src_alloc * sizeof(*rename_src));87}88rename_src_nr++;89if (first < rename_src_nr)90memmove(rename_src + first + 1, rename_src + first,91(rename_src_nr - first - 1) * sizeof(*rename_src));92rename_src[first].one = one;93rename_src[first].src_stays = src_stays;94return &(rename_src[first]);95}9697static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst)98{99if (src->sha1_valid && dst->sha1_valid &&100!memcmp(src->sha1, dst->sha1, 20))101return 1;102if (diff_populate_filespec(src) || diff_populate_filespec(dst))103/* this is an error but will be caught downstream */104return 0;105if (src->size == dst->size &&106!memcmp(src->data, dst->data, src->size))107return 1;108return 0;109}110111struct diff_score {112int src; /* index in rename_src */113int dst; /* index in rename_dst */114int score;115};116117static int estimate_similarity(struct diff_filespec *src,118struct diff_filespec *dst,119int minimum_score)120{121/* src points at a file that existed in the original tree (or122* optionally a file in the destination tree) and dst points123* at a newly created file. They may be quite similar, in which124* case we want to say src is renamed to dst or src is copied into125* dst, and then some edit has been applied to dst.126*127* Compare them and return how similar they are, representing128* the score as an integer between 0 and 10000, except129* where they match exactly it is considered better than anything130* else.131*/132void *delta;133unsigned long delta_size, base_size;134int score;135136/* We deal only with regular files. Symlink renames are handled137* only when they are exact matches --- in other words, no edits138* after renaming.139*/140if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))141return 0;142143delta_size = ((src->size < dst->size) ?144(dst->size - src->size) : (src->size - dst->size));145base_size = ((src->size < dst->size) ? src->size : dst->size);146147/* We would not consider edits that change the file size so148* drastically. delta_size must be smaller than149* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).150* Note that base_size == 0 case is handled here already151* and the final score computation below would not have a152* divide-by-zero issue.153*/154if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)155return 0;156157delta = diff_delta(src->data, src->size,158dst->data, dst->size,159&delta_size);160161/* A delta that has a lot of literal additions would have162* big delta_size no matter what else it does.163*/164if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)165return 0;166167/* Estimate the edit size by interpreting delta. */168delta_size = count_delta(delta, delta_size);169free(delta);170if (delta_size == UINT_MAX)171return 0;172173/*174* Now we will give some score to it. 100% edit gets 0 points175* and 0% edit gets MAX_SCORE points.176*/177score = MAX_SCORE - (MAX_SCORE * delta_size / base_size);178if (score < 0) return 0;179if (MAX_SCORE < score) return MAX_SCORE;180return score;181}182183static void record_rename_pair(struct diff_queue_struct *renq,184int dst_index, int src_index, int score)185{186struct diff_filespec *one, *two, *src, *dst;187struct diff_filepair *dp;188189if (rename_dst[dst_index].pair)190die("internal error: dst already matched.");191192src = rename_src[src_index].one;193one = alloc_filespec(src->path);194fill_filespec(one, src->sha1, src->mode);195196dst = rename_dst[dst_index].two;197two = alloc_filespec(dst->path);198fill_filespec(two, dst->sha1, dst->mode);199200dp = diff_queue(renq, one, two);201dp->score = score ? : 1; /* make sure it is at least 1 */202dp->source_stays = rename_src[src_index].src_stays;203rename_dst[dst_index].pair = dp;204}205206/*207* We sort the rename similarity matrix with the score, in descending208* order (the most similar first).209*/210static int score_compare(const void *a_, const void *b_)211{212const struct diff_score *a = a_, *b = b_;213return b->score - a->score;214}215216int diff_scoreopt_parse(const char *opt)217{218int diglen, num, scale, i;219if (opt[0] != '-' || (opt[1] != 'M' && opt[1] != 'C'))220return -1; /* that is not a -M nor -C option */221diglen = strspn(opt+2, "0123456789");222if (diglen == 0 || strlen(opt+2) != diglen)223return 0; /* use default */224sscanf(opt+2, "%d", &num);225for (i = 0, scale = 1; i < diglen; i++)226scale *= 10;227228/* user says num divided by scale and we say internally that229* is MAX_SCORE * num / scale.230*/231return MAX_SCORE * num / scale;232}233234void diffcore_rename(int detect_rename, int minimum_score)235{236struct diff_queue_struct *q = &diff_queued_diff;237struct diff_queue_struct renq, outq;238struct diff_score *mx;239int i, j;240int num_create, num_src, dst_cnt;241242if (!minimum_score)243minimum_score = DEFAULT_MINIMUM_SCORE;244renq.queue = NULL;245renq.nr = renq.alloc = 0;246247for (i = 0; i < q->nr; i++) {248struct diff_filepair *p = q->queue[i];249if (!DIFF_FILE_VALID(p->one))250if (!DIFF_FILE_VALID(p->two))251continue; /* unmerged */252else253locate_rename_dst(p->two, 1);254else if (!DIFF_FILE_VALID(p->two))255register_rename_src(p->one, 0);256else if (detect_rename == DIFF_DETECT_COPY)257register_rename_src(p->one, 1);258}259if (rename_dst_nr == 0)260goto cleanup; /* nothing to do */261262/* We really want to cull the candidates list early263* with cheap tests in order to avoid doing deltas.264*/265for (i = 0; i < rename_dst_nr; i++) {266struct diff_filespec *two = rename_dst[i].two;267for (j = 0; j < rename_src_nr; j++) {268struct diff_filespec *one = rename_src[j].one;269if (!is_exact_match(one, two))270continue;271record_rename_pair(&renq, i, j, MAX_SCORE);272break; /* we are done with this entry */273}274}275diff_debug_queue("done detecting exact", &renq);276277/* Have we run out the created file pool? If so we can avoid278* doing the delta matrix altogether.279*/280if (renq.nr == rename_dst_nr)281goto cleanup;282283num_create = (rename_dst_nr - renq.nr);284num_src = rename_src_nr;285mx = xmalloc(sizeof(*mx) * num_create * num_src);286for (dst_cnt = i = 0; i < rename_dst_nr; i++) {287int base = dst_cnt * num_src;288struct diff_filespec *two = rename_dst[i].two;289if (rename_dst[i].pair)290continue; /* dealt with exact match already. */291for (j = 0; j < rename_src_nr; j++) {292struct diff_filespec *one = rename_src[j].one;293struct diff_score *m = &mx[base+j];294m->src = j;295m->dst = i;296m->score = estimate_similarity(one, two,297minimum_score);298}299dst_cnt++;300}301/* cost matrix sorted by most to least similar pair */302qsort(mx, num_create * num_src, sizeof(*mx), score_compare);303for (i = 0; i < num_create * num_src; i++) {304struct diff_rename_dst *dst = &rename_dst[mx[i].dst];305if (dst->pair)306continue; /* already done, either exact or fuzzy. */307if (mx[i].score < minimum_score)308break; /* there is no more usable pair. */309record_rename_pair(&renq, mx[i].dst, mx[i].src, mx[i].score);310}311free(mx);312diff_debug_queue("done detecting fuzzy", &renq);313314cleanup:315/* At this point, we have found some renames and copies and they316* are kept in renq. The original list is still in *q.317*/318outq.queue = NULL;319outq.nr = outq.alloc = 0;320for (i = 0; i < q->nr; i++) {321struct diff_filepair *p = q->queue[i];322struct diff_rename_dst *dst = locate_rename_dst(p->two, 0);323struct diff_filepair *pair_to_free = NULL;324325if (dst) {326/* creation */327if (dst->pair) {328/* renq has rename/copy to produce329* this file already, so we do not330* emit the creation record in the331* output.332*/333diff_q(&outq, dst->pair);334pair_to_free = p;335}336else337/* no matching rename/copy source, so record338* this as a creation.339*/340diff_q(&outq, p);341}342else if (!diff_unmodified_pair(p))343/* all the usual ones need to be kept */344diff_q(&outq, p);345else346/* no need to keep unmodified pairs */347pair_to_free = p;348349if (pair_to_free)350diff_free_filepair(pair_to_free);351}352diff_debug_queue("done copying original", &outq);353354free(renq.queue);355free(q->queue);356*q = outq;357diff_debug_queue("done collapsing", q);358359free(rename_dst);360rename_dst = NULL;361rename_dst_nr = rename_dst_alloc = 0;362free(rename_src);363rename_src = NULL;364rename_src_nr = rename_src_alloc = 0;365return;366}