1/*2* Copyright (C) 2005 Junio C Hamano3*/4#include "cache.h"5#include "diff.h"6#include "diffcore.h"7#include "delta.h"8#include "count-delta.h"910/* Table of rename/copy destinations */1112static struct diff_rename_dst {13struct diff_filespec *two;14struct diff_filepair *pair;15} *rename_dst;16static int rename_dst_nr, rename_dst_alloc;1718static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,19int insert_ok)20{21int first, last;2223first = 0;24last = rename_dst_nr;25while (last > first) {26int next = (last + first) >> 1;27struct diff_rename_dst *dst = &(rename_dst[next]);28int cmp = strcmp(two->path, dst->two->path);29if (!cmp)30return dst;31if (cmp < 0) {32last = next;33continue;34}35first = next+1;36}37/* not found */38if (!insert_ok)39return NULL;40/* insert to make it at "first" */41if (rename_dst_alloc <= rename_dst_nr) {42rename_dst_alloc = alloc_nr(rename_dst_alloc);43rename_dst = xrealloc(rename_dst,44rename_dst_alloc * sizeof(*rename_dst));45}46rename_dst_nr++;47if (first < rename_dst_nr)48memmove(rename_dst + first + 1, rename_dst + first,49(rename_dst_nr - first - 1) * sizeof(*rename_dst));50rename_dst[first].two = alloc_filespec(two->path);51fill_filespec(rename_dst[first].two, two->sha1, two->mode);52rename_dst[first].pair = NULL;53return &(rename_dst[first]);54}5556/* Table of rename/copy src files */57static struct diff_rename_src {58struct diff_filespec *one;59unsigned src_path_left : 1;60} *rename_src;61static int rename_src_nr, rename_src_alloc;6263static struct diff_rename_src *register_rename_src(struct diff_filespec *one,64int src_path_left)65{66int first, last;6768first = 0;69last = rename_src_nr;70while (last > first) {71int next = (last + first) >> 1;72struct diff_rename_src *src = &(rename_src[next]);73int cmp = strcmp(one->path, src->one->path);74if (!cmp)75return src;76if (cmp < 0) {77last = next;78continue;79}80first = next+1;81}8283/* insert to make it at "first" */84if (rename_src_alloc <= rename_src_nr) {85rename_src_alloc = alloc_nr(rename_src_alloc);86rename_src = xrealloc(rename_src,87rename_src_alloc * sizeof(*rename_src));88}89rename_src_nr++;90if (first < rename_src_nr)91memmove(rename_src + first + 1, rename_src + first,92(rename_src_nr - first - 1) * sizeof(*rename_src));93rename_src[first].one = one;94rename_src[first].src_path_left = src_path_left;95return &(rename_src[first]);96}9798static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst)99{100if (src->sha1_valid && dst->sha1_valid &&101!memcmp(src->sha1, dst->sha1, 20))102return 1;103if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))104return 0;105if (src->size != dst->size)106return 0;107if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))108return 0;109if (src->size == dst->size &&110!memcmp(src->data, dst->data, src->size))111return 1;112return 0;113}114115struct diff_score {116int src; /* index in rename_src */117int dst; /* index in rename_dst */118int score;119};120121static int estimate_similarity(struct diff_filespec *src,122struct diff_filespec *dst,123int minimum_score)124{125/* src points at a file that existed in the original tree (or126* optionally a file in the destination tree) and dst points127* at a newly created file. They may be quite similar, in which128* case we want to say src is renamed to dst or src is copied into129* dst, and then some edit has been applied to dst.130*131* Compare them and return how similar they are, representing132* the score as an integer between 0 and MAX_SCORE.133*134* When there is an exact match, it is considered a better135* match than anything else; the destination does not even136* call into this function in that case.137*/138void *delta;139unsigned long delta_size, base_size, src_copied, literal_added;140unsigned long delta_limit;141int score;142143/* We deal only with regular files. Symlink renames are handled144* only when they are exact matches --- in other words, no edits145* after renaming.146*/147if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))148return 0;149150delta_size = ((src->size < dst->size) ?151(dst->size - src->size) : (src->size - dst->size));152base_size = ((src->size < dst->size) ? src->size : dst->size);153154/* We would not consider edits that change the file size so155* drastically. delta_size must be smaller than156* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).157*158* Note that base_size == 0 case is handled here already159* and the final score computation below would not have a160* divide-by-zero issue.161*/162if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)163return 0;164165if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))166return 0; /* error but caught downstream */167168delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;169delta = diff_delta(src->data, src->size,170dst->data, dst->size,171&delta_size, delta_limit);172if (!delta)173/* If delta_limit is exceeded, we have too much differences */174return 0;175176/* A delta that has a lot of literal additions would have177* big delta_size no matter what else it does.178*/179if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)180return 0;181182/* Estimate the edit size by interpreting delta. */183if (count_delta(delta, delta_size, &src_copied, &literal_added)) {184free(delta);185return 0;186}187free(delta);188189/* Extent of damage */190if (src->size + literal_added < src_copied)191delta_size = 0;192else193delta_size = (src->size - src_copied) + literal_added;194195/*196* Now we will give some score to it. 100% edit gets 0 points197* and 0% edit gets MAX_SCORE points.198*/199score = MAX_SCORE - (MAX_SCORE * delta_size / base_size);200if (score < 0) return 0;201if (MAX_SCORE < score) return MAX_SCORE;202return score;203}204205static void record_rename_pair(int dst_index, int src_index, int score)206{207struct diff_filespec *one, *two, *src, *dst;208struct diff_filepair *dp;209210if (rename_dst[dst_index].pair)211die("internal error: dst already matched.");212213src = rename_src[src_index].one;214one = alloc_filespec(src->path);215fill_filespec(one, src->sha1, src->mode);216217dst = rename_dst[dst_index].two;218two = alloc_filespec(dst->path);219fill_filespec(two, dst->sha1, dst->mode);220221dp = diff_queue(NULL, one, two);222dp->score = score;223dp->source_stays = rename_src[src_index].src_path_left;224rename_dst[dst_index].pair = dp;225}226227/*228* We sort the rename similarity matrix with the score, in descending229* order (the most similar first).230*/231static int score_compare(const void *a_, const void *b_)232{233const struct diff_score *a = a_, *b = b_;234return b->score - a->score;235}236237static int compute_stays(struct diff_queue_struct *q,238struct diff_filespec *one)239{240int i;241for (i = 0; i < q->nr; i++) {242struct diff_filepair *p = q->queue[i];243if (strcmp(one->path, p->two->path))244continue;245if (DIFF_PAIR_RENAME(p)) {246return 0; /* something else is renamed into this */247}248}249return 1;250}251252void diffcore_rename(struct diff_options *options)253{254int detect_rename = options->detect_rename;255int minimum_score = options->rename_score;256int rename_limit = options->rename_limit;257struct diff_queue_struct *q = &diff_queued_diff;258struct diff_queue_struct outq;259struct diff_score *mx;260int i, j, rename_count;261int num_create, num_src, dst_cnt;262263if (!minimum_score)264minimum_score = DEFAULT_RENAME_SCORE;265rename_count = 0;266267for (i = 0; i < q->nr; i++) {268struct diff_filepair *p = q->queue[i];269if (!DIFF_FILE_VALID(p->one))270if (!DIFF_FILE_VALID(p->two))271continue; /* unmerged */272else273locate_rename_dst(p->two, 1);274else if (!DIFF_FILE_VALID(p->two)) {275/* If the source is a broken "delete", and276* they did not really want to get broken,277* that means the source actually stays.278*/279int stays = (p->broken_pair && !p->score);280register_rename_src(p->one, stays);281}282else if (detect_rename == DIFF_DETECT_COPY)283register_rename_src(p->one, 1);284}285if (rename_dst_nr == 0 ||286(0 < rename_limit && rename_limit < rename_dst_nr))287goto cleanup; /* nothing to do */288289/* We really want to cull the candidates list early290* with cheap tests in order to avoid doing deltas.291*/292for (i = 0; i < rename_dst_nr; i++) {293struct diff_filespec *two = rename_dst[i].two;294for (j = 0; j < rename_src_nr; j++) {295struct diff_filespec *one = rename_src[j].one;296if (!is_exact_match(one, two))297continue;298record_rename_pair(i, j, MAX_SCORE);299rename_count++;300break; /* we are done with this entry */301}302}303304/* Have we run out the created file pool? If so we can avoid305* doing the delta matrix altogether.306*/307if (rename_count == rename_dst_nr)308goto cleanup;309310num_create = (rename_dst_nr - rename_count);311num_src = rename_src_nr;312mx = xmalloc(sizeof(*mx) * num_create * num_src);313for (dst_cnt = i = 0; i < rename_dst_nr; i++) {314int base = dst_cnt * num_src;315struct diff_filespec *two = rename_dst[i].two;316if (rename_dst[i].pair)317continue; /* dealt with exact match already. */318for (j = 0; j < rename_src_nr; j++) {319struct diff_filespec *one = rename_src[j].one;320struct diff_score *m = &mx[base+j];321m->src = j;322m->dst = i;323m->score = estimate_similarity(one, two,324minimum_score);325}326dst_cnt++;327}328/* cost matrix sorted by most to least similar pair */329qsort(mx, num_create * num_src, sizeof(*mx), score_compare);330for (i = 0; i < num_create * num_src; i++) {331struct diff_rename_dst *dst = &rename_dst[mx[i].dst];332if (dst->pair)333continue; /* already done, either exact or fuzzy. */334if (mx[i].score < minimum_score)335break; /* there is no more usable pair. */336record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);337rename_count++;338}339free(mx);340341cleanup:342/* At this point, we have found some renames and copies and they343* are recorded in rename_dst. The original list is still in *q.344*/345outq.queue = NULL;346outq.nr = outq.alloc = 0;347for (i = 0; i < q->nr; i++) {348struct diff_filepair *p = q->queue[i];349struct diff_filepair *pair_to_free = NULL;350351if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {352/*353* Creation354*355* We would output this create record if it has356* not been turned into a rename/copy already.357*/358struct diff_rename_dst *dst =359locate_rename_dst(p->two, 0);360if (dst && dst->pair) {361diff_q(&outq, dst->pair);362pair_to_free = p;363}364else365/* no matching rename/copy source, so366* record this as a creation.367*/368diff_q(&outq, p);369}370else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {371/*372* Deletion373*374* We would output this delete record if:375*376* (1) this is a broken delete and the counterpart377* broken create remains in the output; or378* (2) this is not a broken delete, and rename_dst379* does not have a rename/copy to move p->one->path380* out of existence.381*382* Otherwise, the counterpart broken create383* has been turned into a rename-edit; or384* delete did not have a matching create to385* begin with.386*/387if (DIFF_PAIR_BROKEN(p)) {388/* broken delete */389struct diff_rename_dst *dst =390locate_rename_dst(p->one, 0);391if (dst && dst->pair)392/* counterpart is now rename/copy */393pair_to_free = p;394}395else {396for (j = 0; j < rename_dst_nr; j++) {397if (!rename_dst[j].pair)398continue;399if (strcmp(rename_dst[j].pair->400one->path,401p->one->path))402continue;403break;404}405if (j < rename_dst_nr)406/* this path remains */407pair_to_free = p;408}409410if (pair_to_free)411;412else413diff_q(&outq, p);414}415else if (!diff_unmodified_pair(p))416/* all the usual ones need to be kept */417diff_q(&outq, p);418else419/* no need to keep unmodified pairs */420pair_to_free = p;421422if (pair_to_free)423diff_free_filepair(pair_to_free);424}425diff_debug_queue("done copying original", &outq);426427free(q->queue);428*q = outq;429diff_debug_queue("done collapsing", q);430431/* We need to see which rename source really stays here;432* earlier we only checked if the path is left in the result,433* but even if a path remains in the result, if that is coming434* from copying something else on top of it, then the original435* source is lost and does not stay.436*/437for (i = 0; i < q->nr; i++) {438struct diff_filepair *p = q->queue[i];439if (DIFF_PAIR_RENAME(p) && p->source_stays) {440/* If one appears as the target of a rename-copy,441* then mark p->source_stays = 0; otherwise442* leave it as is.443*/444p->source_stays = compute_stays(q, p->one);445}446}447448for (i = 0; i < rename_dst_nr; i++) {449diff_free_filespec_data(rename_dst[i].two);450free(rename_dst[i].two);451}452453free(rename_dst);454rename_dst = NULL;455rename_dst_nr = rename_dst_alloc = 0;456free(rename_src);457rename_src = NULL;458rename_src_nr = rename_src_alloc = 0;459return;460}