1/*2* Copyright (C) 2005 Junio C Hamano3*/4#include "cache.h"5#include "diff.h"6#include "diffcore.h"7#include "delta.h"8#include "count-delta.h"910/* Table of rename/copy destinations */1112static struct diff_rename_dst {13struct diff_filespec *two;14struct diff_filepair *pair;15} *rename_dst;16static int rename_dst_nr, rename_dst_alloc;1718static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,19int insert_ok)20{21int first, last;2223first = 0;24last = rename_dst_nr;25while (last > first) {26int next = (last + first) >> 1;27struct diff_rename_dst *dst = &(rename_dst[next]);28int cmp = strcmp(two->path, dst->two->path);29if (!cmp)30return dst;31if (cmp < 0) {32last = next;33continue;34}35first = next+1;36}37/* not found */38if (!insert_ok)39return NULL;40/* insert to make it at "first" */41if (rename_dst_alloc <= rename_dst_nr) {42rename_dst_alloc = alloc_nr(rename_dst_alloc);43rename_dst = xrealloc(rename_dst,44rename_dst_alloc * sizeof(*rename_dst));45}46rename_dst_nr++;47if (first < rename_dst_nr)48memmove(rename_dst + first + 1, rename_dst + first,49(rename_dst_nr - first - 1) * sizeof(*rename_dst));50rename_dst[first].two = alloc_filespec(two->path);51fill_filespec(rename_dst[first].two, two->sha1, two->mode);52rename_dst[first].pair = NULL;53return &(rename_dst[first]);54}5556/* Table of rename/copy src files */57static struct diff_rename_src {58struct diff_filespec *one;59unsigned src_path_left : 1;60} *rename_src;61static int rename_src_nr, rename_src_alloc;6263static struct diff_rename_src *register_rename_src(struct diff_filespec *one,64int src_path_left)65{66int first, last;6768first = 0;69last = rename_src_nr;70while (last > first) {71int next = (last + first) >> 1;72struct diff_rename_src *src = &(rename_src[next]);73int cmp = strcmp(one->path, src->one->path);74if (!cmp)75return src;76if (cmp < 0) {77last = next;78continue;79}80first = next+1;81}8283/* insert to make it at "first" */84if (rename_src_alloc <= rename_src_nr) {85rename_src_alloc = alloc_nr(rename_src_alloc);86rename_src = xrealloc(rename_src,87rename_src_alloc * sizeof(*rename_src));88}89rename_src_nr++;90if (first < rename_src_nr)91memmove(rename_src + first + 1, rename_src + first,92(rename_src_nr - first - 1) * sizeof(*rename_src));93rename_src[first].one = one;94rename_src[first].src_path_left = src_path_left;95return &(rename_src[first]);96}9798static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst)99{100if (src->sha1_valid && dst->sha1_valid &&101!memcmp(src->sha1, dst->sha1, 20))102return 1;103if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))104return 0;105if (src->size != dst->size)106return 0;107if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))108return 0;109if (src->size == dst->size &&110!memcmp(src->data, dst->data, src->size))111return 1;112return 0;113}114115struct diff_score {116int src; /* index in rename_src */117int dst; /* index in rename_dst */118int score;119};120121static int estimate_similarity(struct diff_filespec *src,122struct diff_filespec *dst,123int minimum_score)124{125/* src points at a file that existed in the original tree (or126* optionally a file in the destination tree) and dst points127* at a newly created file. They may be quite similar, in which128* case we want to say src is renamed to dst or src is copied into129* dst, and then some edit has been applied to dst.130*131* Compare them and return how similar they are, representing132* the score as an integer between 0 and MAX_SCORE.133*134* When there is an exact match, it is considered a better135* match than anything else; the destination does not even136* call into this function in that case.137*/138void *delta;139unsigned long delta_size, base_size, src_copied, literal_added;140unsigned long delta_limit;141int score;142143/* We deal only with regular files. Symlink renames are handled144* only when they are exact matches --- in other words, no edits145* after renaming.146*/147if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))148return 0;149150delta_size = ((src->size < dst->size) ?151(dst->size - src->size) : (src->size - dst->size));152base_size = ((src->size < dst->size) ? src->size : dst->size);153154/* We would not consider edits that change the file size so155* drastically. delta_size must be smaller than156* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).157*158* Note that base_size == 0 case is handled here already159* and the final score computation below would not have a160* divide-by-zero issue.161*/162if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)163return 0;164165if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))166return 0; /* error but caught downstream */167168delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;169delta = diff_delta(src->data, src->size,170dst->data, dst->size,171&delta_size, delta_limit);172if (!delta)173/* If delta_limit is exceeded, we have too much differences */174return 0;175176/* A delta that has a lot of literal additions would have177* big delta_size no matter what else it does.178*/179if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)180return 0;181182/* Estimate the edit size by interpreting delta. */183if (count_delta(delta, delta_size, &src_copied, &literal_added)) {184free(delta);185return 0;186}187free(delta);188189/* Extent of damage */190if (src->size + literal_added < src_copied)191delta_size = 0;192else193delta_size = (src->size - src_copied) + literal_added;194195/*196* Now we will give some score to it. 100% edit gets 0 points197* and 0% edit gets MAX_SCORE points.198*/199score = MAX_SCORE - (MAX_SCORE * delta_size / base_size);200if (score < 0) return 0;201if (MAX_SCORE < score) return MAX_SCORE;202return score;203}204205static void record_rename_pair(int dst_index, int src_index, int score)206{207struct diff_filespec *one, *two, *src, *dst;208struct diff_filepair *dp;209210if (rename_dst[dst_index].pair)211die("internal error: dst already matched.");212213src = rename_src[src_index].one;214one = alloc_filespec(src->path);215fill_filespec(one, src->sha1, src->mode);216217dst = rename_dst[dst_index].two;218two = alloc_filespec(dst->path);219fill_filespec(two, dst->sha1, dst->mode);220221dp = diff_queue(NULL, one, two);222dp->score = score;223dp->source_stays = rename_src[src_index].src_path_left;224rename_dst[dst_index].pair = dp;225}226227/*228* We sort the rename similarity matrix with the score, in descending229* order (the most similar first).230*/231static int score_compare(const void *a_, const void *b_)232{233const struct diff_score *a = a_, *b = b_;234return b->score - a->score;235}236237static int compute_stays(struct diff_queue_struct *q,238struct diff_filespec *one)239{240int i;241for (i = 0; i < q->nr; i++) {242struct diff_filepair *p = q->queue[i];243if (strcmp(one->path, p->two->path))244continue;245if (DIFF_PAIR_RENAME(p)) {246return 0; /* something else is renamed into this */247}248}249return 1;250}251252void diffcore_rename(struct diff_options *options)253{254int detect_rename = options->detect_rename;255int minimum_score = options->rename_score;256int rename_limit = options->rename_limit;257struct diff_queue_struct *q = &diff_queued_diff;258struct diff_queue_struct outq;259struct diff_score *mx;260int i, j, rename_count;261int num_create, num_src, dst_cnt;262263if (!minimum_score)264minimum_score = DEFAULT_RENAME_SCORE;265rename_count = 0;266267for (i = 0; i < q->nr; i++) {268struct diff_filepair *p = q->queue[i];269if (!DIFF_FILE_VALID(p->one))270if (!DIFF_FILE_VALID(p->two))271continue; /* unmerged */272else273locate_rename_dst(p->two, 1);274else if (!DIFF_FILE_VALID(p->two)) {275/* If the source is a broken "delete", and276* they did not really want to get broken,277* that means the source actually stays.278*/279int stays = (p->broken_pair && !p->score);280register_rename_src(p->one, stays);281}282else if (detect_rename == DIFF_DETECT_COPY)283register_rename_src(p->one, 1);284}285if (rename_dst_nr == 0 ||286(0 < rename_limit && rename_limit < rename_dst_nr))287goto cleanup; /* nothing to do */288289/* We really want to cull the candidates list early290* with cheap tests in order to avoid doing deltas.291*/292for (i = 0; i < rename_dst_nr; i++) {293struct diff_filespec *two = rename_dst[i].two;294for (j = 0; j < rename_src_nr; j++) {295struct diff_filespec *one = rename_src[j].one;296if (!is_exact_match(one, two))297continue;298record_rename_pair(i, j, MAX_SCORE);299rename_count++;300break; /* we are done with this entry */301}302}303304/* Have we run out the created file pool? If so we can avoid305* doing the delta matrix altogether.306*/307if (rename_count == rename_dst_nr)308goto cleanup;309310if (minimum_score == MAX_SCORE)311goto cleanup;312313num_create = (rename_dst_nr - rename_count);314num_src = rename_src_nr;315mx = xmalloc(sizeof(*mx) * num_create * num_src);316for (dst_cnt = i = 0; i < rename_dst_nr; i++) {317int base = dst_cnt * num_src;318struct diff_filespec *two = rename_dst[i].two;319if (rename_dst[i].pair)320continue; /* dealt with exact match already. */321for (j = 0; j < rename_src_nr; j++) {322struct diff_filespec *one = rename_src[j].one;323struct diff_score *m = &mx[base+j];324m->src = j;325m->dst = i;326m->score = estimate_similarity(one, two,327minimum_score);328}329dst_cnt++;330}331/* cost matrix sorted by most to least similar pair */332qsort(mx, num_create * num_src, sizeof(*mx), score_compare);333for (i = 0; i < num_create * num_src; i++) {334struct diff_rename_dst *dst = &rename_dst[mx[i].dst];335if (dst->pair)336continue; /* already done, either exact or fuzzy. */337if (mx[i].score < minimum_score)338break; /* there is no more usable pair. */339record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);340rename_count++;341}342free(mx);343344cleanup:345/* At this point, we have found some renames and copies and they346* are recorded in rename_dst. The original list is still in *q.347*/348outq.queue = NULL;349outq.nr = outq.alloc = 0;350for (i = 0; i < q->nr; i++) {351struct diff_filepair *p = q->queue[i];352struct diff_filepair *pair_to_free = NULL;353354if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {355/*356* Creation357*358* We would output this create record if it has359* not been turned into a rename/copy already.360*/361struct diff_rename_dst *dst =362locate_rename_dst(p->two, 0);363if (dst && dst->pair) {364diff_q(&outq, dst->pair);365pair_to_free = p;366}367else368/* no matching rename/copy source, so369* record this as a creation.370*/371diff_q(&outq, p);372}373else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {374/*375* Deletion376*377* We would output this delete record if:378*379* (1) this is a broken delete and the counterpart380* broken create remains in the output; or381* (2) this is not a broken delete, and rename_dst382* does not have a rename/copy to move p->one->path383* out of existence.384*385* Otherwise, the counterpart broken create386* has been turned into a rename-edit; or387* delete did not have a matching create to388* begin with.389*/390if (DIFF_PAIR_BROKEN(p)) {391/* broken delete */392struct diff_rename_dst *dst =393locate_rename_dst(p->one, 0);394if (dst && dst->pair)395/* counterpart is now rename/copy */396pair_to_free = p;397}398else {399for (j = 0; j < rename_dst_nr; j++) {400if (!rename_dst[j].pair)401continue;402if (strcmp(rename_dst[j].pair->403one->path,404p->one->path))405continue;406break;407}408if (j < rename_dst_nr)409/* this path remains */410pair_to_free = p;411}412413if (pair_to_free)414;415else416diff_q(&outq, p);417}418else if (!diff_unmodified_pair(p))419/* all the usual ones need to be kept */420diff_q(&outq, p);421else422/* no need to keep unmodified pairs */423pair_to_free = p;424425if (pair_to_free)426diff_free_filepair(pair_to_free);427}428diff_debug_queue("done copying original", &outq);429430free(q->queue);431*q = outq;432diff_debug_queue("done collapsing", q);433434/* We need to see which rename source really stays here;435* earlier we only checked if the path is left in the result,436* but even if a path remains in the result, if that is coming437* from copying something else on top of it, then the original438* source is lost and does not stay.439*/440for (i = 0; i < q->nr; i++) {441struct diff_filepair *p = q->queue[i];442if (DIFF_PAIR_RENAME(p) && p->source_stays) {443/* If one appears as the target of a rename-copy,444* then mark p->source_stays = 0; otherwise445* leave it as is.446*/447p->source_stays = compute_stays(q, p->one);448}449}450451for (i = 0; i < rename_dst_nr; i++) {452diff_free_filespec_data(rename_dst[i].two);453free(rename_dst[i].two);454}455456free(rename_dst);457rename_dst = NULL;458rename_dst_nr = rename_dst_alloc = 0;459free(rename_src);460rename_src = NULL;461rename_src_nr = rename_src_alloc = 0;462return;463}