diff-delta.con commit filter-branch: add a test for the commit removal example (f6b78c6)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
   5 * http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
   8 *
   9 * This code is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 */
  13
  14#include "git-compat-util.h"
  15#include "delta.h"
  16
  17/* maximum hash entry list for the same hash bucket */
  18#define HASH_LIMIT 64
  19
  20#define RABIN_SHIFT 23
  21#define RABIN_WINDOW 16
  22
  23static const unsigned int T[256] = {
  24        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  25        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  26        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  27        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  28        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  29        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  30        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  31        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  32        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  33        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  34        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  35        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  36        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  37        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  38        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  39        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  40        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  41        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  42        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  43        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  44        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  45        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  46        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  47        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  48        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  49        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  50        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  51        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  52        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  53        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  54        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  55        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  56        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  57        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  58        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  59        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  60        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  61        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  62        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  63        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  64        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  65        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  66        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  67};
  68
  69static const unsigned int U[256] = {
  70        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  71        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  72        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  73        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  74        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  75        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  76        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  77        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  78        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  79        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  80        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  81        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  82        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  83        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  84        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  85        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  86        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  87        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  88        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  89        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  90        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
  91        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
  92        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
  93        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
  94        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
  95        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
  96        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
  97        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
  98        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
  99        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 100        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 101        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 102        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 103        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 104        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 105        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 106        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 107        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 108        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 109        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 110        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 111        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 112        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 113};
 114
 115struct index_entry {
 116        const unsigned char *ptr;
 117        unsigned int val;
 118        struct index_entry *next;
 119};
 120
 121struct delta_index {
 122        const void *src_buf;
 123        unsigned long src_size;
 124        unsigned int hash_mask;
 125        struct index_entry *hash[FLEX_ARRAY];
 126};
 127
 128struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 129{
 130        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 131        const unsigned char *data, *buffer = buf;
 132        struct delta_index *index;
 133        struct index_entry *entry, **hash;
 134        void *mem;
 135        unsigned long memsize;
 136
 137        if (!buf || !bufsize)
 138                return NULL;
 139
 140        /* Determine index hash size.  Note that indexing skips the
 141           first byte to allow for optimizing the Rabin's polynomial
 142           initialization in create_delta(). */
 143        entries = (bufsize - 1)  / RABIN_WINDOW;
 144        hsize = entries / 4;
 145        for (i = 4; (1u << i) < hsize && i < 31; i++);
 146        hsize = 1 << i;
 147        hmask = hsize - 1;
 148
 149        /* allocate lookup index */
 150        memsize = sizeof(*index) +
 151                  sizeof(*hash) * hsize +
 152                  sizeof(*entry) * entries;
 153        mem = malloc(memsize);
 154        if (!mem)
 155                return NULL;
 156        index = mem;
 157        mem = index + 1;
 158        hash = mem;
 159        mem = hash + hsize;
 160        entry = mem;
 161
 162        index->src_buf = buf;
 163        index->src_size = bufsize;
 164        index->hash_mask = hmask;
 165        memset(hash, 0, hsize * sizeof(*hash));
 166
 167        /* allocate an array to count hash entries */
 168        hash_count = calloc(hsize, sizeof(*hash_count));
 169        if (!hash_count) {
 170                free(index);
 171                return NULL;
 172        }
 173
 174        /* then populate the index */
 175        prev_val = ~0;
 176        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 177             data >= buffer;
 178             data -= RABIN_WINDOW) {
 179                unsigned int val = 0;
 180                for (i = 1; i <= RABIN_WINDOW; i++)
 181                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 182                if (val == prev_val) {
 183                        /* keep the lowest of consecutive identical blocks */
 184                        entry[-1].ptr = data + RABIN_WINDOW;
 185                } else {
 186                        prev_val = val;
 187                        i = val & hmask;
 188                        entry->ptr = data + RABIN_WINDOW;
 189                        entry->val = val;
 190                        entry->next = hash[i];
 191                        hash[i] = entry++;
 192                        hash_count[i]++;
 193                }
 194        }
 195
 196        /*
 197         * Determine a limit on the number of entries in the same hash
 198         * bucket.  This guards us against pathological data sets causing
 199         * really bad hash distribution with most entries in the same hash
 200         * bucket that would bring us to O(m*n) computing costs (m and n
 201         * corresponding to reference and target buffer sizes).
 202         *
 203         * Make sure none of the hash buckets has more entries than
 204         * we're willing to test.  Otherwise we cull the entry list
 205         * uniformly to still preserve a good repartition across
 206         * the reference buffer.
 207         */
 208        for (i = 0; i < hsize; i++) {
 209                if (hash_count[i] < HASH_LIMIT)
 210                        continue;
 211                entry = hash[i];
 212                do {
 213                        struct index_entry *keep = entry;
 214                        int skip = hash_count[i] / HASH_LIMIT / 2;
 215                        do {
 216                                entry = entry->next;
 217                        } while(--skip && entry);
 218                        keep->next = entry;
 219                } while(entry);
 220        }
 221        free(hash_count);
 222
 223        return index;
 224}
 225
 226void free_delta_index(struct delta_index *index)
 227{
 228        free(index);
 229}
 230
 231/*
 232 * The maximum size for any opcode sequence, including the initial header
 233 * plus Rabin window plus biggest copy.
 234 */
 235#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 236
 237void *
 238create_delta(const struct delta_index *index,
 239             const void *trg_buf, unsigned long trg_size,
 240             unsigned long *delta_size, unsigned long max_size)
 241{
 242        unsigned int i, outpos, outsize, moff, msize, val;
 243        int inscnt;
 244        const unsigned char *ref_data, *ref_top, *data, *top;
 245        unsigned char *out;
 246
 247        if (!trg_buf || !trg_size)
 248                return NULL;
 249
 250        outpos = 0;
 251        outsize = 8192;
 252        if (max_size && outsize >= max_size)
 253                outsize = max_size + MAX_OP_SIZE + 1;
 254        out = malloc(outsize);
 255        if (!out)
 256                return NULL;
 257
 258        /* store reference buffer size */
 259        i = index->src_size;
 260        while (i >= 0x80) {
 261                out[outpos++] = i | 0x80;
 262                i >>= 7;
 263        }
 264        out[outpos++] = i;
 265
 266        /* store target buffer size */
 267        i = trg_size;
 268        while (i >= 0x80) {
 269                out[outpos++] = i | 0x80;
 270                i >>= 7;
 271        }
 272        out[outpos++] = i;
 273
 274        ref_data = index->src_buf;
 275        ref_top = ref_data + index->src_size;
 276        data = trg_buf;
 277        top = (const unsigned char *) trg_buf + trg_size;
 278
 279        outpos++;
 280        val = 0;
 281        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 282                out[outpos++] = *data;
 283                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 284        }
 285        inscnt = i;
 286
 287        moff = 0;
 288        msize = 0;
 289        while (data < top) {
 290                if (msize < 4096) {
 291                        struct index_entry *entry;
 292                        val ^= U[data[-RABIN_WINDOW]];
 293                        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 294                        i = val & index->hash_mask;
 295                        for (entry = index->hash[i]; entry; entry = entry->next) {
 296                                const unsigned char *ref = entry->ptr;
 297                                const unsigned char *src = data;
 298                                unsigned int ref_size = ref_top - ref;
 299                                if (entry->val != val)
 300                                        continue;
 301                                if (ref_size > top - src)
 302                                        ref_size = top - src;
 303                                if (ref_size <= msize)
 304                                        break;
 305                                while (ref_size-- && *src++ == *ref)
 306                                        ref++;
 307                                if (msize < ref - entry->ptr) {
 308                                        /* this is our best match so far */
 309                                        msize = ref - entry->ptr;
 310                                        moff = entry->ptr - ref_data;
 311                                        if (msize >= 4096) /* good enough */
 312                                                break;
 313                                }
 314                        }
 315                }
 316
 317                if (msize < 4) {
 318                        if (!inscnt)
 319                                outpos++;
 320                        out[outpos++] = *data++;
 321                        inscnt++;
 322                        if (inscnt == 0x7f) {
 323                                out[outpos - inscnt - 1] = inscnt;
 324                                inscnt = 0;
 325                        }
 326                        msize = 0;
 327                } else {
 328                        unsigned int left;
 329                        unsigned char *op;
 330
 331                        if (inscnt) {
 332                                while (moff && ref_data[moff-1] == data[-1]) {
 333                                        /* we can match one byte back */
 334                                        msize++;
 335                                        moff--;
 336                                        data--;
 337                                        outpos--;
 338                                        if (--inscnt)
 339                                                continue;
 340                                        outpos--;  /* remove count slot */
 341                                        inscnt--;  /* make it -1 */
 342                                        break;
 343                                }
 344                                out[outpos - inscnt - 1] = inscnt;
 345                                inscnt = 0;
 346                        }
 347
 348                        /* A copy op is currently limited to 64KB (pack v2) */
 349                        left = (msize < 0x10000) ? 0 : (msize - 0x10000);
 350                        msize -= left;
 351
 352                        op = out + outpos++;
 353                        i = 0x80;
 354
 355                        if (moff & 0x000000ff)
 356                                out[outpos++] = moff >> 0,  i |= 0x01;
 357                        if (moff & 0x0000ff00)
 358                                out[outpos++] = moff >> 8,  i |= 0x02;
 359                        if (moff & 0x00ff0000)
 360                                out[outpos++] = moff >> 16, i |= 0x04;
 361                        if (moff & 0xff000000)
 362                                out[outpos++] = moff >> 24, i |= 0x08;
 363
 364                        if (msize & 0x00ff)
 365                                out[outpos++] = msize >> 0, i |= 0x10;
 366                        if (msize & 0xff00)
 367                                out[outpos++] = msize >> 8, i |= 0x20;
 368
 369                        *op = i;
 370
 371                        data += msize;
 372                        moff += msize;
 373                        msize = left;
 374
 375                        if (msize < 4096) {
 376                                int j;
 377                                val = 0;
 378                                for (j = -RABIN_WINDOW; j < 0; j++)
 379                                        val = ((val << 8) | data[j])
 380                                              ^ T[val >> RABIN_SHIFT];
 381                        }
 382                }
 383
 384                if (outpos >= outsize - MAX_OP_SIZE) {
 385                        void *tmp = out;
 386                        outsize = outsize * 3 / 2;
 387                        if (max_size && outsize >= max_size)
 388                                outsize = max_size + MAX_OP_SIZE + 1;
 389                        if (max_size && outpos > max_size)
 390                                break;
 391                        out = realloc(out, outsize);
 392                        if (!out) {
 393                                free(tmp);
 394                                return NULL;
 395                        }
 396                }
 397        }
 398
 399        if (inscnt)
 400                out[outpos - inscnt - 1] = inscnt;
 401
 402        if (max_size && outpos > max_size) {
 403                free(out);
 404                return NULL;
 405        }
 406
 407        *delta_size = outpos;
 408        return out;
 409}