diff-delta.con commit Merge branch 'master' of git://repo.or.cz/git/fastimport (f45fa2a)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include "git-compat-util.h"
  22#include "delta.h"
  23
  24/* maximum hash entry list for the same hash bucket */
  25#define HASH_LIMIT 64
  26
  27#define RABIN_SHIFT 23
  28#define RABIN_WINDOW 16
  29
  30static const unsigned int T[256] = {
  31        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  32        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  33        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  34        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  35        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  36        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  37        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  38        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  39        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  40        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  41        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  42        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  43        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  44        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  45        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  46        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  47        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  48        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  49        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  50        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  51        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  52        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  53        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  54        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  55        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  56        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  57        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  58        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  59        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  60        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  61        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  62        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  63        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  64        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  65        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  66        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  67        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  68        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  69        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  70        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  71        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  72        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  73        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  74};
  75
  76static const unsigned int U[256] = {
  77        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  78        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  79        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  80        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  81        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  82        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  83        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  84        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  85        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  86        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  87        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  88        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  89        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  90        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  91        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  92        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  93        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  94        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  95        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  96        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  97        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
  98        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
  99        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 100        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 101        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 102        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 103        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 104        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 105        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 106        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 107        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 108        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 109        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 110        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 111        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 112        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 113        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 114        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 115        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 116        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 117        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 118        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 119        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 120};
 121
 122struct index_entry {
 123        const unsigned char *ptr;
 124        unsigned int val;
 125        struct index_entry *next;
 126};
 127
 128struct delta_index {
 129        const void *src_buf;
 130        unsigned long src_size;
 131        unsigned int hash_mask;
 132        struct index_entry *hash[FLEX_ARRAY];
 133};
 134
 135struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 136{
 137        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 138        const unsigned char *data, *buffer = buf;
 139        struct delta_index *index;
 140        struct index_entry *entry, **hash;
 141        void *mem;
 142        unsigned long memsize;
 143
 144        if (!buf || !bufsize)
 145                return NULL;
 146
 147        /* Determine index hash size.  Note that indexing skips the
 148           first byte to allow for optimizing the Rabin's polynomial
 149           initialization in create_delta(). */
 150        entries = (bufsize - 1)  / RABIN_WINDOW;
 151        hsize = entries / 4;
 152        for (i = 4; (1u << i) < hsize && i < 31; i++);
 153        hsize = 1 << i;
 154        hmask = hsize - 1;
 155
 156        /* allocate lookup index */
 157        memsize = sizeof(*index) +
 158                  sizeof(*hash) * hsize +
 159                  sizeof(*entry) * entries;
 160        mem = malloc(memsize);
 161        if (!mem)
 162                return NULL;
 163        index = mem;
 164        mem = index + 1;
 165        hash = mem;
 166        mem = hash + hsize;
 167        entry = mem;
 168
 169        index->src_buf = buf;
 170        index->src_size = bufsize;
 171        index->hash_mask = hmask;
 172        memset(hash, 0, hsize * sizeof(*hash));
 173
 174        /* allocate an array to count hash entries */
 175        hash_count = calloc(hsize, sizeof(*hash_count));
 176        if (!hash_count) {
 177                free(index);
 178                return NULL;
 179        }
 180
 181        /* then populate the index */
 182        prev_val = ~0;
 183        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 184             data >= buffer;
 185             data -= RABIN_WINDOW) {
 186                unsigned int val = 0;
 187                for (i = 1; i <= RABIN_WINDOW; i++)
 188                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 189                if (val == prev_val) {
 190                        /* keep the lowest of consecutive identical blocks */
 191                        entry[-1].ptr = data + RABIN_WINDOW;
 192                } else {
 193                        prev_val = val;
 194                        i = val & hmask;
 195                        entry->ptr = data + RABIN_WINDOW;
 196                        entry->val = val;
 197                        entry->next = hash[i];
 198                        hash[i] = entry++;
 199                        hash_count[i]++;
 200                }
 201        }
 202
 203        /*
 204         * Determine a limit on the number of entries in the same hash
 205         * bucket.  This guards us against pathological data sets causing
 206         * really bad hash distribution with most entries in the same hash
 207         * bucket that would bring us to O(m*n) computing costs (m and n
 208         * corresponding to reference and target buffer sizes).
 209         *
 210         * Make sure none of the hash buckets has more entries than
 211         * we're willing to test.  Otherwise we cull the entry list
 212         * uniformly to still preserve a good repartition across
 213         * the reference buffer.
 214         */
 215        for (i = 0; i < hsize; i++) {
 216                if (hash_count[i] < HASH_LIMIT)
 217                        continue;
 218                entry = hash[i];
 219                do {
 220                        struct index_entry *keep = entry;
 221                        int skip = hash_count[i] / HASH_LIMIT / 2;
 222                        do {
 223                                entry = entry->next;
 224                        } while(--skip && entry);
 225                        keep->next = entry;
 226                } while(entry);
 227        }
 228        free(hash_count);
 229
 230        return index;
 231}
 232
 233void free_delta_index(struct delta_index *index)
 234{
 235        free(index);
 236}
 237
 238/*
 239 * The maximum size for any opcode sequence, including the initial header
 240 * plus Rabin window plus biggest copy.
 241 */
 242#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 243
 244void *
 245create_delta(const struct delta_index *index,
 246             const void *trg_buf, unsigned long trg_size,
 247             unsigned long *delta_size, unsigned long max_size)
 248{
 249        unsigned int i, outpos, outsize, val;
 250        int inscnt;
 251        const unsigned char *ref_data, *ref_top, *data, *top;
 252        unsigned char *out;
 253
 254        if (!trg_buf || !trg_size)
 255                return NULL;
 256
 257        outpos = 0;
 258        outsize = 8192;
 259        if (max_size && outsize >= max_size)
 260                outsize = max_size + MAX_OP_SIZE + 1;
 261        out = malloc(outsize);
 262        if (!out)
 263                return NULL;
 264
 265        /* store reference buffer size */
 266        i = index->src_size;
 267        while (i >= 0x80) {
 268                out[outpos++] = i | 0x80;
 269                i >>= 7;
 270        }
 271        out[outpos++] = i;
 272
 273        /* store target buffer size */
 274        i = trg_size;
 275        while (i >= 0x80) {
 276                out[outpos++] = i | 0x80;
 277                i >>= 7;
 278        }
 279        out[outpos++] = i;
 280
 281        ref_data = index->src_buf;
 282        ref_top = ref_data + index->src_size;
 283        data = trg_buf;
 284        top = (const unsigned char *) trg_buf + trg_size;
 285
 286        outpos++;
 287        val = 0;
 288        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 289                out[outpos++] = *data;
 290                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 291        }
 292        inscnt = i;
 293
 294        while (data < top) {
 295                unsigned int moff = 0, msize = 0;
 296                struct index_entry *entry;
 297                val ^= U[data[-RABIN_WINDOW]];
 298                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 299                i = val & index->hash_mask;
 300                for (entry = index->hash[i]; entry; entry = entry->next) {
 301                        const unsigned char *ref = entry->ptr;
 302                        const unsigned char *src = data;
 303                        unsigned int ref_size = ref_top - ref;
 304                        if (entry->val != val)
 305                                continue;
 306                        if (ref_size > top - src)
 307                                ref_size = top - src;
 308                        if (ref_size > 0x10000)
 309                                ref_size = 0x10000;
 310                        if (ref_size <= msize)
 311                                break;
 312                        while (ref_size-- && *src++ == *ref)
 313                                ref++;
 314                        if (msize < ref - entry->ptr) {
 315                                /* this is our best match so far */
 316                                msize = ref - entry->ptr;
 317                                moff = entry->ptr - ref_data;
 318                        }
 319                }
 320
 321                if (msize < 4) {
 322                        if (!inscnt)
 323                                outpos++;
 324                        out[outpos++] = *data++;
 325                        inscnt++;
 326                        if (inscnt == 0x7f) {
 327                                out[outpos - inscnt - 1] = inscnt;
 328                                inscnt = 0;
 329                        }
 330                } else {
 331                        unsigned char *op;
 332
 333                        if (msize >= RABIN_WINDOW) {
 334                                const unsigned char *sk;
 335                                sk = data + msize - RABIN_WINDOW;
 336                                val = 0;
 337                                for (i = 0; i < RABIN_WINDOW; i++)
 338                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 339                        } else {
 340                                const unsigned char *sk = data + 1;
 341                                for (i = 1; i < msize; i++) {
 342                                        val ^= U[sk[-RABIN_WINDOW]];
 343                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 344                                }
 345                        }
 346
 347                        if (inscnt) {
 348                                while (moff && ref_data[moff-1] == data[-1]) {
 349                                        if (msize == 0x10000)
 350                                                break;
 351                                        /* we can match one byte back */
 352                                        msize++;
 353                                        moff--;
 354                                        data--;
 355                                        outpos--;
 356                                        if (--inscnt)
 357                                                continue;
 358                                        outpos--;  /* remove count slot */
 359                                        inscnt--;  /* make it -1 */
 360                                        break;
 361                                }
 362                                out[outpos - inscnt - 1] = inscnt;
 363                                inscnt = 0;
 364                        }
 365
 366                        data += msize;
 367                        op = out + outpos++;
 368                        i = 0x80;
 369
 370                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 371                        moff >>= 8;
 372                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 373                        moff >>= 8;
 374                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 375                        moff >>= 8;
 376                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 377
 378                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 379                        msize >>= 8;
 380                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 381
 382                        *op = i;
 383                }
 384
 385                if (outpos >= outsize - MAX_OP_SIZE) {
 386                        void *tmp = out;
 387                        outsize = outsize * 3 / 2;
 388                        if (max_size && outsize >= max_size)
 389                                outsize = max_size + MAX_OP_SIZE + 1;
 390                        if (max_size && outpos > max_size)
 391                                break;
 392                        out = xrealloc(out, outsize);
 393                        if (!out) {
 394                                free(tmp);
 395                                return NULL;
 396                        }
 397                }
 398        }
 399
 400        if (inscnt)
 401                out[outpos - inscnt - 1] = inscnt;
 402
 403        if (max_size && outpos > max_size) {
 404                free(out);
 405                return NULL;
 406        }
 407
 408        *delta_size = outpos;
 409        return out;
 410}