1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
   5 * http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
   8 *
   9 * This code is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 */
  13#include "git-compat-util.h"
  15#include "delta.h"
  16/* maximum hash entry list for the same hash bucket */
  18#define HASH_LIMIT 64
  19#define RABIN_SHIFT 23
  21#define RABIN_WINDOW 16
  22static const unsigned int T[256] = {
  24        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  25        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  26        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  27        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  28        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  29        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  30        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  31        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  32        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  33        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  34        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  35        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  36        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  37        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  38        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  39        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  40        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  41        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  42        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  43        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  44        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  45        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  46        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  47        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  48        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  49        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  50        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  51        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  52        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  53        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  54        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  55        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  56        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  57        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  58        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  59        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  60        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  61        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  62        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  63        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  64        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  65        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  66        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  67};
  68static const unsigned int U[256] = {
  70        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  71        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  72        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  73        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  74        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  75        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  76        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  77        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  78        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  79        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  80        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  81        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  82        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  83        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  84        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  85        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  86        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  87        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  88        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  89        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  90        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
  91        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
  92        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
  93        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
  94        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
  95        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
  96        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
  97        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
  98        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
  99        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 100        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 101        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 102        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 103        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 104        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 105        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 106        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 107        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 108        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 109        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 110        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 111        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 112        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 113};
 114struct index_entry {
 116        const unsigned char *ptr;
 117        unsigned int val;
 118        struct index_entry *next;
 119};
 120struct delta_index {
 122        unsigned long memsize;
 123        const void *src_buf;
 124        unsigned long src_size;
 125        unsigned int hash_mask;
 126        struct index_entry *hash[FLEX_ARRAY];
 127};
 128struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 130{
 131        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 132        const unsigned char *data, *buffer = buf;
 133        struct delta_index *index;
 134        struct index_entry *entry, **hash;
 135        void *mem;
 136        unsigned long memsize;
 137        if (!buf || !bufsize)
 139                return NULL;
 140        /* Determine index hash size.  Note that indexing skips the
 142           first byte to allow for optimizing the Rabin's polynomial
 143           initialization in create_delta(). */
 144        entries = (bufsize - 1)  / RABIN_WINDOW;
 145        hsize = entries / 4;
 146        for (i = 4; (1u << i) < hsize && i < 31; i++);
 147        hsize = 1 << i;
 148        hmask = hsize - 1;
 149        /* allocate lookup index */
 151        memsize = sizeof(*index) +
 152                  sizeof(*hash) * hsize +
 153                  sizeof(*entry) * entries;
 154        mem = malloc(memsize);
 155        if (!mem)
 156                return NULL;
 157        index = mem;
 158        mem = index + 1;
 159        hash = mem;
 160        mem = hash + hsize;
 161        entry = mem;
 162        index->memsize = memsize;
 164        index->src_buf = buf;
 165        index->src_size = bufsize;
 166        index->hash_mask = hmask;
 167        memset(hash, 0, hsize * sizeof(*hash));
 168        /* allocate an array to count hash entries */
 170        hash_count = calloc(hsize, sizeof(*hash_count));
 171        if (!hash_count) {
 172                free(index);
 173                return NULL;
 174        }
 175        /* then populate the index */
 177        prev_val = ~0;
 178        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 179             data >= buffer;
 180             data -= RABIN_WINDOW) {
 181                unsigned int val = 0;
 182                for (i = 1; i <= RABIN_WINDOW; i++)
 183                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 184                if (val == prev_val) {
 185                        /* keep the lowest of consecutive identical blocks */
 186                        entry[-1].ptr = data + RABIN_WINDOW;
 187                } else {
 188                        prev_val = val;
 189                        i = val & hmask;
 190                        entry->ptr = data + RABIN_WINDOW;
 191                        entry->val = val;
 192                        entry->next = hash[i];
 193                        hash[i] = entry++;
 194                        hash_count[i]++;
 195                }
 196        }
 197        /*
 199         * Determine a limit on the number of entries in the same hash
 200         * bucket.  This guards us against pathological data sets causing
 201         * really bad hash distribution with most entries in the same hash
 202         * bucket that would bring us to O(m*n) computing costs (m and n
 203         * corresponding to reference and target buffer sizes).
 204         *
 205         * Make sure none of the hash buckets has more entries than
 206         * we're willing to test.  Otherwise we cull the entry list
 207         * uniformly to still preserve a good repartition across
 208         * the reference buffer.
 209         */
 210        for (i = 0; i < hsize; i++) {
 211                if (hash_count[i] < HASH_LIMIT)
 212                        continue;
 213                entry = hash[i];
 214                do {
 215                        struct index_entry *keep = entry;
 216                        int skip = hash_count[i] / HASH_LIMIT / 2;
 217                        do {
 218                                entry = entry->next;
 219                        } while(--skip && entry);
 220                        keep->next = entry;
 221                } while(entry);
 222        }
 223        free(hash_count);
 224        return index;
 226}
 227void free_delta_index(struct delta_index *index)
 229{
 230        free(index);
 231}
 232unsigned long sizeof_delta_index(struct delta_index *index)
 234{
 235        if (index)
 236                return index->memsize;
 237        else
 238                return 0;
 239}
 240/*
 242 * The maximum size for any opcode sequence, including the initial header
 243 * plus Rabin window plus biggest copy.
 244 */
 245#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 246void *
 248create_delta(const struct delta_index *index,
 249             const void *trg_buf, unsigned long trg_size,
 250             unsigned long *delta_size, unsigned long max_size)
 251{
 252        unsigned int i, outpos, outsize, moff, msize, val;
 253        int inscnt;
 254        const unsigned char *ref_data, *ref_top, *data, *top;
 255        unsigned char *out;
 256        if (!trg_buf || !trg_size)
 258                return NULL;
 259        outpos = 0;
 261        outsize = 8192;
 262        if (max_size && outsize >= max_size)
 263                outsize = max_size + MAX_OP_SIZE + 1;
 264        out = malloc(outsize);
 265        if (!out)
 266                return NULL;
 267        /* store reference buffer size */
 269        i = index->src_size;
 270        while (i >= 0x80) {
 271                out[outpos++] = i | 0x80;
 272                i >>= 7;
 273        }
 274        out[outpos++] = i;
 275        /* store target buffer size */
 277        i = trg_size;
 278        while (i >= 0x80) {
 279                out[outpos++] = i | 0x80;
 280                i >>= 7;
 281        }
 282        out[outpos++] = i;
 283        ref_data = index->src_buf;
 285        ref_top = ref_data + index->src_size;
 286        data = trg_buf;
 287        top = (const unsigned char *) trg_buf + trg_size;
 288        outpos++;
 290        val = 0;
 291        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 292                out[outpos++] = *data;
 293                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 294        }
 295        inscnt = i;
 296        moff = 0;
 298        msize = 0;
 299        while (data < top) {
 300                if (msize < 4096) {
 301                        struct index_entry *entry;
 302                        val ^= U[data[-RABIN_WINDOW]];
 303                        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 304                        i = val & index->hash_mask;
 305                        for (entry = index->hash[i]; entry; entry = entry->next) {
 306                                const unsigned char *ref = entry->ptr;
 307                                const unsigned char *src = data;
 308                                unsigned int ref_size = ref_top - ref;
 309                                if (entry->val != val)
 310                                        continue;
 311                                if (ref_size > top - src)
 312                                        ref_size = top - src;
 313                                if (ref_size <= msize)
 314                                        break;
 315                                while (ref_size-- && *src++ == *ref)
 316                                        ref++;
 317                                if (msize < ref - entry->ptr) {
 318                                        /* this is our best match so far */
 319                                        msize = ref - entry->ptr;
 320                                        moff = entry->ptr - ref_data;
 321                                        if (msize >= 4096) /* good enough */
 322                                                break;
 323                                }
 324                        }
 325                }
 326                if (msize < 4) {
 328                        if (!inscnt)
 329                                outpos++;
 330                        out[outpos++] = *data++;
 331                        inscnt++;
 332                        if (inscnt == 0x7f) {
 333                                out[outpos - inscnt - 1] = inscnt;
 334                                inscnt = 0;
 335                        }
 336                        msize = 0;
 337                } else {
 338                        unsigned int left;
 339                        unsigned char *op;
 340                        if (inscnt) {
 342                                while (moff && ref_data[moff-1] == data[-1]) {
 343                                        /* we can match one byte back */
 344                                        msize++;
 345                                        moff--;
 346                                        data--;
 347                                        outpos--;
 348                                        if (--inscnt)
 349                                                continue;
 350                                        outpos--;  /* remove count slot */
 351                                        inscnt--;  /* make it -1 */
 352                                        break;
 353                                }
 354                                out[outpos - inscnt - 1] = inscnt;
 355                                inscnt = 0;
 356                        }
 357                        /* A copy op is currently limited to 64KB (pack v2) */
 359                        left = (msize < 0x10000) ? 0 : (msize - 0x10000);
 360                        msize -= left;
 361                        op = out + outpos++;
 363                        i = 0x80;
 364                        if (moff & 0x000000ff)
 366                                out[outpos++] = moff >> 0,  i |= 0x01;
 367                        if (moff & 0x0000ff00)
 368                                out[outpos++] = moff >> 8,  i |= 0x02;
 369                        if (moff & 0x00ff0000)
 370                                out[outpos++] = moff >> 16, i |= 0x04;
 371                        if (moff & 0xff000000)
 372                                out[outpos++] = moff >> 24, i |= 0x08;
 373                        if (msize & 0x00ff)
 375                                out[outpos++] = msize >> 0, i |= 0x10;
 376                        if (msize & 0xff00)
 377                                out[outpos++] = msize >> 8, i |= 0x20;
 378                        *op = i;
 380                        data += msize;
 382                        moff += msize;
 383                        msize = left;
 384                        if (msize < 4096) {
 386                                int j;
 387                                val = 0;
 388                                for (j = -RABIN_WINDOW; j < 0; j++)
 389                                        val = ((val << 8) | data[j])
 390                                              ^ T[val >> RABIN_SHIFT];
 391                        }
 392                }
 393                if (outpos >= outsize - MAX_OP_SIZE) {
 395                        void *tmp = out;
 396                        outsize = outsize * 3 / 2;
 397                        if (max_size && outsize >= max_size)
 398                                outsize = max_size + MAX_OP_SIZE + 1;
 399                        if (max_size && outpos > max_size)
 400                                break;
 401                        out = realloc(out, outsize);
 402                        if (!out) {
 403                                free(tmp);
 404                                return NULL;
 405                        }
 406                }
 407        }
 408        if (inscnt)
 410                out[outpos - inscnt - 1] = inscnt;
 411        if (max_size && outpos > max_size) {
 413                free(out);
 414                return NULL;
 415        }
 416        *delta_size = outpos;
 418        return out;
 419}