3a737da68c8cadb900ffe5b44973d9193fbca83f
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include "git-compat-util.h"
  22#include "delta.h"
  23
  24/* maximum hash entry list for the same hash bucket */
  25#define HASH_LIMIT 64
  26
  27#define RABIN_SHIFT 23
  28#define RABIN_WINDOW 16
  29
  30static const unsigned int T[256] = {
  31        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  32        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  33        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  34        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  35        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  36        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  37        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  38        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  39        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  40        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  41        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  42        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  43        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  44        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  45        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  46        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  47        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  48        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  49        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  50        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  51        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  52        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  53        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  54        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  55        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  56        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  57        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  58        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  59        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  60        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  61        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  62        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  63        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  64        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  65        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  66        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  67        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  68        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  69        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  70        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  71        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  72        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  73        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  74};
  75
  76static const unsigned int U[256] = {
  77        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  78        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  79        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  80        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  81        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  82        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  83        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  84        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  85        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  86        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  87        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  88        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  89        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  90        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  91        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  92        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  93        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  94        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  95        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  96        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  97        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
  98        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
  99        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 100        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 101        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 102        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 103        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 104        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 105        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 106        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 107        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 108        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 109        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 110        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 111        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 112        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 113        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 114        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 115        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 116        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 117        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 118        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 119        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 120};
 121
 122struct index_entry {
 123        const unsigned char *ptr;
 124        unsigned int val;
 125        struct index_entry *next;
 126};
 127
 128struct delta_index {
 129        const void *src_buf;
 130        unsigned long src_size;
 131        unsigned int hash_mask;
 132        struct index_entry *hash[FLEX_ARRAY];
 133};
 134
 135struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 136{
 137        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 138        const unsigned char *data, *buffer = buf;
 139        struct delta_index *index;
 140        struct index_entry *entry, **hash;
 141        void *mem;
 142        unsigned long memsize;
 143
 144        if (!buf || !bufsize)
 145                return NULL;
 146
 147        /* Determine index hash size.  Note that indexing skips the
 148           first byte to allow for optimizing the Rabin's polynomial
 149           initialization in create_delta(). */
 150        entries = (bufsize - 1)  / RABIN_WINDOW;
 151        hsize = entries / 4;
 152        for (i = 4; (1u << i) < hsize && i < 31; i++);
 153        hsize = 1 << i;
 154        hmask = hsize - 1;
 155
 156        /* allocate lookup index */
 157        memsize = sizeof(*index) +
 158                  sizeof(*hash) * hsize +
 159                  sizeof(*entry) * entries;
 160        mem = malloc(memsize);
 161        if (!mem)
 162                return NULL;
 163        index = mem;
 164        mem = index + 1;
 165        hash = mem;
 166        mem = hash + hsize;
 167        entry = mem;
 168
 169        index->src_buf = buf;
 170        index->src_size = bufsize;
 171        index->hash_mask = hmask;
 172        memset(hash, 0, hsize * sizeof(*hash));
 173
 174        /* allocate an array to count hash entries */
 175        hash_count = calloc(hsize, sizeof(*hash_count));
 176        if (!hash_count) {
 177                free(index);
 178                return NULL;
 179        }
 180
 181        /* then populate the index */
 182        prev_val = ~0;
 183        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 184             data >= buffer;
 185             data -= RABIN_WINDOW) {
 186                unsigned int val = 0;
 187                for (i = 1; i <= RABIN_WINDOW; i++)
 188                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 189                if (val == prev_val) {
 190                        /* keep the lowest of consecutive identical blocks */
 191                        entry[-1].ptr = data + RABIN_WINDOW;
 192                } else {
 193                        prev_val = val;
 194                        i = val & hmask;
 195                        entry->ptr = data + RABIN_WINDOW;
 196                        entry->val = val;
 197                        entry->next = hash[i];
 198                        hash[i] = entry++;
 199                        hash_count[i]++;
 200                }
 201        }
 202
 203        /*
 204         * Determine a limit on the number of entries in the same hash
 205         * bucket.  This guards us against pathological data sets causing
 206         * really bad hash distribution with most entries in the same hash
 207         * bucket that would bring us to O(m*n) computing costs (m and n
 208         * corresponding to reference and target buffer sizes).
 209         *
 210         * Make sure none of the hash buckets has more entries than
 211         * we're willing to test.  Otherwise we cull the entry list
 212         * uniformly to still preserve a good repartition across
 213         * the reference buffer.
 214         */
 215        for (i = 0; i < hsize; i++) {
 216                if (hash_count[i] < HASH_LIMIT)
 217                        continue;
 218                entry = hash[i];
 219                do {
 220                        struct index_entry *keep = entry;
 221                        int skip = hash_count[i] / HASH_LIMIT / 2;
 222                        do {
 223                                entry = entry->next;
 224                        } while(--skip && entry);
 225                        keep->next = entry;
 226                } while(entry);
 227        }
 228        free(hash_count);
 229
 230        return index;
 231}
 232
 233void free_delta_index(struct delta_index *index)
 234{
 235        free(index);
 236}
 237
 238/*
 239 * The maximum size for any opcode sequence, including the initial header
 240 * plus Rabin window plus biggest copy.
 241 */
 242#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 243
 244void *
 245create_delta(const struct delta_index *index,
 246             const void *trg_buf, unsigned long trg_size,
 247             unsigned long *delta_size, unsigned long max_size)
 248{
 249        unsigned int i, outpos, outsize, moff, msize, val;
 250        int inscnt;
 251        const unsigned char *ref_data, *ref_top, *data, *top;
 252        unsigned char *out;
 253
 254        if (!trg_buf || !trg_size)
 255                return NULL;
 256
 257        outpos = 0;
 258        outsize = 8192;
 259        if (max_size && outsize >= max_size)
 260                outsize = max_size + MAX_OP_SIZE + 1;
 261        out = malloc(outsize);
 262        if (!out)
 263                return NULL;
 264
 265        /* store reference buffer size */
 266        i = index->src_size;
 267        while (i >= 0x80) {
 268                out[outpos++] = i | 0x80;
 269                i >>= 7;
 270        }
 271        out[outpos++] = i;
 272
 273        /* store target buffer size */
 274        i = trg_size;
 275        while (i >= 0x80) {
 276                out[outpos++] = i | 0x80;
 277                i >>= 7;
 278        }
 279        out[outpos++] = i;
 280
 281        ref_data = index->src_buf;
 282        ref_top = ref_data + index->src_size;
 283        data = trg_buf;
 284        top = (const unsigned char *) trg_buf + trg_size;
 285
 286        outpos++;
 287        val = 0;
 288        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 289                out[outpos++] = *data;
 290                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 291        }
 292        inscnt = i;
 293
 294        moff = 0;
 295        msize = 0;
 296        while (data < top) {
 297                if (msize < 4096) {
 298                        struct index_entry *entry;
 299                        val ^= U[data[-RABIN_WINDOW]];
 300                        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 301                        i = val & index->hash_mask;
 302                        for (entry = index->hash[i]; entry; entry = entry->next) {
 303                                const unsigned char *ref = entry->ptr;
 304                                const unsigned char *src = data;
 305                                unsigned int ref_size = ref_top - ref;
 306                                if (entry->val != val)
 307                                        continue;
 308                                if (ref_size > top - src)
 309                                        ref_size = top - src;
 310                                if (ref_size <= msize)
 311                                        break;
 312                                while (ref_size-- && *src++ == *ref)
 313                                        ref++;
 314                                if (msize < ref - entry->ptr) {
 315                                        /* this is our best match so far */
 316                                        msize = ref - entry->ptr;
 317                                        moff = entry->ptr - ref_data;
 318                                        if (msize >= 4096) /* good enough */
 319                                                break;
 320                                }
 321                        }
 322                }
 323
 324                if (msize < 4) {
 325                        if (!inscnt)
 326                                outpos++;
 327                        out[outpos++] = *data++;
 328                        inscnt++;
 329                        if (inscnt == 0x7f) {
 330                                out[outpos - inscnt - 1] = inscnt;
 331                                inscnt = 0;
 332                        }
 333                        msize = 0;
 334                } else {
 335                        unsigned int left;
 336                        unsigned char *op;
 337
 338                        if (inscnt) {
 339                                while (moff && ref_data[moff-1] == data[-1]) {
 340                                        /* we can match one byte back */
 341                                        msize++;
 342                                        moff--;
 343                                        data--;
 344                                        outpos--;
 345                                        if (--inscnt)
 346                                                continue;
 347                                        outpos--;  /* remove count slot */
 348                                        inscnt--;  /* make it -1 */
 349                                        break;
 350                                }
 351                                out[outpos - inscnt - 1] = inscnt;
 352                                inscnt = 0;
 353                        }
 354
 355                        /* A copy op is currently limited to 64KB (pack v2) */
 356                        left = (msize < 0x10000) ? 0 : (msize - 0x10000);
 357                        msize -= left;
 358
 359                        op = out + outpos++;
 360                        i = 0x80;
 361
 362                        if (moff & 0x000000ff)
 363                                out[outpos++] = moff >> 0,  i |= 0x01;
 364                        if (moff & 0x0000ff00)
 365                                out[outpos++] = moff >> 8,  i |= 0x02;
 366                        if (moff & 0x00ff0000)
 367                                out[outpos++] = moff >> 16, i |= 0x04;
 368                        if (moff & 0xff000000)
 369                                out[outpos++] = moff >> 24, i |= 0x08;
 370
 371                        if (msize & 0x00ff)
 372                                out[outpos++] = msize >> 0, i |= 0x10;
 373                        if (msize & 0xff00)
 374                                out[outpos++] = msize >> 8, i |= 0x20;
 375
 376                        *op = i;
 377
 378                        data += msize;
 379                        moff += msize;
 380                        msize = left;
 381
 382                        if (msize < 4096) {
 383                                int j;
 384                                val = 0;
 385                                for (j = -RABIN_WINDOW; j < 0; j++)
 386                                        val = ((val << 8) | data[j])
 387                                              ^ T[val >> RABIN_SHIFT];
 388                        }
 389                }
 390
 391                if (outpos >= outsize - MAX_OP_SIZE) {
 392                        void *tmp = out;
 393                        outsize = outsize * 3 / 2;
 394                        if (max_size && outsize >= max_size)
 395                                outsize = max_size + MAX_OP_SIZE + 1;
 396                        if (max_size && outpos > max_size)
 397                                break;
 398                        out = xrealloc(out, outsize);
 399                        if (!out) {
 400                                free(tmp);
 401                                return NULL;
 402                        }
 403                }
 404        }
 405
 406        if (inscnt)
 407                out[outpos - inscnt - 1] = inscnt;
 408
 409        if (max_size && outpos > max_size) {
 410                free(out);
 411                return NULL;
 412        }
 413
 414        *delta_size = outpos;
 415        return out;
 416}