diff-delta.con commit core-tutorial: minor cleanup (a85feca)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
   5 * http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
   8 *
   9 * This code is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 */
  13
  14#include "git-compat-util.h"
  15#include "delta.h"
  16
  17/* maximum hash entry list for the same hash bucket */
  18#define HASH_LIMIT 64
  19
  20#define RABIN_SHIFT 23
  21#define RABIN_WINDOW 16
  22
  23static const unsigned int T[256] = {
  24        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  25        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  26        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  27        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  28        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  29        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  30        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  31        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  32        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  33        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  34        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  35        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  36        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  37        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  38        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  39        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  40        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  41        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  42        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  43        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  44        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  45        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  46        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  47        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  48        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  49        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  50        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  51        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  52        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  53        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  54        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  55        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  56        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  57        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  58        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  59        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  60        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  61        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  62        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  63        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  64        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  65        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  66        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  67};
  68
  69static const unsigned int U[256] = {
  70        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  71        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  72        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  73        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  74        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  75        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  76        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  77        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  78        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  79        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  80        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  81        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  82        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  83        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  84        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  85        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  86        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  87        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  88        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  89        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  90        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
  91        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
  92        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
  93        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
  94        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
  95        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
  96        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
  97        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
  98        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
  99        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 100        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 101        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 102        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 103        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 104        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 105        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 106        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 107        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 108        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 109        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 110        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 111        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 112        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 113};
 114
 115struct index_entry {
 116        const unsigned char *ptr;
 117        unsigned int val;
 118        struct index_entry *next;
 119};
 120
 121struct delta_index {
 122        unsigned long memsize;
 123        const void *src_buf;
 124        unsigned long src_size;
 125        unsigned int hash_mask;
 126        struct index_entry *hash[FLEX_ARRAY];
 127};
 128
 129struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 130{
 131        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 132        const unsigned char *data, *buffer = buf;
 133        struct delta_index *index;
 134        struct index_entry *entry, **hash;
 135        void *mem;
 136        unsigned long memsize;
 137
 138        if (!buf || !bufsize)
 139                return NULL;
 140
 141        /* Determine index hash size.  Note that indexing skips the
 142           first byte to allow for optimizing the Rabin's polynomial
 143           initialization in create_delta(). */
 144        entries = (bufsize - 1)  / RABIN_WINDOW;
 145        hsize = entries / 4;
 146        for (i = 4; (1u << i) < hsize && i < 31; i++);
 147        hsize = 1 << i;
 148        hmask = hsize - 1;
 149
 150        /* allocate lookup index */
 151        memsize = sizeof(*index) +
 152                  sizeof(*hash) * hsize +
 153                  sizeof(*entry) * entries;
 154        mem = malloc(memsize);
 155        if (!mem)
 156                return NULL;
 157        index = mem;
 158        mem = index + 1;
 159        hash = mem;
 160        mem = hash + hsize;
 161        entry = mem;
 162
 163        index->memsize = memsize;
 164        index->src_buf = buf;
 165        index->src_size = bufsize;
 166        index->hash_mask = hmask;
 167        memset(hash, 0, hsize * sizeof(*hash));
 168
 169        /* allocate an array to count hash entries */
 170        hash_count = calloc(hsize, sizeof(*hash_count));
 171        if (!hash_count) {
 172                free(index);
 173                return NULL;
 174        }
 175
 176        /* then populate the index */
 177        prev_val = ~0;
 178        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 179             data >= buffer;
 180             data -= RABIN_WINDOW) {
 181                unsigned int val = 0;
 182                for (i = 1; i <= RABIN_WINDOW; i++)
 183                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 184                if (val == prev_val) {
 185                        /* keep the lowest of consecutive identical blocks */
 186                        entry[-1].ptr = data + RABIN_WINDOW;
 187                } else {
 188                        prev_val = val;
 189                        i = val & hmask;
 190                        entry->ptr = data + RABIN_WINDOW;
 191                        entry->val = val;
 192                        entry->next = hash[i];
 193                        hash[i] = entry++;
 194                        hash_count[i]++;
 195                }
 196        }
 197
 198        /*
 199         * Determine a limit on the number of entries in the same hash
 200         * bucket.  This guards us against pathological data sets causing
 201         * really bad hash distribution with most entries in the same hash
 202         * bucket that would bring us to O(m*n) computing costs (m and n
 203         * corresponding to reference and target buffer sizes).
 204         *
 205         * Make sure none of the hash buckets has more entries than
 206         * we're willing to test.  Otherwise we cull the entry list
 207         * uniformly to still preserve a good repartition across
 208         * the reference buffer.
 209         */
 210        for (i = 0; i < hsize; i++) {
 211                if (hash_count[i] < HASH_LIMIT)
 212                        continue;
 213                entry = hash[i];
 214                do {
 215                        struct index_entry *keep = entry;
 216                        int skip = hash_count[i] / HASH_LIMIT;
 217                        do {
 218                                entry = entry->next;
 219                        } while(--skip && entry);
 220                        keep->next = entry;
 221                } while(entry);
 222        }
 223        free(hash_count);
 224
 225        return index;
 226}
 227
 228void free_delta_index(struct delta_index *index)
 229{
 230        free(index);
 231}
 232
 233unsigned long sizeof_delta_index(struct delta_index *index)
 234{
 235        if (index)
 236                return index->memsize;
 237        else
 238                return 0;
 239}
 240
 241/*
 242 * The maximum size for any opcode sequence, including the initial header
 243 * plus Rabin window plus biggest copy.
 244 */
 245#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 246
 247void *
 248create_delta(const struct delta_index *index,
 249             const void *trg_buf, unsigned long trg_size,
 250             unsigned long *delta_size, unsigned long max_size)
 251{
 252        unsigned int i, outpos, outsize, moff, msize, val;
 253        int inscnt;
 254        const unsigned char *ref_data, *ref_top, *data, *top;
 255        unsigned char *out;
 256
 257        if (!trg_buf || !trg_size)
 258                return NULL;
 259
 260        outpos = 0;
 261        outsize = 8192;
 262        if (max_size && outsize >= max_size)
 263                outsize = max_size + MAX_OP_SIZE + 1;
 264        out = malloc(outsize);
 265        if (!out)
 266                return NULL;
 267
 268        /* store reference buffer size */
 269        i = index->src_size;
 270        while (i >= 0x80) {
 271                out[outpos++] = i | 0x80;
 272                i >>= 7;
 273        }
 274        out[outpos++] = i;
 275
 276        /* store target buffer size */
 277        i = trg_size;
 278        while (i >= 0x80) {
 279                out[outpos++] = i | 0x80;
 280                i >>= 7;
 281        }
 282        out[outpos++] = i;
 283
 284        ref_data = index->src_buf;
 285        ref_top = ref_data + index->src_size;
 286        data = trg_buf;
 287        top = (const unsigned char *) trg_buf + trg_size;
 288
 289        outpos++;
 290        val = 0;
 291        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 292                out[outpos++] = *data;
 293                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 294        }
 295        inscnt = i;
 296
 297        moff = 0;
 298        msize = 0;
 299        while (data < top) {
 300                if (msize < 4096) {
 301                        struct index_entry *entry;
 302                        val ^= U[data[-RABIN_WINDOW]];
 303                        val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 304                        i = val & index->hash_mask;
 305                        for (entry = index->hash[i]; entry; entry = entry->next) {
 306                                const unsigned char *ref = entry->ptr;
 307                                const unsigned char *src = data;
 308                                unsigned int ref_size = ref_top - ref;
 309                                if (entry->val != val)
 310                                        continue;
 311                                if (ref_size > top - src)
 312                                        ref_size = top - src;
 313                                if (ref_size <= msize)
 314                                        break;
 315                                while (ref_size-- && *src++ == *ref)
 316                                        ref++;
 317                                if (msize < ref - entry->ptr) {
 318                                        /* this is our best match so far */
 319                                        msize = ref - entry->ptr;
 320                                        moff = entry->ptr - ref_data;
 321                                        if (msize >= 4096) /* good enough */
 322                                                break;
 323                                }
 324                        }
 325                }
 326
 327                if (msize < 4) {
 328                        if (!inscnt)
 329                                outpos++;
 330                        out[outpos++] = *data++;
 331                        inscnt++;
 332                        if (inscnt == 0x7f) {
 333                                out[outpos - inscnt - 1] = inscnt;
 334                                inscnt = 0;
 335                        }
 336                        msize = 0;
 337                } else {
 338                        unsigned int left;
 339                        unsigned char *op;
 340
 341                        if (inscnt) {
 342                                while (moff && ref_data[moff-1] == data[-1]) {
 343                                        /* we can match one byte back */
 344                                        msize++;
 345                                        moff--;
 346                                        data--;
 347                                        outpos--;
 348                                        if (--inscnt)
 349                                                continue;
 350                                        outpos--;  /* remove count slot */
 351                                        inscnt--;  /* make it -1 */
 352                                        break;
 353                                }
 354                                out[outpos - inscnt - 1] = inscnt;
 355                                inscnt = 0;
 356                        }
 357
 358                        /* A copy op is currently limited to 64KB (pack v2) */
 359                        left = (msize < 0x10000) ? 0 : (msize - 0x10000);
 360                        msize -= left;
 361
 362                        op = out + outpos++;
 363                        i = 0x80;
 364
 365                        if (moff & 0x000000ff)
 366                                out[outpos++] = moff >> 0,  i |= 0x01;
 367                        if (moff & 0x0000ff00)
 368                                out[outpos++] = moff >> 8,  i |= 0x02;
 369                        if (moff & 0x00ff0000)
 370                                out[outpos++] = moff >> 16, i |= 0x04;
 371                        if (moff & 0xff000000)
 372                                out[outpos++] = moff >> 24, i |= 0x08;
 373
 374                        if (msize & 0x00ff)
 375                                out[outpos++] = msize >> 0, i |= 0x10;
 376                        if (msize & 0xff00)
 377                                out[outpos++] = msize >> 8, i |= 0x20;
 378
 379                        *op = i;
 380
 381                        data += msize;
 382                        moff += msize;
 383                        msize = left;
 384
 385                        if (msize < 4096) {
 386                                int j;
 387                                val = 0;
 388                                for (j = -RABIN_WINDOW; j < 0; j++)
 389                                        val = ((val << 8) | data[j])
 390                                              ^ T[val >> RABIN_SHIFT];
 391                        }
 392                }
 393
 394                if (outpos >= outsize - MAX_OP_SIZE) {
 395                        void *tmp = out;
 396                        outsize = outsize * 3 / 2;
 397                        if (max_size && outsize >= max_size)
 398                                outsize = max_size + MAX_OP_SIZE + 1;
 399                        if (max_size && outpos > max_size)
 400                                break;
 401                        out = realloc(out, outsize);
 402                        if (!out) {
 403                                free(tmp);
 404                                return NULL;
 405                        }
 406                }
 407        }
 408
 409        if (inscnt)
 410                out[outpos - inscnt - 1] = inscnt;
 411
 412        if (max_size && outpos > max_size) {
 413                free(out);
 414                return NULL;
 415        }
 416
 417        *delta_size = outpos;
 418        return out;
 419}