diff-delta.con commit Merge branch 'jc/dirwalk-n-cache-tree' into jc/cache-tree (7d55561)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25
  26/* maximum hash entry list for the same hash bucket */
  27#define HASH_LIMIT 64
  28
  29#define RABIN_SHIFT 23
  30#define RABIN_WINDOW 16
  31
  32static const unsigned int T[256] = {
  33        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  34        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  35        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  36        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  37        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  38        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  39        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  40        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  41        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  42        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  43        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  44        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  45        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  46        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  47        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  48        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  49        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  50        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  51        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  52        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  53        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  54        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  55        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  56        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  57        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  58        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  59        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  60        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  61        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  62        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  63        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  64        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  65        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  66        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  67        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  68        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  69        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  70        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  71        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  72        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  73        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  74        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  75        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  76};
  77
  78static const unsigned int U[256] = {
  79        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  80        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  81        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  82        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  83        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  84        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  85        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  86        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  87        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  88        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  89        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  90        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  91        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  92        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  93        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  94        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  95        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  96        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  97        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  98        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  99        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
 100        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
 101        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 102        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 103        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 104        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 105        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 106        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 107        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 108        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 109        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 110        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 111        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 112        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 113        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 114        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 115        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 116        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 117        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 118        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 119        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 120        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 121        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 122};
 123
 124struct index_entry {
 125        const unsigned char *ptr;
 126        unsigned int val;
 127        struct index_entry *next;
 128};
 129
 130struct delta_index {
 131        const void *src_buf;
 132        unsigned long src_size;
 133        unsigned int hash_mask;
 134        struct index_entry *hash[0];
 135};
 136
 137struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 138{
 139        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 140        const unsigned char *data, *buffer = buf;
 141        struct delta_index *index;
 142        struct index_entry *entry, **hash;
 143        void *mem;
 144        unsigned long memsize;
 145
 146        if (!buf || !bufsize)
 147                return NULL;
 148
 149        /* Determine index hash size.  Note that indexing skips the
 150           first byte to allow for optimizing the rabin polynomial
 151           initialization in create_delta(). */
 152        entries = (bufsize - 1)  / RABIN_WINDOW;
 153        hsize = entries / 4;
 154        for (i = 4; (1 << i) < hsize && i < 31; i++);
 155        hsize = 1 << i;
 156        hmask = hsize - 1;
 157
 158        /* allocate lookup index */
 159        memsize = sizeof(*index) +
 160                  sizeof(*hash) * hsize +
 161                  sizeof(*entry) * entries;
 162        mem = malloc(memsize);
 163        if (!mem)
 164                return NULL;
 165        index = mem;
 166        mem = index + 1;
 167        hash = mem;
 168        mem = hash + hsize;
 169        entry = mem;
 170
 171        index->src_buf = buf;
 172        index->src_size = bufsize;
 173        index->hash_mask = hmask;
 174        memset(hash, 0, hsize * sizeof(*hash));
 175
 176        /* allocate an array to count hash entries */
 177        hash_count = calloc(hsize, sizeof(*hash_count));
 178        if (!hash_count) {
 179                free(index);
 180                return NULL;
 181        }
 182
 183        /* then populate the index */
 184        prev_val = ~0;
 185        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 186             data >= buffer;
 187             data -= RABIN_WINDOW) {
 188                unsigned int val = 0;
 189                for (i = 1; i <= RABIN_WINDOW; i++)
 190                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 191                if (val == prev_val) {
 192                        /* keep the lowest of consecutive identical blocks */
 193                        entry[-1].ptr = data + RABIN_WINDOW;
 194                } else {
 195                        prev_val = val;
 196                        i = val & hmask;
 197                        entry->ptr = data + RABIN_WINDOW;
 198                        entry->val = val;
 199                        entry->next = hash[i];
 200                        hash[i] = entry++;
 201                        hash_count[i]++;
 202                }
 203        }
 204
 205        /*
 206         * Determine a limit on the number of entries in the same hash
 207         * bucket.  This guard us against patological data sets causing
 208         * really bad hash distribution with most entries in the same hash
 209         * bucket that would bring us to O(m*n) computing costs (m and n
 210         * corresponding to reference and target buffer sizes).
 211         *
 212         * Make sure none of the hash buckets has more entries than
 213         * we're willing to test.  Otherwise we cull the entry list
 214         * uniformly to still preserve a good repartition across
 215         * the reference buffer.
 216         */
 217        for (i = 0; i < hsize; i++) {
 218                if (hash_count[i] < HASH_LIMIT)
 219                        continue;
 220                entry = hash[i];
 221                do {
 222                        struct index_entry *keep = entry;
 223                        int skip = hash_count[i] / HASH_LIMIT / 2;
 224                        do {
 225                                entry = entry->next;
 226                        } while(--skip && entry);
 227                        keep->next = entry;
 228                } while(entry);
 229        }
 230        free(hash_count);
 231
 232        return index;
 233}
 234
 235void free_delta_index(struct delta_index *index)
 236{
 237        free(index);
 238}
 239
 240/*
 241 * The maximum size for any opcode sequence, including the initial header
 242 * plus rabin window plus biggest copy.
 243 */
 244#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 245
 246void *
 247create_delta(const struct delta_index *index,
 248             const void *trg_buf, unsigned long trg_size,
 249             unsigned long *delta_size, unsigned long max_size)
 250{
 251        unsigned int i, outpos, outsize, val;
 252        int inscnt;
 253        const unsigned char *ref_data, *ref_top, *data, *top;
 254        unsigned char *out;
 255
 256        if (!trg_buf || !trg_size)
 257                return NULL;
 258
 259        outpos = 0;
 260        outsize = 8192;
 261        if (max_size && outsize >= max_size)
 262                outsize = max_size + MAX_OP_SIZE + 1;
 263        out = malloc(outsize);
 264        if (!out)
 265                return NULL;
 266
 267        /* store reference buffer size */
 268        i = index->src_size;
 269        while (i >= 0x80) {
 270                out[outpos++] = i | 0x80;
 271                i >>= 7;
 272        }
 273        out[outpos++] = i;
 274
 275        /* store target buffer size */
 276        i = trg_size;
 277        while (i >= 0x80) {
 278                out[outpos++] = i | 0x80;
 279                i >>= 7;
 280        }
 281        out[outpos++] = i;
 282
 283        ref_data = index->src_buf;
 284        ref_top = ref_data + index->src_size;
 285        data = trg_buf;
 286        top = trg_buf + trg_size;
 287
 288        outpos++;
 289        val = 0;
 290        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 291                out[outpos++] = *data;
 292                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 293        }
 294        inscnt = i;
 295
 296        while (data < top) {
 297                unsigned int moff = 0, msize = 0;
 298                struct index_entry *entry;
 299                val ^= U[data[-RABIN_WINDOW]];
 300                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 301                i = val & index->hash_mask;
 302                for (entry = index->hash[i]; entry; entry = entry->next) {
 303                        const unsigned char *ref = entry->ptr;
 304                        const unsigned char *src = data;
 305                        unsigned int ref_size = ref_top - ref;
 306                        if (entry->val != val)
 307                                continue;
 308                        if (ref_size > top - src)
 309                                ref_size = top - src;
 310                        if (ref_size > 0x10000)
 311                                ref_size = 0x10000;
 312                        if (ref_size <= msize)
 313                                break;
 314                        while (ref_size-- && *src++ == *ref)
 315                                ref++;
 316                        if (msize < ref - entry->ptr) {
 317                                /* this is our best match so far */
 318                                msize = ref - entry->ptr;
 319                                moff = entry->ptr - ref_data;
 320                        }
 321                }
 322
 323                if (msize < 4) {
 324                        if (!inscnt)
 325                                outpos++;
 326                        out[outpos++] = *data++;
 327                        inscnt++;
 328                        if (inscnt == 0x7f) {
 329                                out[outpos - inscnt - 1] = inscnt;
 330                                inscnt = 0;
 331                        }
 332                } else {
 333                        unsigned char *op;
 334
 335                        if (msize >= RABIN_WINDOW) {
 336                                const unsigned char *sk;
 337                                sk = data + msize - RABIN_WINDOW;
 338                                val = 0;
 339                                for (i = 0; i < RABIN_WINDOW; i++)
 340                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 341                        } else {
 342                                const unsigned char *sk = data + 1;
 343                                for (i = 1; i < msize; i++) {
 344                                        val ^= U[sk[-RABIN_WINDOW]];
 345                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 346                                }
 347                        }
 348
 349                        if (inscnt) {
 350                                while (moff && ref_data[moff-1] == data[-1]) {
 351                                        if (msize == 0x10000)
 352                                                break;
 353                                        /* we can match one byte back */
 354                                        msize++;
 355                                        moff--;
 356                                        data--;
 357                                        outpos--;
 358                                        if (--inscnt)
 359                                                continue;
 360                                        outpos--;  /* remove count slot */
 361                                        inscnt--;  /* make it -1 */
 362                                        break;
 363                                }
 364                                out[outpos - inscnt - 1] = inscnt;
 365                                inscnt = 0;
 366                        }
 367
 368                        data += msize;
 369                        op = out + outpos++;
 370                        i = 0x80;
 371
 372                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 373                        moff >>= 8;
 374                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 375                        moff >>= 8;
 376                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 377                        moff >>= 8;
 378                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 379
 380                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 381                        msize >>= 8;
 382                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 383
 384                        *op = i;
 385                }
 386
 387                if (outpos >= outsize - MAX_OP_SIZE) {
 388                        void *tmp = out;
 389                        outsize = outsize * 3 / 2;
 390                        if (max_size && outsize >= max_size)
 391                                outsize = max_size + MAX_OP_SIZE + 1;
 392                        if (max_size && outpos > max_size)
 393                                break;
 394                        out = realloc(out, outsize);
 395                        if (!out) {
 396                                free(tmp);
 397                                return NULL;
 398                        }
 399                }
 400        }
 401
 402        if (inscnt)
 403                out[outpos - inscnt - 1] = inscnt;
 404
 405        if (max_size && outpos > max_size) {
 406                free(out);
 407                return NULL;
 408        }
 409
 410        *delta_size = outpos;
 411        return out;
 412}