diff-delta.con commit Merge branch 'jc/grep' into next (07c747e)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25
  26/* maximum hash entry list for the same hash bucket */
  27#define HASH_LIMIT 64
  28
  29#define RABIN_SHIFT 23
  30#define RABIN_WINDOW 16
  31
  32static const unsigned int T[256] = {
  33        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  34        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  35        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  36        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  37        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  38        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  39        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  40        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  41        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  42        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  43        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  44        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  45        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  46        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  47        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  48        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  49        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  50        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  51        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  52        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  53        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  54        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  55        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  56        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  57        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  58        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  59        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  60        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  61        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  62        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  63        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  64        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  65        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  66        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  67        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  68        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  69        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  70        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  71        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  72        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  73        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  74        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  75        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  76};
  77
  78static const unsigned int U[256] = {
  79        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  80        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  81        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  82        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  83        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  84        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  85        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  86        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  87        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  88        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  89        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  90        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  91        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  92        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  93        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  94        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  95        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  96        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  97        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  98        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  99        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
 100        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
 101        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 102        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 103        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 104        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 105        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 106        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 107        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 108        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 109        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 110        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 111        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 112        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 113        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 114        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 115        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 116        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 117        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 118        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 119        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 120        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 121        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 122};
 123
 124struct index_entry {
 125        const unsigned char *ptr;
 126        unsigned int val;
 127        struct index_entry *next;
 128};
 129
 130struct delta_index {
 131        const void *src_buf;
 132        unsigned long src_size;
 133        unsigned int hash_mask;
 134        struct index_entry *hash[0];
 135};
 136
 137struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 138{
 139        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 140        const unsigned char *data, *buffer = buf;
 141        struct delta_index *index;
 142        struct index_entry *entry, **hash;
 143        void *mem;
 144        unsigned long memsize;
 145
 146        if (!buf || !bufsize)
 147                return NULL;
 148
 149        /* Determine index hash size.  Note that indexing skips the
 150           first byte to allow for optimizing the rabin polynomial
 151           initialization in create_delta(). */
 152        entries = (bufsize - 1)  / RABIN_WINDOW;
 153        hsize = entries / 4;
 154        for (i = 4; (1 << i) < hsize && i < 31; i++);
 155        hsize = 1 << i;
 156        hmask = hsize - 1;
 157
 158        /* allocate lookup index */
 159        memsize = sizeof(*index) +
 160                  sizeof(*hash) * hsize +
 161                  sizeof(*entry) * entries;
 162        mem = malloc(memsize);
 163        if (!mem)
 164                return NULL;
 165        index = mem;
 166        mem = index + 1;
 167        hash = mem;
 168        mem = hash + hsize;
 169        entry = mem;
 170
 171        index->src_buf = buf;
 172        index->src_size = bufsize;
 173        index->hash_mask = hmask;
 174        memset(hash, 0, hsize * sizeof(*hash));
 175
 176        /* allocate an array to count hash entries */
 177        hash_count = calloc(hsize, sizeof(*hash_count));
 178        if (!hash_count) {
 179                free(index);
 180                return NULL;
 181        }
 182
 183        /* then populate the index */
 184        prev_val = ~0;
 185        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 186             data >= buffer;
 187             data -= RABIN_WINDOW) {
 188                unsigned int val = 0;
 189                for (i = 1; i <= RABIN_WINDOW; i++)
 190                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 191                if (val == prev_val) {
 192                        /* keep the lowest of consecutive identical blocks */
 193                        entry[-1].ptr = data + RABIN_WINDOW;
 194                } else {
 195                        prev_val = val;
 196                        i = val & hmask;
 197                        entry->ptr = data + RABIN_WINDOW;
 198                        entry->val = val;
 199                        entry->next = hash[i];
 200                        hash[i] = entry++;
 201                        hash_count[i]++;
 202                        entries--;
 203                }
 204        }
 205
 206        /*
 207         * Determine a limit on the number of entries in the same hash
 208         * bucket.  This guard us against patological data sets causing
 209         * really bad hash distribution with most entries in the same hash
 210         * bucket that would bring us to O(m*n) computing costs (m and n
 211         * corresponding to reference and target buffer sizes).
 212         *
 213         * Make sure none of the hash buckets has more entries than
 214         * we're willing to test.  Otherwise we cull the entry list
 215         * uniformly to still preserve a good repartition across
 216         * the reference buffer.
 217         */
 218        for (i = 0; i < hsize; i++) {
 219                if (hash_count[i] < HASH_LIMIT)
 220                        continue;
 221                entry = hash[i];
 222                do {
 223                        struct index_entry *keep = entry;
 224                        int skip = hash_count[i] / HASH_LIMIT / 2;
 225                        do {
 226                                entry = entry->next;
 227                        } while(--skip && entry);
 228                        keep->next = entry;
 229                } while(entry);
 230        }
 231        free(hash_count);
 232
 233        /* If we didn't use all hash entries, free the unused memory. */
 234        if (entries)
 235                index = realloc(index, memsize - entries * sizeof(*entry));
 236
 237        return index;
 238}
 239
 240void free_delta_index(struct delta_index *index)
 241{
 242        free(index);
 243}
 244
 245/*
 246 * The maximum size for any opcode sequence, including the initial header
 247 * plus rabin window plus biggest copy.
 248 */
 249#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 250
 251void *
 252create_delta(const struct delta_index *index,
 253             const void *trg_buf, unsigned long trg_size,
 254             unsigned long *delta_size, unsigned long max_size)
 255{
 256        unsigned int i, outpos, outsize, val;
 257        int inscnt;
 258        const unsigned char *ref_data, *ref_top, *data, *top;
 259        unsigned char *out;
 260
 261        if (!trg_buf || !trg_size)
 262                return NULL;
 263
 264        outpos = 0;
 265        outsize = 8192;
 266        if (max_size && outsize >= max_size)
 267                outsize = max_size + MAX_OP_SIZE + 1;
 268        out = malloc(outsize);
 269        if (!out)
 270                return NULL;
 271
 272        /* store reference buffer size */
 273        i = index->src_size;
 274        while (i >= 0x80) {
 275                out[outpos++] = i | 0x80;
 276                i >>= 7;
 277        }
 278        out[outpos++] = i;
 279
 280        /* store target buffer size */
 281        i = trg_size;
 282        while (i >= 0x80) {
 283                out[outpos++] = i | 0x80;
 284                i >>= 7;
 285        }
 286        out[outpos++] = i;
 287
 288        ref_data = index->src_buf;
 289        ref_top = ref_data + index->src_size;
 290        data = trg_buf;
 291        top = trg_buf + trg_size;
 292
 293        outpos++;
 294        val = 0;
 295        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 296                out[outpos++] = *data;
 297                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 298        }
 299        inscnt = i;
 300
 301        while (data < top) {
 302                unsigned int moff = 0, msize = 0;
 303                struct index_entry *entry;
 304                val ^= U[data[-RABIN_WINDOW]];
 305                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 306                i = val & index->hash_mask;
 307                for (entry = index->hash[i]; entry; entry = entry->next) {
 308                        const unsigned char *ref = entry->ptr;
 309                        const unsigned char *src = data;
 310                        unsigned int ref_size = ref_top - ref;
 311                        if (entry->val != val)
 312                                continue;
 313                        if (ref_size > top - src)
 314                                ref_size = top - src;
 315                        if (ref_size > 0x10000)
 316                                ref_size = 0x10000;
 317                        if (ref_size <= msize)
 318                                break;
 319                        while (ref_size-- && *src++ == *ref)
 320                                ref++;
 321                        if (msize < ref - entry->ptr) {
 322                                /* this is our best match so far */
 323                                msize = ref - entry->ptr;
 324                                moff = entry->ptr - ref_data;
 325                        }
 326                }
 327
 328                if (msize < 4) {
 329                        if (!inscnt)
 330                                outpos++;
 331                        out[outpos++] = *data++;
 332                        inscnt++;
 333                        if (inscnt == 0x7f) {
 334                                out[outpos - inscnt - 1] = inscnt;
 335                                inscnt = 0;
 336                        }
 337                } else {
 338                        unsigned char *op;
 339
 340                        if (msize >= RABIN_WINDOW) {
 341                                const unsigned char *sk;
 342                                sk = data + msize - RABIN_WINDOW;
 343                                val = 0;
 344                                for (i = 0; i < RABIN_WINDOW; i++)
 345                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 346                        } else {
 347                                const unsigned char *sk = data + 1;
 348                                for (i = 1; i < msize; i++) {
 349                                        val ^= U[sk[-RABIN_WINDOW]];
 350                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 351                                }
 352                        }
 353
 354                        if (inscnt) {
 355                                while (moff && ref_data[moff-1] == data[-1]) {
 356                                        if (msize == 0x10000)
 357                                                break;
 358                                        /* we can match one byte back */
 359                                        msize++;
 360                                        moff--;
 361                                        data--;
 362                                        outpos--;
 363                                        if (--inscnt)
 364                                                continue;
 365                                        outpos--;  /* remove count slot */
 366                                        inscnt--;  /* make it -1 */
 367                                        break;
 368                                }
 369                                out[outpos - inscnt - 1] = inscnt;
 370                                inscnt = 0;
 371                        }
 372
 373                        data += msize;
 374                        op = out + outpos++;
 375                        i = 0x80;
 376
 377                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 378                        moff >>= 8;
 379                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 380                        moff >>= 8;
 381                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 382                        moff >>= 8;
 383                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 384
 385                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 386                        msize >>= 8;
 387                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 388
 389                        *op = i;
 390                }
 391
 392                if (outpos >= outsize - MAX_OP_SIZE) {
 393                        void *tmp = out;
 394                        outsize = outsize * 3 / 2;
 395                        if (max_size && outsize >= max_size)
 396                                outsize = max_size + MAX_OP_SIZE + 1;
 397                        if (max_size && outpos > max_size)
 398                                break;
 399                        out = realloc(out, outsize);
 400                        if (!out) {
 401                                free(tmp);
 402                                return NULL;
 403                        }
 404                }
 405        }
 406
 407        if (inscnt)
 408                out[outpos - inscnt - 1] = inscnt;
 409
 410        if (max_size && outpos > max_size) {
 411                free(out);
 412                return NULL;
 413        }
 414
 415        *delta_size = outpos;
 416        return out;
 417}