diff-delta.con commit Merge branch 'jc/grep' into next (5d02840)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25
  26/* maximum hash entry list for the same hash bucket */
  27#define HASH_LIMIT 64
  28
  29#define RABIN_SHIFT 23
  30#define RABIN_WINDOW 16
  31
  32static const unsigned int T[256] = {
  33        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  34        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  35        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  36        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  37        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  38        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  39        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  40        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  41        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  42        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  43        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  44        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  45        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  46        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  47        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  48        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  49        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  50        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  51        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  52        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  53        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  54        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  55        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  56        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  57        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  58        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  59        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  60        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  61        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  62        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  63        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  64        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  65        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  66        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  67        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  68        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  69        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  70        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  71        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  72        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  73        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  74        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  75        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  76};
  77
  78static const unsigned int U[256] = {
  79        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  80        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  81        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  82        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  83        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  84        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  85        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  86        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  87        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  88        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  89        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  90        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  91        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  92        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  93        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  94        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  95        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  96        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  97        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  98        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
  99        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
 100        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
 101        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 102        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 103        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 104        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 105        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 106        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 107        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 108        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 109        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 110        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 111        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 112        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 113        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 114        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 115        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 116        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 117        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 118        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 119        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 120        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 121        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 122};
 123
 124struct index_entry {
 125        const unsigned char *ptr;
 126        unsigned int val;
 127        struct index_entry *next;
 128};
 129
 130struct delta_index {
 131        const void *src_buf;
 132        unsigned long src_size;
 133        unsigned int hash_mask;
 134        struct index_entry *hash[0];
 135};
 136
 137struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 138{
 139        unsigned int i, hsize, hmask, entries, *hash_count;
 140        const unsigned char *data, *buffer = buf;
 141        struct delta_index *index;
 142        struct index_entry *entry, **hash;
 143        void *mem;
 144
 145        if (!buf || !bufsize)
 146                return NULL;
 147
 148        /* Determine index hash size.  Note that indexing skips the
 149           first byte to allow for optimizing the rabin polynomial
 150           initialization in create_delta(). */
 151        entries = (bufsize - 1)  / RABIN_WINDOW;
 152        hsize = entries / 4;
 153        for (i = 4; (1 << i) < hsize && i < 31; i++);
 154        hsize = 1 << i;
 155        hmask = hsize - 1;
 156
 157        /* allocate lookup index */
 158        mem = malloc(sizeof(*index) +
 159                     sizeof(*hash) * hsize +
 160                     sizeof(*entry) * entries);
 161        if (!mem)
 162                return NULL;
 163        index = mem;
 164        mem = index + 1;
 165        hash = mem;
 166        mem = hash + hsize;
 167        entry = mem;
 168
 169        index->src_buf = buf;
 170        index->src_size = bufsize;
 171        index->hash_mask = hmask;
 172        memset(hash, 0, hsize * sizeof(*hash));
 173
 174        /* allocate an array to count hash entries */
 175        hash_count = calloc(hsize, sizeof(*hash_count));
 176        if (!hash_count) {
 177                free(index);
 178                return NULL;
 179        }
 180
 181        /* then populate the index */
 182        data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 183        while (data >= buffer) {
 184                unsigned int val = 0;
 185                for (i = 1; i <= RABIN_WINDOW; i++)
 186                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 187                i = val & hmask;
 188                entry->ptr = data + RABIN_WINDOW;
 189                entry->val = val;
 190                entry->next = hash[i];
 191                hash[i] = entry++;
 192                hash_count[i]++;
 193                data -= RABIN_WINDOW;
 194        }
 195
 196        /*
 197         * Determine a limit on the number of entries in the same hash
 198         * bucket.  This guard us against patological data sets causing
 199         * really bad hash distribution with most entries in the same hash
 200         * bucket that would bring us to O(m*n) computing costs (m and n
 201         * corresponding to reference and target buffer sizes).
 202         *
 203         * Make sure none of the hash buckets has more entries than
 204         * we're willing to test.  Otherwise we cull the entry list
 205         * uniformly to still preserve a good repartition across
 206         * the reference buffer.
 207         */
 208        for (i = 0; i < hsize; i++) {
 209                if (hash_count[i] < HASH_LIMIT)
 210                        continue;
 211                entry = hash[i];
 212                do {
 213                        struct index_entry *keep = entry;
 214                        int skip = hash_count[i] / HASH_LIMIT / 2;
 215                        do {
 216                                entry = entry->next;
 217                        } while(--skip && entry);
 218                        keep->next = entry;
 219                } while(entry);
 220        }
 221        free(hash_count);
 222
 223        return index;
 224}
 225
 226void free_delta_index(struct delta_index *index)
 227{
 228        free(index);
 229}
 230
 231/*
 232 * The maximum size for any opcode sequence, including the initial header
 233 * plus rabin window plus biggest copy.
 234 */
 235#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 236
 237void *
 238create_delta(const struct delta_index *index,
 239             const void *trg_buf, unsigned long trg_size,
 240             unsigned long *delta_size, unsigned long max_size)
 241{
 242        unsigned int i, outpos, outsize, hash_mask, val;
 243        int inscnt;
 244        const unsigned char *ref_data, *ref_top, *data, *top;
 245        unsigned char *out;
 246
 247        if (!trg_buf || !trg_size)
 248                return NULL;
 249
 250        outpos = 0;
 251        outsize = 8192;
 252        if (max_size && outsize >= max_size)
 253                outsize = max_size + MAX_OP_SIZE + 1;
 254        out = malloc(outsize);
 255        if (!out)
 256                return NULL;
 257
 258        /* store reference buffer size */
 259        i = index->src_size;
 260        while (i >= 0x80) {
 261                out[outpos++] = i | 0x80;
 262                i >>= 7;
 263        }
 264        out[outpos++] = i;
 265
 266        /* store target buffer size */
 267        i = trg_size;
 268        while (i >= 0x80) {
 269                out[outpos++] = i | 0x80;
 270                i >>= 7;
 271        }
 272        out[outpos++] = i;
 273
 274        ref_data = index->src_buf;
 275        ref_top = ref_data + index->src_size;
 276        data = trg_buf;
 277        top = trg_buf + trg_size;
 278        hash_mask = index->hash_mask;
 279
 280        outpos++;
 281        val = 0;
 282        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 283                out[outpos++] = *data;
 284                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 285        }
 286        inscnt = i;
 287
 288        while (data < top) {
 289                unsigned int moff = 0, msize = 0;
 290                struct index_entry *entry;
 291                val ^= U[data[-RABIN_WINDOW]];
 292                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 293                i = val & hash_mask;
 294                for (entry = index->hash[i]; entry; entry = entry->next) {
 295                        const unsigned char *ref = entry->ptr;
 296                        const unsigned char *src = data;
 297                        unsigned int ref_size = ref_top - ref;
 298                        if (entry->val != val)
 299                                continue;
 300                        if (ref_size > top - src)
 301                                ref_size = top - src;
 302                        if (ref_size > 0x10000)
 303                                ref_size = 0x10000;
 304                        if (ref_size <= msize)
 305                                break;
 306                        while (ref_size-- && *src++ == *ref)
 307                                ref++;
 308                        if (msize < ref - entry->ptr) {
 309                                /* this is our best match so far */
 310                                msize = ref - entry->ptr;
 311                                moff = entry->ptr - ref_data;
 312                        }
 313                }
 314
 315                if (msize < 4) {
 316                        if (!inscnt)
 317                                outpos++;
 318                        out[outpos++] = *data++;
 319                        inscnt++;
 320                        if (inscnt == 0x7f) {
 321                                out[outpos - inscnt - 1] = inscnt;
 322                                inscnt = 0;
 323                        }
 324                } else {
 325                        unsigned char *op;
 326
 327                        if (msize >= RABIN_WINDOW) {
 328                                const unsigned char *sk;
 329                                sk = data + msize - RABIN_WINDOW;
 330                                val = 0;
 331                                for (i = 0; i < RABIN_WINDOW; i++)
 332                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 333                        } else {
 334                                const unsigned char *sk = data + 1;
 335                                for (i = 1; i < msize; i++) {
 336                                        val ^= U[sk[-RABIN_WINDOW]];
 337                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 338                                }
 339                        }
 340
 341                        if (inscnt) {
 342                                while (moff && ref_data[moff-1] == data[-1]) {
 343                                        if (msize == 0x10000)
 344                                                break;
 345                                        /* we can match one byte back */
 346                                        msize++;
 347                                        moff--;
 348                                        data--;
 349                                        outpos--;
 350                                        if (--inscnt)
 351                                                continue;
 352                                        outpos--;  /* remove count slot */
 353                                        inscnt--;  /* make it -1 */
 354                                        break;
 355                                }
 356                                out[outpos - inscnt - 1] = inscnt;
 357                                inscnt = 0;
 358                        }
 359
 360                        data += msize;
 361                        op = out + outpos++;
 362                        i = 0x80;
 363
 364                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 365                        moff >>= 8;
 366                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 367                        moff >>= 8;
 368                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 369                        moff >>= 8;
 370                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 371
 372                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 373                        msize >>= 8;
 374                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 375
 376                        *op = i;
 377                }
 378
 379                if (outpos >= outsize - MAX_OP_SIZE) {
 380                        void *tmp = out;
 381                        outsize = outsize * 3 / 2;
 382                        if (max_size && outsize >= max_size)
 383                                outsize = max_size + MAX_OP_SIZE + 1;
 384                        if (max_size && outpos > max_size)
 385                                break;
 386                        out = realloc(out, outsize);
 387                        if (!out) {
 388                                free(tmp);
 389                                return NULL;
 390                        }
 391                }
 392        }
 393
 394        if (inscnt)
 395                out[outpos - inscnt - 1] = inscnt;
 396
 397        if (max_size && outpos > max_size) {
 398                free(out);
 399                return NULL;
 400        }
 401
 402        *delta_size = outpos;
 403        return out;
 404}