diff-delta.con commit Merge branch 'th/diff-extra' (f3c5b39)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25#include "git-compat-util.h"
  26
  27/* maximum hash entry list for the same hash bucket */
  28#define HASH_LIMIT 64
  29
  30#define RABIN_SHIFT 23
  31#define RABIN_WINDOW 16
  32
  33static const unsigned int T[256] = {
  34        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  35        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  36        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  37        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  38        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  39        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  40        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  41        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  42        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  43        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  44        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  45        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  46        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  47        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  48        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  49        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  50        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  51        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  52        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  53        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  54        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  55        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  56        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  57        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  58        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  59        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  60        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  61        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  62        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  63        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  64        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  65        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  66        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  67        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  68        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  69        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  70        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  71        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  72        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  73        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  74        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  75        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  76        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  77};
  78
  79static const unsigned int U[256] = {
  80        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  81        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  82        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  83        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  84        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  85        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  86        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  87        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  88        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  89        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  90        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  91        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  92        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  93        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  94        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  95        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  96        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  97        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  98        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  99        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
 100        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
 101        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
 102        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 103        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 104        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 105        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 106        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 107        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 108        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 109        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 110        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 111        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 112        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 113        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 114        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 115        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 116        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 117        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 118        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 119        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 120        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 121        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 122        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 123};
 124
 125struct index_entry {
 126        const unsigned char *ptr;
 127        unsigned int val;
 128        struct index_entry *next;
 129};
 130
 131struct delta_index {
 132        const void *src_buf;
 133        unsigned long src_size;
 134        unsigned int hash_mask;
 135        struct index_entry *hash[FLEX_ARRAY];
 136};
 137
 138struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 139{
 140        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 141        const unsigned char *data, *buffer = buf;
 142        struct delta_index *index;
 143        struct index_entry *entry, **hash;
 144        void *mem;
 145        unsigned long memsize;
 146
 147        if (!buf || !bufsize)
 148                return NULL;
 149
 150        /* Determine index hash size.  Note that indexing skips the
 151           first byte to allow for optimizing the Rabin's polynomial
 152           initialization in create_delta(). */
 153        entries = (bufsize - 1)  / RABIN_WINDOW;
 154        hsize = entries / 4;
 155        for (i = 4; (1 << i) < hsize && i < 31; i++);
 156        hsize = 1 << i;
 157        hmask = hsize - 1;
 158
 159        /* allocate lookup index */
 160        memsize = sizeof(*index) +
 161                  sizeof(*hash) * hsize +
 162                  sizeof(*entry) * entries;
 163        mem = malloc(memsize);
 164        if (!mem)
 165                return NULL;
 166        index = mem;
 167        mem = index + 1;
 168        hash = mem;
 169        mem = hash + hsize;
 170        entry = mem;
 171
 172        index->src_buf = buf;
 173        index->src_size = bufsize;
 174        index->hash_mask = hmask;
 175        memset(hash, 0, hsize * sizeof(*hash));
 176
 177        /* allocate an array to count hash entries */
 178        hash_count = calloc(hsize, sizeof(*hash_count));
 179        if (!hash_count) {
 180                free(index);
 181                return NULL;
 182        }
 183
 184        /* then populate the index */
 185        prev_val = ~0;
 186        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 187             data >= buffer;
 188             data -= RABIN_WINDOW) {
 189                unsigned int val = 0;
 190                for (i = 1; i <= RABIN_WINDOW; i++)
 191                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 192                if (val == prev_val) {
 193                        /* keep the lowest of consecutive identical blocks */
 194                        entry[-1].ptr = data + RABIN_WINDOW;
 195                } else {
 196                        prev_val = val;
 197                        i = val & hmask;
 198                        entry->ptr = data + RABIN_WINDOW;
 199                        entry->val = val;
 200                        entry->next = hash[i];
 201                        hash[i] = entry++;
 202                        hash_count[i]++;
 203                }
 204        }
 205
 206        /*
 207         * Determine a limit on the number of entries in the same hash
 208         * bucket.  This guards us against pathological data sets causing
 209         * really bad hash distribution with most entries in the same hash
 210         * bucket that would bring us to O(m*n) computing costs (m and n
 211         * corresponding to reference and target buffer sizes).
 212         *
 213         * Make sure none of the hash buckets has more entries than
 214         * we're willing to test.  Otherwise we cull the entry list
 215         * uniformly to still preserve a good repartition across
 216         * the reference buffer.
 217         */
 218        for (i = 0; i < hsize; i++) {
 219                if (hash_count[i] < HASH_LIMIT)
 220                        continue;
 221                entry = hash[i];
 222                do {
 223                        struct index_entry *keep = entry;
 224                        int skip = hash_count[i] / HASH_LIMIT / 2;
 225                        do {
 226                                entry = entry->next;
 227                        } while(--skip && entry);
 228                        keep->next = entry;
 229                } while(entry);
 230        }
 231        free(hash_count);
 232
 233        return index;
 234}
 235
 236void free_delta_index(struct delta_index *index)
 237{
 238        free(index);
 239}
 240
 241/*
 242 * The maximum size for any opcode sequence, including the initial header
 243 * plus Rabin window plus biggest copy.
 244 */
 245#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 246
 247void *
 248create_delta(const struct delta_index *index,
 249             const void *trg_buf, unsigned long trg_size,
 250             unsigned long *delta_size, unsigned long max_size)
 251{
 252        unsigned int i, outpos, outsize, val;
 253        int inscnt;
 254        const unsigned char *ref_data, *ref_top, *data, *top;
 255        unsigned char *out;
 256
 257        if (!trg_buf || !trg_size)
 258                return NULL;
 259
 260        outpos = 0;
 261        outsize = 8192;
 262        if (max_size && outsize >= max_size)
 263                outsize = max_size + MAX_OP_SIZE + 1;
 264        out = malloc(outsize);
 265        if (!out)
 266                return NULL;
 267
 268        /* store reference buffer size */
 269        i = index->src_size;
 270        while (i >= 0x80) {
 271                out[outpos++] = i | 0x80;
 272                i >>= 7;
 273        }
 274        out[outpos++] = i;
 275
 276        /* store target buffer size */
 277        i = trg_size;
 278        while (i >= 0x80) {
 279                out[outpos++] = i | 0x80;
 280                i >>= 7;
 281        }
 282        out[outpos++] = i;
 283
 284        ref_data = index->src_buf;
 285        ref_top = ref_data + index->src_size;
 286        data = trg_buf;
 287        top = (const unsigned char *) trg_buf + trg_size;
 288
 289        outpos++;
 290        val = 0;
 291        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 292                out[outpos++] = *data;
 293                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 294        }
 295        inscnt = i;
 296
 297        while (data < top) {
 298                unsigned int moff = 0, msize = 0;
 299                struct index_entry *entry;
 300                val ^= U[data[-RABIN_WINDOW]];
 301                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 302                i = val & index->hash_mask;
 303                for (entry = index->hash[i]; entry; entry = entry->next) {
 304                        const unsigned char *ref = entry->ptr;
 305                        const unsigned char *src = data;
 306                        unsigned int ref_size = ref_top - ref;
 307                        if (entry->val != val)
 308                                continue;
 309                        if (ref_size > top - src)
 310                                ref_size = top - src;
 311                        if (ref_size > 0x10000)
 312                                ref_size = 0x10000;
 313                        if (ref_size <= msize)
 314                                break;
 315                        while (ref_size-- && *src++ == *ref)
 316                                ref++;
 317                        if (msize < ref - entry->ptr) {
 318                                /* this is our best match so far */
 319                                msize = ref - entry->ptr;
 320                                moff = entry->ptr - ref_data;
 321                        }
 322                }
 323
 324                if (msize < 4) {
 325                        if (!inscnt)
 326                                outpos++;
 327                        out[outpos++] = *data++;
 328                        inscnt++;
 329                        if (inscnt == 0x7f) {
 330                                out[outpos - inscnt - 1] = inscnt;
 331                                inscnt = 0;
 332                        }
 333                } else {
 334                        unsigned char *op;
 335
 336                        if (msize >= RABIN_WINDOW) {
 337                                const unsigned char *sk;
 338                                sk = data + msize - RABIN_WINDOW;
 339                                val = 0;
 340                                for (i = 0; i < RABIN_WINDOW; i++)
 341                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 342                        } else {
 343                                const unsigned char *sk = data + 1;
 344                                for (i = 1; i < msize; i++) {
 345                                        val ^= U[sk[-RABIN_WINDOW]];
 346                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 347                                }
 348                        }
 349
 350                        if (inscnt) {
 351                                while (moff && ref_data[moff-1] == data[-1]) {
 352                                        if (msize == 0x10000)
 353                                                break;
 354                                        /* we can match one byte back */
 355                                        msize++;
 356                                        moff--;
 357                                        data--;
 358                                        outpos--;
 359                                        if (--inscnt)
 360                                                continue;
 361                                        outpos--;  /* remove count slot */
 362                                        inscnt--;  /* make it -1 */
 363                                        break;
 364                                }
 365                                out[outpos - inscnt - 1] = inscnt;
 366                                inscnt = 0;
 367                        }
 368
 369                        data += msize;
 370                        op = out + outpos++;
 371                        i = 0x80;
 372
 373                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 374                        moff >>= 8;
 375                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 376                        moff >>= 8;
 377                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 378                        moff >>= 8;
 379                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 380
 381                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 382                        msize >>= 8;
 383                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 384
 385                        *op = i;
 386                }
 387
 388                if (outpos >= outsize - MAX_OP_SIZE) {
 389                        void *tmp = out;
 390                        outsize = outsize * 3 / 2;
 391                        if (max_size && outsize >= max_size)
 392                                outsize = max_size + MAX_OP_SIZE + 1;
 393                        if (max_size && outpos > max_size)
 394                                break;
 395                        out = realloc(out, outsize);
 396                        if (!out) {
 397                                free(tmp);
 398                                return NULL;
 399                        }
 400                }
 401        }
 402
 403        if (inscnt)
 404                out[outpos - inscnt - 1] = inscnt;
 405
 406        if (max_size && outpos > max_size) {
 407                free(out);
 408                return NULL;
 409        }
 410
 411        *delta_size = outpos;
 412        return out;
 413}