diff-delta.con commit Merge branch 'jc/gitpm' (69de8cc)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25#include "git-compat-util.h"
  26
  27/* maximum hash entry list for the same hash bucket */
  28#define HASH_LIMIT 64
  29
  30#define RABIN_SHIFT 23
  31#define RABIN_WINDOW 16
  32
  33static const unsigned int T[256] = {
  34        0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
  35        0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
  36        0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
  37        0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
  38        0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
  39        0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
  40        0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
  41        0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
  42        0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
  43        0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
  44        0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
  45        0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
  46        0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
  47        0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
  48        0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
  49        0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
  50        0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
  51        0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
  52        0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
  53        0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
  54        0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
  55        0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
  56        0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
  57        0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
  58        0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
  59        0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
  60        0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
  61        0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
  62        0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
  63        0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
  64        0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
  65        0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
  66        0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
  67        0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
  68        0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
  69        0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
  70        0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
  71        0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
  72        0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
  73        0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
  74        0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
  75        0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
  76        0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
  77};
  78
  79static const unsigned int U[256] = {
  80        0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
  81        0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
  82        0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
  83        0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
  84        0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
  85        0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
  86        0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
  87        0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
  88        0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
  89        0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
  90        0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
  91        0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
  92        0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
  93        0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
  94        0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
  95        0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
  96        0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
  97        0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
  98        0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
  99        0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
 100        0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
 101        0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
 102        0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 103        0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 104        0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 105        0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 106        0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 107        0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 108        0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 109        0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 110        0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 111        0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 112        0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 113        0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 114        0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 115        0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 116        0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 117        0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 118        0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 119        0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 120        0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 121        0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 122        0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 123};
 124
 125struct index_entry {
 126        const unsigned char *ptr;
 127        unsigned int val;
 128        struct index_entry *next;
 129};
 130
 131struct delta_index {
 132        const void *src_buf;
 133        unsigned long src_size;
 134        unsigned int hash_mask;
 135        struct index_entry *hash[FLEX_ARRAY];
 136};
 137
 138struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 139{
 140        unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 141        const unsigned char *data, *buffer = buf;
 142        struct delta_index *index;
 143        struct index_entry *entry, **hash;
 144        void *mem;
 145        unsigned long memsize;
 146
 147        if (!buf || !bufsize)
 148                return NULL;
 149
 150        /* Determine index hash size.  Note that indexing skips the
 151           first byte to allow for optimizing the Rabin's polynomial
 152           initialization in create_delta(). */
 153        entries = (bufsize - 1)  / RABIN_WINDOW;
 154        hsize = entries / 4;
 155        for (i = 4; (1u << i) < hsize && i < 31; i++);
 156        hsize = 1 << i;
 157        hmask = hsize - 1;
 158
 159        /* allocate lookup index */
 160        memsize = sizeof(*index) +
 161                  sizeof(*hash) * hsize +
 162                  sizeof(*entry) * entries;
 163        mem = malloc(memsize);
 164        if (!mem)
 165                return NULL;
 166        index = mem;
 167        mem = index + 1;
 168        hash = mem;
 169        mem = hash + hsize;
 170        entry = mem;
 171
 172        index->src_buf = buf;
 173        index->src_size = bufsize;
 174        index->hash_mask = hmask;
 175        memset(hash, 0, hsize * sizeof(*hash));
 176
 177        /* allocate an array to count hash entries */
 178        hash_count = calloc(hsize, sizeof(*hash_count));
 179        if (!hash_count) {
 180                free(index);
 181                return NULL;
 182        }
 183
 184        /* then populate the index */
 185        prev_val = ~0;
 186        for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 187             data >= buffer;
 188             data -= RABIN_WINDOW) {
 189                unsigned int val = 0;
 190                for (i = 1; i <= RABIN_WINDOW; i++)
 191                        val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 192                if (val == prev_val) {
 193                        /* keep the lowest of consecutive identical blocks */
 194                        entry[-1].ptr = data + RABIN_WINDOW;
 195                } else {
 196                        prev_val = val;
 197                        i = val & hmask;
 198                        entry->ptr = data + RABIN_WINDOW;
 199                        entry->val = val;
 200                        entry->next = hash[i];
 201                        hash[i] = entry++;
 202                        hash_count[i]++;
 203                }
 204        }
 205
 206        /*
 207         * Determine a limit on the number of entries in the same hash
 208         * bucket.  This guards us against pathological data sets causing
 209         * really bad hash distribution with most entries in the same hash
 210         * bucket that would bring us to O(m*n) computing costs (m and n
 211         * corresponding to reference and target buffer sizes).
 212         *
 213         * Make sure none of the hash buckets has more entries than
 214         * we're willing to test.  Otherwise we cull the entry list
 215         * uniformly to still preserve a good repartition across
 216         * the reference buffer.
 217         */
 218        for (i = 0; i < hsize; i++) {
 219                if (hash_count[i] < HASH_LIMIT)
 220                        continue;
 221                entry = hash[i];
 222                do {
 223                        struct index_entry *keep = entry;
 224                        int skip = hash_count[i] / HASH_LIMIT / 2;
 225                        do {
 226                                entry = entry->next;
 227                        } while(--skip && entry);
 228                        keep->next = entry;
 229                } while(entry);
 230        }
 231        free(hash_count);
 232
 233        return index;
 234}
 235
 236void free_delta_index(struct delta_index *index)
 237{
 238        free(index);
 239}
 240
 241/*
 242 * The maximum size for any opcode sequence, including the initial header
 243 * plus Rabin window plus biggest copy.
 244 */
 245#define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 246
 247void *
 248create_delta(const struct delta_index *index,
 249             const void *trg_buf, unsigned long trg_size,
 250             unsigned long *delta_size, unsigned long max_size)
 251{
 252        unsigned int i, outpos, outsize, val;
 253        int inscnt;
 254        const unsigned char *ref_data, *ref_top, *data, *top;
 255        unsigned char *out;
 256
 257        if (!trg_buf || !trg_size)
 258                return NULL;
 259
 260        outpos = 0;
 261        outsize = 8192;
 262        if (max_size && outsize >= max_size)
 263                outsize = max_size + MAX_OP_SIZE + 1;
 264        out = malloc(outsize);
 265        if (!out)
 266                return NULL;
 267
 268        /* store reference buffer size */
 269        i = index->src_size;
 270        while (i >= 0x80) {
 271                out[outpos++] = i | 0x80;
 272                i >>= 7;
 273        }
 274        out[outpos++] = i;
 275
 276        /* store target buffer size */
 277        i = trg_size;
 278        while (i >= 0x80) {
 279                out[outpos++] = i | 0x80;
 280                i >>= 7;
 281        }
 282        out[outpos++] = i;
 283
 284        ref_data = index->src_buf;
 285        ref_top = ref_data + index->src_size;
 286        data = trg_buf;
 287        top = (const unsigned char *) trg_buf + trg_size;
 288
 289        outpos++;
 290        val = 0;
 291        for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 292                out[outpos++] = *data;
 293                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 294        }
 295        inscnt = i;
 296
 297        while (data < top) {
 298                unsigned int moff = 0, msize = 0;
 299                struct index_entry *entry;
 300                val ^= U[data[-RABIN_WINDOW]];
 301                val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 302                i = val & index->hash_mask;
 303                for (entry = index->hash[i]; entry; entry = entry->next) {
 304                        const unsigned char *ref = entry->ptr;
 305                        const unsigned char *src = data;
 306                        unsigned int ref_size = ref_top - ref;
 307                        if (entry->val != val)
 308                                continue;
 309                        if (ref_size > top - src)
 310                                ref_size = top - src;
 311                        if (ref_size > 0xffffff)
 312                                ref_size = 0xffffff;
 313                        if (ref_size <= msize)
 314                                break;
 315                        while (ref_size-- && *src++ == *ref)
 316                                ref++;
 317                        if (msize < ref - entry->ptr) {
 318                                /* this is our best match so far */
 319                                msize = ref - entry->ptr;
 320                                moff = entry->ptr - ref_data;
 321                                if (msize >= 0x10000)
 322                                        break;  /* this is good enough */
 323                        }
 324                }
 325
 326                if (msize < 4) {
 327                        if (!inscnt)
 328                                outpos++;
 329                        out[outpos++] = *data++;
 330                        inscnt++;
 331                        if (inscnt == 0x7f) {
 332                                out[outpos - inscnt - 1] = inscnt;
 333                                inscnt = 0;
 334                        }
 335                } else {
 336                        unsigned char *op;
 337
 338                        if (msize >= RABIN_WINDOW) {
 339                                const unsigned char *sk;
 340                                sk = data + msize - RABIN_WINDOW;
 341                                val = 0;
 342                                for (i = 0; i < RABIN_WINDOW; i++)
 343                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 344                        } else {
 345                                const unsigned char *sk = data + 1;
 346                                for (i = 1; i < msize; i++) {
 347                                        val ^= U[sk[-RABIN_WINDOW]];
 348                                        val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
 349                                }
 350                        }
 351
 352                        if (inscnt) {
 353                                while (moff && ref_data[moff-1] == data[-1]) {
 354                                        if (msize == 0x10000)
 355                                                break;
 356                                        /* we can match one byte back */
 357                                        msize++;
 358                                        moff--;
 359                                        data--;
 360                                        outpos--;
 361                                        if (--inscnt)
 362                                                continue;
 363                                        outpos--;  /* remove count slot */
 364                                        inscnt--;  /* make it -1 */
 365                                        break;
 366                                }
 367                                out[outpos - inscnt - 1] = inscnt;
 368                                inscnt = 0;
 369                        }
 370
 371                        data += msize;
 372                        op = out + outpos++;
 373                        i = 0x80;
 374
 375                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 376                        moff >>= 8;
 377                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 378                        moff >>= 8;
 379                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 380                        moff >>= 8;
 381                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 382
 383                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 384                        msize >>= 8;
 385                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 386                        msize >>= 8;
 387                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x40; }
 388
 389                        *op = i;
 390                }
 391
 392                if (outpos >= outsize - MAX_OP_SIZE) {
 393                        void *tmp = out;
 394                        outsize = outsize * 3 / 2;
 395                        if (max_size && outsize >= max_size)
 396                                outsize = max_size + MAX_OP_SIZE + 1;
 397                        if (max_size && outpos > max_size)
 398                                break;
 399                        out = xrealloc(out, outsize);
 400                        if (!out) {
 401                                free(tmp);
 402                                return NULL;
 403                        }
 404                }
 405        }
 406
 407        if (inscnt)
 408                out[outpos - inscnt - 1] = inscnt;
 409
 410        if (max_size && outpos > max_size) {
 411                free(out);
 412                return NULL;
 413        }
 414
 415        *delta_size = outpos;
 416        return out;
 417}