diff-delta.con commit diff-delta: allow reusing of the reference buffer index (38fd072)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25
  26struct index {
  27        const unsigned char *ptr;
  28        struct index *next;
  29};
  30
  31static struct index ** delta_index(const unsigned char *buf,
  32                                   unsigned long bufsize,
  33                                   unsigned long trg_bufsize)
  34{
  35        unsigned long hsize;
  36        unsigned int i, hshift, hlimit, *hash_count;
  37        const unsigned char *data;
  38        struct index *entry, **hash;
  39        void *mem;
  40
  41        /* determine index hash size */
  42        hsize = bufsize / 4;
  43        for (i = 8; (1 << i) < hsize && i < 24; i += 2);
  44        hsize = 1 << i;
  45        hshift = (i - 8) / 2;
  46
  47        /*
  48         * Allocate lookup index.  Note the first hash pointer
  49         * is used to store the hash shift value.
  50         */
  51        mem = malloc((1 + hsize) * sizeof(*hash) + bufsize * sizeof(*entry));
  52        if (!mem)
  53                return NULL;
  54        hash = mem;
  55        *hash++ = (void *)hshift;
  56        entry = mem + (1 + hsize) * sizeof(*hash);
  57        memset(hash, 0, hsize * sizeof(*hash));
  58
  59        /* allocate an array to count hash entries */
  60        hash_count = calloc(hsize, sizeof(*hash_count));
  61        if (!hash_count) {
  62                free(hash);
  63                return NULL;
  64        }
  65
  66        /* then populate the index */
  67        data = buf + bufsize - 2;
  68        while (data > buf) {
  69                entry->ptr = --data;
  70                i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
  71                entry->next = hash[i];
  72                hash[i] = entry++;
  73                hash_count[i]++;
  74        }
  75
  76        /*
  77         * Determine a limit on the number of entries in the same hash
  78         * bucket.  This guard us against patological data sets causing
  79         * really bad hash distribution with most entries in the same hash
  80         * bucket that would bring us to O(m*n) computing costs (m and n
  81         * corresponding to reference and target buffer sizes).
  82         *
  83         * The more the target buffer is large, the more it is important to
  84         * have small entry lists for each hash buckets.  With such a limit
  85         * the cost is bounded to something more like O(m+n).
  86         */
  87        hlimit = (1 << 26) / trg_bufsize;
  88        if (hlimit < 16)
  89                hlimit = 16;
  90
  91        /*
  92         * Now make sure none of the hash buckets has more entries than
  93         * we're willing to test.  Otherwise we short-circuit the entry
  94         * list uniformly to still preserve a good repartition across
  95         * the reference buffer.
  96         */
  97        for (i = 0; i < hsize; i++) {
  98                if (hash_count[i] < hlimit)
  99                        continue;
 100                entry = hash[i];
 101                do {
 102                        struct index *keep = entry;
 103                        int skip = hash_count[i] / hlimit / 2;
 104                        do {
 105                                entry = entry->next;
 106                        } while(--skip && entry);
 107                        keep->next = entry;
 108                } while(entry);
 109        }
 110        free(hash_count);
 111
 112        return hash-1;
 113}
 114
 115/* provide the size of the copy opcode given the block offset and size */
 116#define COPYOP_SIZE(o, s) \
 117    (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
 118     !!(s & 0xff) + !!(s & 0xff00) + 1)
 119
 120/* the maximum size for any opcode */
 121#define MAX_OP_SIZE COPYOP_SIZE(0xffffffff, 0xffffffff)
 122
 123void *diff_delta(void *from_buf, unsigned long from_size,
 124                 void *to_buf, unsigned long to_size,
 125                 unsigned long *delta_size,
 126                 unsigned long max_size,
 127                 void **from_index)
 128{
 129        unsigned int i, outpos, outsize, inscnt, hash_shift;
 130        const unsigned char *ref_data, *ref_top, *data, *top;
 131        unsigned char *out;
 132        struct index *entry, **hash;
 133
 134        if (!from_size || !to_size)
 135                return NULL;
 136        if (from_index && *from_index) {
 137                hash = *from_index;
 138        } else {
 139                hash = delta_index(from_buf, from_size, to_size);
 140                if (!hash)
 141                        return NULL;
 142                if (from_index)
 143                        *from_index = hash;
 144        }
 145        hash_shift = (unsigned int)(*hash++);
 146
 147        outpos = 0;
 148        outsize = 8192;
 149        if (max_size && outsize >= max_size)
 150                outsize = max_size + MAX_OP_SIZE + 1;
 151        out = malloc(outsize);
 152        if (!out) {
 153                if (!from_index)
 154                        free(hash-1);
 155                return NULL;
 156        }
 157
 158        ref_data = from_buf;
 159        ref_top = from_buf + from_size;
 160        data = to_buf;
 161        top = to_buf + to_size;
 162
 163        /* store reference buffer size */
 164        out[outpos++] = from_size;
 165        from_size >>= 7;
 166        while (from_size) {
 167                out[outpos - 1] |= 0x80;
 168                out[outpos++] = from_size;
 169                from_size >>= 7;
 170        }
 171
 172        /* store target buffer size */
 173        out[outpos++] = to_size;
 174        to_size >>= 7;
 175        while (to_size) {
 176                out[outpos - 1] |= 0x80;
 177                out[outpos++] = to_size;
 178                to_size >>= 7;
 179        }
 180
 181        inscnt = 0;
 182
 183        while (data < top) {
 184                unsigned int moff = 0, msize = 0;
 185                if (data + 3 <= top) {
 186                        i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
 187                        for (entry = hash[i]; entry; entry = entry->next) {
 188                                const unsigned char *ref = entry->ptr;
 189                                const unsigned char *src = data;
 190                                unsigned int ref_size = ref_top - ref;
 191                                if (ref_size > top - src)
 192                                        ref_size = top - src;
 193                                if (ref_size > 0x10000)
 194                                        ref_size = 0x10000;
 195                                if (ref_size <= msize)
 196                                        break;
 197                                if (*ref != *src)
 198                                        continue;
 199                                while (ref_size-- && *++src == *++ref);
 200                                if (msize < ref - entry->ptr) {
 201                                        /* this is our best match so far */
 202                                        msize = ref - entry->ptr;
 203                                        moff = entry->ptr - ref_data;
 204                                }
 205                        }
 206                }
 207
 208                if (!msize || msize < COPYOP_SIZE(moff, msize)) {
 209                        if (!inscnt)
 210                                outpos++;
 211                        out[outpos++] = *data++;
 212                        inscnt++;
 213                        if (inscnt == 0x7f) {
 214                                out[outpos - inscnt - 1] = inscnt;
 215                                inscnt = 0;
 216                        }
 217                } else {
 218                        unsigned char *op;
 219
 220                        if (inscnt) {
 221                                out[outpos - inscnt - 1] = inscnt;
 222                                inscnt = 0;
 223                        }
 224
 225                        data += msize;
 226                        op = out + outpos++;
 227                        i = 0x80;
 228
 229                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 230                        moff >>= 8;
 231                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 232                        moff >>= 8;
 233                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 234                        moff >>= 8;
 235                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 236
 237                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 238                        msize >>= 8;
 239                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 240
 241                        *op = i;
 242                }
 243
 244                if (outpos >= outsize - MAX_OP_SIZE) {
 245                        void *tmp = out;
 246                        outsize = outsize * 3 / 2;
 247                        if (max_size && outsize >= max_size)
 248                                outsize = max_size + MAX_OP_SIZE + 1;
 249                        if (max_size && outpos > max_size)
 250                                out = NULL;
 251                        else
 252                                out = realloc(out, outsize);
 253                        if (!out) {
 254                                free(tmp);
 255                                if (!from_index)
 256                                        free(hash-1);
 257                                return NULL;
 258                        }
 259                }
 260        }
 261
 262        if (inscnt)
 263                out[outpos - inscnt - 1] = inscnt;
 264
 265        if (!from_index)
 266                free(hash-1);
 267        *delta_size = outpos;
 268        return out;
 269}