diff-delta.con commit Merge branch 'master' into next (061ad5f)
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25
  26struct index {
  27        const unsigned char *ptr;
  28        struct index *next;
  29};
  30
  31static struct index ** delta_index(const unsigned char *buf,
  32                                   unsigned long bufsize,
  33                                   unsigned long trg_bufsize)
  34{
  35        unsigned long hsize;
  36        unsigned int i, hshift, hlimit, *hash_count;
  37        const unsigned char *data;
  38        struct index *entry, **hash;
  39        void *mem;
  40
  41        /* determine index hash size */
  42        hsize = bufsize / 4;
  43        for (i = 8; (1 << i) < hsize && i < 24; i += 2);
  44        hsize = 1 << i;
  45        hshift = (i - 8) / 2;
  46
  47        /*
  48         * Allocate lookup index.  Note the first hash pointer
  49         * is used to store the hash shift value.
  50         */
  51        mem = malloc((1 + hsize) * sizeof(*hash) + bufsize * sizeof(*entry));
  52        if (!mem)
  53                return NULL;
  54        hash = mem;
  55        *hash++ = (void *)hshift;
  56        entry = mem + (1 + hsize) * sizeof(*hash);
  57        memset(hash, 0, hsize * sizeof(*hash));
  58
  59        /* allocate an array to count hash entries */
  60        hash_count = calloc(hsize, sizeof(*hash_count));
  61        if (!hash_count) {
  62                free(hash);
  63                return NULL;
  64        }
  65
  66        /* then populate the index */
  67        data = buf + bufsize - 2;
  68        while (data > buf) {
  69                entry->ptr = --data;
  70                i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
  71                entry->next = hash[i];
  72                hash[i] = entry++;
  73                hash_count[i]++;
  74        }
  75
  76        /*
  77         * Determine a limit on the number of entries in the same hash
  78         * bucket.  This guard us against patological data sets causing
  79         * really bad hash distribution with most entries in the same hash
  80         * bucket that would bring us to O(m*n) computing costs (m and n
  81         * corresponding to reference and target buffer sizes).
  82         *
  83         * The more the target buffer is large, the more it is important to
  84         * have small entry lists for each hash buckets.  With such a limit
  85         * the cost is bounded to something more like O(m+n).
  86         */
  87        hlimit = (1 << 26) / trg_bufsize;
  88        if (hlimit < 16)
  89                hlimit = 16;
  90
  91        /*
  92         * Now make sure none of the hash buckets has more entries than
  93         * we're willing to test.  Otherwise we cull the entry list to
  94         * limit identical three byte prefixes to still preserve a good
  95         * repartition across the reference buffer.
  96         */
  97        for (i = 0; i < hsize; i++) {
  98                struct index **list, *bucket, *remaining;
  99                int cnt;
 100                if (hash_count[i] < hlimit)
 101                        continue;
 102
 103                bucket = NULL;
 104                list = &bucket;
 105                remaining = hash[i];
 106                cnt = 0;
 107                while (cnt < hlimit && remaining) {
 108                        struct index *this = remaining, *that;
 109                        remaining = remaining->next;
 110                        for (that = bucket; that; that = that->next) {
 111                                if (!memcmp(that->ptr, this->ptr, 3))
 112                                        break;
 113                        }
 114                        if (that)
 115                                continue; /* discard */
 116                        cnt++;
 117                        *list = this;
 118                        list = &(this->next);
 119                        this->next = NULL;
 120                }
 121                hash[i] = bucket;
 122        }
 123        free(hash_count);
 124
 125        return hash-1;
 126}
 127
 128/* provide the size of the copy opcode given the block offset and size */
 129#define COPYOP_SIZE(o, s) \
 130    (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
 131     !!(s & 0xff) + !!(s & 0xff00) + 1)
 132
 133/* the maximum size for any opcode */
 134#define MAX_OP_SIZE COPYOP_SIZE(0xffffffff, 0xffffffff)
 135
 136void *diff_delta(void *from_buf, unsigned long from_size,
 137                 void *to_buf, unsigned long to_size,
 138                 unsigned long *delta_size,
 139                 unsigned long max_size,
 140                 void **from_index)
 141{
 142        unsigned int i, outpos, outsize, inscnt, hash_shift;
 143        const unsigned char *ref_data, *ref_top, *data, *top;
 144        unsigned char *out;
 145        struct index *entry, **hash;
 146
 147        if (!from_size || !to_size)
 148                return NULL;
 149        if (from_index && *from_index) {
 150                hash = *from_index;
 151        } else {
 152                hash = delta_index(from_buf, from_size, to_size);
 153                if (!hash)
 154                        return NULL;
 155                if (from_index)
 156                        *from_index = hash;
 157        }
 158        hash_shift = (unsigned int)(*hash++);
 159
 160        outpos = 0;
 161        outsize = 8192;
 162        if (max_size && outsize >= max_size)
 163                outsize = max_size + MAX_OP_SIZE + 1;
 164        out = malloc(outsize);
 165        if (!out) {
 166                if (!from_index)
 167                        free(hash-1);
 168                return NULL;
 169        }
 170
 171        ref_data = from_buf;
 172        ref_top = from_buf + from_size;
 173        data = to_buf;
 174        top = to_buf + to_size;
 175
 176        /* store reference buffer size */
 177        out[outpos++] = from_size;
 178        from_size >>= 7;
 179        while (from_size) {
 180                out[outpos - 1] |= 0x80;
 181                out[outpos++] = from_size;
 182                from_size >>= 7;
 183        }
 184
 185        /* store target buffer size */
 186        out[outpos++] = to_size;
 187        to_size >>= 7;
 188        while (to_size) {
 189                out[outpos - 1] |= 0x80;
 190                out[outpos++] = to_size;
 191                to_size >>= 7;
 192        }
 193
 194        inscnt = 0;
 195
 196        while (data < top) {
 197                unsigned int moff = 0, msize = 0;
 198                if (data + 3 <= top) {
 199                        i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
 200                        for (entry = hash[i]; entry; entry = entry->next) {
 201                                const unsigned char *ref = entry->ptr;
 202                                const unsigned char *src = data;
 203                                unsigned int ref_size = ref_top - ref;
 204                                if (ref_size > top - src)
 205                                        ref_size = top - src;
 206                                if (ref_size > 0x10000)
 207                                        ref_size = 0x10000;
 208                                if (ref_size <= msize)
 209                                        break;
 210                                if (*ref != *src)
 211                                        continue;
 212                                while (ref_size-- && *++src == *++ref);
 213                                if (msize < ref - entry->ptr) {
 214                                        /* this is our best match so far */
 215                                        msize = ref - entry->ptr;
 216                                        moff = entry->ptr - ref_data;
 217                                }
 218                        }
 219                }
 220
 221                if (!msize || msize < COPYOP_SIZE(moff, msize)) {
 222                        if (!inscnt)
 223                                outpos++;
 224                        out[outpos++] = *data++;
 225                        inscnt++;
 226                        if (inscnt == 0x7f) {
 227                                out[outpos - inscnt - 1] = inscnt;
 228                                inscnt = 0;
 229                        }
 230                } else {
 231                        unsigned char *op;
 232
 233                        if (inscnt) {
 234                                out[outpos - inscnt - 1] = inscnt;
 235                                inscnt = 0;
 236                        }
 237
 238                        data += msize;
 239                        op = out + outpos++;
 240                        i = 0x80;
 241
 242                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 243                        moff >>= 8;
 244                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 245                        moff >>= 8;
 246                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 247                        moff >>= 8;
 248                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 249
 250                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 251                        msize >>= 8;
 252                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 253
 254                        *op = i;
 255                }
 256
 257                if (outpos >= outsize - MAX_OP_SIZE) {
 258                        void *tmp = out;
 259                        outsize = outsize * 3 / 2;
 260                        if (max_size && outsize >= max_size)
 261                                outsize = max_size + MAX_OP_SIZE + 1;
 262                        if (max_size && outpos > max_size)
 263                                out = NULL;
 264                        else
 265                                out = realloc(out, outsize);
 266                        if (!out) {
 267                                free(tmp);
 268                                if (!from_index)
 269                                        free(hash-1);
 270                                return NULL;
 271                        }
 272                }
 273        }
 274
 275        if (inscnt)
 276                out[outpos - inscnt - 1] = inscnt;
 277
 278        if (!from_index)
 279                free(hash-1);
 280        *delta_size = outpos;
 281        return out;
 282}