0730b24df8fe679d16e1cd03d066b4d7f33e73cd
   1/*
   2 * diff-delta.c: generate a delta between two buffers
   3 *
   4 *  Many parts of this file have been lifted from LibXDiff version 0.10.
   5 *  http://www.xmailserver.org/xdiff-lib.html
   6 *
   7 *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
   8 *  Copyright (C) 2003  Davide Libenzi
   9 *
  10 *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
  11 *
  12 *  This file is free software; you can redistribute it and/or
  13 *  modify it under the terms of the GNU Lesser General Public
  14 *  License as published by the Free Software Foundation; either
  15 *  version 2.1 of the License, or (at your option) any later version.
  16 *
  17 *  Use of this within git automatically means that the LGPL
  18 *  licensing gets turned into GPLv2 within this project.
  19 */
  20
  21#include <stdlib.h>
  22#include <string.h>
  23#include "delta.h"
  24
  25
  26struct index {
  27        const unsigned char *ptr;
  28        struct index *next;
  29};
  30
  31static struct index ** delta_index(const unsigned char *buf,
  32                                   unsigned long bufsize,
  33                                   unsigned long trg_bufsize,
  34                                   unsigned int *hash_shift)
  35{
  36        unsigned long hsize;
  37        unsigned int i, hshift, hlimit, *hash_count;
  38        const unsigned char *data;
  39        struct index *entry, **hash;
  40        void *mem;
  41
  42        /* determine index hash size */
  43        hsize = bufsize / 4;
  44        for (i = 8; (1 << i) < hsize && i < 24; i += 2);
  45        hsize = 1 << i;
  46        hshift = (i - 8) / 2;
  47        *hash_shift = hshift;
  48
  49        /* allocate lookup index */
  50        mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
  51        if (!mem)
  52                return NULL;
  53        hash = mem;
  54        entry = mem + hsize * sizeof(*hash);
  55        memset(hash, 0, hsize * sizeof(*hash));
  56
  57        /* allocate an array to count hash entries */
  58        hash_count = calloc(hsize, sizeof(*hash_count));
  59        if (!hash_count) {
  60                free(hash);
  61                return NULL;
  62        }
  63
  64        /* then populate the index */
  65        data = buf + bufsize - 2;
  66        while (data > buf) {
  67                entry->ptr = --data;
  68                i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
  69                entry->next = hash[i];
  70                hash[i] = entry++;
  71                hash_count[i]++;
  72        }
  73
  74        /*
  75         * Determine a limit on the number of entries in the same hash
  76         * bucket.  This guard us against patological data sets causing
  77         * really bad hash distribution with most entries in the same hash
  78         * bucket that would bring us to O(m*n) computing costs (m and n
  79         * corresponding to reference and target buffer sizes).
  80         *
  81         * The more the target buffer is large, the more it is important to
  82         * have small entry lists for each hash buckets.  With such a limit
  83         * the cost is bounded to something more like O(m+n).
  84         */
  85        hlimit = (1 << 26) / trg_bufsize;
  86        if (hlimit < 16)
  87                hlimit = 16;
  88
  89        /*
  90         * Now make sure none of the hash buckets has more entries than
  91         * we're willing to test.  Otherwise we short-circuit the entry
  92         * list uniformly to still preserve a good repartition across
  93         * the reference buffer.
  94         */
  95        for (i = 0; i < hsize; i++) {
  96                if (hash_count[i] < hlimit)
  97                        continue;
  98                entry = hash[i];
  99                do {
 100                        struct index *keep = entry;
 101                        int skip = hash_count[i] / hlimit / 2;
 102                        do {
 103                                entry = entry->next;
 104                        } while(--skip && entry);
 105                        keep->next = entry;
 106                } while(entry);
 107        }
 108        free(hash_count);
 109
 110        return hash;
 111}
 112
 113/* provide the size of the copy opcode given the block offset and size */
 114#define COPYOP_SIZE(o, s) \
 115    (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
 116     !!(s & 0xff) + !!(s & 0xff00) + 1)
 117
 118/* the maximum size for any opcode */
 119#define MAX_OP_SIZE COPYOP_SIZE(0xffffffff, 0xffffffff)
 120
 121void *diff_delta(void *from_buf, unsigned long from_size,
 122                 void *to_buf, unsigned long to_size,
 123                 unsigned long *delta_size,
 124                 unsigned long max_size)
 125{
 126        unsigned int i, outpos, outsize, inscnt, hash_shift;
 127        const unsigned char *ref_data, *ref_top, *data, *top;
 128        unsigned char *out;
 129        struct index *entry, **hash;
 130
 131        if (!from_size || !to_size)
 132                return NULL;
 133        hash = delta_index(from_buf, from_size, to_size, &hash_shift);
 134        if (!hash)
 135                return NULL;
 136
 137        outpos = 0;
 138        outsize = 8192;
 139        if (max_size && outsize >= max_size)
 140                outsize = max_size + MAX_OP_SIZE + 1;
 141        out = malloc(outsize);
 142        if (!out) {
 143                free(hash);
 144                return NULL;
 145        }
 146
 147        ref_data = from_buf;
 148        ref_top = from_buf + from_size;
 149        data = to_buf;
 150        top = to_buf + to_size;
 151
 152        /* store reference buffer size */
 153        out[outpos++] = from_size;
 154        from_size >>= 7;
 155        while (from_size) {
 156                out[outpos - 1] |= 0x80;
 157                out[outpos++] = from_size;
 158                from_size >>= 7;
 159        }
 160
 161        /* store target buffer size */
 162        out[outpos++] = to_size;
 163        to_size >>= 7;
 164        while (to_size) {
 165                out[outpos - 1] |= 0x80;
 166                out[outpos++] = to_size;
 167                to_size >>= 7;
 168        }
 169
 170        inscnt = 0;
 171
 172        while (data < top) {
 173                unsigned int moff = 0, msize = 0;
 174                if (data + 3 <= top) {
 175                        i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
 176                        for (entry = hash[i]; entry; entry = entry->next) {
 177                                const unsigned char *ref = entry->ptr;
 178                                const unsigned char *src = data;
 179                                unsigned int ref_size = ref_top - ref;
 180                                if (ref_size > top - src)
 181                                        ref_size = top - src;
 182                                if (ref_size > 0x10000)
 183                                        ref_size = 0x10000;
 184                                if (ref_size <= msize)
 185                                        break;
 186                                if (*ref != *src)
 187                                        continue;
 188                                while (ref_size-- && *++src == *++ref);
 189                                if (msize < ref - entry->ptr) {
 190                                        /* this is our best match so far */
 191                                        msize = ref - entry->ptr;
 192                                        moff = entry->ptr - ref_data;
 193                                }
 194                        }
 195                }
 196
 197                if (!msize || msize < COPYOP_SIZE(moff, msize)) {
 198                        if (!inscnt)
 199                                outpos++;
 200                        out[outpos++] = *data++;
 201                        inscnt++;
 202                        if (inscnt == 0x7f) {
 203                                out[outpos - inscnt - 1] = inscnt;
 204                                inscnt = 0;
 205                        }
 206                } else {
 207                        unsigned char *op;
 208
 209                        if (inscnt) {
 210                                out[outpos - inscnt - 1] = inscnt;
 211                                inscnt = 0;
 212                        }
 213
 214                        data += msize;
 215                        op = out + outpos++;
 216                        i = 0x80;
 217
 218                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
 219                        moff >>= 8;
 220                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
 221                        moff >>= 8;
 222                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
 223                        moff >>= 8;
 224                        if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
 225
 226                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
 227                        msize >>= 8;
 228                        if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
 229
 230                        *op = i;
 231                }
 232
 233                if (outpos >= outsize - MAX_OP_SIZE) {
 234                        void *tmp = out;
 235                        outsize = outsize * 3 / 2;
 236                        if (max_size && outsize >= max_size)
 237                                outsize = max_size + MAX_OP_SIZE + 1;
 238                        if (max_size && outpos > max_size)
 239                                out = NULL;
 240                        else
 241                                out = realloc(out, outsize);
 242                        if (!out) {
 243                                free(tmp);
 244                                free(hash);
 245                                return NULL;
 246                        }
 247                }
 248        }
 249
 250        if (inscnt)
 251                out[outpos - inscnt - 1] = inscnt;
 252
 253        free(hash);
 254        *delta_size = outpos;
 255        return out;
 256}