1/*
2 * diff-delta.c: generate a delta between two buffers
3 *
4 * Many parts of this file have been lifted from LibXDiff version 0.10.
5 * http://www.xmailserver.org/xdiff-lib.html
6 *
7 * LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8 * Copyright (C) 2003 Davide Libenzi
9 *
10 * Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
11 *
12 * This file is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * Use of this within git automatically means that the LGPL
18 * licensing gets turned into GPLv2 within this project.
19 */
20
21#include <stdlib.h>
22#include <string.h>
23#include "delta.h"
24
25#include "git-compat-util.h"
26
27/* maximum hash entry list for the same hash bucket */
28#define HASH_LIMIT 64
29
30#define RABIN_SHIFT 23
31#define RABIN_WINDOW 16
32
33static const unsigned int T[256] = {
34 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
35 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
36 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
37 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
38 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
39 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
40 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
41 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
42 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
43 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
44 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
45 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
46 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
47 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
48 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
49 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
50 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
51 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
52 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
53 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
54 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
55 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
56 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
57 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
58 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
59 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
60 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
61 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
62 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
63 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
64 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
65 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
66 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
67 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
68 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
69 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
70 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
71 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
72 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
73 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
74 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
75 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
76 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
77};
78
79static const unsigned int U[256] = {
80 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
81 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
82 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
83 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
84 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
85 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
86 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
87 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
88 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
89 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
90 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
91 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
92 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
93 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
94 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
95 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
96 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
97 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
98 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
99 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
100 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
101 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
102 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
103 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
104 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
105 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
106 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
107 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
108 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
109 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
110 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
111 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
112 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
113 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
114 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
115 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
116 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
117 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
118 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
119 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
120 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
121 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
122 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
123};
124
125struct index_entry {
126 const unsigned char *ptr;
127 unsigned int val;
128 struct index_entry *next;
129};
130
131struct delta_index {
132 const void *src_buf;
133 unsigned long src_size;
134 unsigned int hash_mask;
135 struct index_entry *hash[FLEX_ARRAY];
136};
137
138struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
139{
140 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
141 const unsigned char *data, *buffer = buf;
142 struct delta_index *index;
143 struct index_entry *entry, **hash;
144 void *mem;
145 unsigned long memsize;
146
147 if (!buf || !bufsize)
148 return NULL;
149
150 /* Determine index hash size. Note that indexing skips the
151 first byte to allow for optimizing the Rabin's polynomial
152 initialization in create_delta(). */
153 entries = (bufsize - 1) / RABIN_WINDOW;
154 hsize = entries / 4;
155 for (i = 4; (1u << i) < hsize && i < 31; i++);
156 hsize = 1 << i;
157 hmask = hsize - 1;
158
159 /* allocate lookup index */
160 memsize = sizeof(*index) +
161 sizeof(*hash) * hsize +
162 sizeof(*entry) * entries;
163 mem = malloc(memsize);
164 if (!mem)
165 return NULL;
166 index = mem;
167 mem = index + 1;
168 hash = mem;
169 mem = hash + hsize;
170 entry = mem;
171
172 index->src_buf = buf;
173 index->src_size = bufsize;
174 index->hash_mask = hmask;
175 memset(hash, 0, hsize * sizeof(*hash));
176
177 /* allocate an array to count hash entries */
178 hash_count = calloc(hsize, sizeof(*hash_count));
179 if (!hash_count) {
180 free(index);
181 return NULL;
182 }
183
184 /* then populate the index */
185 prev_val = ~0;
186 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
187 data >= buffer;
188 data -= RABIN_WINDOW) {
189 unsigned int val = 0;
190 for (i = 1; i <= RABIN_WINDOW; i++)
191 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
192 if (val == prev_val) {
193 /* keep the lowest of consecutive identical blocks */
194 entry[-1].ptr = data + RABIN_WINDOW;
195 } else {
196 prev_val = val;
197 i = val & hmask;
198 entry->ptr = data + RABIN_WINDOW;
199 entry->val = val;
200 entry->next = hash[i];
201 hash[i] = entry++;
202 hash_count[i]++;
203 }
204 }
205
206 /*
207 * Determine a limit on the number of entries in the same hash
208 * bucket. This guards us against pathological data sets causing
209 * really bad hash distribution with most entries in the same hash
210 * bucket that would bring us to O(m*n) computing costs (m and n
211 * corresponding to reference and target buffer sizes).
212 *
213 * Make sure none of the hash buckets has more entries than
214 * we're willing to test. Otherwise we cull the entry list
215 * uniformly to still preserve a good repartition across
216 * the reference buffer.
217 */
218 for (i = 0; i < hsize; i++) {
219 if (hash_count[i] < HASH_LIMIT)
220 continue;
221 entry = hash[i];
222 do {
223 struct index_entry *keep = entry;
224 int skip = hash_count[i] / HASH_LIMIT / 2;
225 do {
226 entry = entry->next;
227 } while(--skip && entry);
228 keep->next = entry;
229 } while(entry);
230 }
231 free(hash_count);
232
233 return index;
234}
235
236void free_delta_index(struct delta_index *index)
237{
238 free(index);
239}
240
241/*
242 * The maximum size for any opcode sequence, including the initial header
243 * plus Rabin window plus biggest copy.
244 */
245#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
246
247void *
248create_delta(const struct delta_index *index,
249 const void *trg_buf, unsigned long trg_size,
250 unsigned long *delta_size, unsigned long max_size)
251{
252 unsigned int i, outpos, outsize, val;
253 int inscnt;
254 const unsigned char *ref_data, *ref_top, *data, *top;
255 unsigned char *out;
256
257 if (!trg_buf || !trg_size)
258 return NULL;
259
260 outpos = 0;
261 outsize = 8192;
262 if (max_size && outsize >= max_size)
263 outsize = max_size + MAX_OP_SIZE + 1;
264 out = malloc(outsize);
265 if (!out)
266 return NULL;
267
268 /* store reference buffer size */
269 i = index->src_size;
270 while (i >= 0x80) {
271 out[outpos++] = i | 0x80;
272 i >>= 7;
273 }
274 out[outpos++] = i;
275
276 /* store target buffer size */
277 i = trg_size;
278 while (i >= 0x80) {
279 out[outpos++] = i | 0x80;
280 i >>= 7;
281 }
282 out[outpos++] = i;
283
284 ref_data = index->src_buf;
285 ref_top = ref_data + index->src_size;
286 data = trg_buf;
287 top = (const unsigned char *) trg_buf + trg_size;
288
289 outpos++;
290 val = 0;
291 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
292 out[outpos++] = *data;
293 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
294 }
295 inscnt = i;
296
297 while (data < top) {
298 unsigned int moff = 0, msize = 0;
299 struct index_entry *entry;
300 val ^= U[data[-RABIN_WINDOW]];
301 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
302 i = val & index->hash_mask;
303 for (entry = index->hash[i]; entry; entry = entry->next) {
304 const unsigned char *ref = entry->ptr;
305 const unsigned char *src = data;
306 unsigned int ref_size = ref_top - ref;
307 if (entry->val != val)
308 continue;
309 if (ref_size > top - src)
310 ref_size = top - src;
311 if (ref_size > 0xffffff)
312 ref_size = 0xffffff;
313 if (ref_size <= msize)
314 break;
315 while (ref_size-- && *src++ == *ref)
316 ref++;
317 if (msize < ref - entry->ptr) {
318 /* this is our best match so far */
319 msize = ref - entry->ptr;
320 moff = entry->ptr - ref_data;
321 if (msize >= 0x10000)
322 break; /* this is good enough */
323 }
324 }
325
326 if (msize < 4) {
327 if (!inscnt)
328 outpos++;
329 out[outpos++] = *data++;
330 inscnt++;
331 if (inscnt == 0x7f) {
332 out[outpos - inscnt - 1] = inscnt;
333 inscnt = 0;
334 }
335 } else {
336 unsigned char *op;
337
338 if (msize >= RABIN_WINDOW) {
339 const unsigned char *sk;
340 sk = data + msize - RABIN_WINDOW;
341 val = 0;
342 for (i = 0; i < RABIN_WINDOW; i++)
343 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
344 } else {
345 const unsigned char *sk = data + 1;
346 for (i = 1; i < msize; i++) {
347 val ^= U[sk[-RABIN_WINDOW]];
348 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
349 }
350 }
351
352 if (inscnt) {
353 while (moff && ref_data[moff-1] == data[-1]) {
354 if (msize == 0x10000)
355 break;
356 /* we can match one byte back */
357 msize++;
358 moff--;
359 data--;
360 outpos--;
361 if (--inscnt)
362 continue;
363 outpos--; /* remove count slot */
364 inscnt--; /* make it -1 */
365 break;
366 }
367 out[outpos - inscnt - 1] = inscnt;
368 inscnt = 0;
369 }
370
371 data += msize;
372 op = out + outpos++;
373 i = 0x80;
374
375 if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
376 moff >>= 8;
377 if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
378 moff >>= 8;
379 if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
380 moff >>= 8;
381 if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
382
383 if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
384 msize >>= 8;
385 if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
386 msize >>= 8;
387 if (msize & 0xff) { out[outpos++] = msize; i |= 0x40; }
388
389 *op = i;
390 }
391
392 if (outpos >= outsize - MAX_OP_SIZE) {
393 void *tmp = out;
394 outsize = outsize * 3 / 2;
395 if (max_size && outsize >= max_size)
396 outsize = max_size + MAX_OP_SIZE + 1;
397 if (max_size && outpos > max_size)
398 break;
399 out = xrealloc(out, outsize);
400 if (!out) {
401 free(tmp);
402 return NULL;
403 }
404 }
405 }
406
407 if (inscnt)
408 out[outpos - inscnt - 1] = inscnt;
409
410 if (max_size && outpos > max_size) {
411 free(out);
412 return NULL;
413 }
414
415 *delta_size = outpos;
416 return out;
417}