1/*
2 * diff-delta.c: generate a delta between two buffers
3 *
4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5 * http://www.xmailserver.org/xdiff-lib.html
6 *
7 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
8 *
9 * This code is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include "git-compat-util.h"
15#include "delta.h"
16
17/* maximum hash entry list for the same hash bucket */
18#define HASH_LIMIT 64
19
20#define RABIN_SHIFT 23
21#define RABIN_WINDOW 16
22
23static const unsigned int T[256] = {
24 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
67};
68
69static const unsigned int U[256] = {
70 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
113};
114
115struct index_entry {
116 const unsigned char *ptr;
117 unsigned int val;
118 struct index_entry *next;
119};
120
121struct delta_index {
122 const void *src_buf;
123 unsigned long src_size;
124 unsigned int hash_mask;
125 struct index_entry *hash[FLEX_ARRAY];
126};
127
128struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
129{
130 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
131 const unsigned char *data, *buffer = buf;
132 struct delta_index *index;
133 struct index_entry *entry, **hash;
134 void *mem;
135 unsigned long memsize;
136
137 if (!buf || !bufsize)
138 return NULL;
139
140 /* Determine index hash size. Note that indexing skips the
141 first byte to allow for optimizing the Rabin's polynomial
142 initialization in create_delta(). */
143 entries = (bufsize - 1) / RABIN_WINDOW;
144 hsize = entries / 4;
145 for (i = 4; (1u << i) < hsize && i < 31; i++);
146 hsize = 1 << i;
147 hmask = hsize - 1;
148
149 /* allocate lookup index */
150 memsize = sizeof(*index) +
151 sizeof(*hash) * hsize +
152 sizeof(*entry) * entries;
153 mem = malloc(memsize);
154 if (!mem)
155 return NULL;
156 index = mem;
157 mem = index + 1;
158 hash = mem;
159 mem = hash + hsize;
160 entry = mem;
161
162 index->src_buf = buf;
163 index->src_size = bufsize;
164 index->hash_mask = hmask;
165 memset(hash, 0, hsize * sizeof(*hash));
166
167 /* allocate an array to count hash entries */
168 hash_count = calloc(hsize, sizeof(*hash_count));
169 if (!hash_count) {
170 free(index);
171 return NULL;
172 }
173
174 /* then populate the index */
175 prev_val = ~0;
176 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
177 data >= buffer;
178 data -= RABIN_WINDOW) {
179 unsigned int val = 0;
180 for (i = 1; i <= RABIN_WINDOW; i++)
181 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
182 if (val == prev_val) {
183 /* keep the lowest of consecutive identical blocks */
184 entry[-1].ptr = data + RABIN_WINDOW;
185 } else {
186 prev_val = val;
187 i = val & hmask;
188 entry->ptr = data + RABIN_WINDOW;
189 entry->val = val;
190 entry->next = hash[i];
191 hash[i] = entry++;
192 hash_count[i]++;
193 }
194 }
195
196 /*
197 * Determine a limit on the number of entries in the same hash
198 * bucket. This guards us against pathological data sets causing
199 * really bad hash distribution with most entries in the same hash
200 * bucket that would bring us to O(m*n) computing costs (m and n
201 * corresponding to reference and target buffer sizes).
202 *
203 * Make sure none of the hash buckets has more entries than
204 * we're willing to test. Otherwise we cull the entry list
205 * uniformly to still preserve a good repartition across
206 * the reference buffer.
207 */
208 for (i = 0; i < hsize; i++) {
209 if (hash_count[i] < HASH_LIMIT)
210 continue;
211 entry = hash[i];
212 do {
213 struct index_entry *keep = entry;
214 int skip = hash_count[i] / HASH_LIMIT / 2;
215 do {
216 entry = entry->next;
217 } while(--skip && entry);
218 keep->next = entry;
219 } while(entry);
220 }
221 free(hash_count);
222
223 return index;
224}
225
226void free_delta_index(struct delta_index *index)
227{
228 free(index);
229}
230
231/*
232 * The maximum size for any opcode sequence, including the initial header
233 * plus Rabin window plus biggest copy.
234 */
235#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
236
237void *
238create_delta(const struct delta_index *index,
239 const void *trg_buf, unsigned long trg_size,
240 unsigned long *delta_size, unsigned long max_size)
241{
242 unsigned int i, outpos, outsize, moff, msize, val;
243 int inscnt;
244 const unsigned char *ref_data, *ref_top, *data, *top;
245 unsigned char *out;
246
247 if (!trg_buf || !trg_size)
248 return NULL;
249
250 outpos = 0;
251 outsize = 8192;
252 if (max_size && outsize >= max_size)
253 outsize = max_size + MAX_OP_SIZE + 1;
254 out = malloc(outsize);
255 if (!out)
256 return NULL;
257
258 /* store reference buffer size */
259 i = index->src_size;
260 while (i >= 0x80) {
261 out[outpos++] = i | 0x80;
262 i >>= 7;
263 }
264 out[outpos++] = i;
265
266 /* store target buffer size */
267 i = trg_size;
268 while (i >= 0x80) {
269 out[outpos++] = i | 0x80;
270 i >>= 7;
271 }
272 out[outpos++] = i;
273
274 ref_data = index->src_buf;
275 ref_top = ref_data + index->src_size;
276 data = trg_buf;
277 top = (const unsigned char *) trg_buf + trg_size;
278
279 outpos++;
280 val = 0;
281 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
282 out[outpos++] = *data;
283 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
284 }
285 inscnt = i;
286
287 moff = 0;
288 msize = 0;
289 while (data < top) {
290 if (msize < 4096) {
291 struct index_entry *entry;
292 val ^= U[data[-RABIN_WINDOW]];
293 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
294 i = val & index->hash_mask;
295 for (entry = index->hash[i]; entry; entry = entry->next) {
296 const unsigned char *ref = entry->ptr;
297 const unsigned char *src = data;
298 unsigned int ref_size = ref_top - ref;
299 if (entry->val != val)
300 continue;
301 if (ref_size > top - src)
302 ref_size = top - src;
303 if (ref_size <= msize)
304 break;
305 while (ref_size-- && *src++ == *ref)
306 ref++;
307 if (msize < ref - entry->ptr) {
308 /* this is our best match so far */
309 msize = ref - entry->ptr;
310 moff = entry->ptr - ref_data;
311 if (msize >= 4096) /* good enough */
312 break;
313 }
314 }
315 }
316
317 if (msize < 4) {
318 if (!inscnt)
319 outpos++;
320 out[outpos++] = *data++;
321 inscnt++;
322 if (inscnt == 0x7f) {
323 out[outpos - inscnt - 1] = inscnt;
324 inscnt = 0;
325 }
326 msize = 0;
327 } else {
328 unsigned int left;
329 unsigned char *op;
330
331 if (inscnt) {
332 while (moff && ref_data[moff-1] == data[-1]) {
333 /* we can match one byte back */
334 msize++;
335 moff--;
336 data--;
337 outpos--;
338 if (--inscnt)
339 continue;
340 outpos--; /* remove count slot */
341 inscnt--; /* make it -1 */
342 break;
343 }
344 out[outpos - inscnt - 1] = inscnt;
345 inscnt = 0;
346 }
347
348 /* A copy op is currently limited to 64KB (pack v2) */
349 left = (msize < 0x10000) ? 0 : (msize - 0x10000);
350 msize -= left;
351
352 op = out + outpos++;
353 i = 0x80;
354
355 if (moff & 0x000000ff)
356 out[outpos++] = moff >> 0, i |= 0x01;
357 if (moff & 0x0000ff00)
358 out[outpos++] = moff >> 8, i |= 0x02;
359 if (moff & 0x00ff0000)
360 out[outpos++] = moff >> 16, i |= 0x04;
361 if (moff & 0xff000000)
362 out[outpos++] = moff >> 24, i |= 0x08;
363
364 if (msize & 0x00ff)
365 out[outpos++] = msize >> 0, i |= 0x10;
366 if (msize & 0xff00)
367 out[outpos++] = msize >> 8, i |= 0x20;
368
369 *op = i;
370
371 data += msize;
372 moff += msize;
373 msize = left;
374
375 if (msize < 4096) {
376 int j;
377 val = 0;
378 for (j = -RABIN_WINDOW; j < 0; j++)
379 val = ((val << 8) | data[j])
380 ^ T[val >> RABIN_SHIFT];
381 }
382 }
383
384 if (outpos >= outsize - MAX_OP_SIZE) {
385 void *tmp = out;
386 outsize = outsize * 3 / 2;
387 if (max_size && outsize >= max_size)
388 outsize = max_size + MAX_OP_SIZE + 1;
389 if (max_size && outpos > max_size)
390 break;
391 out = realloc(out, outsize);
392 if (!out) {
393 free(tmp);
394 return NULL;
395 }
396 }
397 }
398
399 if (inscnt)
400 out[outpos - inscnt - 1] = inscnt;
401
402 if (max_size && outpos > max_size) {
403 free(out);
404 return NULL;
405 }
406
407 *delta_size = outpos;
408 return out;
409}