7f8a60de18584fa18dba71a454a6f2300674f47f
1/*
2 * diff-delta.c: generate a delta between two buffers
3 *
4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5 * http://www.xmailserver.org/xdiff-lib.html
6 *
7 * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
8 *
9 * This code is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include "git-compat-util.h"
15#include "delta.h"
16
17/* maximum hash entry list for the same hash bucket */
18#define HASH_LIMIT 64
19
20#define RABIN_SHIFT 23
21#define RABIN_WINDOW 16
22
23static const unsigned int T[256] = {
24 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
67};
68
69static const unsigned int U[256] = {
70 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
113};
114
115struct index_entry {
116 const unsigned char *ptr;
117 unsigned int val;
118};
119
120struct unpacked_index_entry {
121 struct index_entry entry;
122 struct unpacked_index_entry *next;
123};
124
125struct delta_index {
126 unsigned long memsize;
127 const void *src_buf;
128 unsigned long src_size;
129 unsigned int hash_mask;
130 struct index_entry *hash[FLEX_ARRAY];
131};
132
133struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
134{
135 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
136 const unsigned char *data, *buffer = buf;
137 struct delta_index *index;
138 struct unpacked_index_entry *entry, **hash;
139 struct index_entry *packed_entry, **packed_hash;
140 void *mem;
141 unsigned long memsize;
142
143 if (!buf || !bufsize)
144 return NULL;
145
146 /* Determine index hash size. Note that indexing skips the
147 first byte to allow for optimizing the Rabin's polynomial
148 initialization in create_delta(). */
149 entries = (bufsize - 1) / RABIN_WINDOW;
150 hsize = entries / 4;
151 for (i = 4; (1u << i) < hsize && i < 31; i++);
152 hsize = 1 << i;
153 hmask = hsize - 1;
154
155 /* allocate lookup index */
156 memsize = sizeof(*hash) * hsize +
157 sizeof(*entry) * entries;
158 mem = malloc(memsize);
159 if (!mem)
160 return NULL;
161 hash = mem;
162 mem = hash + hsize;
163 entry = mem;
164
165 memset(hash, 0, hsize * sizeof(*hash));
166
167 /* allocate an array to count hash entries */
168 hash_count = calloc(hsize, sizeof(*hash_count));
169 if (!hash_count) {
170 free(hash);
171 return NULL;
172 }
173
174 /* then populate the index */
175 prev_val = ~0;
176 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
177 data >= buffer;
178 data -= RABIN_WINDOW) {
179 unsigned int val = 0;
180 for (i = 1; i <= RABIN_WINDOW; i++)
181 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
182 if (val == prev_val) {
183 /* keep the lowest of consecutive identical blocks */
184 entry[-1].entry.ptr = data + RABIN_WINDOW;
185 --entries;
186 } else {
187 prev_val = val;
188 i = val & hmask;
189 entry->entry.ptr = data + RABIN_WINDOW;
190 entry->entry.val = val;
191 entry->next = hash[i];
192 hash[i] = entry++;
193 hash_count[i]++;
194 }
195 }
196
197 /*
198 * Determine a limit on the number of entries in the same hash
199 * bucket. This guards us against pathological data sets causing
200 * really bad hash distribution with most entries in the same hash
201 * bucket that would bring us to O(m*n) computing costs (m and n
202 * corresponding to reference and target buffer sizes).
203 *
204 * Make sure none of the hash buckets has more entries than
205 * we're willing to test. Otherwise we cull the entry list
206 * uniformly to still preserve a good repartition across
207 * the reference buffer.
208 */
209 for (i = 0; i < hsize; i++) {
210 if (hash_count[i] < HASH_LIMIT)
211 continue;
212 entry = hash[i];
213 do {
214 struct unpacked_index_entry *keep = entry;
215 int skip = hash_count[i] / HASH_LIMIT;
216 do {
217 --entries;
218 entry = entry->next;
219 } while(--skip && entry);
220 ++entries;
221 keep->next = entry;
222 } while(entry);
223 }
224 free(hash_count);
225
226 /* Now create the packed index in array form rather than
227 * linked lists */
228
229 memsize = sizeof(*index)
230 + sizeof(*packed_hash) * (hsize+1)
231 + sizeof(*packed_entry) * entries;
232
233 mem = malloc(memsize);
234
235 if (!mem) {
236 free(hash);
237 return NULL;
238 }
239
240 index = mem;
241 index->memsize = memsize;
242 index->src_buf = buf;
243 index->src_size = bufsize;
244 index->hash_mask = hmask;
245
246 mem = index + 1;
247 packed_hash = mem;
248 mem = packed_hash + (hsize+1);
249 packed_entry = mem;
250
251 /* Coalesce all entries belonging to one linked list into
252 * consecutive array entries */
253
254 for (i = 0; i < hsize; i++) {
255 packed_hash[i] = packed_entry;
256 for (entry = hash[i]; entry; entry = entry->next)
257 *packed_entry++ = entry->entry;
258 }
259
260 /* Sentinel value to indicate the length of the last hash
261 * bucket */
262
263 packed_hash[hsize] = packed_entry;
264 assert(packed_entry - (struct index_entry *)mem == entries);
265 free(hash);
266
267 return index;
268}
269
270void free_delta_index(struct delta_index *index)
271{
272 free(index);
273}
274
275unsigned long sizeof_delta_index(struct delta_index *index)
276{
277 if (index)
278 return index->memsize;
279 else
280 return 0;
281}
282
283/*
284 * The maximum size for any opcode sequence, including the initial header
285 * plus Rabin window plus biggest copy.
286 */
287#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
288
289void *
290create_delta(const struct delta_index *index,
291 const void *trg_buf, unsigned long trg_size,
292 unsigned long *delta_size, unsigned long max_size)
293{
294 unsigned int i, outpos, outsize, moff, msize, val;
295 int inscnt;
296 const unsigned char *ref_data, *ref_top, *data, *top;
297 unsigned char *out;
298
299 if (!trg_buf || !trg_size)
300 return NULL;
301
302 outpos = 0;
303 outsize = 8192;
304 if (max_size && outsize >= max_size)
305 outsize = max_size + MAX_OP_SIZE + 1;
306 out = malloc(outsize);
307 if (!out)
308 return NULL;
309
310 /* store reference buffer size */
311 i = index->src_size;
312 while (i >= 0x80) {
313 out[outpos++] = i | 0x80;
314 i >>= 7;
315 }
316 out[outpos++] = i;
317
318 /* store target buffer size */
319 i = trg_size;
320 while (i >= 0x80) {
321 out[outpos++] = i | 0x80;
322 i >>= 7;
323 }
324 out[outpos++] = i;
325
326 ref_data = index->src_buf;
327 ref_top = ref_data + index->src_size;
328 data = trg_buf;
329 top = (const unsigned char *) trg_buf + trg_size;
330
331 outpos++;
332 val = 0;
333 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
334 out[outpos++] = *data;
335 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
336 }
337 inscnt = i;
338
339 moff = 0;
340 msize = 0;
341 while (data < top) {
342 if (msize < 4096) {
343 struct index_entry *entry;
344 val ^= U[data[-RABIN_WINDOW]];
345 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
346 i = val & index->hash_mask;
347 for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
348 const unsigned char *ref = entry->ptr;
349 const unsigned char *src = data;
350 unsigned int ref_size = ref_top - ref;
351 if (entry->val != val)
352 continue;
353 if (ref_size > top - src)
354 ref_size = top - src;
355 if (ref_size <= msize)
356 break;
357 while (ref_size-- && *src++ == *ref)
358 ref++;
359 if (msize < ref - entry->ptr) {
360 /* this is our best match so far */
361 msize = ref - entry->ptr;
362 moff = entry->ptr - ref_data;
363 if (msize >= 4096) /* good enough */
364 break;
365 }
366 }
367 }
368
369 if (msize < 4) {
370 if (!inscnt)
371 outpos++;
372 out[outpos++] = *data++;
373 inscnt++;
374 if (inscnt == 0x7f) {
375 out[outpos - inscnt - 1] = inscnt;
376 inscnt = 0;
377 }
378 msize = 0;
379 } else {
380 unsigned int left;
381 unsigned char *op;
382
383 if (inscnt) {
384 while (moff && ref_data[moff-1] == data[-1]) {
385 /* we can match one byte back */
386 msize++;
387 moff--;
388 data--;
389 outpos--;
390 if (--inscnt)
391 continue;
392 outpos--; /* remove count slot */
393 inscnt--; /* make it -1 */
394 break;
395 }
396 out[outpos - inscnt - 1] = inscnt;
397 inscnt = 0;
398 }
399
400 /* A copy op is currently limited to 64KB (pack v2) */
401 left = (msize < 0x10000) ? 0 : (msize - 0x10000);
402 msize -= left;
403
404 op = out + outpos++;
405 i = 0x80;
406
407 if (moff & 0x000000ff)
408 out[outpos++] = moff >> 0, i |= 0x01;
409 if (moff & 0x0000ff00)
410 out[outpos++] = moff >> 8, i |= 0x02;
411 if (moff & 0x00ff0000)
412 out[outpos++] = moff >> 16, i |= 0x04;
413 if (moff & 0xff000000)
414 out[outpos++] = moff >> 24, i |= 0x08;
415
416 if (msize & 0x00ff)
417 out[outpos++] = msize >> 0, i |= 0x10;
418 if (msize & 0xff00)
419 out[outpos++] = msize >> 8, i |= 0x20;
420
421 *op = i;
422
423 data += msize;
424 moff += msize;
425 msize = left;
426
427 if (msize < 4096) {
428 int j;
429 val = 0;
430 for (j = -RABIN_WINDOW; j < 0; j++)
431 val = ((val << 8) | data[j])
432 ^ T[val >> RABIN_SHIFT];
433 }
434 }
435
436 if (outpos >= outsize - MAX_OP_SIZE) {
437 void *tmp = out;
438 outsize = outsize * 3 / 2;
439 if (max_size && outsize >= max_size)
440 outsize = max_size + MAX_OP_SIZE + 1;
441 if (max_size && outpos > max_size)
442 break;
443 out = realloc(out, outsize);
444 if (!out) {
445 free(tmp);
446 return NULL;
447 }
448 }
449 }
450
451 if (inscnt)
452 out[outpos - inscnt - 1] = inscnt;
453
454 if (max_size && outpos > max_size) {
455 free(out);
456 return NULL;
457 }
458
459 *delta_size = outpos;
460 return out;
461}