1/*
2* LibXDiff by Davide Libenzi ( File Differential Library )
3* Copyright (C) 2003 Davide Libenzi
4*
5* This library is free software; you can redistribute it and/or
6* modify it under the terms of the GNU Lesser General Public
7* License as published by the Free Software Foundation; either
8* version 2.1 of the License, or (at your option) any later version.
9*
10* This library is distributed in the hope that it will be useful,
11* but WITHOUT ANY WARRANTY; without even the implied warranty of
12* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13* Lesser General Public License for more details.
14*
15* You should have received a copy of the GNU Lesser General Public
16* License along with this library; if not, write to the Free Software
17* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*
19* Davide Libenzi <davidel@xmailserver.org>
20*
21*/
2223
#include "xinclude.h"
2425
26
27
#define XDL_KPDIS_RUN 4
28#define XDL_MAX_EQLIMIT 1024
2930
31
32
typedef struct s_xdlclass {
33struct s_xdlclass *next;
34unsigned long ha;
35char const *line;
36long size;
37long idx;
38} xdlclass_t;
3940
typedef struct s_xdlclassifier {
41unsigned int hbits;
42long hsize;
43xdlclass_t **rchash;
44chastore_t ncha;
45long count;
46long flags;
47} xdlclassifier_t;
4849
50
51
52
static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags);
53static void xdl_free_classifier(xdlclassifier_t *cf);
54static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t **rhash, unsigned int hbits,
55xrecord_t *rec);
56static int xdl_prepare_ctx(mmfile_t *mf, long narec, xpparam_t const *xpp,
57xdlclassifier_t *cf, xdfile_t *xdf);
58static void xdl_free_ctx(xdfile_t *xdf);
59static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
60static int xdl_cleanup_records(xdfile_t *xdf1, xdfile_t *xdf2);
61static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
62static int xdl_optimize_ctxs(xdfile_t *xdf1, xdfile_t *xdf2);
6364
65
66
67
static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
68long i;
6970
cf->flags = flags;
7172
cf->hbits = xdl_hashbits((unsigned int) size);
73cf->hsize = 1 << cf->hbits;
7475
if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) {
7677
return -1;
78}
79if (!(cf->rchash = (xdlclass_t **) xdl_malloc(cf->hsize * sizeof(xdlclass_t *)))) {
8081
xdl_cha_free(&cf->ncha);
82return -1;
83}
84for (i = 0; i < cf->hsize; i++)
85cf->rchash[i] = NULL;
8687
cf->count = 0;
8889
return 0;
90}
9192
93
static void xdl_free_classifier(xdlclassifier_t *cf) {
9495
xdl_free(cf->rchash);
96xdl_cha_free(&cf->ncha);
97}
9899
100
static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t **rhash, unsigned int hbits,
101xrecord_t *rec) {
102long hi;
103char const *line;
104xdlclass_t *rcrec;
105106
line = rec->ptr;
107hi = (long) XDL_HASHLONG(rec->ha, cf->hbits);
108for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next)
109if (rcrec->ha == rec->ha &&
110xdl_recmatch(rcrec->line, rcrec->size,
111rec->ptr, rec->size, cf->flags))
112break;
113114
if (!rcrec) {
115if (!(rcrec = xdl_cha_alloc(&cf->ncha))) {
116117
return -1;
118}
119rcrec->idx = cf->count++;
120rcrec->line = line;
121rcrec->size = rec->size;
122rcrec->ha = rec->ha;
123rcrec->next = cf->rchash[hi];
124cf->rchash[hi] = rcrec;
125}
126127
rec->ha = (unsigned long) rcrec->idx;
128129
hi = (long) XDL_HASHLONG(rec->ha, hbits);
130rec->next = rhash[hi];
131rhash[hi] = rec;
132133
return 0;
134}
135136
137
static int xdl_prepare_ctx(mmfile_t *mf, long narec, xpparam_t const *xpp,
138xdlclassifier_t *cf, xdfile_t *xdf) {
139unsigned int hbits;
140long i, nrec, hsize, bsize;
141unsigned long hav;
142char const *blk, *cur, *top, *prev;
143xrecord_t *crec;
144xrecord_t **recs, **rrecs;
145xrecord_t **rhash;
146unsigned long *ha;
147char *rchg;
148long *rindex;
149150
if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0) {
151152
return -1;
153}
154if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *)))) {
155156
xdl_cha_free(&xdf->rcha);
157return -1;
158}
159160
hbits = xdl_hashbits((unsigned int) narec);
161hsize = 1 << hbits;
162if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *)))) {
163164
xdl_free(recs);
165xdl_cha_free(&xdf->rcha);
166return -1;
167}
168for (i = 0; i < hsize; i++)
169rhash[i] = NULL;
170171
nrec = 0;
172if ((cur = blk = xdl_mmfile_first(mf, &bsize)) != NULL) {
173for (top = blk + bsize;;) {
174if (cur >= top) {
175if (!(cur = blk = xdl_mmfile_next(mf, &bsize)))
176break;
177top = blk + bsize;
178}
179prev = cur;
180hav = xdl_hash_record(&cur, top, xpp->flags);
181if (nrec >= narec) {
182narec *= 2;
183if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *)))) {
184185
xdl_free(rhash);
186xdl_free(recs);
187xdl_cha_free(&xdf->rcha);
188return -1;
189}
190recs = rrecs;
191}
192if (!(crec = xdl_cha_alloc(&xdf->rcha))) {
193194
xdl_free(rhash);
195xdl_free(recs);
196xdl_cha_free(&xdf->rcha);
197return -1;
198}
199crec->ptr = prev;
200crec->size = (long) (cur - prev);
201crec->ha = hav;
202recs[nrec++] = crec;
203204
if (xdl_classify_record(cf, rhash, hbits, crec) < 0) {
205206
xdl_free(rhash);
207xdl_free(recs);
208xdl_cha_free(&xdf->rcha);
209return -1;
210}
211}
212}
213214
if (!(rchg = (char *) xdl_malloc((nrec + 2) * sizeof(char)))) {
215216
xdl_free(rhash);
217xdl_free(recs);
218xdl_cha_free(&xdf->rcha);
219return -1;
220}
221memset(rchg, 0, (nrec + 2) * sizeof(char));
222223
if (!(rindex = (long *) xdl_malloc((nrec + 1) * sizeof(long)))) {
224225
xdl_free(rchg);
226xdl_free(rhash);
227xdl_free(recs);
228xdl_cha_free(&xdf->rcha);
229return -1;
230}
231if (!(ha = (unsigned long *) xdl_malloc((nrec + 1) * sizeof(unsigned long)))) {
232233
xdl_free(rindex);
234xdl_free(rchg);
235xdl_free(rhash);
236xdl_free(recs);
237xdl_cha_free(&xdf->rcha);
238return -1;
239}
240241
xdf->nrec = nrec;
242xdf->recs = recs;
243xdf->hbits = hbits;
244xdf->rhash = rhash;
245xdf->rchg = rchg + 1;
246xdf->rindex = rindex;
247xdf->nreff = 0;
248xdf->ha = ha;
249xdf->dstart = 0;
250xdf->dend = nrec - 1;
251252
return 0;
253}
254255
256
static void xdl_free_ctx(xdfile_t *xdf) {
257258
xdl_free(xdf->rhash);
259xdl_free(xdf->rindex);
260xdl_free(xdf->rchg - 1);
261xdl_free(xdf->ha);
262xdl_free(xdf->recs);
263xdl_cha_free(&xdf->rcha);
264}
265266
267
int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
268xdfenv_t *xe) {
269long enl1, enl2;
270xdlclassifier_t cf;
271272
enl1 = xdl_guess_lines(mf1) + 1;
273enl2 = xdl_guess_lines(mf2) + 1;
274275
if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) {
276277
return -1;
278}
279280
if (xdl_prepare_ctx(mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
281282
xdl_free_classifier(&cf);
283return -1;
284}
285if (xdl_prepare_ctx(mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
286287
xdl_free_ctx(&xe->xdf1);
288xdl_free_classifier(&cf);
289return -1;
290}
291292
xdl_free_classifier(&cf);
293294
if (xdl_optimize_ctxs(&xe->xdf1, &xe->xdf2) < 0) {
295296
xdl_free_ctx(&xe->xdf2);
297xdl_free_ctx(&xe->xdf1);
298return -1;
299}
300301
return 0;
302}
303304
305
void xdl_free_env(xdfenv_t *xe) {
306307
xdl_free_ctx(&xe->xdf2);
308xdl_free_ctx(&xe->xdf1);
309}
310311
312
static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
313long r, rdis0, rpdis0, rdis1, rpdis1;
314315
/*
316* Scans the lines before 'i' to find a run of lines that either
317* have no match (dis[j] == 0) or have multiple matches (dis[j] > 1).
318* Note that we always call this function with dis[i] > 1, so the
319* current line (i) is already a multimatch line.
320*/
321for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
322if (!dis[i - r])
323rdis0++;
324else if (dis[i - r] == 2)
325rpdis0++;
326else
327break;
328}
329/*
330* If the run before the line 'i' found only multimatch lines, we
331* return 0 and hence we don't make the current line (i) discarded.
332* We want to discard multimatch lines only when they appear in the
333* middle of runs with nomatch lines (dis[j] == 0).
334*/
335if (rdis0 == 0)
336return 0;
337for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
338if (!dis[i + r])
339rdis1++;
340else if (dis[i + r] == 2)
341rpdis1++;
342else
343break;
344}
345/*
346* If the run after the line 'i' found only multimatch lines, we
347* return 0 and hence we don't make the current line (i) discarded.
348*/
349if (rdis1 == 0)
350return 0;
351rdis1 += rdis0;
352rpdis1 += rpdis0;
353354
return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
355}
356357
358
/*
359* Try to reduce the problem complexity, discard records that have no
360* matches on the other file. Also, lines that have multiple matches
361* might be potentially discarded if they happear in a run of discardable.
362*/
363static int xdl_cleanup_records(xdfile_t *xdf1, xdfile_t *xdf2) {
364long i, nm, rhi, nreff, mlim;
365unsigned long hav;
366xrecord_t **recs;
367xrecord_t *rec;
368char *dis, *dis1, *dis2;
369370
if (!(dis = (char *) xdl_malloc(xdf1->nrec + xdf2->nrec + 2))) {
371372
return -1;
373}
374memset(dis, 0, xdf1->nrec + xdf2->nrec + 2);
375dis1 = dis;
376dis2 = dis1 + xdf1->nrec + 1;
377378
if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT)
379mlim = XDL_MAX_EQLIMIT;
380for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
381hav = (*recs)->ha;
382rhi = (long) XDL_HASHLONG(hav, xdf2->hbits);
383for (nm = 0, rec = xdf2->rhash[rhi]; rec; rec = rec->next)
384if (rec->ha == hav && ++nm == mlim)
385break;
386dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
387}
388389
if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
390mlim = XDL_MAX_EQLIMIT;
391for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
392hav = (*recs)->ha;
393rhi = (long) XDL_HASHLONG(hav, xdf1->hbits);
394for (nm = 0, rec = xdf1->rhash[rhi]; rec; rec = rec->next)
395if (rec->ha == hav && ++nm == mlim)
396break;
397dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
398}
399400
for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
401i <= xdf1->dend; i++, recs++) {
402if (dis1[i] == 1 ||
403(dis1[i] == 2 && !xdl_clean_mmatch(dis1, i, xdf1->dstart, xdf1->dend))) {
404xdf1->rindex[nreff] = i;
405xdf1->ha[nreff] = (*recs)->ha;
406nreff++;
407} else
408xdf1->rchg[i] = 1;
409}
410xdf1->nreff = nreff;
411412
for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
413i <= xdf2->dend; i++, recs++) {
414if (dis2[i] == 1 ||
415(dis2[i] == 2 && !xdl_clean_mmatch(dis2, i, xdf2->dstart, xdf2->dend))) {
416xdf2->rindex[nreff] = i;
417xdf2->ha[nreff] = (*recs)->ha;
418nreff++;
419} else
420xdf2->rchg[i] = 1;
421}
422xdf2->nreff = nreff;
423424
xdl_free(dis);
425426
return 0;
427}
428429
430
/*
431* Early trim initial and terminal matching records.
432*/
433static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
434long i, lim;
435xrecord_t **recs1, **recs2;
436437
recs1 = xdf1->recs;
438recs2 = xdf2->recs;
439for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
440i++, recs1++, recs2++)
441if ((*recs1)->ha != (*recs2)->ha)
442break;
443444
xdf1->dstart = xdf2->dstart = i;
445446
recs1 = xdf1->recs + xdf1->nrec - 1;
447recs2 = xdf2->recs + xdf2->nrec - 1;
448for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
449if ((*recs1)->ha != (*recs2)->ha)
450break;
451452
xdf1->dend = xdf1->nrec - i - 1;
453xdf2->dend = xdf2->nrec - i - 1;
454455
return 0;
456}
457458
459
static int xdl_optimize_ctxs(xdfile_t *xdf1, xdfile_t *xdf2) {
460461
if (xdl_trim_ends(xdf1, xdf2) < 0 ||
462xdl_cleanup_records(xdf1, xdf2) < 0) {
463464
return -1;
465}
466467
return 0;
468}