#include "xinclude.h"
-
#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
-
+#define XDL_SIMSCAN_WINDOW 100
typedef struct s_xdlclass {
xdlclass_t **rchash;
chastore_t ncha;
long count;
+ long flags;
} xdlclassifier_t;
-static int xdl_init_classifier(xdlclassifier_t *cf, long size);
+static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags);
static void xdl_free_classifier(xdlclassifier_t *cf);
static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t **rhash, unsigned int hbits,
xrecord_t *rec);
-static int xdl_init_classifier(xdlclassifier_t *cf, long size) {
+static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
long i;
+ cf->flags = flags;
+
cf->hbits = xdl_hashbits((unsigned int) size);
cf->hsize = 1 << cf->hbits;
line = rec->ptr;
hi = (long) XDL_HASHLONG(rec->ha, cf->hbits);
for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next)
- if (rcrec->ha == rec->ha && rcrec->size == rec->size &&
- !memcmp(line, rcrec->line, rec->size))
+ if (rcrec->ha == rec->ha &&
+ xdl_recmatch(rcrec->line, rcrec->size,
+ rec->ptr, rec->size, cf->flags))
break;
if (!rcrec) {
top = blk + bsize;
}
prev = cur;
- hav = xdl_hash_record(&cur, top);
+ hav = xdl_hash_record(&cur, top, xpp->flags);
if (nrec >= narec) {
narec *= 2;
if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *)))) {
enl1 = xdl_guess_lines(mf1) + 1;
enl2 = xdl_guess_lines(mf2) + 1;
- if (xdl_init_classifier(&cf, enl1 + enl2 + 1) < 0) {
+ if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) {
return -1;
}
static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
long r, rdis0, rpdis0, rdis1, rpdis1;
+ /*
+ * Limits the window the is examined during the similar-lines
+ * scan. The loops below stops when dis[i - r] == 1 (line that
+ * has no match), but there are corner cases where the loop
+ * proceed all the way to the extremities by causing huge
+ * performance penalties in case of big files.
+ */
+ if (i - s > XDL_SIMSCAN_WINDOW)
+ s = i - XDL_SIMSCAN_WINDOW;
+ if (e - i > XDL_SIMSCAN_WINDOW)
+ e = i + XDL_SIMSCAN_WINDOW;
+
/*
* Scans the lines before 'i' to find a run of lines that either
* have no match (dis[j] == 0) or have multiple matches (dis[j] > 1).
return 0;
}
-