Merge branch 'rc/histogram-diff' into HEAD
authorJunio C Hamano <gitster@pobox.com>
Thu, 18 Aug 2011 00:17:16 +0000 (17:17 -0700)
committerJunio C Hamano <gitster@pobox.com>
Thu, 18 Aug 2011 00:17:16 +0000 (17:17 -0700)
* rc/histogram-diff:
xdiff/xhistogram: drop need for additional variable
xdiff/xhistogram: rely on xdl_trim_ends()
xdiff/xhistogram: rework handling of recursed results
xdiff: do away with xdl_mmfile_next()
Make test number unique
xdiff/xprepare: use a smaller sample size for histogram diff
xdiff/xprepare: skip classification
teach --histogram to diff
t4033-diff-patience: factor out tests
xdiff/xpatience: factor out fall-back-diff function
xdiff/xprepare: refactor abort cleanups
xdiff/xprepare: use memset()

Conflicts:
xdiff/xprepare.c

1  2 
Makefile
diff.c
xdiff/xprepare.c
diff --combined Makefile
index 8dd782fd88d3f1ec3353445ce57cc03c01cf3653,f50d3c777027c1a2318f8ecc042d6ab97541b436..89cc6245a721a1cb807e197fb3346cec02367a51
+++ b/Makefile
@@@ -30,15 -30,15 +30,15 @@@ all:
  # Define LIBPCREDIR=/foo/bar if your libpcre header and library files are in
  # /foo/bar/include and /foo/bar/lib directories.
  #
 -# Define NO_CURL if you do not have libcurl installed.  git-http-pull and
 +# Define NO_CURL if you do not have libcurl installed.  git-http-fetch and
  # git-http-push are not built, and you cannot use http:// and https://
 -# transports.
 +# transports (neither smart nor dumb).
  #
  # Define CURLDIR=/foo/bar if your curl header and library files are in
  # /foo/bar/include and /foo/bar/lib directories.
  #
  # Define NO_EXPAT if you do not have expat installed.  git-http-push is
 -# not built, and you cannot push using http:// and https:// transports.
 +# not built, and you cannot push using http:// and https:// transports (dumb).
  #
  # Define EXPATDIR=/foo/bar if your expat header and library files are in
  # /foo/bar/include and /foo/bar/lib directories.
  #
  # Define NEEDS_SSL_WITH_CRYPTO if you need -lssl when using -lcrypto (Darwin).
  #
 +# Define NEEDS_SSL_WITH_CURL if you need -lssl with -lcurl (Minix).
 +#
 +# Define NEEDS_IDN_WITH_CURL if you need -lidn when using -lcurl (Minix).
 +#
  # Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
  #
  # Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
  # that tells runtime paths to dynamic libraries;
  # "-Wl,-rpath=/path/lib" is used instead.
  #
 +# Define NO_NORETURN if using buggy versions of gcc 4.6+ and profile feedback,
 +# as the compiler can crash (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49299)
 +#
  # Define USE_NSEC below if you want git to care about sub-second file mtimes
  # and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
  # it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
@@@ -563,7 -556,6 +563,7 @@@ LIB_H += sha1-lookup.
  LIB_H += sideband.h
  LIB_H += sigchain.h
  LIB_H += strbuf.h
 +LIB_H += streaming.h
  LIB_H += string-list.h
  LIB_H += submodule.h
  LIB_H += tag.h
@@@ -670,7 -662,6 +670,7 @@@ LIB_OBJS += shallow.
  LIB_OBJS += sideband.o
  LIB_OBJS += sigchain.o
  LIB_OBJS += strbuf.o
 +LIB_OBJS += streaming.o
  LIB_OBJS += string-list.o
  LIB_OBJS += submodule.o
  LIB_OBJS += symlinks.o
@@@ -1133,6 -1124,8 +1133,6 @@@ endi
        X = .exe
  endif
  ifeq ($(uname_S),Interix)
 -      NO_SYS_POLL_H = YesPlease
 -      NO_INTTYPES_H = YesPlease
        NO_INITGROUPS = YesPlease
        NO_IPV6 = YesPlease
        NO_MEMMEM = YesPlease
        ifeq ($(uname_R),3.5)
                NO_INET_NTOP = YesPlease
                NO_INET_PTON = YesPlease
 +              NO_SOCKADDR_STORAGE = YesPlease
 +              NO_FNMATCH_CASEFOLD = YesPlease
        endif
        ifeq ($(uname_R),5.2)
                NO_INET_NTOP = YesPlease
                NO_INET_PTON = YesPlease
 +              NO_SOCKADDR_STORAGE = YesPlease
 +              NO_FNMATCH_CASEFOLD = YesPlease
        endif
  endif
 +ifeq ($(uname_S),Minix)
 +      NO_IPV6 = YesPlease
 +      NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease
 +      NO_NSEC = YesPlease
 +      NEEDS_LIBGEN =
 +      NEEDS_CRYPTO_WITH_SSL = YesPlease
 +      NEEDS_IDN_WITH_CURL = YesPlease
 +      NEEDS_SSL_WITH_CURL = YesPlease
 +      NEEDS_RESOLV =
 +      NO_HSTRERROR = YesPlease
 +      NO_MMAP = YesPlease
 +      NO_CURL =
 +      NO_EXPAT =
 +endif
  ifneq (,$(findstring MINGW,$(uname_S)))
        pathsep = ;
        NO_PREAD = YesPlease
        else
                CURL_LIBCURL = -lcurl
        endif
 +      ifdef NEEDS_SSL_WITH_CURL
 +              CURL_LIBCURL += -lssl
 +              ifdef NEEDS_CRYPTO_WITH_SSL
 +                      CURL_LIBCURL += -lcrypto
 +              endif
 +      endif
 +      ifdef NEEDS_IDN_WITH_CURL
 +              CURL_LIBCURL += -lidn
 +      endif
 +
        REMOTE_CURL_PRIMARY = git-remote-http$X
        REMOTE_CURL_ALIASES = git-remote-https$X git-remote-ftp$X git-remote-ftps$X
        REMOTE_CURL_NAMES = $(REMOTE_CURL_PRIMARY) $(REMOTE_CURL_ALIASES)
@@@ -1357,7 -1322,7 +1357,7 @@@ ifndef NO_OPENSS
                OPENSSL_LINK =
        endif
        ifdef NEEDS_CRYPTO_WITH_SSL
 -              OPENSSL_LINK += -lcrypto
 +              OPENSSL_LIBSSL += -lcrypto
        endif
  else
        BASIC_CFLAGS += -DNO_OPENSSL
@@@ -1409,9 -1374,6 +1409,9 @@@ endi
  ifdef USE_ST_TIMESPEC
        BASIC_CFLAGS += -DUSE_ST_TIMESPEC
  endif
 +ifdef NO_NORETURN
 +      BASIC_CFLAGS += -DNO_NORETURN
 +endif
  ifdef NO_NSEC
        BASIC_CFLAGS += -DNO_NSEC
  endif
@@@ -1744,7 -1706,7 +1744,7 @@@ git.sp git.s git.o: EXTRA_CPPFLAGS = -D
        '-DGIT_MAN_PATH="$(mandir_SQ)"' \
        '-DGIT_INFO_PATH="$(infodir_SQ)"'
  
 -git$X: git.o $(BUILTIN_OBJS) $(GITLIBS)
 +git$X: git.o GIT-LDFLAGS $(BUILTIN_OBJS) $(GITLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ git.o \
                $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
  
@@@ -1876,7 -1838,7 +1876,7 @@@ ifndef NO_CUR
        GIT_OBJS += http.o http-walker.o remote-curl.o
  endif
  XDIFF_OBJS = xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \
-       xdiff/xmerge.o xdiff/xpatience.o
+       xdiff/xmerge.o xdiff/xpatience.o xdiff/xhistogram.o
  VCSSVN_OBJS = vcs-svn/string_pool.o vcs-svn/line_buffer.o \
        vcs-svn/repo_tree.o vcs-svn/fast_export.o vcs-svn/svndump.o
  VCSSVN_TEST_OBJS = test-obj-pool.o test-string-pool.o \
@@@ -2042,17 -2004,17 +2042,17 @@@ compat/nedmalloc/nedmalloc.sp compat/ne
        -DNDEBUG -DOVERRIDE_STRDUP -DREPLACE_SYSTEM_ALLOCATOR
  endif
  
 -git-%$X: %.o $(GITLIBS)
 +git-%$X: %.o GIT-LDFLAGS $(GITLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
  
 -git-imap-send$X: imap-send.o $(GITLIBS)
 +git-imap-send$X: imap-send.o GIT-LDFLAGS $(GITLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
                $(LIBS) $(OPENSSL_LINK) $(OPENSSL_LIBSSL) $(LIB_4_CRYPTO)
  
 -git-http-fetch$X: revision.o http.o http-walker.o http-fetch.o $(GITLIBS)
 +git-http-fetch$X: revision.o http.o http-walker.o http-fetch.o GIT-LDFLAGS $(GITLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
                $(LIBS) $(CURL_LIBCURL)
 -git-http-push$X: revision.o http.o http-push.o $(GITLIBS)
 +git-http-push$X: revision.o http.o http-push.o GIT-LDFLAGS $(GITLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
                $(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT)
  
@@@ -2062,7 -2024,7 +2062,7 @@@ $(REMOTE_CURL_ALIASES): $(REMOTE_CURL_P
        ln -s $< $@ 2>/dev/null || \
        cp $< $@
  
 -$(REMOTE_CURL_PRIMARY): remote-curl.o http.o http-walker.o $(GITLIBS)
 +$(REMOTE_CURL_PRIMARY): remote-curl.o http.o http-walker.o GIT-LDFLAGS $(GITLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
                $(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT)
  
@@@ -2132,15 -2094,6 +2132,15 @@@ GIT-CFLAGS: FORC
                echo "$$FLAGS" >GIT-CFLAGS; \
              fi
  
 +TRACK_LDFLAGS = $(subst ','\'',$(ALL_LDFLAGS))
 +
 +GIT-LDFLAGS: FORCE
 +      @FLAGS='$(TRACK_LDFLAGS)'; \
 +          if test x"$$FLAGS" != x"`cat GIT-LDFLAGS 2>/dev/null`" ; then \
 +              echo 1>&2 "    * new link flags"; \
 +              echo "$$FLAGS" >GIT-LDFLAGS; \
 +            fi
 +
  # We need to apply sq twice, once to protect from the shell
  # that runs GIT-BUILD-OPTIONS, and then again to protect it
  # and the first level quoting from the shell that runs "echo".
@@@ -2212,7 -2165,7 +2212,7 @@@ test-svn-fe$X: vcs-svn/lib.
  
  .PRECIOUS: $(TEST_OBJS)
  
 -test-%$X: test-%.o $(GITLIBS)
 +test-%$X: test-%.o GIT-LDFLAGS $(GITLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(filter %.a,$^) $(LIBS)
  
  check-sha1:: test-sha1$X
@@@ -2422,7 -2375,7 +2422,7 @@@ ifndef NO_TCLT
        $(MAKE) -C gitk-git clean
        $(MAKE) -C git-gui clean
  endif
 -      $(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS
 +      $(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-LDFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS
  
  .PHONY: all install clean strip
  .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
@@@ -2533,19 -2486,3 +2533,19 @@@ cover_db: coverage-repor
  
  cover_db_html: cover_db
        cover -report html -outputdir cover_db_html cover_db
 +
 +### profile feedback build
 +#
 +.PHONY: profile-all profile-clean
 +
 +PROFILE_GEN_CFLAGS := $(CFLAGS) -fprofile-generate -DNO_NORETURN=1
 +PROFILE_USE_CFLAGS := $(CFLAGS) -fprofile-use -fprofile-correction -DNO_NORETURN=1
 +
 +profile-clean:
 +      $(RM) $(addsuffix *.gcda,$(object_dirs))
 +      $(RM) $(addsuffix *.gcno,$(object_dirs))
 +
 +profile-all: profile-clean
 +      $(MAKE) CFLAGS="$(PROFILE_GEN_CFLAGS)" all
 +      $(MAKE) CFLAGS="$(PROFILE_GEN_CFLAGS)" -j1 test
 +      $(MAKE) CFLAGS="$(PROFILE_USE_CFLAGS)" all
diff --combined diff.c
index 93ef9a265ca6b52b644468979dba732c70b5097a,1b940ee72e8925ba97d3c162bf5c9da946c6bd0c..d3d8daec77142bc6a67aacdc2e62ee522ddf12db
--- 1/diff.c
--- 2/diff.c
+++ b/diff.c
@@@ -1316,10 -1316,9 +1316,10 @@@ static void show_stats(struct diffstat_
        int i, len, add, del, adds = 0, dels = 0;
        uintmax_t max_change = 0, max_len = 0;
        int total_files = data->nr;
 -      int width, name_width;
 +      int width, name_width, count;
        const char *reset, *add_c, *del_c;
        const char *line_prefix = "";
 +      int extra_shown = 0;
        struct strbuf *msg = NULL;
  
        if (data->nr == 0)
  
        width = options->stat_width ? options->stat_width : 80;
        name_width = options->stat_name_width ? options->stat_name_width : 50;
 +      count = options->stat_count ? options->stat_count : data->nr;
  
        /* Sanity: give at least 5 columns to the graph,
         * but leave at least 10 columns for the name.
        add_c = diff_get_color_opt(options, DIFF_FILE_NEW);
        del_c = diff_get_color_opt(options, DIFF_FILE_OLD);
  
 -      for (i = 0; i < data->nr; i++) {
 +      for (i = 0; (i < count) && (i < data->nr); i++) {
                struct diffstat_file *file = data->files[i];
                uintmax_t change = file->added + file->deleted;
 +              if (!data->files[i]->is_renamed &&
 +                       (change == 0)) {
 +                      count++; /* not shown == room for one more */
 +                      continue;
 +              }
                fill_print_name(file);
                len = strlen(file->print_name);
                if (max_len < len)
                if (max_change < change)
                        max_change = change;
        }
 +      count = i; /* min(count, data->nr) */
  
        /* Compute the width of the graph part;
         * 10 is for one blank at the beginning of the line plus
        else
                width = max_change;
  
 -      for (i = 0; i < data->nr; i++) {
 +      for (i = 0; i < count; i++) {
                const char *prefix = "";
                char *name = data->files[i]->print_name;
                uintmax_t added = data->files[i]->added;
                uintmax_t deleted = data->files[i]->deleted;
                int name_len;
  
 +              if (!data->files[i]->is_renamed &&
 +                       (added + deleted == 0)) {
 +                      total_files--;
 +                      continue;
 +              }
                /*
                 * "scale" the filename
                 */
                        fprintf(options->file, "  Unmerged\n");
                        continue;
                }
 -              else if (!data->files[i]->is_renamed &&
 -                       (added + deleted == 0)) {
 -                      total_files--;
 -                      continue;
 -              }
  
                /*
                 * scale the add/delete
                show_graph(options->file, '-', del, del_c, reset);
                fprintf(options->file, "\n");
        }
 +      for (i = count; i < data->nr; i++) {
 +              uintmax_t added = data->files[i]->added;
 +              uintmax_t deleted = data->files[i]->deleted;
 +              if (!data->files[i]->is_renamed &&
 +                       (added + deleted == 0)) {
 +                      total_files--;
 +                      continue;
 +              }
 +              adds += added;
 +              dels += deleted;
 +              if (!extra_shown)
 +                      fprintf(options->file, "%s ...\n", line_prefix);
 +              extra_shown = 1;
 +      }
        fprintf(options->file, "%s", line_prefix);
        fprintf(options->file,
               " %d files changed, %d insertions(+), %d deletions(-)\n",
@@@ -1861,20 -1839,20 +1861,20 @@@ static unsigned char *deflate_it(char *
  {
        int bound;
        unsigned char *deflated;
 -      z_stream stream;
 +      git_zstream stream;
  
        memset(&stream, 0, sizeof(stream));
 -      deflateInit(&stream, zlib_compression_level);
 -      bound = deflateBound(&stream, size);
 +      git_deflate_init(&stream, zlib_compression_level);
 +      bound = git_deflate_bound(&stream, size);
        deflated = xmalloc(bound);
        stream.next_out = deflated;
        stream.avail_out = bound;
  
        stream.next_in = (unsigned char *)data;
        stream.avail_in = size;
 -      while (deflate(&stream, Z_FINISH) == Z_OK)
 +      while (git_deflate(&stream, Z_FINISH) == Z_OK)
                ; /* nothing */
 -      deflateEnd(&stream);
 +      git_deflate_end(&stream);
        *result_size = stream.total_out;
        return deflated;
  }
@@@ -2006,7 -1984,19 +2006,7 @@@ struct userdiff_driver *get_textconv(st
                return NULL;
  
        diff_filespec_load_driver(one);
 -      if (!one->driver->textconv)
 -              return NULL;
 -
 -      if (one->driver->textconv_want_cache && !one->driver->textconv_cache) {
 -              struct notes_cache *c = xmalloc(sizeof(*c));
 -              struct strbuf name = STRBUF_INIT;
 -
 -              strbuf_addf(&name, "textconv/%s", one->driver->name);
 -              notes_cache_init(c, name.buf, one->driver->textconv);
 -              one->driver->textconv_cache = c;
 -      }
 -
 -      return one->driver;
 +      return userdiff_get_textconv(one->driver);
  }
  
  static void builtin_diff(const char *name_a,
@@@ -3230,7 -3220,6 +3230,7 @@@ static int stat_opt(struct diff_option
        char *end;
        int width = options->stat_width;
        int name_width = options->stat_name_width;
 +      int count = options->stat_count;
        int argcount = 1;
  
        arg += strlen("--stat");
                                name_width = strtoul(av[1], &end, 10);
                                argcount = 2;
                        }
 +              } else if (!prefixcmp(arg, "-count")) {
 +                      arg += strlen("-count");
 +                      if (*arg == '=')
 +                              count = strtoul(arg + 1, &end, 10);
 +                      else if (!*arg && !av[1])
 +                              die("Option '--stat-count' requires a value");
 +                      else if (!*arg) {
 +                              count = strtoul(av[1], &end, 10);
 +                              argcount = 2;
 +                      }
                }
                break;
        case '=':
                width = strtoul(arg+1, &end, 10);
                if (*end == ',')
                        name_width = strtoul(end+1, &end, 10);
 +              if (*end == ',')
 +                      count = strtoul(end+1, &end, 10);
        }
  
        /* Important! This checks all the error cases! */
        options->output_format |= DIFF_FORMAT_DIFFSTAT;
        options->stat_name_width = name_width;
        options->stat_width = width;
 +      options->stat_count = count;
        return argcount;
  }
  
@@@ -3349,7 -3325,7 +3349,7 @@@ int diff_opt_parse(struct diff_options 
        else if (!strcmp(arg, "-s"))
                options->output_format |= DIFF_FORMAT_NO_OUTPUT;
        else if (!prefixcmp(arg, "--stat"))
 -              /* --stat, --stat-width, or --stat-name-width */
 +              /* --stat, --stat-width, --stat-name-width, or --stat-count */
                return stat_opt(options, av);
  
        /* renames options */
                DIFF_XDL_SET(options, IGNORE_WHITESPACE_AT_EOL);
        else if (!strcmp(arg, "--patience"))
                DIFF_XDL_SET(options, PATIENCE_DIFF);
+       else if (!strcmp(arg, "--histogram"))
+               DIFF_XDL_SET(options, HISTOGRAM_DIFF);
  
        /* flags options */
        else if (!strcmp(arg, "--binary")) {
diff --combined xdiff/xprepare.c
index 05a8f01f38a8391b74db0511e9ab0226d9448086,620fc9a657e2246d3a382c916c2cdd4f820c0c44..eba31ffaeebb4fe15f45ce854aa2ed3f63489b8c
@@@ -26,6 -26,8 +26,8 @@@
  #define XDL_KPDIS_RUN 4
  #define XDL_MAX_EQLIMIT 1024
  #define XDL_SIMSCAN_WINDOW 100
+ #define XDL_GUESS_NLINES1 256
+ #define XDL_GUESS_NLINES2 20
  
  
  typedef struct s_xdlclass {
@@@ -34,7 -36,6 +36,7 @@@
        char const *line;
        long size;
        long idx;
 +      long len1, len2;
  } xdlclass_t;
  
  typedef struct s_xdlclassifier {
@@@ -42,8 -43,6 +44,8 @@@
        long hsize;
        xdlclass_t **rchash;
        chastore_t ncha;
 +      xdlclass_t **rcrecs;
 +      long alloc;
        long count;
        long flags;
  } xdlclassifier_t;
  
  static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags);
  static void xdl_free_classifier(xdlclassifier_t *cf);
 -static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t **rhash, unsigned int hbits,
 -                             xrecord_t *rec);
 -static int xdl_prepare_ctx(mmfile_t *mf, long narec, xpparam_t const *xpp,
 +static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash,
 +                             unsigned int hbits, xrecord_t *rec);
 +static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
                           xdlclassifier_t *cf, xdfile_t *xdf);
  static void xdl_free_ctx(xdfile_t *xdf);
  static int xdl_clean_mmatch(char const *dis, long i, long s, long e);
 -static int xdl_cleanup_records(xdfile_t *xdf1, xdfile_t *xdf2);
 +static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
  static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2);
 -static int xdl_optimize_ctxs(xdfile_t *xdf1, xdfile_t *xdf2);
 +static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2);
  
  
  
  
  static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
-       long i;
        cf->flags = flags;
  
        cf->hbits = xdl_hashbits((unsigned int) size);
                xdl_cha_free(&cf->ncha);
                return -1;
        }
-       for (i = 0; i < cf->hsize; i++)
-               cf->rchash[i] = NULL;
+       memset(cf->rchash, 0, cf->hsize * sizeof(xdlclass_t *));
  
 +      cf->alloc = size;
 +      if (!(cf->rcrecs = (xdlclass_t **) xdl_malloc(cf->alloc * sizeof(xdlclass_t *)))) {
 +
 +              xdl_free(cf->rchash);
 +              xdl_cha_free(&cf->ncha);
 +              return -1;
 +      }
 +
        cf->count = 0;
  
        return 0;
  
  static void xdl_free_classifier(xdlclassifier_t *cf) {
  
 +      xdl_free(cf->rcrecs);
        xdl_free(cf->rchash);
        xdl_cha_free(&cf->ncha);
  }
  
  
 -static int xdl_classify_record(xdlclassifier_t *cf, xrecord_t **rhash, unsigned int hbits,
 -                             xrecord_t *rec) {
 +static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t **rhash,
 +                             unsigned int hbits, xrecord_t *rec) {
        long hi;
        char const *line;
        xdlclass_t *rcrec;
 +      xdlclass_t **rcrecs;
  
        line = rec->ptr;
        hi = (long) XDL_HASHLONG(rec->ha, cf->hbits);
                        return -1;
                }
                rcrec->idx = cf->count++;
 +              if (cf->count > cf->alloc) {
 +                      cf->alloc *= 2;
 +                      if (!(rcrecs = (xdlclass_t **) xdl_realloc(cf->rcrecs, cf->alloc * sizeof(xdlclass_t *)))) {
 +
 +                              return -1;
 +                      }
 +                      cf->rcrecs = rcrecs;
 +              }
 +              cf->rcrecs[rcrec->idx] = rcrec;
                rcrec->line = line;
                rcrec->size = rec->size;
                rcrec->ha = rec->ha;
 +              rcrec->len1 = rcrec->len2 = 0;
                rcrec->next = cf->rchash[hi];
                cf->rchash[hi] = rcrec;
        }
  
 +      (pass == 1) ? rcrec->len1++ : rcrec->len2++;
 +
        rec->ha = (unsigned long) rcrec->idx;
  
        hi = (long) XDL_HASHLONG(rec->ha, hbits);
  }
  
  
 -static int xdl_prepare_ctx(mmfile_t *mf, long narec, xpparam_t const *xpp,
 +static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
                           xdlclassifier_t *cf, xdfile_t *xdf) {
        unsigned int hbits;
-       long i, nrec, hsize, bsize;
+       long nrec, hsize, bsize;
        unsigned long hav;
        char const *blk, *cur, *top, *prev;
        xrecord_t *crec;
        char *rchg;
        long *rindex;
  
-       if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0) {
-               return -1;
-       }
-       if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *)))) {
-               xdl_cha_free(&xdf->rcha);
-               return -1;
-       }
-       hbits = xdl_hashbits((unsigned int) narec);
-       hsize = 1 << hbits;
-       if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *)))) {
-               xdl_free(recs);
-               xdl_cha_free(&xdf->rcha);
-               return -1;
+       ha = NULL;
+       rindex = NULL;
+       rchg = NULL;
+       rhash = NULL;
+       recs = NULL;
+       if (xdl_cha_init(&xdf->rcha, sizeof(xrecord_t), narec / 4 + 1) < 0)
+               goto abort;
+       if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *))))
+               goto abort;
+       if (xpp->flags & XDF_HISTOGRAM_DIFF)
+               hbits = hsize = 0;
+       else {
+               hbits = xdl_hashbits((unsigned int) narec);
+               hsize = 1 << hbits;
+               if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *))))
+                       goto abort;
+               memset(rhash, 0, hsize * sizeof(xrecord_t *));
        }
-       for (i = 0; i < hsize; i++)
-               rhash[i] = NULL;
  
        nrec = 0;
        if ((cur = blk = xdl_mmfile_first(mf, &bsize)) != NULL) {
-               for (top = blk + bsize;;) {
-                       if (cur >= top) {
-                               if (!(cur = blk = xdl_mmfile_next(mf, &bsize)))
-                                       break;
-                               top = blk + bsize;
-                       }
+               for (top = blk + bsize; cur < top; ) {
                        prev = cur;
                        hav = xdl_hash_record(&cur, top, xpp->flags);
                        if (nrec >= narec) {
                                narec *= 2;
-                               if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *)))) {
-                                       xdl_free(rhash);
-                                       xdl_free(recs);
-                                       xdl_cha_free(&xdf->rcha);
-                                       return -1;
-                               }
+                               if (!(rrecs = (xrecord_t **) xdl_realloc(recs, narec * sizeof(xrecord_t *))))
+                                       goto abort;
                                recs = rrecs;
                        }
-                       if (!(crec = xdl_cha_alloc(&xdf->rcha))) {
-                               xdl_free(rhash);
-                               xdl_free(recs);
-                               xdl_cha_free(&xdf->rcha);
-                               return -1;
-                       }
+                       if (!(crec = xdl_cha_alloc(&xdf->rcha)))
+                               goto abort;
                        crec->ptr = prev;
                        crec->size = (long) (cur - prev);
                        crec->ha = hav;
                        recs[nrec++] = crec;
  
-                       if (xdl_classify_record(pass, cf, rhash, hbits, crec) < 0) {
-                               xdl_free(rhash);
-                               xdl_free(recs);
-                               xdl_cha_free(&xdf->rcha);
-                               return -1;
-                       }
+                       if (!(xpp->flags & XDF_HISTOGRAM_DIFF) &&
 -                              xdl_classify_record(cf, rhash, hbits, crec) < 0)
++                              xdl_classify_record(pass, cf, rhash, hbits, crec) < 0)
+                               goto abort;
                }
        }
  
-       if (!(rchg = (char *) xdl_malloc((nrec + 2) * sizeof(char)))) {
-               xdl_free(rhash);
-               xdl_free(recs);
-               xdl_cha_free(&xdf->rcha);
-               return -1;
-       }
+       if (!(rchg = (char *) xdl_malloc((nrec + 2) * sizeof(char))))
+               goto abort;
        memset(rchg, 0, (nrec + 2) * sizeof(char));
  
-       if (!(rindex = (long *) xdl_malloc((nrec + 1) * sizeof(long)))) {
-               xdl_free(rchg);
-               xdl_free(rhash);
-               xdl_free(recs);
-               xdl_cha_free(&xdf->rcha);
-               return -1;
-       }
-       if (!(ha = (unsigned long *) xdl_malloc((nrec + 1) * sizeof(unsigned long)))) {
-               xdl_free(rindex);
-               xdl_free(rchg);
-               xdl_free(rhash);
-               xdl_free(recs);
-               xdl_cha_free(&xdf->rcha);
-               return -1;
-       }
+       if (!(rindex = (long *) xdl_malloc((nrec + 1) * sizeof(long))))
+               goto abort;
+       if (!(ha = (unsigned long *) xdl_malloc((nrec + 1) * sizeof(unsigned long))))
+               goto abort;
  
        xdf->nrec = nrec;
        xdf->recs = recs;
        xdf->dend = nrec - 1;
  
        return 0;
+ abort:
+       xdl_free(ha);
+       xdl_free(rindex);
+       xdl_free(rchg);
+       xdl_free(rhash);
+       xdl_free(recs);
+       xdl_cha_free(&xdf->rcha);
+       return -1;
  }
  
  
@@@ -290,39 -236,51 +261,51 @@@ static void xdl_free_ctx(xdfile_t *xdf
  
  int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
                    xdfenv_t *xe) {
-       long enl1, enl2;
+       long enl1, enl2, sample;
        xdlclassifier_t cf;
  
-       enl1 = xdl_guess_lines(mf1) + 1;
-       enl2 = xdl_guess_lines(mf2) + 1;
+       /*
+        * For histogram diff, we can afford a smaller sample size and
+        * thus a poorer estimate of the number of lines, as the hash
+        * table (rhash) won't be filled up/grown. The number of lines
+        * (nrecs) will be updated correctly anyway by
+        * xdl_prepare_ctx().
+        */
+       sample = xpp->flags & XDF_HISTOGRAM_DIFF ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1;
+       enl1 = xdl_guess_lines(mf1, sample) + 1;
+       enl2 = xdl_guess_lines(mf2, sample) + 1;
  
-       if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) {
+       if (!(xpp->flags & XDF_HISTOGRAM_DIFF) &&
+               xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0) {
  
                return -1;
        }
  
 -      if (xdl_prepare_ctx(mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
 +      if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
  
                xdl_free_classifier(&cf);
                return -1;
        }
 -      if (xdl_prepare_ctx(mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
 +      if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
  
                xdl_free_ctx(&xe->xdf1);
                xdl_free_classifier(&cf);
                return -1;
        }
  
 -      if (!(xpp->flags & XDF_HISTOGRAM_DIFF))
 -              xdl_free_classifier(&cf);
 -
        if (!(xpp->flags & XDF_PATIENCE_DIFF) &&
 -                      xdl_optimize_ctxs(&xe->xdf1, &xe->xdf2) < 0) {
+                       !(xpp->flags & XDF_HISTOGRAM_DIFF) &&
 +                      xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
  
                xdl_free_ctx(&xe->xdf2);
                xdl_free_ctx(&xe->xdf1);
                return -1;
        }
  
-       xdl_free_classifier(&cf);
++      if (!(xpp->flags & XDF_HISTOGRAM_DIFF))
++              xdl_free_classifier(&cf);
 +
        return 0;
  }
  
@@@ -397,10 -355,11 +380,10 @@@ static int xdl_clean_mmatch(char const 
   * matches on the other file. Also, lines that have multiple matches
   * might be potentially discarded if they happear in a run of discardable.
   */
 -static int xdl_cleanup_records(xdfile_t *xdf1, xdfile_t *xdf2) {
 -      long i, nm, rhi, nreff, mlim;
 -      unsigned long hav;
 +static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
 +      long i, nm, nreff;
        xrecord_t **recs;
 -      xrecord_t *rec;
 +      xdlclass_t *rcrec;
        char *dis, *dis1, *dis2;
  
        if (!(dis = (char *) xdl_malloc(xdf1->nrec + xdf2->nrec + 2))) {
        dis1 = dis;
        dis2 = dis1 + xdf1->nrec + 1;
  
 -      if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT)
 -              mlim = XDL_MAX_EQLIMIT;
        for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
 -              hav = (*recs)->ha;
 -              rhi = (long) XDL_HASHLONG(hav, xdf2->hbits);
 -              for (nm = 0, rec = xdf2->rhash[rhi]; rec; rec = rec->next)
 -                      if (rec->ha == hav && ++nm == mlim)
 -                              break;
 -              dis1[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
 +              rcrec = cf->rcrecs[(*recs)->ha];
 +              nm = rcrec ? rcrec->len2 : 0;
 +              dis1[i] = (nm == 0) ? 0: 1;
        }
  
 -      if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
 -              mlim = XDL_MAX_EQLIMIT;
        for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
 -              hav = (*recs)->ha;
 -              rhi = (long) XDL_HASHLONG(hav, xdf1->hbits);
 -              for (nm = 0, rec = xdf1->rhash[rhi]; rec; rec = rec->next)
 -                      if (rec->ha == hav && ++nm == mlim)
 -                              break;
 -              dis2[i] = (nm == 0) ? 0: (nm >= mlim) ? 2: 1;
 +              rcrec = cf->rcrecs[(*recs)->ha];
 +              nm = rcrec ? rcrec->len1 : 0;
 +              dis2[i] = (nm == 0) ? 0: 1;
        }
  
        for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
@@@ -482,10 -451,10 +465,10 @@@ static int xdl_trim_ends(xdfile_t *xdf1
  }
  
  
 -static int xdl_optimize_ctxs(xdfile_t *xdf1, xdfile_t *xdf2) {
 +static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
  
        if (xdl_trim_ends(xdf1, xdf2) < 0 ||
 -          xdl_cleanup_records(xdf1, xdf2) < 0) {
 +          xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
  
                return -1;
        }