xdiff: drop XDL_FAST_HASH
authorJeff King <peff@peff.net>
Thu, 1 Dec 2016 04:52:43 +0000 (23:52 -0500)
committerJunio C Hamano <gitster@pobox.com>
Tue, 6 Dec 2016 21:27:11 +0000 (13:27 -0800)
The xdiff code hashes every line of both sides of a diff,
and then compares those hashes to find duplicates. The
overall performance depends both on how fast we can compute
the hashes, but also on how many hash collisions we see.

The idea of XDL_FAST_HASH is to speed up the hash
computation. But the generated hashes have worse collision
behavior. This means that in some cases it speeds diffs up
(running "git log -p" on git.git improves by ~8% with it),
but in others it can slow things down. One pathological case
saw over a 100x slowdown[1].

There may be a better hash function that covers both
properties, but in the meantime we are better off with the
original hash. It's slightly slower in the common case, but
it has fewer surprising pathological cases.

[1] http://public-inbox.org/git/20141222041944.GA441@peff.net/

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Makefile
config.mak.uname
xdiff/xutils.c
index f53fcc90d71b7e0ba95b94e9baf67d559085863c..f61076997aa499ec58cf34734045e0b1c0c9fe10 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -338,11 +338,6 @@ all::
 #
 # Define NATIVE_CRLF if your platform uses CRLF for line endings.
 #
-# Define XDL_FAST_HASH to use an alternative line-hashing method in
-# the diff algorithm.  It gives a nice speedup if your processor has
-# fast unaligned word loads.  Does NOT work on big-endian systems!
-# Enabled by default on x86_64.
-#
 # Define GIT_USER_AGENT if you want to change how git identifies itself during
 # network interactions.  The default is "git/$(GIT_VERSION)".
 #
@@ -1485,10 +1480,6 @@ ifndef NO_MSGFMT_EXTENDED_OPTIONS
        MSGFMT += --check --statistics
 endif
 
-ifneq (,$(XDL_FAST_HASH))
-       BASIC_CFLAGS += -DXDL_FAST_HASH
-endif
-
 ifdef GMTIME_UNRELIABLE_ERRORS
        COMPAT_OBJS += compat/gmtime.o
        BASIC_CFLAGS += -DGMTIME_UNRELIABLE_ERRORS
index b232908f8c8c2eae84bd6ef8ab2a96ac45bf94a3..447f36ac2e31dd4d11e90f326b114a78fdba8df0 100644 (file)
@@ -17,9 +17,6 @@ endif
 # because maintaining the nesting to match is a pain.  If
 # we had "elif" things would have been much nicer...
 
-ifeq ($(uname_M),x86_64)
-       XDL_FAST_HASH = YesPlease
-endif
 ifeq ($(uname_S),OSF1)
        # Need this for u_short definitions et al
        BASIC_CFLAGS += -D_OSF_SOURCE
index 027192a1c7f12214c0ff6787296769ce708ba407..04d7b32e4e4a75b9d5762f3d3a2a2f9c236075a0 100644 (file)
@@ -264,110 +264,6 @@ static unsigned long xdl_hash_record_with_whitespace(char const **data,
        return ha;
 }
 
-#ifdef XDL_FAST_HASH
-
-#define REPEAT_BYTE(x)  ((~0ul / 0xff) * (x))
-
-#define ONEBYTES       REPEAT_BYTE(0x01)
-#define NEWLINEBYTES   REPEAT_BYTE(0x0a)
-#define HIGHBITS       REPEAT_BYTE(0x80)
-
-/* Return the high bit set in the first byte that is a zero */
-static inline unsigned long has_zero(unsigned long a)
-{
-       return ((a - ONEBYTES) & ~a) & HIGHBITS;
-}
-
-static inline long count_masked_bytes(unsigned long mask)
-{
-       if (sizeof(long) == 8) {
-               /*
-                * Jan Achrenius on G+: microoptimized version of
-                * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
-                * that works for the bytemasks without having to
-                * mask them first.
-                */
-               /*
-                * return mask * 0x0001020304050608 >> 56;
-                *
-                * Doing it like this avoids warnings on 32-bit machines.
-                */
-               long a = (REPEAT_BYTE(0x01) / 0xff + 1);
-               return mask * a >> (sizeof(long) * 7);
-       } else {
-               /* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
-               /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
-               long a = (0x0ff0001 + mask) >> 23;
-               /* Fix the 1 for 00 case */
-               return a & mask;
-       }
-}
-
-unsigned long xdl_hash_record(char const **data, char const *top, long flags)
-{
-       unsigned long hash = 5381;
-       unsigned long a = 0, mask = 0;
-       char const *ptr = *data;
-       char const *end = top - sizeof(unsigned long) + 1;
-
-       if (flags & XDF_WHITESPACE_FLAGS)
-               return xdl_hash_record_with_whitespace(data, top, flags);
-
-       ptr -= sizeof(unsigned long);
-       do {
-               hash += hash << 5;
-               hash ^= a;
-               ptr += sizeof(unsigned long);
-               if (ptr >= end)
-                       break;
-               a = *(unsigned long *)ptr;
-               /* Do we have any '\n' bytes in this word? */
-               mask = has_zero(a ^ NEWLINEBYTES);
-       } while (!mask);
-
-       if (ptr >= end) {
-               /*
-                * There is only a partial word left at the end of the
-                * buffer. Because we may work with a memory mapping,
-                * we have to grab the rest byte by byte instead of
-                * blindly reading it.
-                *
-                * To avoid problems with masking in a signed value,
-                * we use an unsigned char here.
-                */
-               const char *p;
-               for (p = top - 1; p >= ptr; p--)
-                       a = (a << 8) + *((const unsigned char *)p);
-               mask = has_zero(a ^ NEWLINEBYTES);
-               if (!mask)
-                       /*
-                        * No '\n' found in the partial word.  Make a
-                        * mask that matches what we read.
-                        */
-                       mask = 1UL << (8 * (top - ptr) + 7);
-       }
-
-       /* The mask *below* the first high bit set */
-       mask = (mask - 1) & ~mask;
-       mask >>= 7;
-       hash += hash << 5;
-       hash ^= a & mask;
-
-       /* Advance past the last (possibly partial) word */
-       ptr += count_masked_bytes(mask);
-
-       if (ptr < top) {
-               assert(*ptr == '\n');
-               ptr++;
-       }
-
-       *data = ptr;
-
-       return hash;
-}
-
-#else /* XDL_FAST_HASH */
-
 unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
        unsigned long ha = 5381;
        char const *ptr = *data;
@@ -384,8 +280,6 @@ unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
        return ha;
 }
 
-#endif /* XDL_FAST_HASH */
-
 unsigned int xdl_hashbits(unsigned int size) {
        unsigned int val = 1, bits = 0;