Merge branch 'jh/add-index-entry-optim'
authorJunio C Hamano <gitster@pobox.com>
Wed, 26 Apr 2017 06:39:07 +0000 (15:39 +0900)
committerJunio C Hamano <gitster@pobox.com>
Wed, 26 Apr 2017 06:39:07 +0000 (15:39 +0900)
"git checkout" that handles a lot of paths has been optimized by
reducing the number of unnecessary checks of paths in the
has_dir_name() function.

* jh/add-index-entry-optim:
read-cache: speed up has_dir_name (part 2)
read-cache: speed up has_dir_name (part 1)
read-cache: speed up add_index_entry during checkout
p0006-read-tree-checkout: perf test to time read-tree
read-cache: add strcmp_offset function

1  2 
Makefile
cache.h
read-cache.c
t/helper/.gitignore
diff --combined Makefile
index eb1a1a7cffd5efc06ada671ca482eb638cd25d61,0a4fb9de8cb1771a48150cc4b41a5f31244367cd..c739023e4e5683f8588abf57a066df707b81ebfe
+++ b/Makefile
@@@ -626,12 -626,10 +626,12 @@@ TEST_PROGRAMS_NEED_X += test-line-buffe
  TEST_PROGRAMS_NEED_X += test-match-trees
  TEST_PROGRAMS_NEED_X += test-mergesort
  TEST_PROGRAMS_NEED_X += test-mktemp
 +TEST_PROGRAMS_NEED_X += test-online-cpus
  TEST_PROGRAMS_NEED_X += test-parse-options
  TEST_PROGRAMS_NEED_X += test-path-utils
  TEST_PROGRAMS_NEED_X += test-prio-queue
  TEST_PROGRAMS_NEED_X += test-read-cache
 +TEST_PROGRAMS_NEED_X += test-ref-store
  TEST_PROGRAMS_NEED_X += test-regex
  TEST_PROGRAMS_NEED_X += test-revision-walking
  TEST_PROGRAMS_NEED_X += test-run-command
@@@ -639,6 -637,7 +639,7 @@@ TEST_PROGRAMS_NEED_X += test-scrap-cach
  TEST_PROGRAMS_NEED_X += test-sha1
  TEST_PROGRAMS_NEED_X += test-sha1-array
  TEST_PROGRAMS_NEED_X += test-sigchain
+ TEST_PROGRAMS_NEED_X += test-strcmp-offset
  TEST_PROGRAMS_NEED_X += test-string-list
  TEST_PROGRAMS_NEED_X += test-submodule-config
  TEST_PROGRAMS_NEED_X += test-subprocess
@@@ -1853,7 -1852,6 +1854,7 @@@ perl/perl.mak: perl/PM.stam
  
  perl/PM.stamp: FORCE
        @$(FIND) perl -type f -name '*.pm' | sort >$@+ && \
 +      $(PERL_PATH) -V >>$@+ && \
        { cmp $@+ $@ >/dev/null 2>/dev/null || mv $@+ $@; } && \
        $(RM) $@+
  
diff --combined cache.h
index 322ae582590e40a9691ff390ba6e4d1589c8b550,2aea7caf448a006e34308a6cbcdb4bb7131b8a15..e1f0e182ad011d5a3b6912f48b597dcc6d1070b0
+++ b/cache.h
@@@ -66,12 -66,8 +66,12 @@@ unsigned long git_deflate_bound(git_zst
  #define GIT_SHA1_RAWSZ 20
  #define GIT_SHA1_HEXSZ (2 * GIT_SHA1_RAWSZ)
  
 +/* The length in byte and in hex digits of the largest possible hash value. */
 +#define GIT_MAX_RAWSZ GIT_SHA1_RAWSZ
 +#define GIT_MAX_HEXSZ GIT_SHA1_HEXSZ
 +
  struct object_id {
 -      unsigned char hash[GIT_SHA1_RAWSZ];
 +      unsigned char hash[GIT_MAX_RAWSZ];
  };
  
  #if defined(DT_UNKNOWN) && !defined(NO_D_TYPE_IN_DIRENT)
@@@ -599,6 -595,7 +599,7 @@@ extern int write_locked_index(struct in
  extern int discard_index(struct index_state *);
  extern int unmerged_index(const struct index_state *);
  extern int verify_path(const char *path);
+ extern int strcmp_offset(const char *s1, const char *s2, size_t *first_change);
  extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
  extern void adjust_dirname_case(struct index_state *istate, char *name);
  extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);
@@@ -710,8 -707,6 +711,8 @@@ extern void update_index_if_able(struc
  extern int hold_locked_index(struct lock_file *, int);
  extern void set_alternate_index_output(const char *);
  
 +extern int verify_index_checksum;
 +
  /* Environment bits from configuration mechanism */
  extern int trust_executable_bit;
  extern int trust_ctime;
@@@ -984,7 -979,7 +985,7 @@@ extern char *sha1_pack_index_name(cons
  extern const char *find_unique_abbrev(const unsigned char *sha1, int len);
  extern int find_unique_abbrev_r(char *hex, const unsigned char *sha1, int len);
  
 -extern const unsigned char null_sha1[GIT_SHA1_RAWSZ];
 +extern const unsigned char null_sha1[GIT_MAX_RAWSZ];
  extern const struct object_id null_oid;
  
  static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2)
@@@ -1166,7 -1161,7 +1167,7 @@@ typedef int create_file_fn(const char *
  int raceproof_create_file(const char *path, create_file_fn fn, void *cb);
  
  int mkdir_in_gitdir(const char *path);
 -extern char *expand_user_path(const char *path);
 +extern char *expand_user_path(const char *path, int real_home);
  const char *enter_repo(const char *path, int strict);
  static inline int is_absolute_path(const char *path)
  {
@@@ -1366,7 -1361,7 +1367,7 @@@ extern int get_sha1_with_context(const 
  
  extern int get_oid(const char *str, struct object_id *oid);
  
 -typedef int each_abbrev_fn(const unsigned char *sha1, void *);
 +typedef int each_abbrev_fn(const struct object_id *oid, void *);
  extern int for_each_abbrev(const char *prefix, each_abbrev_fn, void *);
  
  extern int set_disambiguate_hint_config(const char *var, const char *value);
@@@ -1681,12 -1676,9 +1682,12 @@@ extern struct packed_git *find_sha1_pac
  extern void pack_report(void);
  
  /*
 - * Create a temporary file rooted in the object database directory.
 + * Create a temporary file rooted in the object database directory, or
 + * die on failure. The filename is taken from "pattern", which should have the
 + * usual "XXXXXX" trailer, and the resulting filename is written into the
 + * "template" buffer. Returns the open descriptor.
   */
 -extern int odb_mkstemp(char *template, size_t limit, const char *pattern);
 +extern int odb_mkstemp(struct strbuf *template, const char *pattern);
  
  /*
   * Generate the filename to be used for a pack file with checksum "sha1" and
@@@ -1891,11 -1883,6 +1892,11 @@@ enum config_origin_type 
        CONFIG_ORIGIN_CMDLINE
  };
  
 +struct config_options {
 +      unsigned int respect_includes : 1;
 +      const char *git_dir;
 +};
 +
  typedef int (*config_fn_t)(const char *, const char *, void *);
  extern int git_default_config(const char *, const char *, void *);
  extern int git_config_from_file(config_fn_t fn, const char *, void *);
@@@ -1909,13 -1896,12 +1910,13 @@@ extern void read_early_config(config_fn
  extern void git_config(config_fn_t fn, void *);
  extern int git_config_with_options(config_fn_t fn, void *,
                                   struct git_config_source *config_source,
 -                                 int respect_includes);
 +                                 const struct config_options *opts);
  extern int git_parse_ulong(const char *, unsigned long *);
  extern int git_parse_maybe_bool(const char *);
  extern int git_config_int(const char *, const char *);
  extern int64_t git_config_int64(const char *, const char *);
  extern unsigned long git_config_ulong(const char *, const char *);
 +extern ssize_t git_config_ssize_t(const char *, const char *);
  extern int git_config_bool_or_int(const char *, const char *, int *);
  extern int git_config_bool(const char *, const char *);
  extern int git_config_maybe_bool(const char *, const char *);
@@@ -1962,7 -1948,6 +1963,7 @@@ struct config_include_data 
        int depth;
        config_fn_t fn;
        void *data;
 +      const struct config_options *opts;
  };
  #define CONFIG_INCLUDE_INIT { 0 }
  extern int git_config_include(const char *name, const char *value, void *data);
diff --combined read-cache.c
index 008b335844c48fad9408452c11bd921d7b9b0b44,5d47ba8bdd5947b536582a4fcc8963644206fcd4..d8c657d6a7faa717be8459cf88de83acc51ddde1
@@@ -887,9 -887,32 +887,32 @@@ static int has_file_name(struct index_s
        return retval;
  }
  
+ /*
+  * Like strcmp(), but also return the offset of the first change.
+  * If strings are equal, return the length.
+  */
+ int strcmp_offset(const char *s1, const char *s2, size_t *first_change)
+ {
+       size_t k;
+       if (!first_change)
+               return strcmp(s1, s2);
+       for (k = 0; s1[k] == s2[k]; k++)
+               if (s1[k] == '\0')
+                       break;
+       *first_change = k;
+       return (unsigned char)s1[k] - (unsigned char)s2[k];
+ }
  /*
   * Do we have another file with a pathname that is a proper
   * subset of the name we're trying to add?
+  *
+  * That is, is there another file in the index with a path
+  * that matches a sub-directory in the given entry?
   */
  static int has_dir_name(struct index_state *istate,
                        const struct cache_entry *ce, int pos, int ok_to_replace)
        int stage = ce_stage(ce);
        const char *name = ce->name;
        const char *slash = name + ce_namelen(ce);
+       size_t len_eq_last;
+       int cmp_last = 0;
+       /*
+        * We are frequently called during an iteration on a sorted
+        * list of pathnames and while building a new index.  Therefore,
+        * there is a high probability that this entry will eventually
+        * be appended to the index, rather than inserted in the middle.
+        * If we can confirm that, we can avoid binary searches on the
+        * components of the pathname.
+        *
+        * Compare the entry's full path with the last path in the index.
+        */
+       if (istate->cache_nr > 0) {
+               cmp_last = strcmp_offset(name,
+                       istate->cache[istate->cache_nr - 1]->name,
+                       &len_eq_last);
+               if (cmp_last > 0) {
+                       if (len_eq_last == 0) {
+                               /*
+                                * The entry sorts AFTER the last one in the
+                                * index and their paths have no common prefix,
+                                * so there cannot be a F/D conflict.
+                                */
+                               return retval;
+                       } else {
+                               /*
+                                * The entry sorts AFTER the last one in the
+                                * index, but has a common prefix.  Fall through
+                                * to the loop below to disect the entry's path
+                                * and see where the difference is.
+                                */
+                       }
+               } else if (cmp_last == 0) {
+                       /*
+                        * The entry exactly matches the last one in the
+                        * index, but because of multiple stage and CE_REMOVE
+                        * items, we fall through and let the regular search
+                        * code handle it.
+                        */
+               }
+       }
  
        for (;;) {
-               int len;
+               size_t len;
  
                for (;;) {
                        if (*--slash == '/')
                }
                len = slash - name;
  
+               if (cmp_last > 0) {
+                       /*
+                        * (len + 1) is a directory boundary (including
+                        * the trailing slash).  And since the loop is
+                        * decrementing "slash", the first iteration is
+                        * the longest directory prefix; subsequent
+                        * iterations consider parent directories.
+                        */
+                       if (len + 1 <= len_eq_last) {
+                               /*
+                                * The directory prefix (including the trailing
+                                * slash) also appears as a prefix in the last
+                                * entry, so the remainder cannot collide (because
+                                * strcmp said the whole path was greater).
+                                *
+                                * EQ: last: xxx/A
+                                *     this: xxx/B
+                                *
+                                * LT: last: xxx/file_A
+                                *     this: xxx/file_B
+                                */
+                               return retval;
+                       }
+                       if (len > len_eq_last) {
+                               /*
+                                * This part of the directory prefix (excluding
+                                * the trailing slash) is longer than the known
+                                * equal portions, so this sub-directory cannot
+                                * collide with a file.
+                                *
+                                * GT: last: xxxA
+                                *     this: xxxB/file
+                                */
+                               return retval;
+                       }
+                       if (istate->cache_nr > 0 &&
+                               ce_namelen(istate->cache[istate->cache_nr - 1]) > len) {
+                               /*
+                                * The directory prefix lines up with part of
+                                * a longer file or directory name, but sorts
+                                * after it, so this sub-directory cannot
+                                * collide with a file.
+                                *
+                                * last: xxx/yy-file (because '-' sorts before '/')
+                                * this: xxx/yy/abc
+                                */
+                               return retval;
+                       }
+                       /*
+                        * This is a possible collision. Fall through and
+                        * let the regular search code handle it.
+                        *
+                        * last: xxx
+                        * this: xxx/file
+                        */
+               }
                pos = index_name_stage_pos(istate, name, len, stage);
                if (pos >= 0) {
                        /*
@@@ -1001,7 -1127,16 +1127,16 @@@ static int add_index_entry_with_check(s
  
        if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
                cache_tree_invalidate_path(istate, ce->name);
-       pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
+       /*
+        * If this entry's path sorts after the last entry in the index,
+        * we can avoid searching for it.
+        */
+       if (istate->cache_nr > 0 &&
+               strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
+               pos = -istate->cache_nr - 1;
+       else
+               pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
  
        /* existing match? Just replace it. */
        if (pos >= 0) {
@@@ -1371,9 -1506,6 +1506,9 @@@ struct ondisk_cache_entry_extended 
                            ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
                            ondisk_cache_entry_size(ce_namelen(ce)))
  
 +/* Allow fsck to force verification of the index checksum. */
 +int verify_index_checksum;
 +
  static int verify_hdr(struct cache_header *hdr, unsigned long size)
  {
        git_SHA_CTX c;
        hdr_version = ntohl(hdr->hdr_version);
        if (hdr_version < INDEX_FORMAT_LB || INDEX_FORMAT_UB < hdr_version)
                return error("bad index version %d", hdr_version);
 +
 +      if (!verify_index_checksum)
 +              return 0;
 +
        git_SHA1_Init(&c);
        git_SHA1_Update(&c, hdr, size - 20);
        git_SHA1_Final(sha1, &c);
diff --combined t/helper/.gitignore
index acd5db180f31f5b81877c75f8a695c3e4437e34d,e7fe7ff5982dc4b478fc1359034933089eca004c..721650256e63eb51c66d5985c04bc3095f183c1c
  /test-match-trees
  /test-mergesort
  /test-mktemp
 +/test-online-cpus
  /test-parse-options
  /test-path-utils
  /test-prio-queue
  /test-read-cache
 +/test-ref-store
  /test-regex
  /test-revision-walking
  /test-run-command
  /test-sha1
  /test-sha1-array
  /test-sigchain
+ /test-strcmp-offset
  /test-string-list
  /test-submodule-config
  /test-subprocess