Merge branch 'nd/pathspec-wildcard'
authorJunio C Hamano <gitster@pobox.com>
Sun, 6 Jan 2013 07:40:15 +0000 (23:40 -0800)
committerJunio C Hamano <gitster@pobox.com>
Sun, 6 Jan 2013 07:40:15 +0000 (23:40 -0800)
Optimize matching paths with common forms of pathspecs that contain
wildcard characters.

* nd/pathspec-wildcard:
tree_entry_interesting: do basedir compare on wildcard patterns when possible
pathspec: apply "*.c" optimization from exclude
pathspec: do exact comparison on the leading non-wildcard part
pathspec: save the non-wildcard length part

builtin/ls-files.c
builtin/ls-tree.c
cache.h
dir.c
dir.h
tree-walk.c
index b5434af0c87741f3160388cf86904c6e3f903527..4a9ee690c7dcbaaf90f9511489b8f3b7b17b3c87 100644 (file)
@@ -337,7 +337,7 @@ void overlay_tree_on_cache(const char *tree_name, const char *prefix)
                matchbuf[0] = prefix;
                matchbuf[1] = NULL;
                init_pathspec(&pathspec, matchbuf);
-               pathspec.items[0].use_wildcard = 0;
+               pathspec.items[0].nowildcard_len = pathspec.items[0].len;
        } else
                init_pathspec(&pathspec, NULL);
        if (read_tree(tree, 1, &pathspec))
index 235c17cc015acfb73358bc5ee5bde712fa2b0fa9..fb76e38d849fc6f7b9cc1a534f62efb605a3655c 100644 (file)
@@ -168,7 +168,7 @@ int cmd_ls_tree(int argc, const char **argv, const char *prefix)
 
        init_pathspec(&pathspec, get_pathspec(prefix, argv + 1));
        for (i = 0; i < pathspec.nr; i++)
-               pathspec.items[i].use_wildcard = 0;
+               pathspec.items[i].nowildcard_len = pathspec.items[i].len;
        pathspec.has_wildcard = 0;
        tree = parse_tree_indirect(sha1);
        if (!tree)
diff --git a/cache.h b/cache.h
index 2b192d24ac63181b0bba08805c09f004af8ef1dc..843689b68fcc96c149bbe7332c5945eb7165f16d 100644 (file)
--- a/cache.h
+++ b/cache.h
@@ -473,6 +473,8 @@ extern int index_name_is_other(const struct index_state *, const char *, int);
 extern int ie_match_stat(const struct index_state *, struct cache_entry *, struct stat *, unsigned int);
 extern int ie_modified(const struct index_state *, struct cache_entry *, struct stat *, unsigned int);
 
+#define PATHSPEC_ONESTAR 1     /* the pathspec pattern sastisfies GFNM_ONESTAR */
+
 struct pathspec {
        const char **raw; /* get_pathspec() result, not freed by free_pathspec() */
        int nr;
@@ -482,7 +484,8 @@ struct pathspec {
        struct pathspec_item {
                const char *match;
                int len;
-               unsigned int use_wildcard:1;
+               int nowildcard_len;
+               int flags;
        } *items;
 };
 
diff --git a/dir.c b/dir.c
index 5a83aa7897f270279c403778f43aea6db1efc5af..9afd388604c05b5c7f7c34cd19f1e6ff2a14fdce 100644 (file)
--- a/dir.c
+++ b/dir.c
@@ -34,6 +34,28 @@ int fnmatch_icase(const char *pattern, const char *string, int flags)
        return fnmatch(pattern, string, flags | (ignore_case ? FNM_CASEFOLD : 0));
 }
 
+inline int git_fnmatch(const char *pattern, const char *string,
+                      int flags, int prefix)
+{
+       int fnm_flags = 0;
+       if (flags & GFNM_PATHNAME)
+               fnm_flags |= FNM_PATHNAME;
+       if (prefix > 0) {
+               if (strncmp(pattern, string, prefix))
+                       return FNM_NOMATCH;
+               pattern += prefix;
+               string += prefix;
+       }
+       if (flags & GFNM_ONESTAR) {
+               int pattern_len = strlen(++pattern);
+               int string_len = strlen(string);
+               return string_len < pattern_len ||
+                      strcmp(pattern,
+                             string + string_len - pattern_len);
+       }
+       return fnmatch(pattern, string, fnm_flags);
+}
+
 static size_t common_prefix_len(const char **pathspec)
 {
        const char *n, *first;
@@ -230,7 +252,10 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix,
                        return MATCHED_RECURSIVELY;
        }
 
-       if (item->use_wildcard && !fnmatch(match, name, 0))
+       if (item->nowildcard_len < item->len &&
+           !git_fnmatch(match, name,
+                        item->flags & PATHSPEC_ONESTAR ? GFNM_ONESTAR : 0,
+                        item->nowildcard_len - prefix))
                return MATCHED_FNMATCH;
 
        return 0;
@@ -1429,9 +1454,14 @@ int init_pathspec(struct pathspec *pathspec, const char **paths)
 
                item->match = path;
                item->len = strlen(path);
-               item->use_wildcard = !no_wildcard(path);
-               if (item->use_wildcard)
+               item->nowildcard_len = simple_length(path);
+               item->flags = 0;
+               if (item->nowildcard_len < item->len) {
                        pathspec->has_wildcard = 1;
+                       if (path[item->nowildcard_len] == '*' &&
+                           no_wildcard(path + item->nowildcard_len + 1))
+                               item->flags |= PATHSPEC_ONESTAR;
+               }
        }
 
        qsort(pathspec->items, pathspec->nr,
diff --git a/dir.h b/dir.h
index f5c89e3b80143f2508ac5b69d432aa82a4254751..ab5af42b2eedcf7045abd0b6029e84ba804f6057 100644 (file)
--- a/dir.h
+++ b/dir.h
@@ -139,4 +139,13 @@ extern int strcmp_icase(const char *a, const char *b);
 extern int strncmp_icase(const char *a, const char *b, size_t count);
 extern int fnmatch_icase(const char *pattern, const char *string, int flags);
 
+/*
+ * The prefix part of pattern must not contains wildcards.
+ */
+#define GFNM_PATHNAME 1                /* similar to FNM_PATHNAME */
+#define GFNM_ONESTAR  2                /* there is only _one_ wildcard, a star */
+
+extern int git_fnmatch(const char *pattern, const char *string,
+                      int flags, int prefix);
+
 #endif
index 3f54c02d7624c26632e3a0f81b8bb970c6ac307f..6e30ef9d048c62c11a92aa5b0ee6df2d227776e6 100644 (file)
@@ -572,6 +572,54 @@ static int match_dir_prefix(const char *base,
        return 0;
 }
 
+/*
+ * Perform matching on the leading non-wildcard part of
+ * pathspec. item->nowildcard_len must be greater than zero. Return
+ * non-zero if base is matched.
+ */
+static int match_wildcard_base(const struct pathspec_item *item,
+                              const char *base, int baselen,
+                              int *matched)
+{
+       const char *match = item->match;
+       /* the wildcard part is not considered in this function */
+       int matchlen = item->nowildcard_len;
+
+       if (baselen) {
+               int dirlen;
+               /*
+                * Return early if base is longer than the
+                * non-wildcard part but it does not match.
+                */
+               if (baselen >= matchlen) {
+                       *matched = matchlen;
+                       return !strncmp(base, match, matchlen);
+               }
+
+               dirlen = matchlen;
+               while (dirlen && match[dirlen - 1] != '/')
+                       dirlen--;
+
+               /*
+                * Return early if base is shorter than the
+                * non-wildcard part but it does not match. Note that
+                * base ends with '/' so we are sure it really matches
+                * directory
+                */
+               if (strncmp(base, match, baselen))
+                       return 0;
+               *matched = baselen;
+       } else
+               *matched = 0;
+       /*
+        * we could have checked entry against the non-wildcard part
+        * that is not in base and does similar never_interesting
+        * optimization as in match_entry. For now just be happy with
+        * base comparison.
+        */
+       return entry_interesting;
+}
+
 /*
  * Is a tree entry interesting given the pathspec we have?
  *
@@ -602,7 +650,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
                const struct pathspec_item *item = ps->items+i;
                const char *match = item->match;
                const char *base_str = base->buf + base_offset;
-               int matchlen = item->len;
+               int matchlen = item->len, matched = 0;
 
                if (baselen >= matchlen) {
                        /* If it doesn't match, move along... */
@@ -626,8 +674,10 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
                                        &never_interesting))
                                return entry_interesting;
 
-                       if (item->use_wildcard) {
-                               if (!fnmatch(match + baselen, entry->path, 0))
+                       if (item->nowildcard_len < item->len) {
+                               if (!git_fnmatch(match + baselen, entry->path,
+                                                item->flags & PATHSPEC_ONESTAR ? GFNM_ONESTAR : 0,
+                                                item->nowildcard_len - baselen))
                                        return entry_interesting;
 
                                /*
@@ -642,17 +692,34 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
                }
 
 match_wildcards:
-               if (!item->use_wildcard)
+               if (item->nowildcard_len == item->len)
                        continue;
 
+               if (item->nowildcard_len &&
+                   !match_wildcard_base(item, base_str, baselen, &matched))
+                       return entry_not_interesting;
+
                /*
                 * Concatenate base and entry->path into one and do
                 * fnmatch() on it.
+                *
+                * While we could avoid concatenation in certain cases
+                * [1], which saves a memcpy and potentially a
+                * realloc, it turns out not worth it. Measurement on
+                * linux-2.6 does not show any clear improvements,
+                * partly because of the nowildcard_len optimization
+                * in git_fnmatch(). Avoid micro-optimizations here.
+                *
+                * [1] if match_wildcard_base() says the base
+                * directory is already matched, we only need to match
+                * the rest, which is shorter so _in theory_ faster.
                 */
 
                strbuf_add(base, entry->path, pathlen);
 
-               if (!fnmatch(match, base->buf + base_offset, 0)) {
+               if (!git_fnmatch(match, base->buf + base_offset,
+                                item->flags & PATHSPEC_ONESTAR ? GFNM_ONESTAR : 0,
+                                item->nowildcard_len)) {
                        strbuf_setlen(base, base_offset + baselen);
                        return entry_interesting;
                }