Merge branch 'ds/push-sparse-tree-walk'
authorJunio C Hamano <gitster@pobox.com>
Thu, 7 Feb 2019 06:05:24 +0000 (22:05 -0800)
committerJunio C Hamano <gitster@pobox.com>
Thu, 7 Feb 2019 06:05:25 +0000 (22:05 -0800)
"git pack-objects" learned another algorithm to compute the set of
objects to send, that trades the resulting packfile off to save
traversal cost to favor small pushes.

* ds/push-sparse-tree-walk:
pack-objects: create GIT_TEST_PACK_SPARSE
pack-objects: create pack.useSparse setting
revision: implement sparse algorithm
list-objects: consume sparse tree walk
revision: add mark_tree_uninteresting_sparse

12 files changed:
Documentation/config/pack.txt
Documentation/git-pack-objects.txt
bisect.c
builtin/pack-objects.c
builtin/rev-list.c
http-push.c
list-objects.c
list-objects.h
revision.c
revision.h
t/README
t/t5322-pack-objects-sparse.sh [new file with mode: 0755]
index edac75c83f471bee7e9f560b1ec2defc13e0482e..425c73aa521a67f87941bd8fc11c69b52432cb00 100644 (file)
@@ -105,6 +105,15 @@ pack.useBitmaps::
        true. You should not generally need to turn this off unless
        you are debugging pack bitmaps.
 
+pack.useSparse::
+       When true, git will default to using the '--sparse' option in
+       'git pack-objects' when the '--revs' option is present. This
+       algorithm only walks trees that appear in paths that introduce new
+       objects. This can have significant performance benefits when
+       computing a pack to send a small change. However, it is possible
+       that extra objects are added to the pack-file if the included
+       commits contain certain types of direct renames.
+
 pack.writeBitmaps (deprecated)::
        This is a deprecated synonym for `repack.writeBitmaps`.
 
index 40c825c38197f4e335ebfb162415cdcc52bbdf1e..e45f3e680d3632c8122db01b2a51cc3971c27922 100644 (file)
@@ -14,7 +14,7 @@ SYNOPSIS
        [--local] [--incremental] [--window=<n>] [--depth=<n>]
        [--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
        [--stdout [--filter=<filter-spec>] | base-name]
-       [--shallow] [--keep-true-parents] < object-list
+       [--shallow] [--keep-true-parents] [--sparse] < object-list
 
 
 DESCRIPTION
@@ -196,6 +196,15 @@ depth is 4095.
        Add --no-reuse-object if you want to force a uniform compression
        level on all data no matter the source.
 
+--sparse::
+       Use the "sparse" algorithm to determine which objects to include in
+       the pack, when combined with the "--revs" option. This algorithm
+       only walks trees that appear in paths that introduce new objects.
+       This can have significant performance benefits when computing
+       a pack to send a small change. However, it is possible that extra
+       objects are added to the pack-file if the included commits contain
+       certain types of direct renames.
+
 --thin::
        Create a "thin" pack by omitting the common objects between a
        sender and a receiver in order to reduce network transfer. This
index 6bf521138a590496a7b3fa7c2515ac53f815a1e3..3af955c4bc4e1c7cd155a5f2e460cee38af732cd 100644 (file)
--- a/bisect.c
+++ b/bisect.c
@@ -658,7 +658,7 @@ static void bisect_common(struct rev_info *revs)
        if (prepare_revision_walk(revs))
                die("revision walk setup failed");
        if (revs->tree_objects)
-               mark_edges_uninteresting(revs, NULL);
+               mark_edges_uninteresting(revs, NULL, 0);
 }
 
 static void exit_if_skipped_commits(struct commit_list *tried,
index ffef8dcf2f279cfe31363fe5051ff2bac3c5a77f..bd67491c1633602a7fab3c48e86bde4421f92c4e 100644 (file)
@@ -84,6 +84,7 @@ static unsigned long pack_size_limit;
 static int depth = 50;
 static int delta_search_threads;
 static int pack_to_stdout;
+static int sparse;
 static int thin;
 static int num_preferred_base;
 static struct progress *progress_state;
@@ -2703,6 +2704,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
                use_bitmap_index_default = git_config_bool(k, v);
                return 0;
        }
+       if (!strcmp(k, "pack.usesparse")) {
+               sparse = git_config_bool(k, v);
+               return 0;
+       }
        if (!strcmp(k, "pack.threads")) {
                delta_search_threads = git_config_int(k, v);
                if (delta_search_threads < 0)
@@ -3130,7 +3135,7 @@ static void get_object_list(int ac, const char **av)
 
        if (prepare_revision_walk(&revs))
                die(_("revision walk setup failed"));
-       mark_edges_uninteresting(&revs, show_edge);
+       mark_edges_uninteresting(&revs, show_edge, sparse);
 
        if (!fn_show_object)
                fn_show_object = show_object;
@@ -3287,6 +3292,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
                { OPTION_CALLBACK, 0, "unpack-unreachable", NULL, N_("time"),
                  N_("unpack unreachable objects newer than <time>"),
                  PARSE_OPT_OPTARG, option_parse_unpack_unreachable },
+               OPT_BOOL(0, "sparse", &sparse,
+                        N_("use the sparse reachability algorithm")),
                OPT_BOOL(0, "thin", &thin,
                         N_("create thin packs")),
                OPT_BOOL(0, "shallow", &shallow,
@@ -3319,6 +3326,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 
        read_replace_refs = 0;
 
+       sparse = git_env_bool("GIT_TEST_PACK_SPARSE", 0);
        reset_pack_idx_option(&pack_idx_opts);
        git_config(git_pack_config, NULL);
 
index 14ef659c12a49e25bfc8b8143a7fb5b4138d1787..5b5b6dbb1c9b6f9893f3d0420c99655eb19501a8 100644 (file)
@@ -546,7 +546,7 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
        if (prepare_revision_walk(&revs))
                die("revision walk setup failed");
        if (revs.tree_objects)
-               mark_edges_uninteresting(&revs, show_edge);
+               mark_edges_uninteresting(&revs, show_edge, 0);
 
        if (bisect_list) {
                int reaches, all;
index bb802d80ee08945b0008d8f6862e29c783410b35..77e2e22852d769a04dc0275498964eff86e362d8 100644 (file)
@@ -1933,7 +1933,7 @@ int cmd_main(int argc, const char **argv)
                pushing = 0;
                if (prepare_revision_walk(&revs))
                        die("revision walk setup failed");
-               mark_edges_uninteresting(&revs, NULL);
+               mark_edges_uninteresting(&revs, NULL, 0);
                objects_to_send = get_delta(&revs, ref_lock);
                finish_all_active_slots();
 
index a2296a8e7b42a3d5d044c648d353ac94f9b4ed81..dc77361e11d02a4eaa994f4351310a0d54d33ef6 100644 (file)
@@ -226,25 +226,73 @@ static void mark_edge_parents_uninteresting(struct commit *commit,
        }
 }
 
-void mark_edges_uninteresting(struct rev_info *revs, show_edge_fn show_edge)
+static void add_edge_parents(struct commit *commit,
+                            struct rev_info *revs,
+                            show_edge_fn show_edge,
+                            struct oidset *set)
+{
+       struct commit_list *parents;
+
+       for (parents = commit->parents; parents; parents = parents->next) {
+               struct commit *parent = parents->item;
+               struct tree *tree = get_commit_tree(parent);
+
+               if (!tree)
+                       continue;
+
+               oidset_insert(set, &tree->object.oid);
+
+               if (!(parent->object.flags & UNINTERESTING))
+                       continue;
+               tree->object.flags |= UNINTERESTING;
+
+               if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
+                       parent->object.flags |= SHOWN;
+                       show_edge(parent);
+               }
+       }
+}
+
+void mark_edges_uninteresting(struct rev_info *revs,
+                             show_edge_fn show_edge,
+                             int sparse)
 {
        struct commit_list *list;
        int i;
 
-       for (list = revs->commits; list; list = list->next) {
-               struct commit *commit = list->item;
+       if (sparse) {
+               struct oidset set;
+               oidset_init(&set, 16);
 
-               if (commit->object.flags & UNINTERESTING) {
-                       mark_tree_uninteresting(revs->repo,
-                                               get_commit_tree(commit));
-                       if (revs->edge_hint_aggressive && !(commit->object.flags & SHOWN)) {
-                               commit->object.flags |= SHOWN;
-                               show_edge(commit);
+               for (list = revs->commits; list; list = list->next) {
+                       struct commit *commit = list->item;
+                       struct tree *tree = get_commit_tree(commit);
+
+                       if (commit->object.flags & UNINTERESTING)
+                               tree->object.flags |= UNINTERESTING;
+
+                       oidset_insert(&set, &tree->object.oid);
+                       add_edge_parents(commit, revs, show_edge, &set);
+               }
+
+               mark_trees_uninteresting_sparse(revs->repo, &set);
+               oidset_clear(&set);
+       } else {
+               for (list = revs->commits; list; list = list->next) {
+                       struct commit *commit = list->item;
+                       if (commit->object.flags & UNINTERESTING) {
+                               mark_tree_uninteresting(revs->repo,
+                                                       get_commit_tree(commit));
+                               if (revs->edge_hint_aggressive && !(commit->object.flags & SHOWN)) {
+                                       commit->object.flags |= SHOWN;
+                                       show_edge(commit);
+                               }
+                               continue;
                        }
-                       continue;
+                       mark_edge_parents_uninteresting(commit, revs, show_edge);
                }
-               mark_edge_parents_uninteresting(commit, revs, show_edge);
        }
+
        if (revs->edge_hint_aggressive) {
                for (i = 0; i < revs->cmdline.nr; i++) {
                        struct object *obj = revs->cmdline.rev[i].item;
index ad407629269a7e7c77953390beccf036fd2452f6..a952680e46671db2543bc4abff78a2d898cd1408 100644 (file)
@@ -10,7 +10,9 @@ typedef void (*show_object_fn)(struct object *, const char *, void *);
 void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *);
 
 typedef void (*show_edge_fn)(struct commit *);
-void mark_edges_uninteresting(struct rev_info *, show_edge_fn);
+void mark_edges_uninteresting(struct rev_info *revs,
+                             show_edge_fn show_edge,
+                             int sparse);
 
 struct oidset;
 struct list_objects_filter_options;
index 5c7d3c75d7d8697dbf4ab41441a9e22b7f0a536d..162d511d4686bc43f2032a57fa95e3601eb3d6b2 100644 (file)
@@ -27,6 +27,7 @@
 #include "commit-reach.h"
 #include "commit-graph.h"
 #include "prio-queue.h"
+#include "hashmap.h"
 
 volatile show_early_output_fn_t show_early_output;
 
@@ -99,6 +100,148 @@ void mark_tree_uninteresting(struct repository *r, struct tree *tree)
        mark_tree_contents_uninteresting(r, tree);
 }
 
+struct path_and_oids_entry {
+       struct hashmap_entry ent;
+       char *path;
+       struct oidset trees;
+};
+
+static int path_and_oids_cmp(const void *hashmap_cmp_fn_data,
+                            const struct path_and_oids_entry *e1,
+                            const struct path_and_oids_entry *e2,
+                            const void *keydata)
+{
+       return strcmp(e1->path, e2->path);
+}
+
+static void paths_and_oids_init(struct hashmap *map)
+{
+       hashmap_init(map, (hashmap_cmp_fn) path_and_oids_cmp, NULL, 0);
+}
+
+static void paths_and_oids_clear(struct hashmap *map)
+{
+       struct hashmap_iter iter;
+       struct path_and_oids_entry *entry;
+       hashmap_iter_init(map, &iter);
+
+       while ((entry = (struct path_and_oids_entry *)hashmap_iter_next(&iter))) {
+               oidset_clear(&entry->trees);
+               free(entry->path);
+       }
+
+       hashmap_free(map, 1);
+}
+
+static void paths_and_oids_insert(struct hashmap *map,
+                                 const char *path,
+                                 const struct object_id *oid)
+{
+       int hash = strhash(path);
+       struct path_and_oids_entry key;
+       struct path_and_oids_entry *entry;
+
+       hashmap_entry_init(&key, hash);
+
+       /* use a shallow copy for the lookup */
+       key.path = (char *)path;
+       oidset_init(&key.trees, 0);
+
+       if (!(entry = (struct path_and_oids_entry *)hashmap_get(map, &key, NULL))) {
+               entry = xcalloc(1, sizeof(struct path_and_oids_entry));
+               hashmap_entry_init(entry, hash);
+               entry->path = xstrdup(key.path);
+               oidset_init(&entry->trees, 16);
+               hashmap_put(map, entry);
+       }
+
+       oidset_insert(&entry->trees, oid);
+}
+
+static void add_children_by_path(struct repository *r,
+                                struct tree *tree,
+                                struct hashmap *map)
+{
+       struct tree_desc desc;
+       struct name_entry entry;
+
+       if (!tree)
+               return;
+
+       if (parse_tree_gently(tree, 1) < 0)
+               return;
+
+       init_tree_desc(&desc, tree->buffer, tree->size);
+       while (tree_entry(&desc, &entry)) {
+               switch (object_type(entry.mode)) {
+               case OBJ_TREE:
+                       paths_and_oids_insert(map, entry.path, &entry.oid);
+
+                       if (tree->object.flags & UNINTERESTING) {
+                               struct tree *child = lookup_tree(r, &entry.oid);
+                               if (child)
+                                       child->object.flags |= UNINTERESTING;
+                       }
+                       break;
+               case OBJ_BLOB:
+                       if (tree->object.flags & UNINTERESTING) {
+                               struct blob *child = lookup_blob(r, &entry.oid);
+                               if (child)
+                                       child->object.flags |= UNINTERESTING;
+                       }
+                       break;
+               default:
+                       /* Subproject commit - not in this repository */
+                       break;
+               }
+       }
+
+       free_tree_buffer(tree);
+}
+
+void mark_trees_uninteresting_sparse(struct repository *r,
+                                    struct oidset *trees)
+{
+       unsigned has_interesting = 0, has_uninteresting = 0;
+       struct hashmap map;
+       struct hashmap_iter map_iter;
+       struct path_and_oids_entry *entry;
+       struct object_id *oid;
+       struct oidset_iter iter;
+
+       oidset_iter_init(trees, &iter);
+       while ((!has_interesting || !has_uninteresting) &&
+              (oid = oidset_iter_next(&iter))) {
+               struct tree *tree = lookup_tree(r, oid);
+
+               if (!tree)
+                       continue;
+
+               if (tree->object.flags & UNINTERESTING)
+                       has_uninteresting = 1;
+               else
+                       has_interesting = 1;
+       }
+
+       /* Do not walk unless we have both types of trees. */
+       if (!has_uninteresting || !has_interesting)
+               return;
+
+       paths_and_oids_init(&map);
+
+       oidset_iter_init(trees, &iter);
+       while ((oid = oidset_iter_next(&iter))) {
+               struct tree *tree = lookup_tree(r, oid);
+               add_children_by_path(r, tree, &map);
+       }
+
+       hashmap_iter_init(&map, &map_iter);
+       while ((entry = hashmap_iter_next(&map_iter)))
+               mark_trees_uninteresting_sparse(r, &entry->trees);
+
+       paths_and_oids_clear(&map);
+}
+
 struct commit_stack {
        struct commit **items;
        size_t nr, alloc;
index 52e5a88ff5725862dced5c72fbc2aa6435b94e3b..d32d62abc6a7e65ba04a9f25c5024d6f5bbb5f49 100644 (file)
@@ -67,6 +67,7 @@ struct rev_cmdline_info {
 #define REVISION_WALK_NO_WALK_SORTED 1
 #define REVISION_WALK_NO_WALK_UNSORTED 2
 
+struct oidset;
 struct topo_walk_info;
 
 struct rev_info {
@@ -327,6 +328,7 @@ void put_revision_mark(const struct rev_info *revs,
 
 void mark_parents_uninteresting(struct commit *commit);
 void mark_tree_uninteresting(struct repository *r, struct tree *tree);
+void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees);
 
 void show_object_with_name(FILE *, struct object *, const char *);
 
index 25864ec88384850342f3e6122f82470424e49953..6140b8c45bc7890c9166d44b852782364fa675ca 100644 (file)
--- a/t/README
+++ b/t/README
@@ -358,6 +358,10 @@ GIT_TEST_INDEX_VERSION=<n> exercises the index read/write code path
 for the index version specified.  Can be set to any valid version
 (currently 2, 3, or 4).
 
+GIT_TEST_PACK_SPARSE=<boolean> if enabled will default the pack-objects
+builtin to use the sparse object walk. This can still be overridden by
+the --no-sparse command-line argument.
+
 GIT_TEST_PRELOAD_INDEX=<boolean> exercises the preload-index code path
 by overriding the minimum number of cache entries required per thread.
 
diff --git a/t/t5322-pack-objects-sparse.sh b/t/t5322-pack-objects-sparse.sh
new file mode 100755 (executable)
index 0000000..7124b55
--- /dev/null
@@ -0,0 +1,136 @@
+#!/bin/sh
+
+test_description='pack-objects object selection using sparse algorithm'
+. ./test-lib.sh
+
+test_expect_success 'setup repo' '
+       test_commit initial &&
+       for i in $(test_seq 1 3)
+       do
+               mkdir f$i &&
+               for j in $(test_seq 1 3)
+               do
+                       mkdir f$i/f$j &&
+                       echo $j >f$i/f$j/data.txt
+               done
+       done &&
+       git add . &&
+       git commit -m "Initialized trees" &&
+       for i in $(test_seq 1 3)
+       do
+               git checkout -b topic$i master &&
+               echo change-$i >f$i/f$i/data.txt &&
+               git commit -a -m "Changed f$i/f$i/data.txt"
+       done &&
+       cat >packinput.txt <<-EOF &&
+       topic1
+       ^topic2
+       ^topic3
+       EOF
+       git rev-parse                   \
+               topic1                  \
+               topic1^{tree}           \
+               topic1:f1               \
+               topic1:f1/f1            \
+               topic1:f1/f1/data.txt | sort >expect_objects.txt
+'
+
+test_expect_success 'non-sparse pack-objects' '
+       git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
+       git index-pack -o nonsparse.idx nonsparse.pack &&
+       git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
+       test_cmp expect_objects.txt nonsparse_objects.txt
+'
+
+test_expect_success 'sparse pack-objects' '
+       git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
+       git index-pack -o sparse.idx sparse.pack &&
+       git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
+       test_cmp expect_objects.txt sparse_objects.txt
+'
+
+test_expect_success 'duplicate a folder from f3 and commit to topic1' '
+       git checkout topic1 &&
+       echo change-3 >f3/f3/data.txt &&
+       git commit -a -m "Changed f3/f3/data.txt" &&
+       git rev-parse                   \
+               topic1~1                \
+               topic1~1^{tree}         \
+               topic1^{tree}           \
+               topic1                  \
+               topic1:f1               \
+               topic1:f1/f1            \
+               topic1:f1/f1/data.txt | sort >required_objects.txt
+'
+
+test_expect_success 'non-sparse pack-objects' '
+       git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
+       git index-pack -o nonsparse.idx nonsparse.pack &&
+       git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
+       comm -1 -2 required_objects.txt nonsparse_objects.txt >nonsparse_required_objects.txt &&
+       test_cmp required_objects.txt nonsparse_required_objects.txt
+'
+
+test_expect_success 'sparse pack-objects' '
+       git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
+       git index-pack -o sparse.idx sparse.pack &&
+       git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
+       comm -1 -2 required_objects.txt sparse_objects.txt >sparse_required_objects.txt &&
+       test_cmp required_objects.txt sparse_required_objects.txt
+'
+
+# Demonstrate that the algorithms differ when we copy a tree wholesale
+# from one folder to another.
+
+test_expect_success 'duplicate a folder from f1 into f3' '
+       mkdir f3/f4 &&
+       cp -r f1/f1/* f3/f4 &&
+       git add f3/f4 &&
+       git commit -m "Copied f1/f1 to f3/f4" &&
+       cat >packinput.txt <<-EOF &&
+       topic1
+       ^topic1~1
+       EOF
+       git rev-parse           \
+               topic1          \
+               topic1^{tree}   \
+               topic1:f3 | sort >required_objects.txt
+'
+
+test_expect_success 'non-sparse pack-objects' '
+       git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
+       git index-pack -o nonsparse.idx nonsparse.pack &&
+       git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
+       comm -1 -2 required_objects.txt nonsparse_objects.txt >nonsparse_required_objects.txt &&
+       test_cmp required_objects.txt nonsparse_required_objects.txt
+'
+
+test_expect_success 'sparse pack-objects' '
+       git rev-parse                   \
+               topic1                  \
+               topic1^{tree}           \
+               topic1:f3               \
+               topic1:f3/f4            \
+               topic1:f3/f4/data.txt | sort >expect_sparse_objects.txt &&
+       git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
+       git index-pack -o sparse.idx sparse.pack &&
+       git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
+       test_cmp expect_sparse_objects.txt sparse_objects.txt
+'
+
+test_expect_success 'pack.useSparse enables algorithm' '
+       git config pack.useSparse true &&
+       git pack-objects --stdout --revs <packinput.txt >sparse.pack &&
+       git index-pack -o sparse.idx sparse.pack &&
+       git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
+       test_cmp expect_sparse_objects.txt sparse_objects.txt
+'
+
+test_expect_success 'pack.useSparse overridden' '
+       git pack-objects --stdout --revs --no-sparse <packinput.txt >sparse.pack &&
+       git index-pack -o sparse.idx sparse.pack &&
+       git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
+       test_cmp required_objects.txt sparse_objects.txt
+'
+
+test_done