commit-graph: merge commit-graph chains

author Derrick Stolee <dstolee@microsoft.com>
Tue, 18 Jun 2019 18:14:29 +0000 (11:14 -0700)

committer Junio C Hamano <gitster@pobox.com>
Thu, 20 Jun 2019 03:46:26 +0000 (20:46 -0700)
author: Derrick Stolee <dstolee@microsoft.com>
Tue, 18 Jun 2019 18:14:29 +0000 (11:14 -0700)
committer: Junio C Hamano <gitster@pobox.com>
Thu, 20 Jun 2019 03:46:26 +0000 (20:46 -0700)
diff --git a/Documentation/technical/commit-graph.txt b/Documentation/technical/commit-graph.txt

index 1dca3bd8fe90a189e52008212d8aa8d9bd094eff..d9c6253b0a4368b2e391157c853dc8c690d07196 100644 (file)
--- a/Documentation/technical/commit-graph.txt
+++ b/Documentation/technical/commit-graph.txt
@@ -186,6 +186,86 @@ positions to refer to their parents, which may be in `graph-{hash1}.graph` or
  its containment in the intervals [0, X0), [X0, X0 + X1), [X0 + X1, X0 + X1 +
  X2).
  
+Each commit-graph file (except the base, `graph-{hash0}.graph`) contains data
+specifying the hashes of all files in the lower layers. In the above example,
+`graph-{hash1}.graph` contains `{hash0}` while `graph-{hash2}.graph` contains
+`{hash0}` and `{hash1}`.
+
+## Merging commit-graph files
+
+If we only added a new commit-graph file on every write, we would run into a
+linear search problem through many commit-graph files.  Instead, we use a merge
+strategy to decide when the stack should collapse some number of levels.
+
+The diagram below shows such a collapse. As a set of new commits are added, it
+is determined by the merge strategy that the files should collapse to
+`graph-{hash1}`. Thus, the new commits, the commits in `graph-{hash2}` and
+the commits in `graph-{hash1}` should be combined into a new `graph-{hash3}`
+file.
+
+                           +---------------------+
+                           |                     |
+                           |    (new commits)    |
+                           |                     |
+                           +---------------------+
+                           |                     |
+ +-----------------------+  +---------------------+
+ |  graph-{hash2} |->|                     |
+ +-----------------------+  +---------------------+
+         |                 |                     |
+ +-----------------------+  +---------------------+
+ |                       |  |                     |
+ |  graph-{hash1} |->|                     |
+ |                       |  |                     |
+ +-----------------------+  +---------------------+
+         |                  tmp_graphXXX
+ +-----------------------+
+ |                       |
+ |                       |
+ |                       |
+ |  graph-{hash0} |
+ |                       |
+ |                       |
+ |                       |
+ +-----------------------+
+
+During this process, the commits to write are combined, sorted and we write the
+contents to a temporary file, all while holding a `commit-graph-chain.lock`
+lock-file.  When the file is flushed, we rename it to `graph-{hash3}`
+according to the computed `{hash3}`. Finally, we write the new chain data to
+`commit-graph-chain.lock`:
+
+```
+       {hash3}
+       {hash0}
+```
+
+We then close the lock-file.
+
+## Merge Strategy
+
+When writing a set of commits that do not exist in the commit-graph stack of
+height N, we default to creating a new file at level N + 1. We then decide to
+merge with the Nth level if one of two conditions hold:
+
+  1. The expected file size for level N + 1 is at least half the file size for
+     level N.
+
+  2. Level N + 1 contains more than 64,0000 commits.
+
+This decision cascades down the levels: when we merge a level we create a new
+set of commits that then compares to the next level.
+
+The first condition bounds the number of levels to be logarithmic in the total
+number of commits.  The second condition bounds the total number of commits in
+a `graph-{hashN}` file and not in the `commit-graph` file, preventing
+significant performance issues when the stack merges and another process only
+partially reads the previous stack.
+
+The merge strategy values (2 for the size multiple, 64,000 for the maximum
+number of commits) could be extracted into config settings for full
+flexibility.
+
  Related Links
  -------------
  [0] https://bugs.chromium.org/p/git/issues/detail?id=8
diff --git a/commit-graph.c b/commit-graph.c

index 1224309e5feb48a6cef4b86c41f5b4fecfcce604..fb3100921cdcd4675231b81f71f6d1db209096d7 100644 (file)
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1298,36 +1298,6 @@ static int write_graph_chunk_base(struct hashfile *f,
         return 0;
  }
  
-static void init_commit_graph_chain(struct write_commit_graph_context *ctx)
-{
-       struct commit_graph *g = ctx->r->objects->commit_graph;
-       uint32_t i;
-
-       ctx->new_base_graph = g;
-       ctx->base_graph_name = xstrdup(g->filename);
-       ctx->new_num_commits_in_base = g->num_commits + g->num_commits_in_base;
-
-       ctx->num_commit_graphs_after = ctx->num_commit_graphs_before + 1;
-
-       ALLOC_ARRAY(ctx->commit_graph_filenames_after, ctx->num_commit_graphs_after);
-       ALLOC_ARRAY(ctx->commit_graph_hash_after, ctx->num_commit_graphs_after);
-
-       for (i = 0; i < ctx->num_commit_graphs_before - 1; i++)
-               ctx->commit_graph_filenames_after[i] = xstrdup(ctx->commit_graph_filenames_before[i]);
-
-       if (ctx->num_commit_graphs_before)
-               ctx->commit_graph_filenames_after[ctx->num_commit_graphs_before - 1] =
-                       get_split_graph_filename(ctx->obj_dir, oid_to_hex(&g->oid));
-
-       i = ctx->num_commit_graphs_before - 1;
-
-       while (g) {
-               ctx->commit_graph_hash_after[i] = xstrdup(oid_to_hex(&g->oid));
-               i--;
-               g = g->base_graph;
-       }
-}
-
  static int write_commit_graph_file(struct write_commit_graph_context *ctx)
  {
         uint32_t i;
@@ -1509,6 +1479,145 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
         return 0;
  }
  
+static int split_strategy_max_commits = 64000;
+static float split_strategy_size_mult = 2.0f;
+
+static void split_graph_merge_strategy(struct write_commit_graph_context *ctx)
+{
+       struct commit_graph *g = ctx->r->objects->commit_graph;
+       uint32_t num_commits = ctx->commits.nr;
+       uint32_t i;
+
+       g = ctx->r->objects->commit_graph;
+       ctx->num_commit_graphs_after = ctx->num_commit_graphs_before + 1;
+
+       while (g && (g->num_commits <= split_strategy_size_mult * num_commits ||
+                    num_commits > split_strategy_max_commits)) {
+               num_commits += g->num_commits;
+               g = g->base_graph;
+
+               ctx->num_commit_graphs_after--;
+       }
+
+       ctx->new_base_graph = g;
+
+       ALLOC_ARRAY(ctx->commit_graph_filenames_after, ctx->num_commit_graphs_after);
+       ALLOC_ARRAY(ctx->commit_graph_hash_after, ctx->num_commit_graphs_after);
+
+       for (i = 0; i < ctx->num_commit_graphs_after &&
+                   i < ctx->num_commit_graphs_before; i++)
+               ctx->commit_graph_filenames_after[i] = xstrdup(ctx->commit_graph_filenames_before[i]);
+
+       i = ctx->num_commit_graphs_before - 1;
+       g = ctx->r->objects->commit_graph;
+
+       while (g) {
+               if (i < ctx->num_commit_graphs_after)
+                       ctx->commit_graph_hash_after[i] = xstrdup(oid_to_hex(&g->oid));
+
+               i--;
+               g = g->base_graph;
+       }
+}
+
+static void merge_commit_graph(struct write_commit_graph_context *ctx,
+                              struct commit_graph *g)
+{
+       uint32_t i;
+       uint32_t offset = g->num_commits_in_base;
+
+       ALLOC_GROW(ctx->commits.list, ctx->commits.nr + g->num_commits, ctx->commits.alloc);
+
+       for (i = 0; i < g->num_commits; i++) {
+               struct object_id oid;
+               struct commit *result;
+
+               display_progress(ctx->progress, i + 1);
+
+               load_oid_from_graph(g, i + offset, &oid);
+
+               /* only add commits if they still exist in the repo */
+               result = lookup_commit_reference_gently(ctx->r, &oid, 1);
+
+               if (result) {
+                       ctx->commits.list[ctx->commits.nr] = result;
+                       ctx->commits.nr++;
+               }
+       }
+}
+
+static int commit_compare(const void *_a, const void *_b)
+{
+       const struct commit *a = *(const struct commit **)_a;
+       const struct commit *b = *(const struct commit **)_b;
+       return oidcmp(&a->object.oid, &b->object.oid);
+}
+
+static void sort_and_scan_merged_commits(struct write_commit_graph_context *ctx)
+{
+       uint32_t i, num_parents;
+       struct commit_list *parent;
+
+       if (ctx->report_progress)
+               ctx->progress = start_delayed_progress(
+                                       _("Scanning merged commits"),
+                                       ctx->commits.nr);
+
+       QSORT(ctx->commits.list, ctx->commits.nr, commit_compare);
+
+       ctx->num_extra_edges = 0;
+       for (i = 0; i < ctx->commits.nr; i++) {
+               display_progress(ctx->progress, i);
+
+               if (i && oideq(&ctx->commits.list[i - 1]->object.oid,
+                         &ctx->commits.list[i]->object.oid)) {
+                       die(_("unexpected duplicate commit id %s"),
+                           oid_to_hex(&ctx->commits.list[i]->object.oid));
+               } else {
+                       num_parents = 0;
+                       for (parent = ctx->commits.list[i]->parents; parent; parent = parent->next)
+                               num_parents++;
+
+                       if (num_parents > 2)
+                               ctx->num_extra_edges += num_parents - 2;
+               }
+       }
+
+       stop_progress(&ctx->progress);
+}
+
+static void merge_commit_graphs(struct write_commit_graph_context *ctx)
+{
+       struct commit_graph *g = ctx->r->objects->commit_graph;
+       uint32_t current_graph_number = ctx->num_commit_graphs_before;
+       struct strbuf progress_title = STRBUF_INIT;
+
+       while (g && current_graph_number >= ctx->num_commit_graphs_after) {
+               current_graph_number--;
+
+               if (ctx->report_progress) {
+                       strbuf_addstr(&progress_title, _("Merging commit-graph"));
+                       ctx->progress = start_delayed_progress(progress_title.buf, 0);
+               }
+
+               merge_commit_graph(ctx, g);
+               stop_progress(&ctx->progress);
+               strbuf_release(&progress_title);
+
+               g = g->base_graph;
+       }
+
+       if (g) {
+               ctx->new_base_graph = g;
+               ctx->new_num_commits_in_base = g->num_commits + g->num_commits_in_base;
+       }
+
+       if (ctx->new_base_graph)
+               ctx->base_graph_name = xstrdup(ctx->new_base_graph->filename);
+
+       sort_and_scan_merged_commits(ctx);
+}
+
  int write_commit_graph(const char *obj_dir,
                        struct string_list *pack_indexes,
                        struct string_list *commit_hex,
@@ -1554,6 +1663,9 @@ int write_commit_graph(const char *obj_dir,
         ctx->approx_nr_objects = approximate_object_count();
         ctx->oids.alloc = ctx->approx_nr_objects / 32;
  
+       if (ctx->split && ctx->oids.alloc > split_strategy_max_commits)
+               ctx->oids.alloc = split_strategy_max_commits;
+
         if (ctx->append) {
                 prepare_commit_graph_one(ctx->r, ctx->obj_dir);
                 if (ctx->r->objects->commit_graph)
@@ -1607,9 +1719,11 @@ int write_commit_graph(const char *obj_dir,
         if (!ctx->commits.nr)
                 goto cleanup;
  
-       if (ctx->split)
-               init_commit_graph_chain(ctx);
-       else
+       if (ctx->split) {
+               split_graph_merge_strategy(ctx);
+
+               merge_commit_graphs(ctx);
+       } else
                 ctx->num_commit_graphs_after = 1;
  
         compute_generation_numbers(ctx);
diff --git a/t/t5324-split-commit-graph.sh b/t/t5324-split-commit-graph.sh

index ccd24bd22bf665b9c13e5872d2512ef52bb3bcf9..5cb5663a30967d2ae2d003b3d30fa8a6c27188c4 100755 (executable)
--- a/t/t5324-split-commit-graph.sh
+++ b/t/t5324-split-commit-graph.sh
@@ -119,4 +119,17 @@ test_expect_success 'add one commit, write a tip graph' '
  
  graph_git_behavior 'three-layer commit-graph: commit 11 vs 6' commits/11 commits/6
  
+test_expect_success 'add one commit, write a merged graph' '
+       test_commit 12 &&
+       git branch commits/12 &&
+       git commit-graph write --reachable --split &&
+       test_path_is_file $graphdir/commit-graph-chain &&
+       test_line_count = 2 $graphdir/commit-graph-chain &&
+       ls $graphdir/graph-*.graph >graph-files &&
+       test_line_count = 4 graph-files &&
+       verify_chain_files_exist $graphdir
+'
+
+graph_git_behavior 'merged commit-graph: commit 12 vs 6' commits/12 commits/6
+
  test_done
author	Derrick Stolee <dstolee@microsoft.com>
author	Tue, 18 Jun 2019 18:14:29 +0000 (11:14 -0700)
committer	Junio C Hamano <gitster@pobox.com>
committer	Thu, 20 Jun 2019 03:46:26 +0000 (20:46 -0700)
Documentation/technical/commit-graph.txt		patch \| blob \| history
commit-graph.c		patch \| blob \| history
t/t5324-split-commit-graph.sh		patch \| blob \| history