Merge branch 'jt/batch-fetch-blobs-in-diff'
authorJunio C Hamano <gitster@pobox.com>
Thu, 25 Apr 2019 07:41:19 +0000 (16:41 +0900)
committerJunio C Hamano <gitster@pobox.com>
Thu, 25 Apr 2019 07:41:19 +0000 (16:41 +0900)
While running "git diff" in a lazy clone, we can upfront know which
missing blobs we will need, instead of waiting for the on-demand
machinery to discover them one by one. Aim to achieve better
performance by batching the request for these promised blobs.

* jt/batch-fetch-blobs-in-diff:
diff: batch fetching of missing blobs
sha1-file: support OBJECT_INFO_FOR_PREFETCH

diff.c
object-store.h
sha1-file.c
t/t4067-diff-partial-clone.sh [new file with mode: 0755]
unpack-trees.c
diff --git a/diff.c b/diff.c
index e30a645d9e01ecab1c12d8d14def7de5f6b8d2ed..4d3cf83a27e5785f5fd50dd2ce6155c94eb50840 100644 (file)
--- a/diff.c
+++ b/diff.c
@@ -25,6 +25,7 @@
 #include "packfile.h"
 #include "parse-options.h"
 #include "help.h"
+#include "fetch-object.h"
 
 #ifdef NO_FAST_WORKING_DIRECTORY
 #define FAST_WORKING_DIRECTORY 0
@@ -6477,8 +6478,41 @@ void diffcore_fix_diff_index(void)
        QSORT(q->queue, q->nr, diffnamecmp);
 }
 
+static void add_if_missing(struct repository *r,
+                          struct oid_array *to_fetch,
+                          const struct diff_filespec *filespec)
+{
+       if (filespec && filespec->oid_valid &&
+           oid_object_info_extended(r, &filespec->oid, NULL,
+                                    OBJECT_INFO_FOR_PREFETCH))
+               oid_array_append(to_fetch, &filespec->oid);
+}
+
 void diffcore_std(struct diff_options *options)
 {
+       if (options->repo == the_repository &&
+           repository_format_partial_clone) {
+               /*
+                * Prefetch the diff pairs that are about to be flushed.
+                */
+               int i;
+               struct diff_queue_struct *q = &diff_queued_diff;
+               struct oid_array to_fetch = OID_ARRAY_INIT;
+
+               for (i = 0; i < q->nr; i++) {
+                       struct diff_filepair *p = q->queue[i];
+                       add_if_missing(options->repo, &to_fetch, p->one);
+                       add_if_missing(options->repo, &to_fetch, p->two);
+               }
+               if (to_fetch.nr)
+                       /*
+                        * NEEDSWORK: Consider deduplicating the OIDs sent.
+                        */
+                       fetch_objects(repository_format_partial_clone,
+                                     to_fetch.oid, to_fetch.nr);
+               oid_array_clear(&to_fetch);
+       }
+
        /* NOTE please keep the following in sync with diff_tree_combined() */
        if (options->skip_stat_unmatch)
                diffcore_skip_stat_unmatch(options);
index 56f8aea1cc8df60f3664c50d81a17277a55a7458..b086f5ecdb82a105b9b3381de9893cb72bc5871c 100644 (file)
@@ -280,6 +280,12 @@ struct object_info {
 #define OBJECT_INFO_QUICK 8
 /* Do not check loose object */
 #define OBJECT_INFO_IGNORE_LOOSE 16
+/*
+ * Do not attempt to fetch the object if missing (even if fetch_is_missing is
+ * nonzero). This is meant for bulk prefetching of missing blobs in a partial
+ * clone. Implies OBJECT_INFO_QUICK.
+ */
+#define OBJECT_INFO_FOR_PREFETCH (32 + OBJECT_INFO_QUICK)
 
 int oid_object_info_extended(struct repository *r,
                             const struct object_id *,
index bcd9470bce089012ab942973cd841262d35749eb..ed5c50dac427f2f3aa2b9b7b203bf2355049b163 100644 (file)
@@ -1378,7 +1378,8 @@ int oid_object_info_extended(struct repository *r, const struct object_id *oid,
 
                /* Check if it is a missing object */
                if (fetch_if_missing && repository_format_partial_clone &&
-                   !already_retried && r == the_repository) {
+                   !already_retried && r == the_repository &&
+                   !(flags & OBJECT_INFO_FOR_PREFETCH)) {
                        /*
                         * TODO Investigate having fetch_object() return
                         * TODO error/success and stopping the music here.
diff --git a/t/t4067-diff-partial-clone.sh b/t/t4067-diff-partial-clone.sh
new file mode 100755 (executable)
index 0000000..90c8fb2
--- /dev/null
@@ -0,0 +1,103 @@
+#!/bin/sh
+
+test_description='behavior of diff when reading objects in a partial clone'
+
+. ./test-lib.sh
+
+test_expect_success 'git show batches blobs' '
+       test_when_finished "rm -rf server client trace" &&
+
+       test_create_repo server &&
+       echo a >server/a &&
+       echo b >server/b &&
+       git -C server add a b &&
+       git -C server commit -m x &&
+
+       test_config -C server uploadpack.allowfilter 1 &&
+       test_config -C server uploadpack.allowanysha1inwant 1 &&
+       git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+       # Ensure that there is exactly 1 negotiation by checking that there is
+       # only 1 "done" line sent. ("done" marks the end of negotiation.)
+       GIT_TRACE_PACKET="$(pwd)/trace" git -C client show HEAD &&
+       grep "git> done" trace >done_lines &&
+       test_line_count = 1 done_lines
+'
+
+test_expect_success 'diff batches blobs' '
+       test_when_finished "rm -rf server client trace" &&
+
+       test_create_repo server &&
+       echo a >server/a &&
+       echo b >server/b &&
+       git -C server add a b &&
+       git -C server commit -m x &&
+       echo c >server/c &&
+       echo d >server/d &&
+       git -C server add c d &&
+       git -C server commit -m x &&
+
+       test_config -C server uploadpack.allowfilter 1 &&
+       test_config -C server uploadpack.allowanysha1inwant 1 &&
+       git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+       # Ensure that there is exactly 1 negotiation by checking that there is
+       # only 1 "done" line sent. ("done" marks the end of negotiation.)
+       GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff HEAD^ HEAD &&
+       grep "git> done" trace >done_lines &&
+       test_line_count = 1 done_lines
+'
+
+test_expect_success 'diff skips same-OID blobs' '
+       test_when_finished "rm -rf server client trace" &&
+
+       test_create_repo server &&
+       echo a >server/a &&
+       echo b >server/b &&
+       git -C server add a b &&
+       git -C server commit -m x &&
+       echo another-a >server/a &&
+       git -C server add a &&
+       git -C server commit -m x &&
+
+       test_config -C server uploadpack.allowfilter 1 &&
+       test_config -C server uploadpack.allowanysha1inwant 1 &&
+       git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+       echo a | git hash-object --stdin >hash-old-a &&
+       echo another-a | git hash-object --stdin >hash-new-a &&
+       echo b | git hash-object --stdin >hash-b &&
+
+       # Ensure that only a and another-a are fetched.
+       GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff HEAD^ HEAD &&
+       grep "want $(cat hash-old-a)" trace &&
+       grep "want $(cat hash-new-a)" trace &&
+       ! grep "want $(cat hash-b)" trace
+'
+
+test_expect_success 'diff with rename detection batches blobs' '
+       test_when_finished "rm -rf server client trace" &&
+
+       test_create_repo server &&
+       echo a >server/a &&
+       printf "b\nb\nb\nb\nb\n" >server/b &&
+       git -C server add a b &&
+       git -C server commit -m x &&
+       rm server/b &&
+       printf "b\nb\nb\nb\nbX\n" >server/c &&
+       git -C server add c &&
+       git -C server commit -a -m x &&
+
+       test_config -C server uploadpack.allowfilter 1 &&
+       test_config -C server uploadpack.allowanysha1inwant 1 &&
+       git clone --bare --filter=blob:limit=0 "file://$(pwd)/server" client &&
+
+       # Ensure that there is exactly 1 negotiation by checking that there is
+       # only 1 "done" line sent. ("done" marks the end of negotiation.)
+       GIT_TRACE_PACKET="$(pwd)/trace" git -C client diff -M HEAD^ HEAD >out &&
+       grep "similarity index" out &&
+       grep "git> done" trace >done_lines &&
+       test_line_count = 1 done_lines
+'
+
+test_done
index afa4a5cea815a810334136fecf40c0bb39be716d..50189909b86d6ab48a0aa15893f5b66c2e9ce613 100644 (file)
@@ -406,20 +406,21 @@ static int check_updates(struct unpack_trees_options *o)
                 * below.
                 */
                struct oid_array to_fetch = OID_ARRAY_INIT;
-               int fetch_if_missing_store = fetch_if_missing;
-               fetch_if_missing = 0;
                for (i = 0; i < index->cache_nr; i++) {
                        struct cache_entry *ce = index->cache[i];
-                       if ((ce->ce_flags & CE_UPDATE) &&
-                           !S_ISGITLINK(ce->ce_mode)) {
-                               if (!has_object_file(&ce->oid))
-                                       oid_array_append(&to_fetch, &ce->oid);
-                       }
+
+                       if (!(ce->ce_flags & CE_UPDATE) ||
+                           S_ISGITLINK(ce->ce_mode))
+                               continue;
+                       if (!oid_object_info_extended(the_repository, &ce->oid,
+                                                     NULL,
+                                                     OBJECT_INFO_FOR_PREFETCH))
+                               continue;
+                       oid_array_append(&to_fetch, &ce->oid);
                }
                if (to_fetch.nr)
                        fetch_objects(repository_format_partial_clone,
                                      to_fetch.oid, to_fetch.nr);
-               fetch_if_missing = fetch_if_missing_store;
                oid_array_clear(&to_fetch);
        }
        for (i = 0; i < index->cache_nr; i++) {