Merge branch 'jk/pack-delta-reuse-with-bitmap'
authorJunio C Hamano <gitster@pobox.com>
Mon, 17 Sep 2018 20:53:53 +0000 (13:53 -0700)
committerJunio C Hamano <gitster@pobox.com>
Mon, 17 Sep 2018 20:53:53 +0000 (13:53 -0700)
When creating a thin pack, which allows objects to be made into a
delta against another object that is not in the resulting pack but
is known to be present on the receiving end, the code learned to
take advantage of the reachability bitmap; this allows the server
to send a delta against a base beyond the "boundary" commit.

* jk/pack-delta-reuse-with-bitmap:
pack-objects: reuse on-disk deltas for thin "have" objects
pack-bitmap: save "have" bitmap from walk
t/perf: add perf tests for fetches from a bitmapped server
t/perf: add infrastructure for measuring sizes
t/perf: factor out percent calculations
t/perf: factor boilerplate out of test_perf

builtin/pack-objects.c
pack-bitmap.c
pack-bitmap.h
pack-objects.c
pack-objects.h
t/perf/README
t/perf/aggregate.perl
t/perf/p5311-pack-bitmaps-fetch.sh [new file with mode: 0755]
t/perf/perf-lib.sh
index caa4cd0211c40e4ac0acb4178a4df260db651b88..f434e1fb0c1ff29e801a2a3b3fc2e67d0f86d217 100644 (file)
@@ -41,6 +41,7 @@
 #define DELTA_CHILD(obj) oe_delta_child(&to_pack, obj)
 #define DELTA_SIBLING(obj) oe_delta_sibling(&to_pack, obj)
 #define SET_DELTA(obj, val) oe_set_delta(&to_pack, obj, val)
+#define SET_DELTA_EXT(obj, oid) oe_set_delta_ext(&to_pack, obj, oid)
 #define SET_DELTA_SIZE(obj, val) oe_set_delta_size(&to_pack, obj, val)
 #define SET_DELTA_CHILD(obj, val) oe_set_delta_child(&to_pack, obj, val)
 #define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
@@ -60,6 +61,7 @@ static struct packing_data to_pack;
 
 static struct pack_idx_entry **written_list;
 static uint32_t nr_result, nr_written, nr_seen;
+static struct bitmap_index *bitmap_git;
 
 static int non_empty;
 static int reuse_delta = 1, reuse_object = 1;
@@ -80,6 +82,7 @@ static unsigned long pack_size_limit;
 static int depth = 50;
 static int delta_search_threads;
 static int pack_to_stdout;
+static int thin;
 static int num_preferred_base;
 static struct progress *progress_state;
 
@@ -1538,11 +1541,15 @@ static void check_object(struct object_entry *entry)
                        break;
                }
 
-               if (base_ref && (base_entry = packlist_find(&to_pack, base_ref, NULL))) {
+               if (base_ref && (
+                   (base_entry = packlist_find(&to_pack, base_ref, NULL)) ||
+                   (thin &&
+                    bitmap_has_sha1_in_uninteresting(bitmap_git, base_ref)))) {
                        /*
                         * If base_ref was set above that means we wish to
-                        * reuse delta data, and we even found that base
-                        * in the list of objects we want to pack. Goodie!
+                        * reuse delta data, and either we found that object in
+                        * the list of objects we want to pack, or it's one we
+                        * know the receiver has.
                         *
                         * Depth value does not matter - find_deltas() will
                         * never consider reused delta as the base object to
@@ -1551,10 +1558,16 @@ static void check_object(struct object_entry *entry)
                         */
                        oe_set_type(entry, entry->in_pack_type);
                        SET_SIZE(entry, in_pack_size); /* delta size */
-                       SET_DELTA(entry, base_entry);
                        SET_DELTA_SIZE(entry, in_pack_size);
-                       entry->delta_sibling_idx = base_entry->delta_child_idx;
-                       SET_DELTA_CHILD(base_entry, entry);
+
+                       if (base_entry) {
+                               SET_DELTA(entry, base_entry);
+                               entry->delta_sibling_idx = base_entry->delta_child_idx;
+                               SET_DELTA_CHILD(base_entry, entry);
+                       } else {
+                               SET_DELTA_EXT(entry, base_ref);
+                       }
+
                        unuse_pack(&w_curs);
                        return;
                }
@@ -2979,7 +2992,6 @@ static int pack_options_allow_reuse(void)
 
 static int get_object_list_from_bitmap(struct rev_info *revs)
 {
-       struct bitmap_index *bitmap_git;
        if (!(bitmap_git = prepare_bitmap_walk(revs)))
                return -1;
 
@@ -2995,7 +3007,6 @@ static int get_object_list_from_bitmap(struct rev_info *revs)
        }
 
        traverse_bitmap_commit_list(bitmap_git, &add_object_entry_from_bitmap);
-       free_bitmap_index(bitmap_git);
        return 0;
 }
 
@@ -3143,7 +3154,6 @@ static int option_parse_unpack_unreachable(const struct option *opt,
 int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 {
        int use_internal_rev_list = 0;
-       int thin = 0;
        int shallow = 0;
        int all_progress_implied = 0;
        struct argv_array rp = ARGV_ARRAY_INIT;
index 4e50ab391fa0df9fc2d28a5bfdc31996635699b6..c7d593c91c7a76497407a703d9f0c50925e2475a 100644 (file)
@@ -86,6 +86,9 @@ struct bitmap_index {
        /* Bitmap result of the last performed walk */
        struct bitmap *result;
 
+       /* "have" bitmap from the last performed walk */
+       struct bitmap *haves;
+
        /* Version of the bitmap index */
        unsigned int version;
 
@@ -759,8 +762,8 @@ struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs)
                bitmap_and_not(wants_bitmap, haves_bitmap);
 
        bitmap_git->result = wants_bitmap;
+       bitmap_git->haves = haves_bitmap;
 
-       bitmap_free(haves_bitmap);
        return bitmap_git;
 
 cleanup:
@@ -1114,5 +1117,25 @@ void free_bitmap_index(struct bitmap_index *b)
        free(b->ext_index.objects);
        free(b->ext_index.hashes);
        bitmap_free(b->result);
+       bitmap_free(b->haves);
        free(b);
 }
+
+int bitmap_has_sha1_in_uninteresting(struct bitmap_index *bitmap_git,
+                                    const unsigned char *sha1)
+{
+       int pos;
+
+       if (!bitmap_git)
+               return 0; /* no bitmap loaded */
+       if (!bitmap_git->result)
+               BUG("failed to perform bitmap walk before querying");
+       if (!bitmap_git->haves)
+               return 0; /* walk had no "haves" */
+
+       pos = bitmap_position_packfile(bitmap_git, sha1);
+       if (pos < 0)
+               return 0;
+
+       return bitmap_get(bitmap_git->haves, pos);
+}
index 8a04741e1253b0ba801445a76ceb8e4937121f73..c633bf523898db8d261f5c3f8098c3e843ca9c21 100644 (file)
@@ -53,6 +53,13 @@ int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping
                             khash_sha1 *reused_bitmaps, int show_progress);
 void free_bitmap_index(struct bitmap_index *);
 
+/*
+ * After a traversal has been performed on the bitmap_index, this can be
+ * queried to see if a particular object was reachable from any of the
+ * objects flagged as UNINTERESTING.
+ */
+int bitmap_has_sha1_in_uninteresting(struct bitmap_index *, const unsigned char *sha1);
+
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
 void bitmap_writer_build_type_index(struct packing_data *to_pack,
index d04cfa8e9f173b3e3b31e7b634c13adf735cb479..34628587725ae879729c09d1847bad2269b7b97f 100644 (file)
@@ -181,3 +181,22 @@ struct object_entry *packlist_alloc(struct packing_data *pdata,
 
        return new_entry;
 }
+
+void oe_set_delta_ext(struct packing_data *pdata,
+                     struct object_entry *delta,
+                     const unsigned char *sha1)
+{
+       struct object_entry *base;
+
+       ALLOC_GROW(pdata->ext_bases, pdata->nr_ext + 1, pdata->alloc_ext);
+       base = &pdata->ext_bases[pdata->nr_ext++];
+       memset(base, 0, sizeof(*base));
+       hashcpy(base->idx.oid.hash, sha1);
+
+       /* These flags mark that we are not part of the actual pack output. */
+       base->preferred_base = 1;
+       base->filled = 1;
+
+       delta->ext_base = 1;
+       delta->delta_idx = base - pdata->ext_bases + 1;
+}
index 62806ccc39ea31b425089f4f38121d81a02fe5dd..b0c1f137c675519cd527be2bc4a058e71024562c 100644 (file)
@@ -112,6 +112,7 @@ struct object_entry {
        unsigned filled:1; /* assigned write-order */
        unsigned dfs_state:OE_DFS_STATE_BITS;
        unsigned depth:OE_DEPTH_BITS;
+       unsigned ext_base:1; /* delta_idx points outside packlist */
 
        /*
         * pahole results on 64-bit linux (gcc and clang)
@@ -147,6 +148,14 @@ struct packing_data {
        pthread_mutex_t lock;
 #endif
 
+       /*
+        * This list contains entries for bases which we know the other side
+        * has (e.g., via reachability bitmaps), but which aren't in our
+        * "objects" list.
+        */
+       struct object_entry *ext_bases;
+       uint32_t nr_ext, alloc_ext;
+
        uintmax_t oe_size_limit;
        uintmax_t oe_delta_size_limit;
 };
@@ -249,9 +258,12 @@ static inline struct object_entry *oe_delta(
                const struct packing_data *pack,
                const struct object_entry *e)
 {
-       if (e->delta_idx)
+       if (!e->delta_idx)
+               return NULL;
+       if (e->ext_base)
+               return &pack->ext_bases[e->delta_idx - 1];
+       else
                return &pack->objects[e->delta_idx - 1];
-       return NULL;
 }
 
 static inline void oe_set_delta(struct packing_data *pack,
@@ -264,6 +276,10 @@ static inline void oe_set_delta(struct packing_data *pack,
                e->delta_idx = 0;
 }
 
+void oe_set_delta_ext(struct packing_data *pack,
+                     struct object_entry *e,
+                     const unsigned char *sha1);
+
 static inline struct object_entry *oe_delta_child(
                const struct packing_data *pack,
                const struct object_entry *e)
index 21321a0f361203aacc78516ae25f21f35107da02..be12090c3853b8d64bb903aa92b74001d0fdd19f 100644 (file)
@@ -168,3 +168,28 @@ that
   While we have tried to make sure that it can cope with embedded
   whitespace and other special characters, it will not work with
   multi-line data.
+
+Rather than tracking the performance by run-time as `test_perf` does, you
+may also track output size by using `test_size`. The stdout of the
+function should be a single numeric value, which will be captured and
+shown in the aggregated output. For example:
+
+       test_perf 'time foo' '
+               ./foo >foo.out
+       '
+
+       test_size 'output size'
+               wc -c <foo.out
+       '
+
+might produce output like:
+
+       Test                origin           HEAD
+       -------------------------------------------------------------
+       1234.1 time foo     0.37(0.79+0.02)  0.26(0.51+0.02) -29.7%
+       1234.2 output size             4.3M             3.6M -14.7%
+
+The item being measured (and its units) is up to the test; the context
+and the test title should make it clear to the user whether bigger or
+smaller numbers are better. Unlike test_perf, the test code will only be
+run once, since output sizes tend to be more deterministic than timings.
index bc865160e7e3370f9462beda9d8b3866e3c2111b..494907a892bba90b677448a152fd55c010b2ebda 100755 (executable)
@@ -13,27 +13,42 @@ sub get_times {
        my $line = <$fh>;
        return undef if not defined $line;
        close $fh or die "cannot close $name: $!";
-       $line =~ /^(?:(\d+):)?(\d+):(\d+(?:\.\d+)?) (\d+(?:\.\d+)?) (\d+(?:\.\d+)?)$/
-               or die "bad input line: $line";
-       my $rt = ((defined $1 ? $1 : 0.0)*60+$2)*60+$3;
-       return ($rt, $4, $5);
+       # times
+       if ($line =~ /^(?:(\d+):)?(\d+):(\d+(?:\.\d+)?) (\d+(?:\.\d+)?) (\d+(?:\.\d+)?)$/) {
+               my $rt = ((defined $1 ? $1 : 0.0)*60+$2)*60+$3;
+               return ($rt, $4, $5);
+       # size
+       } elsif ($line =~ /^\d+$/) {
+               return $&;
+       } else {
+               die "bad input line: $line";
+       }
+}
+
+sub relative_change {
+       my ($r, $firstr) = @_;
+       if ($firstr > 0) {
+               return sprintf "%+.1f%%", 100.0*($r-$firstr)/$firstr;
+       } elsif ($r == 0) {
+               return "=";
+       } else {
+               return "+inf";
+       }
 }
 
 sub format_times {
        my ($r, $u, $s, $firstr) = @_;
+       # no value means we did not finish the test
        if (!defined $r) {
                return "<missing>";
        }
-       my $out = sprintf "%.2f(%.2f+%.2f)", $r, $u, $s;
-       if (defined $firstr) {
-               if ($firstr > 0) {
-                       $out .= sprintf " %+.1f%%", 100.0*($r-$firstr)/$firstr;
-               } elsif ($r == 0) {
-                       $out .= " =";
-               } else {
-                       $out .= " +inf";
-               }
+       # a single value means we have a size, not times
+       if (!defined $u) {
+               return format_size($r, $firstr);
        }
+       # otherwise, we have real/user/system times
+       my $out = sprintf "%.2f(%.2f+%.2f)", $r, $u, $s;
+       $out .= ' ' . relative_change($r, $firstr) if defined $firstr;
        return $out;
 }
 
@@ -51,6 +66,25 @@ sub usage {
        exit(1);
 }
 
+sub human_size {
+       my $n = shift;
+       my @units = ('', qw(K M G));
+       while ($n > 900 && @units > 1) {
+               $n /= 1000;
+               shift @units;
+       }
+       return $n unless length $units[0];
+       return sprintf '%.1f%s', $n, $units[0];
+}
+
+sub format_size {
+       my ($size, $first) = @_;
+       # match the width of a time: 0.00(0.00+0.00)
+       my $out = sprintf '%15s', human_size($size);
+       $out .= ' ' . relative_change($size, $first) if defined $first;
+       return $out;
+}
+
 my (@dirs, %dirnames, %dirabbrevs, %prefixes, @tests,
     $codespeed, $sortby, $subsection, $reponame);
 
@@ -181,7 +215,14 @@ sub print_default_results {
                my $firstr;
                for my $i (0..$#dirs) {
                        my $d = $dirs[$i];
-                       $times{$prefixes{$d}.$t} = [get_times("$resultsdir/$prefixes{$d}$t.times")];
+                       my $base = "$resultsdir/$prefixes{$d}$t";
+                       $times{$prefixes{$d}.$t} = [];
+                       foreach my $type (qw(times size)) {
+                               if (-e "$base.$type") {
+                                       $times{$prefixes{$d}.$t} = [get_times("$base.$type")];
+                                       last;
+                               }
+                       }
                        my ($r,$u,$s) = @{$times{$prefixes{$d}.$t}};
                        my $w = length format_times($r,$u,$s,$firstr);
                        $colwidth[$i] = $w if $w > $colwidth[$i];
diff --git a/t/perf/p5311-pack-bitmaps-fetch.sh b/t/perf/p5311-pack-bitmaps-fetch.sh
new file mode 100755 (executable)
index 0000000..b045759
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+test_description='performance of fetches from bitmapped packs'
+. ./perf-lib.sh
+
+test_perf_default_repo
+
+test_expect_success 'create bitmapped server repo' '
+       git config pack.writebitmaps true &&
+       git config pack.writebitmaphashcache true &&
+       git repack -ad
+'
+
+# simulate a fetch from a repository that last fetched N days ago, for
+# various values of N. We do so by following the first-parent chain,
+# and assume the first entry in the chain that is N days older than the current
+# HEAD is where the HEAD would have been then.
+for days in 1 2 4 8 16 32 64 128; do
+       title=$(printf '%10s' "($days days)")
+       test_expect_success "setup revs from $days days ago" '
+               now=$(git log -1 --format=%ct HEAD) &&
+               then=$(($now - ($days * 86400))) &&
+               tip=$(git rev-list -1 --first-parent --until=$then HEAD) &&
+               {
+                       echo HEAD &&
+                       echo ^$tip
+               } >revs
+       '
+
+       test_perf "server $title" '
+               git pack-objects --stdout --revs \
+                                --thin --delta-base-offset \
+                                <revs >tmp.pack
+       '
+
+       test_size "size   $title" '
+               wc -c <tmp.pack
+       '
+
+       test_perf "client $title" '
+               git index-pack --stdin --fix-thin <tmp.pack
+       '
+done
+
+test_done
index e4c343a6b795b6f990fa278788ad65a0a5cdc4e6..11d1922cf58edb78b8e311e3fa041ae54b625e74 100644 (file)
@@ -179,8 +179,8 @@ exit $ret' >&3 2>&4
        return "$eval_ret"
 }
 
-
-test_perf () {
+test_wrapper_ () {
+       test_wrapper_func_=$1; shift
        test_start_
        test "$#" = 3 && { test_prereq=$1; shift; } || test_prereq=
        test "$#" = 2 ||
@@ -191,35 +191,57 @@ test_perf () {
                base=$(basename "$0" .sh)
                echo "$test_count" >>"$perf_results_dir"/$base.subtests
                echo "$1" >"$perf_results_dir"/$base.$test_count.descr
-               if test -z "$verbose"; then
-                       printf "%s" "perf $test_count - $1:"
-               else
-                       echo "perf $test_count - $1:"
-               fi
-               for i in $(test_seq 1 $GIT_PERF_REPEAT_COUNT); do
-                       say >&3 "running: $2"
-                       if test_run_perf_ "$2"
-                       then
-                               if test -z "$verbose"; then
-                                       printf " %s" "$i"
-                               else
-                                       echo "* timing run $i/$GIT_PERF_REPEAT_COUNT:"
-                               fi
+               base="$perf_results_dir"/"$perf_results_prefix$(basename "$0" .sh)"."$test_count"
+               "$test_wrapper_func_" "$@"
+       fi
+
+       test_finish_
+}
+
+test_perf_ () {
+       if test -z "$verbose"; then
+               printf "%s" "perf $test_count - $1:"
+       else
+               echo "perf $test_count - $1:"
+       fi
+       for i in $(test_seq 1 $GIT_PERF_REPEAT_COUNT); do
+               say >&3 "running: $2"
+               if test_run_perf_ "$2"
+               then
+                       if test -z "$verbose"; then
+                               printf " %s" "$i"
                        else
-                               test -z "$verbose" && echo
-                               test_failure_ "$@"
-                               break
+                               echo "* timing run $i/$GIT_PERF_REPEAT_COUNT:"
                        fi
-               done
-               if test -z "$verbose"; then
-                       echo " ok"
                else
-                       test_ok_ "$1"
+                       test -z "$verbose" && echo
+                       test_failure_ "$@"
+                       break
                fi
-               base="$perf_results_dir"/"$perf_results_prefix$(basename "$0" .sh)"."$test_count"
-               "$TEST_DIRECTORY"/perf/min_time.perl test_time.* >"$base".times
+       done
+       if test -z "$verbose"; then
+               echo " ok"
+       else
+               test_ok_ "$1"
        fi
-       test_finish_
+       "$TEST_DIRECTORY"/perf/min_time.perl test_time.* >"$base".times
+}
+
+test_perf () {
+       test_wrapper_ test_perf_ "$@"
+}
+
+test_size_ () {
+       say >&3 "running: $2"
+       if test_eval_ "$2" 3>"$base".size; then
+               test_ok_ "$1"
+       else
+               test_failure_ "$@"
+       fi
+}
+
+test_size () {
+       test_wrapper_ test_size_ "$@"
 }
 
 # We extend test_done to print timings at the end (./run disables this