multi-pack-index: prepare 'repack' subcommand
authorDerrick Stolee <dstolee@microsoft.com>
Mon, 10 Jun 2019 23:35:26 +0000 (16:35 -0700)
committerJunio C Hamano <gitster@pobox.com>
Tue, 11 Jun 2019 17:34:40 +0000 (10:34 -0700)
In an environment where the multi-pack-index is useful, it is due
to many pack-files and an inability to repack the object store
into a single pack-file. However, it is likely that many of these
pack-files are rather small, and could be repacked into a slightly
larger pack-file without too much effort. It may also be important
to ensure the object store is highly available and the repack
operation does not interrupt concurrent git commands.

Introduce a 'repack' subcommand to 'git multi-pack-index' that
takes a '--batch-size' option. The subcommand will inspect the
multi-pack-index for referenced pack-files whose size is smaller
than the batch size, until collecting a list of pack-files whose
sizes sum to larger than the batch size. Then, a new pack-file
will be created containing the objects from those pack-files that
are referenced by the multi-pack-index. The resulting pack is
likely to actually be smaller than the batch size due to
compression and the fact that there may be objects in the pack-
files that have duplicate copies in other pack-files.

The current change introduces the command-line arguments, and we
add a test that ensures we parse these options properly. Since
we specify a small batch size, we will guarantee that future
implementations do not change the list of pack-files.

In addition, we hard-code the modified times of the packs in
the pack directory to ensure the list of packs sorted by modified
time matches the order if sorted by size (ascending). This will
be important in a future test.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Documentation/git-multi-pack-index.txt
builtin/multi-pack-index.c
midx.c
midx.h
t/t5319-multi-pack-index.sh
index 6186c4c9369a9c5c57096f19fa9e428d662d5a5e..233b2b786271cc695268d2f7c0139d02228bd3c2 100644 (file)
@@ -36,6 +36,23 @@ expire::
        have no objects referenced by the MIDX. Rewrite the MIDX file
        afterward to remove all references to these pack-files.
 
+repack::
+       Create a new pack-file containing objects in small pack-files
+       referenced by the multi-pack-index. If the size given by the
+       `--batch-size=<size>` argument is zero, then create a pack
+       containing all objects referenced by the multi-pack-index. For
+       a non-zero batch size, Select the pack-files by examining packs
+       from oldest-to-newest, computing the "expected size" by counting
+       the number of objects in the pack referenced by the
+       multi-pack-index, then divide by the total number of objects in
+       the pack and multiply by the pack size. We select packs with
+       expected size below the batch size until the set of packs have
+       total expected size at least the batch size. If the total size
+       does not reach the batch size, then do nothing. If a new pack-
+       file is created, rewrite the multi-pack-index to reference the
+       new pack-file. A later run of 'git multi-pack-index expire' will
+       delete the pack-files that were part of this batch.
+
 
 EXAMPLES
 --------
index ad10d4051214a78a086faaf6310b765f8f371f0c..b1ea1a6aa17724915a529a882831640aec7ea7b8 100644 (file)
@@ -6,12 +6,13 @@
 #include "trace2.h"
 
 static char const * const builtin_multi_pack_index_usage[] = {
-       N_("git multi-pack-index [--object-dir=<dir>] (write|verify|expire)"),
+       N_("git multi-pack-index [--object-dir=<dir>] (write|verify|expire|repack --batch-size=<size>)"),
        NULL
 };
 
 static struct opts_multi_pack_index {
        const char *object_dir;
+       unsigned long batch_size;
 } opts;
 
 int cmd_multi_pack_index(int argc, const char **argv,
@@ -20,6 +21,8 @@ int cmd_multi_pack_index(int argc, const char **argv,
        static struct option builtin_multi_pack_index_options[] = {
                OPT_FILENAME(0, "object-dir", &opts.object_dir,
                  N_("object directory containing set of packfile and pack-index pairs")),
+               OPT_MAGNITUDE(0, "batch-size", &opts.batch_size,
+                 N_("during repack, collect pack-files of smaller size into a batch that is larger than this size")),
                OPT_END(),
        };
 
@@ -43,6 +46,11 @@ int cmd_multi_pack_index(int argc, const char **argv,
 
        trace2_cmd_mode(argv[0]);
 
+       if (!strcmp(argv[0], "repack"))
+               return midx_repack(the_repository, opts.object_dir, (size_t)opts.batch_size);
+       if (opts.batch_size)
+               die(_("--batch-size option is only for 'repack' subcommand"));
+
        if (!strcmp(argv[0], "write"))
                return write_midx_file(opts.object_dir);
        if (!strcmp(argv[0], "verify"))
@@ -50,5 +58,5 @@ int cmd_multi_pack_index(int argc, const char **argv,
        if (!strcmp(argv[0], "expire"))
                return expire_midx_packs(the_repository, opts.object_dir);
 
-       die(_("unrecognized verb: %s"), argv[0]);
+       die(_("unrecognized subcommand: %s"), argv[0]);
 }
diff --git a/midx.c b/midx.c
index 9b0b4c152031007059092a2a6e23bf23cb09127a..fbed8a8adb38f63252bbb03c9b2611c4a87214e5 100644 (file)
--- a/midx.c
+++ b/midx.c
@@ -1226,3 +1226,8 @@ int expire_midx_packs(struct repository *r, const char *object_dir)
        string_list_clear(&packs_to_drop, 0);
        return result;
 }
+
+int midx_repack(struct repository *r, const char *object_dir, size_t batch_size)
+{
+       return 0;
+}
diff --git a/midx.h b/midx.h
index 505f1431b7e309371aec7cdd7f1d6b56013b33f7..f0ae656b5d767644d60ef7b101350ca29cde7585 100644 (file)
--- a/midx.h
+++ b/midx.h
@@ -51,6 +51,7 @@ int write_midx_file(const char *object_dir);
 void clear_midx_file(struct repository *r);
 int verify_midx_file(struct repository *r, const char *object_dir);
 int expire_midx_packs(struct repository *r, const char *object_dir);
+int midx_repack(struct repository *r, const char *object_dir, size_t batch_size);
 
 void close_midx(struct multi_pack_index *m);
 
index 12570fe7ace9eebfb30ec43ed66d35c7ce0904d7..133d5b7068e8e8af79b2b6c4d3c50b98fcf426d9 100755 (executable)
@@ -398,7 +398,8 @@ test_expect_success 'setup expire tests' '
                git pack-objects --revs .git/objects/pack/pack-E <<-EOF &&
                refs/heads/E
                EOF
-               git multi-pack-index write
+               git multi-pack-index write &&
+               cp -r .git/objects/pack .git/objects/pack-backup
        )
 '
 
@@ -432,4 +433,21 @@ test_expect_success 'expire removes unreferenced packs' '
        )
 '
 
+test_expect_success 'repack with minimum size does not alter existing packs' '
+       (
+               cd dup &&
+               rm -rf .git/objects/pack &&
+               mv .git/objects/pack-backup .git/objects/pack &&
+               touch -m -t 201901010000 .git/objects/pack/pack-D* &&
+               touch -m -t 201901010001 .git/objects/pack/pack-C* &&
+               touch -m -t 201901010002 .git/objects/pack/pack-B* &&
+               touch -m -t 201901010003 .git/objects/pack/pack-A* &&
+               ls .git/objects/pack >expect &&
+               MINSIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 1) &&
+               git multi-pack-index repack --batch-size=$MINSIZE &&
+               ls .git/objects/pack >actual &&
+               test_cmp expect actual
+       )
+'
+
 test_done