midx: write object offsets
authorDerrick Stolee <stolee@gmail.com>
Thu, 12 Jul 2018 19:39:32 +0000 (15:39 -0400)
committerJunio C Hamano <gitster@pobox.com>
Fri, 20 Jul 2018 18:27:28 +0000 (11:27 -0700)
The final pair of chunks for the multi-pack-index file stores the object
offsets. We default to using 32-bit offsets as in the pack-index version
1 format, but if there exists an offset larger than 32-bits, we use a
trick similar to the pack-index version 2 format by storing all offsets
at least 2^31 in a 64-bit table; we use the 32-bit table to point into
that 64-bit table as necessary.

We only store these 64-bit offsets if necessary, so create a test that
manipulates a version 2 pack-index to fake a large offset. This allows
us to test that the large offset table is created, but the data does not
match the actual packfile offsets. The multi-pack-index offset does match
the (corrupted) pack-index offset, so a future feature will compare these
offsets during a 'verify' step.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Documentation/technical/pack-format.txt
midx.c
midx.h
t/helper/test-read-midx.c
t/t5319-multi-pack-index.sh
index 3215f7bfcdbda934b78255893474884b421a913f..cab5bdd2ff0f887cb991e2dc9ba3cccec34f8a0a 100644 (file)
@@ -311,7 +311,20 @@ CHUNK DATA:
            The OIDs for all objects in the MIDX are stored in lexicographic
            order in this chunk.
 
            The OIDs for all objects in the MIDX are stored in lexicographic
            order in this chunk.
 
-       (This section intentionally left incomplete.)
+       Object Offsets (ID: {'O', 'O', 'F', 'F'})
+           Stores two 4-byte values for every object.
+           1: The pack-int-id for the pack storing this object.
+           2: The offset within the pack.
+               If all offsets are less than 2^31, then the large offset chunk
+               will not exist and offsets are stored as in IDX v1.
+               If there is at least one offset value larger than 2^32-1, then
+               the large offset chunk must exist. If the large offset chunk
+               exists and the 31st bit is on, then removing that bit reveals
+               the row in the large offsets containing the 8-byte offset of
+               this object.
+
+       [Optional] Object Large Offsets (ID: {'L', 'O', 'F', 'F'})
+           8-byte offsets into large packfiles.
 
 TRAILER:
 
 
 TRAILER:
 
diff --git a/midx.c b/midx.c
index 7a954eb0cd317aef19b154b8648b7f49245ec9a5..e83110ae92b4768802b6b70cb6b220ce91fc74b9 100644 (file)
--- a/midx.c
+++ b/midx.c
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
-#define MIDX_MAX_CHUNKS 3
+#define MIDX_MAX_CHUNKS 5
 #define MIDX_CHUNK_ALIGNMENT 4
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
 #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
 #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
 #define MIDX_CHUNK_ALIGNMENT 4
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
 #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
 #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
+#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
+#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
+#define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
+#define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
+#define MIDX_LARGE_OFFSET_NEEDED 0x80000000
 
 static char *get_midx_filename(const char *object_dir)
 {
 
 static char *get_midx_filename(const char *object_dir)
 {
@@ -112,6 +117,14 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir)
                                m->chunk_oid_lookup = m->data + chunk_offset;
                                break;
 
                                m->chunk_oid_lookup = m->data + chunk_offset;
                                break;
 
+                       case MIDX_CHUNKID_OBJECTOFFSETS:
+                               m->chunk_object_offsets = m->data + chunk_offset;
+                               break;
+
+                       case MIDX_CHUNKID_LARGEOFFSETS:
+                               m->chunk_large_offsets = m->data + chunk_offset;
+                               break;
+
                        case 0:
                                die(_("terminating multi-pack-index chunk id appears earlier than expected"));
                                break;
                        case 0:
                                die(_("terminating multi-pack-index chunk id appears earlier than expected"));
                                break;
@@ -131,6 +144,8 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir)
                die(_("multi-pack-index missing required OID fanout chunk"));
        if (!m->chunk_oid_lookup)
                die(_("multi-pack-index missing required OID lookup chunk"));
                die(_("multi-pack-index missing required OID fanout chunk"));
        if (!m->chunk_oid_lookup)
                die(_("multi-pack-index missing required OID lookup chunk"));
+       if (!m->chunk_object_offsets)
+               die(_("multi-pack-index missing required object offsets chunk"));
 
        m->num_objects = ntohl(m->chunk_oid_fanout[255]);
 
 
        m->num_objects = ntohl(m->chunk_oid_fanout[255]);
 
@@ -454,6 +469,56 @@ static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len,
        return written;
 }
 
        return written;
 }
 
+static size_t write_midx_object_offsets(struct hashfile *f, int large_offset_needed,
+                                       struct pack_midx_entry *objects, uint32_t nr_objects)
+{
+       struct pack_midx_entry *list = objects;
+       uint32_t i, nr_large_offset = 0;
+       size_t written = 0;
+
+       for (i = 0; i < nr_objects; i++) {
+               struct pack_midx_entry *obj = list++;
+
+               hashwrite_be32(f, obj->pack_int_id);
+
+               if (large_offset_needed && obj->offset >> 31)
+                       hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++);
+               else if (!large_offset_needed && obj->offset >> 32)
+                       BUG("object %s requires a large offset (%"PRIx64") but the MIDX is not writing large offsets!",
+                           oid_to_hex(&obj->oid),
+                           obj->offset);
+               else
+                       hashwrite_be32(f, (uint32_t)obj->offset);
+
+               written += MIDX_CHUNK_OFFSET_WIDTH;
+       }
+
+       return written;
+}
+
+static size_t write_midx_large_offsets(struct hashfile *f, uint32_t nr_large_offset,
+                                      struct pack_midx_entry *objects, uint32_t nr_objects)
+{
+       struct pack_midx_entry *list = objects;
+       size_t written = 0;
+
+       while (nr_large_offset) {
+               struct pack_midx_entry *obj = list++;
+               uint64_t offset = obj->offset;
+
+               if (!(offset >> 31))
+                       continue;
+
+               hashwrite_be32(f, offset >> 32);
+               hashwrite_be32(f, offset & 0xffffffffUL);
+               written += 2 * sizeof(uint32_t);
+
+               nr_large_offset--;
+       }
+
+       return written;
+}
+
 int write_midx_file(const char *object_dir)
 {
        unsigned char cur_chunk, num_chunks = 0;
 int write_midx_file(const char *object_dir)
 {
        unsigned char cur_chunk, num_chunks = 0;
@@ -466,8 +531,9 @@ int write_midx_file(const char *object_dir)
        uint64_t written = 0;
        uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
        uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
        uint64_t written = 0;
        uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
        uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
-       uint32_t nr_entries;
+       uint32_t nr_entries, num_large_offsets = 0;
        struct pack_midx_entry *entries = NULL;
        struct pack_midx_entry *entries = NULL;
+       int large_offsets_needed = 0;
 
        midx_name = get_midx_filename(object_dir);
        if (safe_create_leading_directories(midx_name)) {
 
        midx_name = get_midx_filename(object_dir);
        if (safe_create_leading_directories(midx_name)) {
@@ -494,13 +560,19 @@ int write_midx_file(const char *object_dir)
        sort_packs_by_name(packs.names, packs.nr, pack_perm);
 
        entries = get_sorted_entries(packs.list, pack_perm, packs.nr, &nr_entries);
        sort_packs_by_name(packs.names, packs.nr, pack_perm);
 
        entries = get_sorted_entries(packs.list, pack_perm, packs.nr, &nr_entries);
+       for (i = 0; i < nr_entries; i++) {
+               if (entries[i].offset > 0x7fffffff)
+                       num_large_offsets++;
+               if (entries[i].offset > 0xffffffff)
+                       large_offsets_needed = 1;
+       }
 
        hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
        f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
        FREE_AND_NULL(midx_name);
 
        cur_chunk = 0;
 
        hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
        f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
        FREE_AND_NULL(midx_name);
 
        cur_chunk = 0;
-       num_chunks = 3;
+       num_chunks = large_offsets_needed ? 5 : 4;
 
        written = write_midx_header(f, num_chunks, packs.nr);
 
 
        written = write_midx_header(f, num_chunks, packs.nr);
 
@@ -516,9 +588,21 @@ int write_midx_file(const char *object_dir)
        chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + MIDX_CHUNK_FANOUT_SIZE;
 
        cur_chunk++;
        chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + MIDX_CHUNK_FANOUT_SIZE;
 
        cur_chunk++;
-       chunk_ids[cur_chunk] = 0;
+       chunk_ids[cur_chunk] = MIDX_CHUNKID_OBJECTOFFSETS;
        chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_HASH_LEN;
 
        chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_HASH_LEN;
 
+       cur_chunk++;
+       chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_CHUNK_OFFSET_WIDTH;
+       if (large_offsets_needed) {
+               chunk_ids[cur_chunk] = MIDX_CHUNKID_LARGEOFFSETS;
+
+               cur_chunk++;
+               chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] +
+                                          num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH;
+       }
+
+       chunk_ids[cur_chunk] = 0;
+
        for (i = 0; i <= num_chunks; i++) {
                if (i && chunk_offsets[i] < chunk_offsets[i - 1])
                        BUG("incorrect chunk offsets: %"PRIu64" before %"PRIu64,
        for (i = 0; i <= num_chunks; i++) {
                if (i && chunk_offsets[i] < chunk_offsets[i - 1])
                        BUG("incorrect chunk offsets: %"PRIu64" before %"PRIu64,
@@ -556,6 +640,14 @@ int write_midx_file(const char *object_dir)
                                written += write_midx_oid_lookup(f, MIDX_HASH_LEN, entries, nr_entries);
                                break;
 
                                written += write_midx_oid_lookup(f, MIDX_HASH_LEN, entries, nr_entries);
                                break;
 
+                       case MIDX_CHUNKID_OBJECTOFFSETS:
+                               written += write_midx_object_offsets(f, large_offsets_needed, entries, nr_entries);
+                               break;
+
+                       case MIDX_CHUNKID_LARGEOFFSETS:
+                               written += write_midx_large_offsets(f, num_large_offsets, entries, nr_entries);
+                               break;
+
                        default:
                                BUG("trying to write unknown chunk id %"PRIx32,
                                    chunk_ids[i]);
                        default:
                                BUG("trying to write unknown chunk id %"PRIx32,
                                    chunk_ids[i]);
diff --git a/midx.h b/midx.h
index 8572cf0f4b3540871482993d3affd1a9def48c88..e15966272ff4a5aab1edfa5e4751beb5ca47654b 100644 (file)
--- a/midx.h
+++ b/midx.h
@@ -17,6 +17,8 @@ struct multi_pack_index {
        const unsigned char *chunk_pack_names;
        const uint32_t *chunk_oid_fanout;
        const unsigned char *chunk_oid_lookup;
        const unsigned char *chunk_pack_names;
        const uint32_t *chunk_oid_fanout;
        const unsigned char *chunk_oid_lookup;
+       const unsigned char *chunk_object_offsets;
+       const unsigned char *chunk_large_offsets;
 
        const char **pack_names;
        char object_dir[FLEX_ARRAY];
 
        const char **pack_names;
        char object_dir[FLEX_ARRAY];
index f7c17b0940d34264b5c1f7e09a7d86e240390604..8e19972e8939d176999904edaeba13c1d8c5f3a8 100644 (file)
@@ -26,6 +26,10 @@ static int read_midx_file(const char *object_dir)
                printf(" oid-fanout");
        if (m->chunk_oid_lookup)
                printf(" oid-lookup");
                printf(" oid-fanout");
        if (m->chunk_oid_lookup)
                printf(" oid-lookup");
+       if (m->chunk_object_offsets)
+               printf(" object-offsets");
+       if (m->chunk_large_offsets)
+               printf(" large-offsets");
 
        printf("\nnum_objects: %d\n", m->num_objects);
 
 
        printf("\nnum_objects: %d\n", m->num_objects);
 
index 95e731ae52f125dc0e2720588588fd52d70b4ae1..4a4fa26f7a5225752d63d699e9a01d1e5c6e02d6 100755 (executable)
@@ -6,27 +6,30 @@ test_description='multi-pack-indexes'
 midx_read_expect () {
        NUM_PACKS=$1
        NUM_OBJECTS=$2
 midx_read_expect () {
        NUM_PACKS=$1
        NUM_OBJECTS=$2
+       NUM_CHUNKS=$3
+       OBJECT_DIR=$4
+       EXTRA_CHUNKS="$5"
        {
                cat <<-EOF &&
        {
                cat <<-EOF &&
-               header: 4d494458 1 3 $NUM_PACKS
-               chunks: pack-names oid-fanout oid-lookup
+               header: 4d494458 1 $NUM_CHUNKS $NUM_PACKS
+               chunks: pack-names oid-fanout oid-lookup object-offsets$EXTRA_CHUNKS
                num_objects: $NUM_OBJECTS
                packs:
                EOF
                if test $NUM_PACKS -ge 1
                then
                num_objects: $NUM_OBJECTS
                packs:
                EOF
                if test $NUM_PACKS -ge 1
                then
-                       ls pack/ | grep idx | sort
+                       ls $OBJECT_DIR/pack/ | grep idx | sort
                fi &&
                fi &&
-               printf "object-dir: .\n"
+               printf "object-dir: $OBJECT_DIR\n"
        } >expect &&
        } >expect &&
-       test-tool read-midx . >actual &&
+       test-tool read-midx $OBJECT_DIR >actual &&
        test_cmp expect actual
 }
 
 test_expect_success 'write midx with no packs' '
        test_when_finished rm -f pack/multi-pack-index &&
        git multi-pack-index --object-dir=. write &&
        test_cmp expect actual
 }
 
 test_expect_success 'write midx with no packs' '
        test_when_finished rm -f pack/multi-pack-index &&
        git multi-pack-index --object-dir=. write &&
-       midx_read_expect 0 0
+       midx_read_expect 0 0 4 .
 '
 
 generate_objects () {
 '
 
 generate_objects () {
@@ -76,13 +79,13 @@ test_expect_success 'write midx with one v1 pack' '
        pack=$(git pack-objects --index-version=1 pack/test <obj-list) &&
        test_when_finished rm pack/test-$pack.pack pack/test-$pack.idx pack/multi-pack-index &&
        git multi-pack-index --object-dir=. write &&
        pack=$(git pack-objects --index-version=1 pack/test <obj-list) &&
        test_when_finished rm pack/test-$pack.pack pack/test-$pack.idx pack/multi-pack-index &&
        git multi-pack-index --object-dir=. write &&
-       midx_read_expect 1 18
+       midx_read_expect 1 18 4 .
 '
 
 test_expect_success 'write midx with one v2 pack' '
        git pack-objects --index-version=2,0x40 pack/test <obj-list &&
        git multi-pack-index --object-dir=. write &&
 '
 
 test_expect_success 'write midx with one v2 pack' '
        git pack-objects --index-version=2,0x40 pack/test <obj-list &&
        git multi-pack-index --object-dir=. write &&
-       midx_read_expect 1 18
+       midx_read_expect 1 18 4 .
 '
 
 test_expect_success 'add more objects' '
 '
 
 test_expect_success 'add more objects' '
@@ -96,7 +99,7 @@ test_expect_success 'add more objects' '
 test_expect_success 'write midx with two packs' '
        git pack-objects --index-version=1 pack/test-2 <obj-list &&
        git multi-pack-index --object-dir=. write &&
 test_expect_success 'write midx with two packs' '
        git pack-objects --index-version=1 pack/test-2 <obj-list &&
        git multi-pack-index --object-dir=. write &&
-       midx_read_expect 2 34
+       midx_read_expect 2 34 4 .
 '
 
 test_expect_success 'add more packs' '
 '
 
 test_expect_success 'add more packs' '
@@ -110,7 +113,33 @@ test_expect_success 'add more packs' '
 
 test_expect_success 'write midx with twelve packs' '
        git multi-pack-index --object-dir=. write &&
 
 test_expect_success 'write midx with twelve packs' '
        git multi-pack-index --object-dir=. write &&
-       midx_read_expect 12 74
+       midx_read_expect 12 74 4 .
+'
+
+# usage: corrupt_data <file> <pos> [<data>]
+corrupt_data () {
+       file=$1
+       pos=$2
+       data="${3:-\0}"
+       printf "$data" | dd of="$file" bs=1 seek="$pos" conv=notrunc
+}
+
+# Force 64-bit offsets by manipulating the idx file.
+# This makes the IDX file _incorrect_ so be careful to clean up after!
+test_expect_success 'force some 64-bit offsets with pack-objects' '
+       mkdir objects64 &&
+       mkdir objects64/pack &&
+       for i in $(test_seq 1 11)
+       do
+               generate_objects 11
+       done &&
+       commit_and_list_objects &&
+       pack64=$(git pack-objects --index-version=2,0x40 objects64/pack/test-64 <obj-list) &&
+       idx64=objects64/pack/test-64-$pack64.idx &&
+       chmod u+w $idx64 &&
+       corrupt_data $idx64 2999 "\02" &&
+       midx64=$(git multi-pack-index --object-dir=objects64 write) &&
+       midx_read_expect 1 63 5 objects64 " large-offsets"
 '
 
 test_done
 '
 
 test_done