pack-objects: equal objects in size should delta against newer objects
[gitweb.git] / builtin-pack-objects.c
index f8ebad0b2f2006d619dc06814acb6024ee674eec..869ca1ab269859c6e23676f329e8c59e4f6aeb20 100644 (file)
@@ -44,6 +44,7 @@ struct object_entry {
                                 * be used as the base objectto delta huge
                                 * objects against.
                                 */
+       uint32_t crc32;         /* crc of raw pack data for this object */
 };
 
 /*
@@ -164,16 +165,37 @@ static int cmp_offset(const void *a_, const void *b_)
 static void prepare_pack_revindex(struct pack_revindex *rix)
 {
        struct packed_git *p = rix->p;
-       int num_ent = num_packed_objects(p);
+       int num_ent = p->num_objects;
        int i;
-       void *index = p->index_base + 256;
+       const char *index = p->index_data;
 
        rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
-       for (i = 0; i < num_ent; i++) {
-               unsigned int hl = *((unsigned int *)((char *) index + 24*i));
-               rix->revindex[i].offset = ntohl(hl);
-               rix->revindex[i].nr = i;
+       index += 4 * 256;
+
+       if (p->index_version > 1) {
+               const uint32_t *off_32 =
+                       (uint32_t *)(index + 8 + p->num_objects * (20 + 4));
+               const uint32_t *off_64 = off_32 + p->num_objects;
+               for (i = 0; i < num_ent; i++) {
+                       uint32_t off = ntohl(*off_32++);
+                       if (!(off & 0x80000000)) {
+                               rix->revindex[i].offset = off;
+                       } else {
+                               rix->revindex[i].offset =
+                                       ((uint64_t)ntohl(*off_64++)) << 32;
+                               rix->revindex[i].offset |=
+                                       ntohl(*off_64++);
+                       }
+                       rix->revindex[i].nr = i;
+               }
+       } else {
+               for (i = 0; i < num_ent; i++) {
+                       uint32_t hl = *((uint32_t *)(index + 24 * i));
+                       rix->revindex[i].offset = ntohl(hl);
+                       rix->revindex[i].nr = i;
+               }
        }
+
        /* This knows the pack format -- the 20-byte trailer
         * follows immediately after the last object data.
         */
@@ -197,7 +219,7 @@ static struct revindex_entry * find_packed_object(struct packed_git *p,
                prepare_pack_revindex(rix);
        revindex = rix->revindex;
        lo = 0;
-       hi = num_packed_objects(p) + 1;
+       hi = p->num_objects + 1;
        do {
                int mi = (lo + hi) / 2;
                if (revindex[mi].offset == ofs) {
@@ -211,17 +233,11 @@ static struct revindex_entry * find_packed_object(struct packed_git *p,
        die("internal error: pack revindex corrupt");
 }
 
-static off_t find_packed_object_size(struct packed_git *p, off_t ofs)
+static const unsigned char *find_packed_object_name(struct packed_git *p,
+                                                   off_t ofs)
 {
        struct revindex_entry *entry = find_packed_object(p, ofs);
-       return entry[1].offset - ofs;
-}
-
-static unsigned char *find_packed_object_name(struct packed_git *p,
-                                             off_t ofs)
-{
-       struct revindex_entry *entry = find_packed_object(p, ofs);
-       return (unsigned char *)(p->index_base + 256) + 24 * entry->nr + 4;
+       return nth_packed_object_sha1(p, entry->nr);
 }
 
 static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
@@ -299,6 +315,28 @@ static int check_pack_inflate(struct packed_git *p,
                stream.total_in == len) ? 0 : -1;
 }
 
+static int check_pack_crc(struct packed_git *p, struct pack_window **w_curs,
+                         off_t offset, off_t len, unsigned int nr)
+{
+       const uint32_t *index_crc;
+       uint32_t data_crc = crc32(0, Z_NULL, 0);
+
+       do {
+               unsigned int avail;
+               void *data = use_pack(p, w_curs, offset, &avail);
+               if (avail > len)
+                       avail = len;
+               data_crc = crc32(data_crc, data, avail);
+               offset += avail;
+               len -= avail;
+       } while (len);
+
+       index_crc = p->index_data;
+       index_crc += 2 + 256 + p->num_objects * (20/4) + nr;
+
+       return data_crc != ntohl(*index_crc);
+}
+
 static void copy_pack_data(struct sha1file *f,
                struct packed_git *p,
                struct pack_window **w_curs,
@@ -368,7 +406,7 @@ static int revalidate_loose_object(struct object_entry *entry,
        return check_loose_inflate(map, mapsize, size);
 }
 
-static off_t write_object(struct sha1file *f,
+static unsigned long write_object(struct sha1file *f,
                                  struct object_entry *entry)
 {
        unsigned long size;
@@ -380,6 +418,9 @@ static off_t write_object(struct sha1file *f,
        enum object_type obj_type;
        int to_reuse = 0;
 
+       if (!pack_to_stdout)
+               crc32_begin(f);
+
        obj_type = entry->type;
        if (! entry->in_pack)
                to_reuse = 0;   /* can't reuse what we don't have */
@@ -460,6 +501,7 @@ static off_t write_object(struct sha1file *f,
        else {
                struct packed_git *p = entry->in_pack;
                struct pack_window *w_curs = NULL;
+               struct revindex_entry *revidx;
                off_t offset;
 
                if (entry->delta) {
@@ -482,12 +524,17 @@ static off_t write_object(struct sha1file *f,
                        hdrlen += 20;
                }
 
-               offset = entry->in_pack_offset + entry->in_pack_header_size;
-               datalen = find_packed_object_size(p, entry->in_pack_offset)
-                               - entry->in_pack_header_size;
-               if (!pack_to_stdout && check_pack_inflate(p, &w_curs,
-                               offset, datalen, entry->size))
-                       die("corrupt delta in pack %s", sha1_to_hex(entry->sha1));
+               offset = entry->in_pack_offset;
+               revidx = find_packed_object(p, offset);
+               datalen = revidx[1].offset - offset;
+               if (!pack_to_stdout && p->index_version > 1 &&
+                   check_pack_crc(p, &w_curs, offset, datalen, revidx->nr))
+                       die("bad packed object CRC for %s", sha1_to_hex(entry->sha1));
+               offset += entry->in_pack_header_size;
+               datalen -= entry->in_pack_header_size;
+               if (!pack_to_stdout && p->index_version == 1 &&
+                   check_pack_inflate(p, &w_curs, offset, datalen, entry->size))
+                       die("corrupt packed object for %s", sha1_to_hex(entry->sha1));
                copy_pack_data(f, p, &w_curs, offset, datalen);
                unuse_pack(&w_curs);
                reused++;
@@ -495,6 +542,8 @@ static off_t write_object(struct sha1file *f,
        if (entry->delta)
                written_delta++;
        written++;
+       if (!pack_to_stdout)
+               entry->crc32 = crc32_end(f);
        return hdrlen + datalen;
 }
 
@@ -502,23 +551,30 @@ static off_t write_one(struct sha1file *f,
                               struct object_entry *e,
                               off_t offset)
 {
+       unsigned long size;
+
+       /* offset is non zero if object is written already. */
        if (e->offset || e->preferred_base)
-               /* offset starts from header size and cannot be zero
-                * if it is written already.
-                */
                return offset;
-       /* if we are deltified, write out its base object first. */
+
+       /* if we are deltified, write out base object first. */
        if (e->delta)
                offset = write_one(f, e->delta, offset);
+
        e->offset = offset;
-       return offset + write_object(f, e);
+       size = write_object(f, e);
+
+       /* make sure off_t is sufficiently large not to wrap */
+       if (offset > offset + size)
+               die("pack too large for current definition of off_t");
+       return offset + size;
 }
 
-static void write_pack_file(void)
+static off_t write_pack_file(void)
 {
        uint32_t i;
        struct sha1file *f;
-       off_t offset;
+       off_t offset, last_obj_offset = 0;
        struct pack_header hdr;
        unsigned last_percent = 999;
        int do_progress = progress;
@@ -541,6 +597,7 @@ static void write_pack_file(void)
        if (!nr_result)
                goto done;
        for (i = 0; i < nr_objects; i++) {
+               last_obj_offset = offset;
                offset = write_one(f, objects + i, offset);
                if (do_progress) {
                        unsigned percent = written * 100 / nr_result;
@@ -558,9 +615,14 @@ static void write_pack_file(void)
        if (written != nr_result)
                die("wrote %u objects while expecting %u", written, nr_result);
        sha1close(f, pack_file_sha1, 1);
+
+       return last_obj_offset;
 }
 
-static void write_index_file(void)
+static uint32_t index_default_version = 1;
+static uint32_t index_off32_limit = 0x7fffffff;
+
+static void write_index_file(off_t last_obj_offset)
 {
        uint32_t i;
        struct sha1file *f = sha1create("%s-%s.%s", base_name,
@@ -568,6 +630,18 @@ static void write_index_file(void)
        struct object_entry **list = sorted_by_sha;
        struct object_entry **last = list + nr_result;
        uint32_t array[256];
+       uint32_t index_version;
+
+       /* if last object's offset is >= 2^31 we should use index V2 */
+       index_version = (last_obj_offset >> 31) ? 2 : index_default_version;
+
+       /* index versions 2 and above need a header */
+       if (index_version >= 2) {
+               struct pack_idx_header hdr;
+               hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
+               hdr.idx_version = htonl(index_version);
+               sha1write(f, &hdr, sizeof(hdr));
+       }
 
        /*
         * Write the first-level table (the list is sorted,
@@ -593,10 +667,49 @@ static void write_index_file(void)
        list = sorted_by_sha;
        for (i = 0; i < nr_result; i++) {
                struct object_entry *entry = *list++;
-               uint32_t offset = htonl(entry->offset);
-               sha1write(f, &offset, 4);
+               if (index_version < 2) {
+                       uint32_t offset = htonl(entry->offset);
+                       sha1write(f, &offset, 4);
+               }
                sha1write(f, entry->sha1, 20);
        }
+
+       if (index_version >= 2) {
+               unsigned int nr_large_offset = 0;
+
+               /* write the crc32 table */
+               list = sorted_by_sha;
+               for (i = 0; i < nr_objects; i++) {
+                       struct object_entry *entry = *list++;
+                       uint32_t crc32_val = htonl(entry->crc32);
+                       sha1write(f, &crc32_val, 4);
+               }
+
+               /* write the 32-bit offset table */
+               list = sorted_by_sha;
+               for (i = 0; i < nr_objects; i++) {
+                       struct object_entry *entry = *list++;
+                       uint32_t offset = (entry->offset <= index_off32_limit) ?
+                               entry->offset : (0x80000000 | nr_large_offset++);
+                       offset = htonl(offset);
+                       sha1write(f, &offset, 4);
+               }
+
+               /* write the large offset table */
+               list = sorted_by_sha;
+               while (nr_large_offset) {
+                       struct object_entry *entry = *list++;
+                       uint64_t offset = entry->offset;
+                       if (offset > index_off32_limit) {
+                               uint32_t split[2];
+                               split[0]        = htonl(offset >> 32);
+                               split[1] = htonl(offset & 0xffffffff);
+                               sha1write(f, split, 8);
+                               nr_large_offset--;
+                       }
+               }
+       }
+
        sha1write(f, pack_file_sha1, 20);
        sha1close(f, NULL, 1);
 }
@@ -668,12 +781,19 @@ static unsigned name_hash(const char *name)
 
 static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude)
 {
-       uint32_t idx = nr_objects;
        struct object_entry *entry;
-       struct packed_git *p;
+       struct packed_git *p, *found_pack = NULL;
        off_t found_offset = 0;
-       struct packed_git *found_pack = NULL;
-       int ix, status = 0;
+       int ix;
+
+       ix = nr_objects ? locate_object_entry_hash(sha1) : -1;
+       if (ix >= 0) {
+               if (exclude) {
+                       entry = objects + object_ix[ix] - 1;
+                       entry->preferred_base = 1;
+               }
+               return 0;
+       }
 
        if (!exclude) {
                for (p = packed_git; p; p = p->next) {
@@ -690,43 +810,34 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud
                        }
                }
        }
-       if ((entry = locate_object_entry(sha1)) != NULL)
-               goto already_added;
 
-       if (idx >= nr_alloc) {
-               nr_alloc = (idx + 1024) * 3 / 2;
+       if (nr_objects >= nr_alloc) {
+               nr_alloc = (nr_alloc  + 1024) * 3 / 2;
                objects = xrealloc(objects, nr_alloc * sizeof(*entry));
        }
-       entry = objects + idx;
-       nr_objects = idx + 1;
+
+       entry = objects + nr_objects++;
        memset(entry, 0, sizeof(*entry));
        hashcpy(entry->sha1, sha1);
        entry->hash = hash;
+       if (exclude)
+               entry->preferred_base = 1;
+       if (found_pack) {
+               entry->in_pack = found_pack;
+               entry->in_pack_offset = found_offset;
+       }
 
        if (object_ix_hashsz * 3 <= nr_objects * 4)
                rehash_objects();
-       else {
-               ix = locate_object_entry_hash(entry->sha1);
-               if (0 <= ix)
-                       die("internal error in object hashing.");
-               object_ix[-1 - ix] = idx + 1;
-       }
-       status = 1;
+       else
+               object_ix[-1 - ix] = nr_objects;
 
- already_added:
        if (progress_update) {
                fprintf(stderr, "Counting objects...%u\r", nr_objects);
                progress_update = 0;
        }
-       if (exclude)
-               entry->preferred_base = 1;
-       else {
-               if (found_pack) {
-                       entry->in_pack = found_pack;
-                       entry->in_pack_offset = found_offset;
-               }
-       }
-       return status;
+
+       return 1;
 }
 
 struct pbase_tree_cache {
@@ -848,22 +959,21 @@ static void add_pbase_object(struct tree_desc *tree,
                             const char *fullname)
 {
        struct name_entry entry;
+       int cmp;
 
        while (tree_entry(tree,&entry)) {
-               unsigned long size;
-               enum object_type type;
-
-               if (entry.pathlen != cmplen ||
-                   memcmp(entry.path, name, cmplen) ||
-                   !has_sha1_file(entry.sha1) ||
-                   (type = sha1_object_info(entry.sha1, &size)) < 0)
+               cmp = tree_entry_len(entry.path, entry.sha1) != cmplen ? 1 :
+                     memcmp(name, entry.path, cmplen);
+               if (cmp > 0)
                        continue;
+               if (cmp < 0)
+                       return;
                if (name[cmplen] != '/') {
                        unsigned hash = name_hash(fullname);
                        add_object_entry(entry.sha1, hash, 1);
                        return;
                }
-               if (type == OBJ_TREE) {
+               if (S_ISDIR(entry.mode)) {
                        struct tree_desc sub;
                        struct pbase_tree_cache *tree;
                        const char *down = name+cmplen+1;
@@ -872,8 +982,7 @@ static void add_pbase_object(struct tree_desc *tree,
                        tree = pbase_tree_get(entry.sha1);
                        if (!tree)
                                return;
-                       sub.buf = tree->tree_data;
-                       sub.size = tree->tree_size;
+                       init_tree_desc(&sub, tree->tree_data, tree->tree_size);
 
                        add_pbase_object(&sub, down, downlen, fullname);
                        pbase_tree_put(tree);
@@ -924,20 +1033,19 @@ static int check_pbase_path(unsigned hash)
 static void add_preferred_base_object(const char *name, unsigned hash)
 {
        struct pbase_tree *it;
-       int cmplen = name_cmp_len(name);
+       int cmplen;
 
-       if (check_pbase_path(hash))
+       if (!num_preferred_base || check_pbase_path(hash))
                return;
 
+       cmplen = name_cmp_len(name);
        for (it = pbase_tree; it; it = it->next) {
                if (cmplen == 0) {
-                       hash = name_hash("");
-                       add_object_entry(it->pcache.sha1, hash, 1);
+                       add_object_entry(it->pcache.sha1, 0, 1);
                }
                else {
                        struct tree_desc tree;
-                       tree.buf = it->pcache.tree_data;
-                       tree.size = it->pcache.tree_size;
+                       init_tree_desc(&tree, it->pcache.tree_data, it->pcache.tree_size);
                        add_pbase_object(&tree, name, cmplen, name);
                }
        }
@@ -996,7 +1104,8 @@ static void check_object(struct object_entry *entry)
                 * delta.
                 */
                if (!no_reuse_delta) {
-                       unsigned char c, *base_name;
+                       unsigned char c;
+                       const unsigned char *base_name;
                        off_t ofs;
                        unsigned long used_0;
                        /* there is at least 20 bytes left in the pack */
@@ -1014,7 +1123,7 @@ static void check_object(struct object_entry *entry)
                                ofs = c & 127;
                                while (c & 128) {
                                        ofs += 1;
-                                       if (!ofs || ofs & ~(~0UL >> 7))
+                                       if (!ofs || MSB(ofs, 7))
                                                die("delta base offset overflow in pack for %s",
                                                    sha1_to_hex(entry->sha1));
                                        c = buf[used_0++];
@@ -1167,7 +1276,7 @@ static int type_size_sort(const struct object_entry *a, const struct object_entr
                return -1;
        if (a->size > b->size)
                return 1;
-       return a < b ? -1 : (a > b);
+       return a > b ? -1 : (a < b);  /* newest last */
 }
 
 struct unpacked {
@@ -1477,9 +1586,7 @@ static void read_object_list_from_stdin(void)
 
 static void show_commit(struct commit *commit)
 {
-       unsigned hash = name_hash("");
-       add_preferred_base_object("", hash);
-       add_object_entry(commit->object.sha1, hash, 0);
+       add_object_entry(commit->object.sha1, 0, 0);
 }
 
 static void show_object(struct object_array_entry *p)
@@ -1627,6 +1734,17 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
                        rp_av[1] = "--objects-edge";
                        continue;
                }
+               if (!prefixcmp(arg, "--index-version=")) {
+                       char *c;
+                       index_default_version = strtoul(arg + 16, &c, 10);
+                       if (index_default_version > 2)
+                               die("bad %s", arg);
+                       if (*c == ',')
+                               index_off32_limit = strtoul(c+1, &c, 0);
+                       if (*c || index_off32_limit & 0x80000000)
+                               die("bad %s", arg);
+                       continue;
+               }
                usage(pack_usage);
        }
 
@@ -1685,6 +1803,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
        if (reuse_cached_pack(object_list_sha1))
                ;
        else {
+               off_t last_obj_offset;
                if (nr_result)
                        prepare_pack(window, depth);
                if (progress == 1 && pack_to_stdout) {
@@ -1694,9 +1813,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
                        signal(SIGALRM, SIG_IGN );
                        progress_update = 0;
                }
-               write_pack_file();
+               last_obj_offset = write_pack_file();
                if (!pack_to_stdout) {
-                       write_index_file();
+                       write_index_file(last_obj_offset);
                        puts(sha1_to_hex(object_list_sha1));
                }
        }