+struct pack_midx_entry {
+ struct object_id oid;
+ uint32_t pack_int_id;
+ time_t pack_mtime;
+ uint64_t offset;
+};
+
+static int midx_oid_compare(const void *_a, const void *_b)
+{
+ const struct pack_midx_entry *a = (const struct pack_midx_entry *)_a;
+ const struct pack_midx_entry *b = (const struct pack_midx_entry *)_b;
+ int cmp = oidcmp(&a->oid, &b->oid);
+
+ if (cmp)
+ return cmp;
+
+ if (a->pack_mtime > b->pack_mtime)
+ return -1;
+ else if (a->pack_mtime < b->pack_mtime)
+ return 1;
+
+ return a->pack_int_id - b->pack_int_id;
+}
+
+static void fill_pack_entry(uint32_t pack_int_id,
+ struct packed_git *p,
+ uint32_t cur_object,
+ struct pack_midx_entry *entry)
+{
+ if (!nth_packed_object_oid(&entry->oid, p, cur_object))
+ die(_("failed to locate object %d in packfile"), cur_object);
+
+ entry->pack_int_id = pack_int_id;
+ entry->pack_mtime = p->mtime;
+
+ entry->offset = nth_packed_object_offset(p, cur_object);
+}
+
+/*
+ * It is possible to artificially get into a state where there are many
+ * duplicate copies of objects. That can create high memory pressure if
+ * we are to create a list of all objects before de-duplication. To reduce
+ * this memory pressure without a significant performance drop, automatically
+ * group objects by the first byte of their object id. Use the IDX fanout
+ * tables to group the data, copy to a local array, then sort.
+ *
+ * Copy only the de-duplicated entries (selected by most-recent modified time
+ * of a packfile containing the object).
+ */
+static struct pack_midx_entry *get_sorted_entries(struct packed_git **p,
+ uint32_t *perm,
+ uint32_t nr_packs,
+ uint32_t *nr_objects)
+{
+ uint32_t cur_fanout, cur_pack, cur_object;
+ uint32_t alloc_fanout, alloc_objects, total_objects = 0;
+ struct pack_midx_entry *entries_by_fanout = NULL;
+ struct pack_midx_entry *deduplicated_entries = NULL;
+
+ for (cur_pack = 0; cur_pack < nr_packs; cur_pack++)
+ total_objects += p[cur_pack]->num_objects;
+
+ /*
+ * As we de-duplicate by fanout value, we expect the fanout
+ * slices to be evenly distributed, with some noise. Hence,
+ * allocate slightly more than one 256th.
+ */
+ alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16;
+
+ ALLOC_ARRAY(entries_by_fanout, alloc_fanout);
+ ALLOC_ARRAY(deduplicated_entries, alloc_objects);
+ *nr_objects = 0;
+
+ for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
+ uint32_t nr_fanout = 0;
+
+ for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
+ uint32_t start = 0, end;
+
+ if (cur_fanout)
+ start = get_pack_fanout(p[cur_pack], cur_fanout - 1);
+ end = get_pack_fanout(p[cur_pack], cur_fanout);
+
+ for (cur_object = start; cur_object < end; cur_object++) {
+ ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout);
+ fill_pack_entry(perm[cur_pack], p[cur_pack], cur_object, &entries_by_fanout[nr_fanout]);
+ nr_fanout++;
+ }
+ }
+
+ QSORT(entries_by_fanout, nr_fanout, midx_oid_compare);
+
+ /*
+ * The batch is now sorted by OID and then mtime (descending).
+ * Take only the first duplicate.
+ */
+ for (cur_object = 0; cur_object < nr_fanout; cur_object++) {
+ if (cur_object && !oidcmp(&entries_by_fanout[cur_object - 1].oid,
+ &entries_by_fanout[cur_object].oid))
+ continue;
+
+ ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects);
+ memcpy(&deduplicated_entries[*nr_objects],
+ &entries_by_fanout[cur_object],
+ sizeof(struct pack_midx_entry));
+ (*nr_objects)++;
+ }
+ }
+
+ free(entries_by_fanout);
+ return deduplicated_entries;
+}
+