3f113e1bebbf74e639290d525c34b208e1bfc57f
   1#include "cache.h"
   2#include "csum-file.h"
   3#include "dir.h"
   4#include "lockfile.h"
   5#include "packfile.h"
   6#include "object-store.h"
   7#include "packfile.h"
   8#include "midx.h"
   9
  10#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
  11#define MIDX_VERSION 1
  12#define MIDX_BYTE_FILE_VERSION 4
  13#define MIDX_BYTE_HASH_VERSION 5
  14#define MIDX_BYTE_NUM_CHUNKS 6
  15#define MIDX_BYTE_NUM_PACKS 8
  16#define MIDX_HASH_VERSION 1
  17#define MIDX_HEADER_SIZE 12
  18#define MIDX_HASH_LEN 20
  19#define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
  20
  21#define MIDX_MAX_CHUNKS 2
  22#define MIDX_CHUNK_ALIGNMENT 4
  23#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
  24#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
  25#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
  26
  27static char *get_midx_filename(const char *object_dir)
  28{
  29        return xstrfmt("%s/pack/multi-pack-index", object_dir);
  30}
  31
  32struct multi_pack_index *load_multi_pack_index(const char *object_dir)
  33{
  34        struct multi_pack_index *m = NULL;
  35        int fd;
  36        struct stat st;
  37        size_t midx_size;
  38        void *midx_map = NULL;
  39        uint32_t hash_version;
  40        char *midx_name = get_midx_filename(object_dir);
  41        uint32_t i;
  42        const char *cur_pack_name;
  43
  44        fd = git_open(midx_name);
  45
  46        if (fd < 0)
  47                goto cleanup_fail;
  48        if (fstat(fd, &st)) {
  49                error_errno(_("failed to read %s"), midx_name);
  50                goto cleanup_fail;
  51        }
  52
  53        midx_size = xsize_t(st.st_size);
  54
  55        if (midx_size < MIDX_MIN_SIZE) {
  56                error(_("multi-pack-index file %s is too small"), midx_name);
  57                goto cleanup_fail;
  58        }
  59
  60        FREE_AND_NULL(midx_name);
  61
  62        midx_map = xmmap(NULL, midx_size, PROT_READ, MAP_PRIVATE, fd, 0);
  63
  64        FLEX_ALLOC_MEM(m, object_dir, object_dir, strlen(object_dir));
  65        m->fd = fd;
  66        m->data = midx_map;
  67        m->data_len = midx_size;
  68
  69        m->signature = get_be32(m->data);
  70        if (m->signature != MIDX_SIGNATURE) {
  71                error(_("multi-pack-index signature 0x%08x does not match signature 0x%08x"),
  72                      m->signature, MIDX_SIGNATURE);
  73                goto cleanup_fail;
  74        }
  75
  76        m->version = m->data[MIDX_BYTE_FILE_VERSION];
  77        if (m->version != MIDX_VERSION) {
  78                error(_("multi-pack-index version %d not recognized"),
  79                      m->version);
  80                goto cleanup_fail;
  81        }
  82
  83        hash_version = m->data[MIDX_BYTE_HASH_VERSION];
  84        if (hash_version != MIDX_HASH_VERSION) {
  85                error(_("hash version %u does not match"), hash_version);
  86                goto cleanup_fail;
  87        }
  88        m->hash_len = MIDX_HASH_LEN;
  89
  90        m->num_chunks = m->data[MIDX_BYTE_NUM_CHUNKS];
  91
  92        m->num_packs = get_be32(m->data + MIDX_BYTE_NUM_PACKS);
  93
  94        for (i = 0; i < m->num_chunks; i++) {
  95                uint32_t chunk_id = get_be32(m->data + MIDX_HEADER_SIZE +
  96                                             MIDX_CHUNKLOOKUP_WIDTH * i);
  97                uint64_t chunk_offset = get_be64(m->data + MIDX_HEADER_SIZE + 4 +
  98                                                 MIDX_CHUNKLOOKUP_WIDTH * i);
  99
 100                switch (chunk_id) {
 101                        case MIDX_CHUNKID_PACKNAMES:
 102                                m->chunk_pack_names = m->data + chunk_offset;
 103                                break;
 104
 105                        case MIDX_CHUNKID_OIDLOOKUP:
 106                                m->chunk_oid_lookup = m->data + chunk_offset;
 107                                break;
 108
 109                        case 0:
 110                                die(_("terminating multi-pack-index chunk id appears earlier than expected"));
 111                                break;
 112
 113                        default:
 114                                /*
 115                                 * Do nothing on unrecognized chunks, allowing future
 116                                 * extensions to add optional chunks.
 117                                 */
 118                                break;
 119                }
 120        }
 121
 122        if (!m->chunk_pack_names)
 123                die(_("multi-pack-index missing required pack-name chunk"));
 124        if (!m->chunk_oid_lookup)
 125                die(_("multi-pack-index missing required OID lookup chunk"));
 126
 127        m->pack_names = xcalloc(m->num_packs, sizeof(*m->pack_names));
 128
 129        cur_pack_name = (const char *)m->chunk_pack_names;
 130        for (i = 0; i < m->num_packs; i++) {
 131                m->pack_names[i] = cur_pack_name;
 132
 133                cur_pack_name += strlen(cur_pack_name) + 1;
 134
 135                if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0) {
 136                        error(_("multi-pack-index pack names out of order: '%s' before '%s'"),
 137                              m->pack_names[i - 1],
 138                              m->pack_names[i]);
 139                        goto cleanup_fail;
 140                }
 141        }
 142
 143        return m;
 144
 145cleanup_fail:
 146        free(m);
 147        free(midx_name);
 148        if (midx_map)
 149                munmap(midx_map, midx_size);
 150        if (0 <= fd)
 151                close(fd);
 152        return NULL;
 153}
 154
 155static size_t write_midx_header(struct hashfile *f,
 156                                unsigned char num_chunks,
 157                                uint32_t num_packs)
 158{
 159        unsigned char byte_values[4];
 160
 161        hashwrite_be32(f, MIDX_SIGNATURE);
 162        byte_values[0] = MIDX_VERSION;
 163        byte_values[1] = MIDX_HASH_VERSION;
 164        byte_values[2] = num_chunks;
 165        byte_values[3] = 0; /* unused */
 166        hashwrite(f, byte_values, sizeof(byte_values));
 167        hashwrite_be32(f, num_packs);
 168
 169        return MIDX_HEADER_SIZE;
 170}
 171
 172struct pack_list {
 173        struct packed_git **list;
 174        char **names;
 175        uint32_t nr;
 176        uint32_t alloc_list;
 177        uint32_t alloc_names;
 178        size_t pack_name_concat_len;
 179};
 180
 181static void add_pack_to_midx(const char *full_path, size_t full_path_len,
 182                             const char *file_name, void *data)
 183{
 184        struct pack_list *packs = (struct pack_list *)data;
 185
 186        if (ends_with(file_name, ".idx")) {
 187                ALLOC_GROW(packs->list, packs->nr + 1, packs->alloc_list);
 188                ALLOC_GROW(packs->names, packs->nr + 1, packs->alloc_names);
 189
 190                packs->list[packs->nr] = add_packed_git(full_path,
 191                                                        full_path_len,
 192                                                        0);
 193
 194                if (!packs->list[packs->nr]) {
 195                        warning(_("failed to add packfile '%s'"),
 196                                full_path);
 197                        return;
 198                }
 199
 200                if (open_pack_index(packs->list[packs->nr])) {
 201                        warning(_("failed to open pack-index '%s'"),
 202                                full_path);
 203                        close_pack(packs->list[packs->nr]);
 204                        FREE_AND_NULL(packs->list[packs->nr]);
 205                        return;
 206                }
 207
 208                packs->names[packs->nr] = xstrdup(file_name);
 209                packs->pack_name_concat_len += strlen(file_name) + 1;
 210                packs->nr++;
 211        }
 212}
 213
 214struct pack_pair {
 215        uint32_t pack_int_id;
 216        char *pack_name;
 217};
 218
 219static int pack_pair_compare(const void *_a, const void *_b)
 220{
 221        struct pack_pair *a = (struct pack_pair *)_a;
 222        struct pack_pair *b = (struct pack_pair *)_b;
 223        return strcmp(a->pack_name, b->pack_name);
 224}
 225
 226static void sort_packs_by_name(char **pack_names, uint32_t nr_packs, uint32_t *perm)
 227{
 228        uint32_t i;
 229        struct pack_pair *pairs;
 230
 231        ALLOC_ARRAY(pairs, nr_packs);
 232
 233        for (i = 0; i < nr_packs; i++) {
 234                pairs[i].pack_int_id = i;
 235                pairs[i].pack_name = pack_names[i];
 236        }
 237
 238        QSORT(pairs, nr_packs, pack_pair_compare);
 239
 240        for (i = 0; i < nr_packs; i++) {
 241                pack_names[i] = pairs[i].pack_name;
 242                perm[pairs[i].pack_int_id] = i;
 243        }
 244
 245        free(pairs);
 246}
 247
 248struct pack_midx_entry {
 249        struct object_id oid;
 250        uint32_t pack_int_id;
 251        time_t pack_mtime;
 252        uint64_t offset;
 253};
 254
 255static int midx_oid_compare(const void *_a, const void *_b)
 256{
 257        const struct pack_midx_entry *a = (const struct pack_midx_entry *)_a;
 258        const struct pack_midx_entry *b = (const struct pack_midx_entry *)_b;
 259        int cmp = oidcmp(&a->oid, &b->oid);
 260
 261        if (cmp)
 262                return cmp;
 263
 264        if (a->pack_mtime > b->pack_mtime)
 265                return -1;
 266        else if (a->pack_mtime < b->pack_mtime)
 267                return 1;
 268
 269        return a->pack_int_id - b->pack_int_id;
 270}
 271
 272static void fill_pack_entry(uint32_t pack_int_id,
 273                            struct packed_git *p,
 274                            uint32_t cur_object,
 275                            struct pack_midx_entry *entry)
 276{
 277        if (!nth_packed_object_oid(&entry->oid, p, cur_object))
 278                die(_("failed to locate object %d in packfile"), cur_object);
 279
 280        entry->pack_int_id = pack_int_id;
 281        entry->pack_mtime = p->mtime;
 282
 283        entry->offset = nth_packed_object_offset(p, cur_object);
 284}
 285
 286/*
 287 * It is possible to artificially get into a state where there are many
 288 * duplicate copies of objects. That can create high memory pressure if
 289 * we are to create a list of all objects before de-duplication. To reduce
 290 * this memory pressure without a significant performance drop, automatically
 291 * group objects by the first byte of their object id. Use the IDX fanout
 292 * tables to group the data, copy to a local array, then sort.
 293 *
 294 * Copy only the de-duplicated entries (selected by most-recent modified time
 295 * of a packfile containing the object).
 296 */
 297static struct pack_midx_entry *get_sorted_entries(struct packed_git **p,
 298                                                  uint32_t *perm,
 299                                                  uint32_t nr_packs,
 300                                                  uint32_t *nr_objects)
 301{
 302        uint32_t cur_fanout, cur_pack, cur_object;
 303        uint32_t alloc_fanout, alloc_objects, total_objects = 0;
 304        struct pack_midx_entry *entries_by_fanout = NULL;
 305        struct pack_midx_entry *deduplicated_entries = NULL;
 306
 307        for (cur_pack = 0; cur_pack < nr_packs; cur_pack++)
 308                total_objects += p[cur_pack]->num_objects;
 309
 310        /*
 311         * As we de-duplicate by fanout value, we expect the fanout
 312         * slices to be evenly distributed, with some noise. Hence,
 313         * allocate slightly more than one 256th.
 314         */
 315        alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16;
 316
 317        ALLOC_ARRAY(entries_by_fanout, alloc_fanout);
 318        ALLOC_ARRAY(deduplicated_entries, alloc_objects);
 319        *nr_objects = 0;
 320
 321        for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
 322                uint32_t nr_fanout = 0;
 323
 324                for (cur_pack = 0; cur_pack < nr_packs; cur_pack++) {
 325                        uint32_t start = 0, end;
 326
 327                        if (cur_fanout)
 328                                start = get_pack_fanout(p[cur_pack], cur_fanout - 1);
 329                        end = get_pack_fanout(p[cur_pack], cur_fanout);
 330
 331                        for (cur_object = start; cur_object < end; cur_object++) {
 332                                ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout);
 333                                fill_pack_entry(perm[cur_pack], p[cur_pack], cur_object, &entries_by_fanout[nr_fanout]);
 334                                nr_fanout++;
 335                        }
 336                }
 337
 338                QSORT(entries_by_fanout, nr_fanout, midx_oid_compare);
 339
 340                /*
 341                 * The batch is now sorted by OID and then mtime (descending).
 342                 * Take only the first duplicate.
 343                 */
 344                for (cur_object = 0; cur_object < nr_fanout; cur_object++) {
 345                        if (cur_object && !oidcmp(&entries_by_fanout[cur_object - 1].oid,
 346                                                  &entries_by_fanout[cur_object].oid))
 347                                continue;
 348
 349                        ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects);
 350                        memcpy(&deduplicated_entries[*nr_objects],
 351                               &entries_by_fanout[cur_object],
 352                               sizeof(struct pack_midx_entry));
 353                        (*nr_objects)++;
 354                }
 355        }
 356
 357        free(entries_by_fanout);
 358        return deduplicated_entries;
 359}
 360
 361static size_t write_midx_pack_names(struct hashfile *f,
 362                                    char **pack_names,
 363                                    uint32_t num_packs)
 364{
 365        uint32_t i;
 366        unsigned char padding[MIDX_CHUNK_ALIGNMENT];
 367        size_t written = 0;
 368
 369        for (i = 0; i < num_packs; i++) {
 370                size_t writelen = strlen(pack_names[i]) + 1;
 371
 372                if (i && strcmp(pack_names[i], pack_names[i - 1]) <= 0)
 373                        BUG("incorrect pack-file order: %s before %s",
 374                            pack_names[i - 1],
 375                            pack_names[i]);
 376
 377                hashwrite(f, pack_names[i], writelen);
 378                written += writelen;
 379        }
 380
 381        /* add padding to be aligned */
 382        i = MIDX_CHUNK_ALIGNMENT - (written % MIDX_CHUNK_ALIGNMENT);
 383        if (i < MIDX_CHUNK_ALIGNMENT) {
 384                memset(padding, 0, sizeof(padding));
 385                hashwrite(f, padding, i);
 386                written += i;
 387        }
 388
 389        return written;
 390}
 391
 392static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len,
 393                                    struct pack_midx_entry *objects,
 394                                    uint32_t nr_objects)
 395{
 396        struct pack_midx_entry *list = objects;
 397        uint32_t i;
 398        size_t written = 0;
 399
 400        for (i = 0; i < nr_objects; i++) {
 401                struct pack_midx_entry *obj = list++;
 402
 403                if (i < nr_objects - 1) {
 404                        struct pack_midx_entry *next = list;
 405                        if (oidcmp(&obj->oid, &next->oid) >= 0)
 406                                BUG("OIDs not in order: %s >= %s",
 407                                    oid_to_hex(&obj->oid),
 408                                    oid_to_hex(&next->oid));
 409                }
 410
 411                hashwrite(f, obj->oid.hash, (int)hash_len);
 412                written += hash_len;
 413        }
 414
 415        return written;
 416}
 417
 418int write_midx_file(const char *object_dir)
 419{
 420        unsigned char cur_chunk, num_chunks = 0;
 421        char *midx_name;
 422        uint32_t i;
 423        struct hashfile *f = NULL;
 424        struct lock_file lk;
 425        struct pack_list packs;
 426        uint32_t *pack_perm = NULL;
 427        uint64_t written = 0;
 428        uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
 429        uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
 430        uint32_t nr_entries;
 431        struct pack_midx_entry *entries = NULL;
 432
 433        midx_name = get_midx_filename(object_dir);
 434        if (safe_create_leading_directories(midx_name)) {
 435                UNLEAK(midx_name);
 436                die_errno(_("unable to create leading directories of %s"),
 437                          midx_name);
 438        }
 439
 440        packs.nr = 0;
 441        packs.alloc_list = 16;
 442        packs.alloc_names = 16;
 443        packs.list = NULL;
 444        packs.pack_name_concat_len = 0;
 445        ALLOC_ARRAY(packs.list, packs.alloc_list);
 446        ALLOC_ARRAY(packs.names, packs.alloc_names);
 447
 448        for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &packs);
 449
 450        if (packs.pack_name_concat_len % MIDX_CHUNK_ALIGNMENT)
 451                packs.pack_name_concat_len += MIDX_CHUNK_ALIGNMENT -
 452                                              (packs.pack_name_concat_len % MIDX_CHUNK_ALIGNMENT);
 453
 454        ALLOC_ARRAY(pack_perm, packs.nr);
 455        sort_packs_by_name(packs.names, packs.nr, pack_perm);
 456
 457        entries = get_sorted_entries(packs.list, pack_perm, packs.nr, &nr_entries);
 458
 459        hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
 460        f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
 461        FREE_AND_NULL(midx_name);
 462
 463        cur_chunk = 0;
 464        num_chunks = 2;
 465
 466        written = write_midx_header(f, num_chunks, packs.nr);
 467
 468        chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES;
 469        chunk_offsets[cur_chunk] = written + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH;
 470
 471        cur_chunk++;
 472        chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDLOOKUP;
 473        chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + packs.pack_name_concat_len;
 474
 475        cur_chunk++;
 476        chunk_ids[cur_chunk] = 0;
 477        chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_HASH_LEN;
 478
 479        for (i = 0; i <= num_chunks; i++) {
 480                if (i && chunk_offsets[i] < chunk_offsets[i - 1])
 481                        BUG("incorrect chunk offsets: %"PRIu64" before %"PRIu64,
 482                            chunk_offsets[i - 1],
 483                            chunk_offsets[i]);
 484
 485                if (chunk_offsets[i] % MIDX_CHUNK_ALIGNMENT)
 486                        BUG("chunk offset %"PRIu64" is not properly aligned",
 487                            chunk_offsets[i]);
 488
 489                hashwrite_be32(f, chunk_ids[i]);
 490                hashwrite_be32(f, chunk_offsets[i] >> 32);
 491                hashwrite_be32(f, chunk_offsets[i]);
 492
 493                written += MIDX_CHUNKLOOKUP_WIDTH;
 494        }
 495
 496        for (i = 0; i < num_chunks; i++) {
 497                if (written != chunk_offsets[i])
 498                        BUG("incorrect chunk offset (%"PRIu64" != %"PRIu64") for chunk id %"PRIx32,
 499                            chunk_offsets[i],
 500                            written,
 501                            chunk_ids[i]);
 502
 503                switch (chunk_ids[i]) {
 504                        case MIDX_CHUNKID_PACKNAMES:
 505                                written += write_midx_pack_names(f, packs.names, packs.nr);
 506                                break;
 507
 508                        case MIDX_CHUNKID_OIDLOOKUP:
 509                                written += write_midx_oid_lookup(f, MIDX_HASH_LEN, entries, nr_entries);
 510                                break;
 511
 512                        default:
 513                                BUG("trying to write unknown chunk id %"PRIx32,
 514                                    chunk_ids[i]);
 515                }
 516        }
 517
 518        if (written != chunk_offsets[num_chunks])
 519                BUG("incorrect final offset %"PRIu64" != %"PRIu64,
 520                    written,
 521                    chunk_offsets[num_chunks]);
 522
 523        finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
 524        commit_lock_file(&lk);
 525
 526        for (i = 0; i < packs.nr; i++) {
 527                if (packs.list[i]) {
 528                        close_pack(packs.list[i]);
 529                        free(packs.list[i]);
 530                }
 531                free(packs.names[i]);
 532        }
 533
 534        free(packs.list);
 535        free(packs.names);
 536        free(entries);
 537        return 0;
 538}