index-pack.con commit teach git-index-pack about deltas with offset to base (53dda6f)
   1#include "cache.h"
   2#include "delta.h"
   3#include "pack.h"
   4#include "csum-file.h"
   5#include "blob.h"
   6#include "commit.h"
   7#include "tag.h"
   8#include "tree.h"
   9
  10static const char index_pack_usage[] =
  11"git-index-pack [-o index-file] pack-file";
  12
  13struct object_entry
  14{
  15        unsigned long offset;
  16        enum object_type type;
  17        enum object_type real_type;
  18        unsigned char sha1[20];
  19};
  20
  21union delta_base {
  22        unsigned char sha1[20];
  23        unsigned long offset;
  24};
  25
  26struct delta_entry
  27{
  28        struct object_entry *obj;
  29        union delta_base base;
  30};
  31
  32static const char *pack_name;
  33static unsigned char *pack_base;
  34static unsigned long pack_size;
  35static struct object_entry *objects;
  36static struct delta_entry *deltas;
  37static int nr_objects;
  38static int nr_deltas;
  39
  40static void open_pack_file(void)
  41{
  42        int fd;
  43        struct stat st;
  44
  45        fd = open(pack_name, O_RDONLY);
  46        if (fd < 0)
  47                die("cannot open packfile '%s': %s", pack_name,
  48                    strerror(errno));
  49        if (fstat(fd, &st)) {
  50                int err = errno;
  51                close(fd);
  52                die("cannot fstat packfile '%s': %s", pack_name,
  53                    strerror(err));
  54        }
  55        pack_size = st.st_size;
  56        pack_base = mmap(NULL, pack_size, PROT_READ, MAP_PRIVATE, fd, 0);
  57        if (pack_base == MAP_FAILED) {
  58                int err = errno;
  59                close(fd);
  60                die("cannot mmap packfile '%s': %s", pack_name,
  61                    strerror(err));
  62        }
  63        close(fd);
  64}
  65
  66static void parse_pack_header(void)
  67{
  68        const struct pack_header *hdr;
  69        unsigned char sha1[20];
  70        SHA_CTX ctx;
  71
  72        /* Ensure there are enough bytes for the header and final SHA1 */
  73        if (pack_size < sizeof(struct pack_header) + 20)
  74                die("packfile '%s' is too small", pack_name);
  75
  76        /* Header consistency check */
  77        hdr = (void *)pack_base;
  78        if (hdr->hdr_signature != htonl(PACK_SIGNATURE))
  79                die("packfile '%s' signature mismatch", pack_name);
  80        if (!pack_version_ok(hdr->hdr_version))
  81                die("packfile '%s' version %d unsupported",
  82                    pack_name, ntohl(hdr->hdr_version));
  83
  84        nr_objects = ntohl(hdr->hdr_entries);
  85
  86        /* Check packfile integrity */
  87        SHA1_Init(&ctx);
  88        SHA1_Update(&ctx, pack_base, pack_size - 20);
  89        SHA1_Final(sha1, &ctx);
  90        if (hashcmp(sha1, pack_base + pack_size - 20))
  91                die("packfile '%s' SHA1 mismatch", pack_name);
  92}
  93
  94static void bad_object(unsigned long offset, const char *format,
  95                       ...) NORETURN __attribute__((format (printf, 2, 3)));
  96
  97static void bad_object(unsigned long offset, const char *format, ...)
  98{
  99        va_list params;
 100        char buf[1024];
 101
 102        va_start(params, format);
 103        vsnprintf(buf, sizeof(buf), format, params);
 104        va_end(params);
 105        die("packfile '%s': bad object at offset %lu: %s",
 106            pack_name, offset, buf);
 107}
 108
 109static void *unpack_entry_data(unsigned long offset,
 110                               unsigned long *current_pos, unsigned long size)
 111{
 112        unsigned long pack_limit = pack_size - 20;
 113        unsigned long pos = *current_pos;
 114        z_stream stream;
 115        void *buf = xmalloc(size);
 116
 117        memset(&stream, 0, sizeof(stream));
 118        stream.next_out = buf;
 119        stream.avail_out = size;
 120        stream.next_in = pack_base + pos;
 121        stream.avail_in = pack_limit - pos;
 122        inflateInit(&stream);
 123
 124        for (;;) {
 125                int ret = inflate(&stream, 0);
 126                if (ret == Z_STREAM_END)
 127                        break;
 128                if (ret != Z_OK)
 129                        bad_object(offset, "inflate returned %d", ret);
 130        }
 131        inflateEnd(&stream);
 132        if (stream.total_out != size)
 133                bad_object(offset, "size mismatch (expected %lu, got %lu)",
 134                           size, stream.total_out);
 135        *current_pos = pack_limit - stream.avail_in;
 136        return buf;
 137}
 138
 139static void *unpack_raw_entry(unsigned long offset,
 140                              enum object_type *obj_type,
 141                              unsigned long *obj_size,
 142                              union delta_base *delta_base,
 143                              unsigned long *next_obj_offset)
 144{
 145        unsigned long pack_limit = pack_size - 20;
 146        unsigned long pos = offset;
 147        unsigned char c;
 148        unsigned long size, base_offset;
 149        unsigned shift;
 150        enum object_type type;
 151        void *data;
 152
 153        c = pack_base[pos++];
 154        type = (c >> 4) & 7;
 155        size = (c & 15);
 156        shift = 4;
 157        while (c & 0x80) {
 158                if (pos >= pack_limit)
 159                        bad_object(offset, "object extends past end of pack");
 160                c = pack_base[pos++];
 161                size += (c & 0x7fUL) << shift;
 162                shift += 7;
 163        }
 164
 165        switch (type) {
 166        case OBJ_REF_DELTA:
 167                if (pos + 20 >= pack_limit)
 168                        bad_object(offset, "object extends past end of pack");
 169                hashcpy(delta_base->sha1, pack_base + pos);
 170                pos += 20;
 171                break;
 172        case OBJ_OFS_DELTA:
 173                memset(delta_base, 0, sizeof(*delta_base));
 174                c = pack_base[pos++];
 175                base_offset = c & 127;
 176                while (c & 128) {
 177                        base_offset += 1;
 178                        if (!base_offset || base_offset & ~(~0UL >> 7))
 179                                bad_object(offset, "offset value overflow for delta base object");
 180                        if (pos >= pack_limit)
 181                                bad_object(offset, "object extends past end of pack");
 182                        c = pack_base[pos++];
 183                        base_offset = (base_offset << 7) + (c & 127);
 184                }
 185                delta_base->offset = offset - base_offset;
 186                if (delta_base->offset >= offset)
 187                        bad_object(offset, "delta base offset is out of bound");
 188                break;
 189        case OBJ_COMMIT:
 190        case OBJ_TREE:
 191        case OBJ_BLOB:
 192        case OBJ_TAG:
 193                break;
 194        default:
 195                bad_object(offset, "bad object type %d", type);
 196        }
 197
 198        data = unpack_entry_data(offset, &pos, size);
 199        *obj_type = type;
 200        *obj_size = size;
 201        *next_obj_offset = pos;
 202        return data;
 203}
 204
 205static int find_delta(const union delta_base *base)
 206{
 207        int first = 0, last = nr_deltas;
 208
 209        while (first < last) {
 210                int next = (first + last) / 2;
 211                struct delta_entry *delta = &deltas[next];
 212                int cmp;
 213
 214                cmp = memcmp(base, &delta->base, sizeof(*base));
 215                if (!cmp)
 216                        return next;
 217                if (cmp < 0) {
 218                        last = next;
 219                        continue;
 220                }
 221                first = next+1;
 222        }
 223        return -first-1;
 224}
 225
 226static int find_delta_childs(const union delta_base *base,
 227                             int *first_index, int *last_index)
 228{
 229        int first = find_delta(base);
 230        int last = first;
 231        int end = nr_deltas - 1;
 232
 233        if (first < 0)
 234                return -1;
 235        while (first > 0 && !memcmp(&deltas[first - 1].base, base, sizeof(*base)))
 236                --first;
 237        while (last < end && !memcmp(&deltas[last + 1].base, base, sizeof(*base)))
 238                ++last;
 239        *first_index = first;
 240        *last_index = last;
 241        return 0;
 242}
 243
 244static void sha1_object(const void *data, unsigned long size,
 245                        enum object_type type, unsigned char *sha1)
 246{
 247        SHA_CTX ctx;
 248        char header[50];
 249        int header_size;
 250        const char *type_str;
 251
 252        switch (type) {
 253        case OBJ_COMMIT: type_str = commit_type; break;
 254        case OBJ_TREE:   type_str = tree_type; break;
 255        case OBJ_BLOB:   type_str = blob_type; break;
 256        case OBJ_TAG:    type_str = tag_type; break;
 257        default:
 258                die("bad type %d", type);
 259        }
 260
 261        header_size = sprintf(header, "%s %lu", type_str, size) + 1;
 262
 263        SHA1_Init(&ctx);
 264        SHA1_Update(&ctx, header, header_size);
 265        SHA1_Update(&ctx, data, size);
 266        SHA1_Final(sha1, &ctx);
 267}
 268
 269static void resolve_delta(struct delta_entry *delta, void *base_data,
 270                          unsigned long base_size, enum object_type type)
 271{
 272        struct object_entry *obj = delta->obj;
 273        void *delta_data;
 274        unsigned long delta_size;
 275        void *result;
 276        unsigned long result_size;
 277        enum object_type delta_type;
 278        union delta_base delta_base;
 279        unsigned long next_obj_offset;
 280        int j, first, last;
 281
 282        obj->real_type = type;
 283        delta_data = unpack_raw_entry(obj->offset, &delta_type,
 284                                      &delta_size, &delta_base,
 285                                      &next_obj_offset);
 286        result = patch_delta(base_data, base_size, delta_data, delta_size,
 287                             &result_size);
 288        free(delta_data);
 289        if (!result)
 290                bad_object(obj->offset, "failed to apply delta");
 291        sha1_object(result, result_size, type, obj->sha1);
 292
 293        hashcpy(delta_base.sha1, obj->sha1);
 294        if (!find_delta_childs(&delta_base, &first, &last)) {
 295                for (j = first; j <= last; j++)
 296                        if (deltas[j].obj->type == OBJ_REF_DELTA)
 297                                resolve_delta(&deltas[j], result, result_size, type);
 298        }
 299
 300        memset(&delta_base, 0, sizeof(delta_base));
 301        delta_base.offset = obj->offset;
 302        if (!find_delta_childs(&delta_base, &first, &last)) {
 303                for (j = first; j <= last; j++)
 304                        if (deltas[j].obj->type == OBJ_OFS_DELTA)
 305                                resolve_delta(&deltas[j], result, result_size, type);
 306        }
 307
 308        free(result);
 309}
 310
 311static int compare_delta_entry(const void *a, const void *b)
 312{
 313        const struct delta_entry *delta_a = a;
 314        const struct delta_entry *delta_b = b;
 315        return memcmp(&delta_a->base, &delta_b->base, sizeof(union delta_base));
 316}
 317
 318static void parse_pack_objects(void)
 319{
 320        int i;
 321        unsigned long offset = sizeof(struct pack_header);
 322        struct delta_entry *delta = deltas;
 323        void *data;
 324        unsigned long data_size;
 325
 326        /*
 327         * First pass:
 328         * - find locations of all objects;
 329         * - calculate SHA1 of all non-delta objects;
 330         * - remember base SHA1 for all deltas.
 331         */
 332        for (i = 0; i < nr_objects; i++) {
 333                struct object_entry *obj = &objects[i];
 334                obj->offset = offset;
 335                data = unpack_raw_entry(offset, &obj->type, &data_size,
 336                                        &delta->base, &offset);
 337                obj->real_type = obj->type;
 338                if (obj->type == OBJ_REF_DELTA || obj->type == OBJ_OFS_DELTA) {
 339                        nr_deltas++;
 340                        delta->obj = obj;
 341                        delta++;
 342                } else
 343                        sha1_object(data, data_size, obj->type, obj->sha1);
 344                free(data);
 345        }
 346        if (offset != pack_size - 20)
 347                die("packfile '%s' has junk at the end", pack_name);
 348
 349        /* Sort deltas by base SHA1/offset for fast searching */
 350        qsort(deltas, nr_deltas, sizeof(struct delta_entry),
 351              compare_delta_entry);
 352
 353        /*
 354         * Second pass:
 355         * - for all non-delta objects, look if it is used as a base for
 356         *   deltas;
 357         * - if used as a base, uncompress the object and apply all deltas,
 358         *   recursively checking if the resulting object is used as a base
 359         *   for some more deltas.
 360         */
 361        for (i = 0; i < nr_objects; i++) {
 362                struct object_entry *obj = &objects[i];
 363                union delta_base base;
 364                int j, ref, ref_first, ref_last, ofs, ofs_first, ofs_last;
 365
 366                if (obj->type == OBJ_REF_DELTA || obj->type == OBJ_OFS_DELTA)
 367                        continue;
 368                hashcpy(base.sha1, obj->sha1);
 369                ref = !find_delta_childs(&base, &ref_first, &ref_last);
 370                memset(&base, 0, sizeof(base));
 371                base.offset = obj->offset;
 372                ofs = !find_delta_childs(&base, &ofs_first, &ofs_last);
 373                if (!ref && !ofs)
 374                        continue;
 375                data = unpack_raw_entry(obj->offset, &obj->type, &data_size,
 376                                        &base, &offset);
 377                if (ref)
 378                        for (j = ref_first; j <= ref_last; j++)
 379                                if (deltas[j].obj->type == OBJ_REF_DELTA)
 380                                        resolve_delta(&deltas[j], data,
 381                                                      data_size, obj->type);
 382                if (ofs)
 383                        for (j = ofs_first; j <= ofs_last; j++)
 384                                if (deltas[j].obj->type == OBJ_OFS_DELTA)
 385                                        resolve_delta(&deltas[j], data,
 386                                                      data_size, obj->type);
 387                free(data);
 388        }
 389
 390        /* Check for unresolved deltas */
 391        for (i = 0; i < nr_deltas; i++) {
 392                if (deltas[i].obj->real_type == OBJ_REF_DELTA ||
 393                    deltas[i].obj->real_type == OBJ_OFS_DELTA)
 394                        die("packfile '%s' has unresolved deltas",  pack_name);
 395        }
 396}
 397
 398static int sha1_compare(const void *_a, const void *_b)
 399{
 400        struct object_entry *a = *(struct object_entry **)_a;
 401        struct object_entry *b = *(struct object_entry **)_b;
 402        return hashcmp(a->sha1, b->sha1);
 403}
 404
 405static void write_index_file(const char *index_name, unsigned char *sha1)
 406{
 407        struct sha1file *f;
 408        struct object_entry **sorted_by_sha, **list, **last;
 409        unsigned int array[256];
 410        int i;
 411        SHA_CTX ctx;
 412
 413        if (nr_objects) {
 414                sorted_by_sha =
 415                        xcalloc(nr_objects, sizeof(struct object_entry *));
 416                list = sorted_by_sha;
 417                last = sorted_by_sha + nr_objects;
 418                for (i = 0; i < nr_objects; ++i)
 419                        sorted_by_sha[i] = &objects[i];
 420                qsort(sorted_by_sha, nr_objects, sizeof(sorted_by_sha[0]),
 421                      sha1_compare);
 422
 423        }
 424        else
 425                sorted_by_sha = list = last = NULL;
 426
 427        unlink(index_name);
 428        f = sha1create("%s", index_name);
 429
 430        /*
 431         * Write the first-level table (the list is sorted,
 432         * but we use a 256-entry lookup to be able to avoid
 433         * having to do eight extra binary search iterations).
 434         */
 435        for (i = 0; i < 256; i++) {
 436                struct object_entry **next = list;
 437                while (next < last) {
 438                        struct object_entry *obj = *next;
 439                        if (obj->sha1[0] != i)
 440                                break;
 441                        next++;
 442                }
 443                array[i] = htonl(next - sorted_by_sha);
 444                list = next;
 445        }
 446        sha1write(f, array, 256 * sizeof(int));
 447
 448        /* recompute the SHA1 hash of sorted object names.
 449         * currently pack-objects does not do this, but that
 450         * can be fixed.
 451         */
 452        SHA1_Init(&ctx);
 453        /*
 454         * Write the actual SHA1 entries..
 455         */
 456        list = sorted_by_sha;
 457        for (i = 0; i < nr_objects; i++) {
 458                struct object_entry *obj = *list++;
 459                unsigned int offset = htonl(obj->offset);
 460                sha1write(f, &offset, 4);
 461                sha1write(f, obj->sha1, 20);
 462                SHA1_Update(&ctx, obj->sha1, 20);
 463        }
 464        sha1write(f, pack_base + pack_size - 20, 20);
 465        sha1close(f, NULL, 1);
 466        free(sorted_by_sha);
 467        SHA1_Final(sha1, &ctx);
 468}
 469
 470int main(int argc, char **argv)
 471{
 472        int i;
 473        char *index_name = NULL;
 474        char *index_name_buf = NULL;
 475        unsigned char sha1[20];
 476
 477        for (i = 1; i < argc; i++) {
 478                const char *arg = argv[i];
 479
 480                if (*arg == '-') {
 481                        if (!strcmp(arg, "-o")) {
 482                                if (index_name || (i+1) >= argc)
 483                                        usage(index_pack_usage);
 484                                index_name = argv[++i];
 485                        } else
 486                                usage(index_pack_usage);
 487                        continue;
 488                }
 489
 490                if (pack_name)
 491                        usage(index_pack_usage);
 492                pack_name = arg;
 493        }
 494
 495        if (!pack_name)
 496                usage(index_pack_usage);
 497        if (!index_name) {
 498                int len = strlen(pack_name);
 499                if (!has_extension(pack_name, ".pack"))
 500                        die("packfile name '%s' does not end with '.pack'",
 501                            pack_name);
 502                index_name_buf = xmalloc(len);
 503                memcpy(index_name_buf, pack_name, len - 5);
 504                strcpy(index_name_buf + len - 5, ".idx");
 505                index_name = index_name_buf;
 506        }
 507
 508        open_pack_file();
 509        parse_pack_header();
 510        objects = xcalloc(nr_objects, sizeof(struct object_entry));
 511        deltas = xcalloc(nr_objects, sizeof(struct delta_entry));
 512        parse_pack_objects();
 513        free(deltas);
 514        write_index_file(index_name, sha1);
 515        free(objects);
 516        free(index_name_buf);
 517
 518        printf("%s\n", sha1_to_hex(sha1));
 519
 520        return 0;
 521}