pack-objects.con commit git-pack-objects: do the delta search in reverse size order (521a4f4)
   1#include "cache.h"
   2#include "object.h"
   3#include "delta.h"
   4
   5static const char pack_usage[] = "git-pack-objects [--window=N] [--depth=N] base-name < object-list";
   6
   7enum object_type {
   8        OBJ_NONE,
   9        OBJ_COMMIT,
  10        OBJ_TREE,
  11        OBJ_BLOB,
  12        OBJ_DELTA       // NOTE! This is _not_ the same as a "delta" object in the filesystem
  13};
  14
  15struct object_entry {
  16        unsigned char sha1[20];
  17        unsigned long size;
  18        unsigned long offset;
  19        unsigned int depth;
  20        unsigned int flags;
  21        enum object_type type;
  22        unsigned long delta_size;
  23        struct object_entry *delta;
  24};
  25
  26static struct object_entry **sorted_by_sha, **sorted_by_type;
  27static struct object_entry *objects = NULL;
  28static int nr_objects = 0, nr_alloc = 0;
  29static const char *base_name;
  30
  31struct myfile {
  32        int fd;
  33        unsigned long chars;
  34        unsigned char buffer[8192];
  35};
  36
  37static FILE *create_file(const char *suffix)
  38{
  39        static char filename[PATH_MAX];
  40        unsigned len;
  41
  42        len = snprintf(filename, PATH_MAX, "%s.%s", base_name, suffix);
  43        if (len >= PATH_MAX)
  44                die("you wascally wabbit, you");
  45        return fopen(filename, "w");
  46}
  47
  48static unsigned long fwrite_compressed(void *in, unsigned long size, FILE *f)
  49{
  50        z_stream stream;
  51        unsigned long maxsize;
  52        void *out;
  53
  54        memset(&stream, 0, sizeof(stream));
  55        deflateInit(&stream, Z_DEFAULT_COMPRESSION);
  56        maxsize = deflateBound(&stream, size);
  57        out = xmalloc(maxsize);
  58
  59        /* Compress it */
  60        stream.next_in = in;
  61        stream.avail_in = size;
  62
  63        stream.next_out = out;
  64        stream.avail_out = maxsize;
  65
  66        while (deflate(&stream, Z_FINISH) == Z_OK)
  67                /* nothing */;
  68        deflateEnd(&stream);
  69
  70        size = stream.total_out;
  71        fwrite(out, size, 1, f);
  72        free(out);
  73        return size;
  74}
  75
  76static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
  77{
  78        unsigned long othersize, delta_size;
  79        char type[10];
  80        void *otherbuf = read_sha1_file(entry->delta->sha1, type, &othersize);
  81        void *delta_buf;
  82
  83        if (!otherbuf)
  84                die("unable to read %s", sha1_to_hex(entry->delta->sha1));
  85        delta_buf = diff_delta(otherbuf, othersize,
  86                               buf, size, &delta_size, ~0UL);
  87        if (!delta_buf || delta_size != entry->delta_size)
  88                die("delta size changed");
  89        free(buf);
  90        free(otherbuf);
  91        return delta_buf;
  92}
  93
  94static unsigned long write_object(FILE *f, struct object_entry *entry)
  95{
  96        unsigned long size;
  97        char type[10];
  98        void *buf = read_sha1_file(entry->sha1, type, &size);
  99        char header[25];
 100        unsigned hdrlen, datalen;
 101
 102        if (!buf)
 103                die("unable to read %s", sha1_to_hex(entry->sha1));
 104        if (size != entry->size)
 105                die("object %s size inconsistency (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
 106
 107        /*
 108         * The object header is a byte of 'type' followed by four bytes of
 109         * length, except for deltas that has the 20 bytes of delta sha
 110         * instead.
 111         */
 112        header[0] = ".CTB"[entry->type];
 113        hdrlen = 5;
 114        if (entry->delta) {
 115                header[0] = 'D';
 116                memcpy(header+5, entry->delta, 20);
 117                buf = delta_against(buf, size, entry);
 118                size = entry->delta_size;
 119                hdrlen = 25;
 120        }
 121        datalen = htonl(size);
 122        memcpy(header+1, &datalen, 4);
 123        fwrite(header, hdrlen, 1, f);
 124        datalen = fwrite_compressed(buf, size, f);
 125        free(buf);
 126        return hdrlen + datalen;
 127}
 128
 129static void write_pack_file(void)
 130{
 131        int i;
 132        FILE *f = create_file("pack");
 133        unsigned long offset = 0;
 134        unsigned long mb;
 135
 136        for (i = 0; i < nr_objects; i++) {
 137                struct object_entry *entry = objects + i;
 138                entry->offset = offset;
 139                offset += write_object(f, entry);
 140        }
 141        fclose(f);
 142        mb = offset >> 20;
 143        offset &= 0xfffff;
 144}
 145
 146static void write_index_file(void)
 147{
 148        int i;
 149        FILE *f = create_file("idx");
 150        struct object_entry **list = sorted_by_sha;
 151        struct object_entry **last = list + nr_objects;
 152        unsigned int array[256];
 153
 154        /*
 155         * Write the first-level table (the list is sorted,
 156         * but we use a 256-entry lookup to be able to avoid
 157         * having to do eight extra binary search iterations)
 158         */
 159        for (i = 0; i < 256; i++) {
 160                struct object_entry **next = list;
 161                while (next < last) {
 162                        struct object_entry *entry = *next;
 163                        if (entry->sha1[0] != i)
 164                                break;
 165                        next++;
 166                }
 167                array[i] = htonl(next - sorted_by_sha);
 168                list = next;
 169        }
 170        fwrite(array, 256, sizeof(int), f);
 171
 172        /*
 173         * Write the actual SHA1 entries..
 174         */
 175        list = sorted_by_sha;
 176        for (i = 0; i < nr_objects; i++) {
 177                struct object_entry *entry = *list++;
 178                unsigned int offset = htonl(entry->offset);
 179                fwrite(&offset, 4, 1, f);
 180                fwrite(entry->sha1, 20, 1, f);
 181        }
 182        fclose(f);
 183}
 184
 185static void add_object_entry(unsigned char *sha1)
 186{
 187        unsigned int idx = nr_objects;
 188        struct object_entry *entry;
 189
 190        if (idx >= nr_alloc) {
 191                unsigned int needed = (idx + 1024) * 3 / 2;
 192                objects = xrealloc(objects, needed * sizeof(*entry));
 193                nr_alloc = needed;
 194        }
 195        entry = objects + idx;
 196        memset(entry, 0, sizeof(*entry));
 197        memcpy(entry->sha1, sha1, 20);
 198        nr_objects = idx+1;
 199}
 200
 201static void check_object(struct object_entry *entry)
 202{
 203        char buffer[128];
 204        char type[10];
 205        unsigned long mapsize;
 206        z_stream stream;
 207        void *map;
 208
 209        map = map_sha1_file(entry->sha1, &mapsize);
 210        if (!map)
 211                die("unable to map %s", sha1_to_hex(entry->sha1));
 212        if (unpack_sha1_header(&stream, map, mapsize, buffer, sizeof(buffer)) < 0)
 213                die("unable to unpack %s header", sha1_to_hex(entry->sha1));
 214        munmap(map, mapsize);
 215        if (parse_sha1_header(buffer, type, &entry->size) < 0)
 216                die("unable to parse %s header", sha1_to_hex(entry->sha1));
 217        if (!strcmp(type, "commit")) {
 218                entry->type = OBJ_COMMIT;
 219        } else if (!strcmp(type, "tree")) {
 220                entry->type = OBJ_TREE;
 221        } else if (!strcmp(type, "blob")) {
 222                entry->type = OBJ_BLOB;
 223        } else
 224                die("unable to pack object %s of type %s", sha1_to_hex(entry->sha1), type);
 225}
 226
 227static void get_object_details(void)
 228{
 229        int i;
 230        struct object_entry *entry = objects;
 231
 232        for (i = 0; i < nr_objects; i++)
 233                check_object(entry++);
 234}
 235
 236typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *);
 237
 238static entry_sort_t current_sort;
 239
 240static int sort_comparator(const void *_a, const void *_b)
 241{
 242        struct object_entry *a = *(struct object_entry **)_a;
 243        struct object_entry *b = *(struct object_entry **)_b;
 244        return current_sort(a,b);
 245}
 246
 247static struct object_entry **create_sorted_list(entry_sort_t sort)
 248{
 249        struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *));
 250        int i;
 251
 252        for (i = 0; i < nr_objects; i++)
 253                list[i] = objects + i;
 254        current_sort = sort;
 255        qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator);
 256        return list;
 257}
 258
 259static int sha1_sort(const struct object_entry *a, const struct object_entry *b)
 260{
 261        return memcmp(a->sha1, b->sha1, 20);
 262}
 263
 264static int type_size_sort(const struct object_entry *a, const struct object_entry *b)
 265{
 266        if (a->type < b->type)
 267                return -1;
 268        if (a->type > b->type)
 269                return 1;
 270        if (a->size < b->size)
 271                return -1;
 272        if (a->size > b->size)
 273                return 1;
 274        return a < b ? -1 : (a > b);
 275}
 276
 277struct unpacked {
 278        struct object_entry *entry;
 279        void *data;
 280};
 281
 282/*
 283 * We search for deltas _backwards_ in a list sorted by type and
 284 * by size, so that we see progressively smaller and smaller files.
 285 * That's because we prefer deltas to be from the bigger file
 286 * to the smaller - deletes are potentially cheaper, but perhaps
 287 * more importantly, the bigger file is likely the more recent
 288 * one.
 289 */
 290static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth)
 291{
 292        struct object_entry *cur_entry = cur->entry;
 293        struct object_entry *old_entry = old->entry;
 294        unsigned long size, oldsize, delta_size, sizediff;
 295        long max_size;
 296        void *delta_buf;
 297
 298        /* Don't bother doing diffs between different types */
 299        if (cur_entry->type != old_entry->type)
 300                return -1;
 301
 302        size = cur_entry->size;
 303        if (size < 50)
 304                return -1;
 305        oldsize = old_entry->size;
 306        sizediff = oldsize > size ? oldsize - size : size - oldsize;
 307        if (sizediff > size / 8)
 308                return -1;
 309        if (old_entry->depth >= max_depth)
 310                return 0;
 311
 312        /*
 313         * NOTE!
 314         *
 315         * We always delta from the bigger to the smaller, since that's
 316         * more space-efficient (deletes don't have to say _what_ they
 317         * delete).
 318         */
 319        max_size = size / 2 - 20;
 320        if (cur_entry->delta)
 321                max_size = cur_entry->delta_size-1;
 322        delta_buf = diff_delta(old->data, oldsize,
 323                               cur->data, size, &delta_size, max_size);
 324        if (!delta_buf)
 325                return 0;
 326        cur_entry->delta = old_entry;
 327        cur_entry->delta_size = delta_size;
 328        cur_entry->depth = old_entry->depth + 1;
 329        free(delta_buf);
 330        return 0;
 331}
 332
 333static void find_deltas(struct object_entry **list, int window, int depth)
 334{
 335        int i, idx;
 336        unsigned int array_size = window * sizeof(struct unpacked);
 337        struct unpacked *array = xmalloc(array_size);
 338
 339        memset(array, 0, array_size);
 340        i = nr_objects;
 341        idx = 0;
 342        while (--i >= 0) {
 343                struct object_entry *entry = list[i];
 344                struct unpacked *n = array + idx;
 345                unsigned long size;
 346                char type[10];
 347                int j;
 348
 349                free(n->data);
 350                n->entry = entry;
 351                n->data = read_sha1_file(entry->sha1, type, &size);
 352                if (size != entry->size)
 353                        die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
 354                j = window;
 355                while (--j > 0) {
 356                        unsigned int other_idx = idx + j;
 357                        struct unpacked *m;
 358                        if (other_idx >= window)
 359                                other_idx -= window;
 360                        m = array + other_idx;
 361                        if (!m->entry)
 362                                break;
 363                        if (try_delta(n, m, depth) < 0)
 364                                break;
 365                }
 366                idx++;
 367                if (idx >= window)
 368                        idx = 0;
 369        }
 370}
 371
 372int main(int argc, char **argv)
 373{
 374        char line[128];
 375        int window = 10, depth = 10;
 376        int i;
 377
 378        for (i = 1; i < argc; i++) {
 379                const char *arg = argv[i];
 380
 381                if (*arg == '-') {
 382                        if (!strncmp("--window=", arg, 9)) {
 383                                char *end;
 384                                window = strtoul(arg+9, &end, 0);
 385                                if (!arg[9] || *end)
 386                                        usage(pack_usage);
 387                                continue;
 388                        }
 389                        if (!strncmp("--depth=", arg, 8)) {
 390                                char *end;
 391                                depth = strtoul(arg+8, &end, 0);
 392                                if (!arg[8] || *end)
 393                                        usage(pack_usage);
 394                                continue;
 395                        }
 396                        usage(pack_usage);
 397                }
 398                if (base_name)
 399                        usage(pack_usage);
 400                base_name = arg;
 401        }
 402
 403        if (!base_name)
 404                usage(pack_usage);
 405
 406        while (fgets(line, sizeof(line), stdin) != NULL) {
 407                unsigned char sha1[20];
 408                if (get_sha1_hex(line, sha1))
 409                        die("expected sha1, got garbage");
 410                add_object_entry(sha1);
 411        }
 412        get_object_details();
 413
 414        printf("Packing %d objects\n", nr_objects);
 415
 416        sorted_by_sha = create_sorted_list(sha1_sort);
 417        sorted_by_type = create_sorted_list(type_size_sort);
 418        if (window && depth)
 419                find_deltas(sorted_by_type, window+1, depth);
 420
 421        write_pack_file();
 422        write_index_file();
 423        return 0;
 424}