feee5600b9c362ad1436db64e3624f646f064fc3
   1#include <ctype.h>
   2#include "cache.h"
   3#include "object.h"
   4#include "delta.h"
   5#include "pack.h"
   6#include "csum-file.h"
   7
   8static const char pack_usage[] = "git-pack-objects [--window=N] [--depth=N] {--stdout | base-name} < object-list";
   9
  10struct object_entry {
  11        unsigned char sha1[20];
  12        unsigned long size;
  13        unsigned long offset;
  14        unsigned int depth;
  15        unsigned int hash;
  16        enum object_type type;
  17        unsigned long delta_size;
  18        struct object_entry *delta;
  19};
  20
  21static struct object_entry **sorted_by_sha, **sorted_by_type;
  22static struct object_entry *objects = NULL;
  23static int nr_objects = 0, nr_alloc = 0;
  24static const char *base_name;
  25static unsigned char pack_file_sha1[20];
  26
  27static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
  28{
  29        unsigned long othersize, delta_size;
  30        char type[10];
  31        void *otherbuf = read_sha1_file(entry->delta->sha1, type, &othersize);
  32        void *delta_buf;
  33
  34        if (!otherbuf)
  35                die("unable to read %s", sha1_to_hex(entry->delta->sha1));
  36        delta_buf = diff_delta(otherbuf, othersize,
  37                               buf, size, &delta_size, ~0UL);
  38        if (!delta_buf || delta_size != entry->delta_size)
  39                die("delta size changed");
  40        free(buf);
  41        free(otherbuf);
  42        return delta_buf;
  43}
  44
  45/*
  46 * The per-object header is a pretty dense thing, which is
  47 *  - first byte: low four bits are "size", then three bits of "type",
  48 *    and the high bit is "size continues".
  49 *  - each byte afterwards: low seven bits are size continuation,
  50 *    with the high bit being "size continues"
  51 */
  52static int encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
  53{
  54        int n = 1, i;
  55        unsigned char c;
  56
  57        if (type < OBJ_COMMIT || type > OBJ_DELTA)
  58                die("bad type %d", type);
  59
  60        /*
  61         * Shift the size up by 7 bits at a time,
  62         * until you get bits in the "high four".
  63         * That will be our beginning. We'll have
  64         * four size bits in 28..31, then groups
  65         * of seven in 21..27, 14..20, 7..13 and
  66         * finally 0..6.
  67         */
  68        if (size) {
  69                n = 5;
  70                while (!(size & 0xfe000000)) {
  71                        size <<= 7;
  72                        n--;
  73                }
  74        }
  75        c = (type << 4) | (size >> 28);
  76        for (i = 1; i < n; i++) {
  77                *hdr++ = c | 0x80;
  78                c = (size >> 21) & 0x7f;
  79                size <<= 7;
  80        }
  81        *hdr = c;
  82        return n;
  83}
  84
  85static unsigned long write_object(struct sha1file *f, struct object_entry *entry)
  86{
  87        unsigned long size;
  88        char type[10];
  89        void *buf = read_sha1_file(entry->sha1, type, &size);
  90        unsigned char header[10];
  91        unsigned hdrlen, datalen;
  92        enum object_type obj_type;
  93
  94        if (!buf)
  95                die("unable to read %s", sha1_to_hex(entry->sha1));
  96        if (size != entry->size)
  97                die("object %s size inconsistency (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
  98
  99        /*
 100         * The object header is a byte of 'type' followed by zero or
 101         * more bytes of length.  For deltas, the 20 bytes of delta sha1
 102         * follows that.
 103         */
 104        obj_type = entry->type;
 105        if (entry->delta) {
 106                buf = delta_against(buf, size, entry);
 107                size = entry->delta_size;
 108                obj_type = OBJ_DELTA;
 109        }
 110        hdrlen = encode_header(obj_type, size, header);
 111        sha1write(f, header, hdrlen);
 112        if (entry->delta) {
 113                sha1write(f, entry->delta, 20);
 114                hdrlen += 20;
 115        }
 116        datalen = sha1write_compressed(f, buf, size);
 117        free(buf);
 118        return hdrlen + datalen;
 119}
 120
 121static unsigned long write_one(struct sha1file *f,
 122                               struct object_entry *e,
 123                               unsigned long offset)
 124{
 125        if (e->offset)
 126                /* offset starts from header size and cannot be zero
 127                 * if it is written already.
 128                 */
 129                return offset;
 130        e->offset = offset;
 131        offset += write_object(f, e);
 132        /* if we are delitified, write out its base object. */
 133        if (e->delta)
 134                offset = write_one(f, e->delta, offset);
 135        return offset;
 136}
 137
 138static void write_pack_file(void)
 139{
 140        int i;
 141        struct sha1file *f;
 142        unsigned long offset;
 143        unsigned long mb;
 144        struct pack_header hdr;
 145
 146        if (!base_name)
 147                f = sha1fd(1, "<stdout>");
 148        else
 149                f = sha1create("%s.%s", base_name, "pack");
 150        hdr.hdr_signature = htonl(PACK_SIGNATURE);
 151        hdr.hdr_version = htonl(1);
 152        hdr.hdr_entries = htonl(nr_objects);
 153        sha1write(f, &hdr, sizeof(hdr));
 154        offset = sizeof(hdr);
 155        for (i = 0; i < nr_objects; i++)
 156                offset = write_one(f, objects + i, offset);
 157
 158        sha1close(f, pack_file_sha1, 1);
 159        mb = offset >> 20;
 160        offset &= 0xfffff;
 161}
 162
 163static void write_index_file(void)
 164{
 165        int i;
 166        struct sha1file *f = sha1create("%s.%s", base_name, "idx");
 167        struct object_entry **list = sorted_by_sha;
 168        struct object_entry **last = list + nr_objects;
 169        unsigned int array[256];
 170
 171        /*
 172         * Write the first-level table (the list is sorted,
 173         * but we use a 256-entry lookup to be able to avoid
 174         * having to do eight extra binary search iterations).
 175         */
 176        for (i = 0; i < 256; i++) {
 177                struct object_entry **next = list;
 178                while (next < last) {
 179                        struct object_entry *entry = *next;
 180                        if (entry->sha1[0] != i)
 181                                break;
 182                        next++;
 183                }
 184                array[i] = htonl(next - sorted_by_sha);
 185                list = next;
 186        }
 187        sha1write(f, array, 256 * sizeof(int));
 188
 189        /*
 190         * Write the actual SHA1 entries..
 191         */
 192        list = sorted_by_sha;
 193        for (i = 0; i < nr_objects; i++) {
 194                struct object_entry *entry = *list++;
 195                unsigned int offset = htonl(entry->offset);
 196                sha1write(f, &offset, 4);
 197                sha1write(f, entry->sha1, 20);
 198        }
 199        sha1write(f, pack_file_sha1, 20);
 200        sha1close(f, NULL, 1);
 201}
 202
 203static void add_object_entry(unsigned char *sha1, unsigned int hash)
 204{
 205        unsigned int idx = nr_objects;
 206        struct object_entry *entry;
 207
 208        if (idx >= nr_alloc) {
 209                unsigned int needed = (idx + 1024) * 3 / 2;
 210                objects = xrealloc(objects, needed * sizeof(*entry));
 211                nr_alloc = needed;
 212        }
 213        entry = objects + idx;
 214        memset(entry, 0, sizeof(*entry));
 215        memcpy(entry->sha1, sha1, 20);
 216        entry->hash = hash;
 217        nr_objects = idx+1;
 218}
 219
 220static void check_object(struct object_entry *entry)
 221{
 222        char type[20];
 223
 224        if (!sha1_object_info(entry->sha1, type, &entry->size)) {
 225                if (!strcmp(type, "commit")) {
 226                        entry->type = OBJ_COMMIT;
 227                } else if (!strcmp(type, "tree")) {
 228                        entry->type = OBJ_TREE;
 229                } else if (!strcmp(type, "blob")) {
 230                        entry->type = OBJ_BLOB;
 231                } else if (!strcmp(type, "tag")) {
 232                        entry->type = OBJ_TAG;
 233                } else
 234                        die("unable to pack object %s of type %s",
 235                            sha1_to_hex(entry->sha1), type);
 236        }
 237        else
 238                die("unable to get type of object %s",
 239                    sha1_to_hex(entry->sha1));
 240}
 241
 242static void get_object_details(void)
 243{
 244        int i;
 245        struct object_entry *entry = objects;
 246
 247        for (i = 0; i < nr_objects; i++)
 248                check_object(entry++);
 249}
 250
 251typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *);
 252
 253static entry_sort_t current_sort;
 254
 255static int sort_comparator(const void *_a, const void *_b)
 256{
 257        struct object_entry *a = *(struct object_entry **)_a;
 258        struct object_entry *b = *(struct object_entry **)_b;
 259        return current_sort(a,b);
 260}
 261
 262static struct object_entry **create_sorted_list(entry_sort_t sort)
 263{
 264        struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *));
 265        int i;
 266
 267        for (i = 0; i < nr_objects; i++)
 268                list[i] = objects + i;
 269        current_sort = sort;
 270        qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator);
 271        return list;
 272}
 273
 274static int sha1_sort(const struct object_entry *a, const struct object_entry *b)
 275{
 276        return memcmp(a->sha1, b->sha1, 20);
 277}
 278
 279static int type_size_sort(const struct object_entry *a, const struct object_entry *b)
 280{
 281        if (a->type < b->type)
 282                return -1;
 283        if (a->type > b->type)
 284                return 1;
 285        if (a->hash < b->hash)
 286                return -1;
 287        if (a->hash > b->hash)
 288                return 1;
 289        if (a->size < b->size)
 290                return -1;
 291        if (a->size > b->size)
 292                return 1;
 293        return a < b ? -1 : (a > b);
 294}
 295
 296struct unpacked {
 297        struct object_entry *entry;
 298        void *data;
 299};
 300
 301/*
 302 * We search for deltas _backwards_ in a list sorted by type and
 303 * by size, so that we see progressively smaller and smaller files.
 304 * That's because we prefer deltas to be from the bigger file
 305 * to the smaller - deletes are potentially cheaper, but perhaps
 306 * more importantly, the bigger file is likely the more recent
 307 * one.
 308 */
 309static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth)
 310{
 311        struct object_entry *cur_entry = cur->entry;
 312        struct object_entry *old_entry = old->entry;
 313        unsigned long size, oldsize, delta_size, sizediff;
 314        long max_size;
 315        void *delta_buf;
 316
 317        /* Don't bother doing diffs between different types */
 318        if (cur_entry->type != old_entry->type)
 319                return -1;
 320
 321        size = cur_entry->size;
 322        if (size < 50)
 323                return -1;
 324        oldsize = old_entry->size;
 325        sizediff = oldsize > size ? oldsize - size : size - oldsize;
 326        if (sizediff > size / 8)
 327                return -1;
 328        if (old_entry->depth >= max_depth)
 329                return 0;
 330
 331        /*
 332         * NOTE!
 333         *
 334         * We always delta from the bigger to the smaller, since that's
 335         * more space-efficient (deletes don't have to say _what_ they
 336         * delete).
 337         */
 338        max_size = size / 2 - 20;
 339        if (cur_entry->delta)
 340                max_size = cur_entry->delta_size-1;
 341        if (sizediff >= max_size)
 342                return -1;
 343        delta_buf = diff_delta(old->data, oldsize,
 344                               cur->data, size, &delta_size, max_size);
 345        if (!delta_buf)
 346                return 0;
 347        cur_entry->delta = old_entry;
 348        cur_entry->delta_size = delta_size;
 349        cur_entry->depth = old_entry->depth + 1;
 350        free(delta_buf);
 351        return 0;
 352}
 353
 354static void find_deltas(struct object_entry **list, int window, int depth)
 355{
 356        int i, idx;
 357        unsigned int array_size = window * sizeof(struct unpacked);
 358        struct unpacked *array = xmalloc(array_size);
 359
 360        memset(array, 0, array_size);
 361        i = nr_objects;
 362        idx = 0;
 363        while (--i >= 0) {
 364                struct object_entry *entry = list[i];
 365                struct unpacked *n = array + idx;
 366                unsigned long size;
 367                char type[10];
 368                int j;
 369
 370                free(n->data);
 371                n->entry = entry;
 372                n->data = read_sha1_file(entry->sha1, type, &size);
 373                if (size != entry->size)
 374                        die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
 375                j = window;
 376                while (--j > 0) {
 377                        unsigned int other_idx = idx + j;
 378                        struct unpacked *m;
 379                        if (other_idx >= window)
 380                                other_idx -= window;
 381                        m = array + other_idx;
 382                        if (!m->entry)
 383                                break;
 384                        if (try_delta(n, m, depth) < 0)
 385                                break;
 386                }
 387                idx++;
 388                if (idx >= window)
 389                        idx = 0;
 390        }
 391}
 392
 393int main(int argc, char **argv)
 394{
 395        char line[PATH_MAX + 20];
 396        int window = 10, depth = 10, pack_to_stdout = 0;
 397        int i;
 398
 399        for (i = 1; i < argc; i++) {
 400                const char *arg = argv[i];
 401
 402                if (*arg == '-') {
 403                        if (!strncmp("--window=", arg, 9)) {
 404                                char *end;
 405                                window = strtoul(arg+9, &end, 0);
 406                                if (!arg[9] || *end)
 407                                        usage(pack_usage);
 408                                continue;
 409                        }
 410                        if (!strncmp("--depth=", arg, 8)) {
 411                                char *end;
 412                                depth = strtoul(arg+8, &end, 0);
 413                                if (!arg[8] || *end)
 414                                        usage(pack_usage);
 415                                continue;
 416                        }
 417                        if (!strcmp("--stdout", arg)) {
 418                                pack_to_stdout = 1;
 419                                continue;
 420                        }
 421                        usage(pack_usage);
 422                }
 423                if (base_name)
 424                        usage(pack_usage);
 425                base_name = arg;
 426        }
 427
 428        if (pack_to_stdout != !base_name)
 429                usage(pack_usage);
 430
 431        while (fgets(line, sizeof(line), stdin) != NULL) {
 432                unsigned int hash;
 433                char *p;
 434                unsigned char sha1[20];
 435
 436                if (get_sha1_hex(line, sha1))
 437                        die("expected sha1, got garbage");
 438                hash = 0;
 439                p = line+40;
 440                while (*p) {
 441                        unsigned char c = *p++;
 442                        if (isspace(c))
 443                                continue;
 444                        hash = hash * 11 + c;
 445                }
 446                add_object_entry(sha1, hash);
 447        }
 448        get_object_details();
 449
 450        fprintf(stderr, "Packing %d objects\n", nr_objects);
 451
 452        sorted_by_sha = create_sorted_list(sha1_sort);
 453        sorted_by_type = create_sorted_list(type_size_sort);
 454        if (window && depth)
 455                find_deltas(sorted_by_type, window+1, depth);
 456
 457        write_pack_file();
 458        if (!pack_to_stdout)
 459                write_index_file();
 460        return 0;
 461}