pack-objects.con commit Use a *real* built-in diff generator (3443546)
   1#include "cache.h"
   2#include "object.h"
   3#include "delta.h"
   4#include "pack.h"
   5#include "csum-file.h"
   6#include "diff.h"
   7#include <sys/time.h>
   8#include <signal.h>
   9
  10static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list";
  11
  12struct object_entry {
  13        unsigned char sha1[20];
  14        unsigned long size;     /* uncompressed size */
  15        unsigned long offset;   /* offset into the final pack file;
  16                                 * nonzero if already written.
  17                                 */
  18        unsigned int depth;     /* delta depth */
  19        unsigned int delta_limit;       /* base adjustment for in-pack delta */
  20        unsigned int hash;      /* name hint hash */
  21        enum object_type type;
  22        enum object_type in_pack_type;  /* could be delta */
  23        unsigned long delta_size;       /* delta data size (uncompressed) */
  24        struct object_entry *delta;     /* delta base object */
  25        struct packed_git *in_pack;     /* already in pack */
  26        unsigned int in_pack_offset;
  27        struct object_entry *delta_child; /* delitified objects who bases me */
  28        struct object_entry *delta_sibling; /* other deltified objects who
  29                                             * uses the same base as me
  30                                             */
  31        int preferred_base;     /* we do not pack this, but is encouraged to
  32                                 * be used as the base objectto delta huge
  33                                 * objects against.
  34                                 */
  35};
  36
  37/*
  38 * Objects we are going to pack are colected in objects array (dynamically
  39 * expanded).  nr_objects & nr_alloc controls this array.  They are stored
  40 * in the order we see -- typically rev-list --objects order that gives us
  41 * nice "minimum seek" order.
  42 *
  43 * sorted-by-sha ans sorted-by-type are arrays of pointers that point at
  44 * elements in the objects array.  The former is used to build the pack
  45 * index (lists object names in the ascending order to help offset lookup),
  46 * and the latter is used to group similar things together by try_delta()
  47 * heuristics.
  48 */
  49
  50static unsigned char object_list_sha1[20];
  51static int non_empty = 0;
  52static int no_reuse_delta = 0;
  53static int local = 0;
  54static int incremental = 0;
  55static struct object_entry **sorted_by_sha, **sorted_by_type;
  56static struct object_entry *objects = NULL;
  57static int nr_objects = 0, nr_alloc = 0, nr_result = 0;
  58static const char *base_name;
  59static unsigned char pack_file_sha1[20];
  60static int progress = 1;
  61static volatile int progress_update = 0;
  62
  63/*
  64 * The object names in objects array are hashed with this hashtable,
  65 * to help looking up the entry by object name.  Binary search from
  66 * sorted_by_sha is also possible but this was easier to code and faster.
  67 * This hashtable is built after all the objects are seen.
  68 */
  69static int *object_ix = NULL;
  70static int object_ix_hashsz = 0;
  71
  72/*
  73 * Pack index for existing packs give us easy access to the offsets into
  74 * corresponding pack file where each object's data starts, but the entries
  75 * do not store the size of the compressed representation (uncompressed
  76 * size is easily available by examining the pack entry header).  We build
  77 * a hashtable of existing packs (pack_revindex), and keep reverse index
  78 * here -- pack index file is sorted by object name mapping to offset; this
  79 * pack_revindex[].revindex array is an ordered list of offsets, so if you
  80 * know the offset of an object, next offset is where its packed
  81 * representation ends.
  82 */
  83struct pack_revindex {
  84        struct packed_git *p;
  85        unsigned long *revindex;
  86} *pack_revindex = NULL;
  87static int pack_revindex_hashsz = 0;
  88
  89/*
  90 * stats
  91 */
  92static int written = 0;
  93static int written_delta = 0;
  94static int reused = 0;
  95static int reused_delta = 0;
  96
  97static int pack_revindex_ix(struct packed_git *p)
  98{
  99        unsigned long ui = (unsigned long)p;
 100        int i;
 101
 102        ui = ui ^ (ui >> 16); /* defeat structure alignment */
 103        i = (int)(ui % pack_revindex_hashsz);
 104        while (pack_revindex[i].p) {
 105                if (pack_revindex[i].p == p)
 106                        return i;
 107                if (++i == pack_revindex_hashsz)
 108                        i = 0;
 109        }
 110        return -1 - i;
 111}
 112
 113static void prepare_pack_ix(void)
 114{
 115        int num;
 116        struct packed_git *p;
 117        for (num = 0, p = packed_git; p; p = p->next)
 118                num++;
 119        if (!num)
 120                return;
 121        pack_revindex_hashsz = num * 11;
 122        pack_revindex = xcalloc(sizeof(*pack_revindex), pack_revindex_hashsz);
 123        for (p = packed_git; p; p = p->next) {
 124                num = pack_revindex_ix(p);
 125                num = - 1 - num;
 126                pack_revindex[num].p = p;
 127        }
 128        /* revindex elements are lazily initialized */
 129}
 130
 131static int cmp_offset(const void *a_, const void *b_)
 132{
 133        unsigned long a = *(unsigned long *) a_;
 134        unsigned long b = *(unsigned long *) b_;
 135        if (a < b)
 136                return -1;
 137        else if (a == b)
 138                return 0;
 139        else
 140                return 1;
 141}
 142
 143/*
 144 * Ordered list of offsets of objects in the pack.
 145 */
 146static void prepare_pack_revindex(struct pack_revindex *rix)
 147{
 148        struct packed_git *p = rix->p;
 149        int num_ent = num_packed_objects(p);
 150        int i;
 151        void *index = p->index_base + 256;
 152
 153        rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1));
 154        for (i = 0; i < num_ent; i++) {
 155                long hl = *((long *)(index + 24 * i));
 156                rix->revindex[i] = ntohl(hl);
 157        }
 158        /* This knows the pack format -- the 20-byte trailer
 159         * follows immediately after the last object data.
 160         */
 161        rix->revindex[num_ent] = p->pack_size - 20;
 162        qsort(rix->revindex, num_ent, sizeof(unsigned long), cmp_offset);
 163}
 164
 165static unsigned long find_packed_object_size(struct packed_git *p,
 166                                             unsigned long ofs)
 167{
 168        int num;
 169        int lo, hi;
 170        struct pack_revindex *rix;
 171        unsigned long *revindex;
 172        num = pack_revindex_ix(p);
 173        if (num < 0)
 174                die("internal error: pack revindex uninitialized");
 175        rix = &pack_revindex[num];
 176        if (!rix->revindex)
 177                prepare_pack_revindex(rix);
 178        revindex = rix->revindex;
 179        lo = 0;
 180        hi = num_packed_objects(p) + 1;
 181        do {
 182                int mi = (lo + hi) / 2;
 183                if (revindex[mi] == ofs) {
 184                        return revindex[mi+1] - ofs;
 185                }
 186                else if (ofs < revindex[mi])
 187                        hi = mi;
 188                else
 189                        lo = mi + 1;
 190        } while (lo < hi);
 191        die("internal error: pack revindex corrupt");
 192}
 193
 194static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
 195{
 196        unsigned long othersize, delta_size;
 197        char type[10];
 198        void *otherbuf = read_sha1_file(entry->delta->sha1, type, &othersize);
 199        void *delta_buf;
 200
 201        if (!otherbuf)
 202                die("unable to read %s", sha1_to_hex(entry->delta->sha1));
 203        delta_buf = diff_delta(otherbuf, othersize,
 204                               buf, size, &delta_size, 0);
 205        if (!delta_buf || delta_size != entry->delta_size)
 206                die("delta size changed");
 207        free(buf);
 208        free(otherbuf);
 209        return delta_buf;
 210}
 211
 212/*
 213 * The per-object header is a pretty dense thing, which is
 214 *  - first byte: low four bits are "size", then three bits of "type",
 215 *    and the high bit is "size continues".
 216 *  - each byte afterwards: low seven bits are size continuation,
 217 *    with the high bit being "size continues"
 218 */
 219static int encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
 220{
 221        int n = 1;
 222        unsigned char c;
 223
 224        if (type < OBJ_COMMIT || type > OBJ_DELTA)
 225                die("bad type %d", type);
 226
 227        c = (type << 4) | (size & 15);
 228        size >>= 4;
 229        while (size) {
 230                *hdr++ = c | 0x80;
 231                c = size & 0x7f;
 232                size >>= 7;
 233                n++;
 234        }
 235        *hdr = c;
 236        return n;
 237}
 238
 239static unsigned long write_object(struct sha1file *f,
 240                                  struct object_entry *entry)
 241{
 242        unsigned long size;
 243        char type[10];
 244        void *buf;
 245        unsigned char header[10];
 246        unsigned hdrlen, datalen;
 247        enum object_type obj_type;
 248        int to_reuse = 0;
 249
 250        if (entry->preferred_base)
 251                return 0;
 252
 253        obj_type = entry->type;
 254        if (! entry->in_pack)
 255                to_reuse = 0;   /* can't reuse what we don't have */
 256        else if (obj_type == OBJ_DELTA)
 257                to_reuse = 1;   /* check_object() decided it for us */
 258        else if (obj_type != entry->in_pack_type)
 259                to_reuse = 0;   /* pack has delta which is unusable */
 260        else if (entry->delta)
 261                to_reuse = 0;   /* we want to pack afresh */
 262        else
 263                to_reuse = 1;   /* we have it in-pack undeltified,
 264                                 * and we do not need to deltify it.
 265                                 */
 266
 267        if (! to_reuse) {
 268                buf = read_sha1_file(entry->sha1, type, &size);
 269                if (!buf)
 270                        die("unable to read %s", sha1_to_hex(entry->sha1));
 271                if (size != entry->size)
 272                        die("object %s size inconsistency (%lu vs %lu)",
 273                            sha1_to_hex(entry->sha1), size, entry->size);
 274                if (entry->delta) {
 275                        buf = delta_against(buf, size, entry);
 276                        size = entry->delta_size;
 277                        obj_type = OBJ_DELTA;
 278                }
 279                /*
 280                 * The object header is a byte of 'type' followed by zero or
 281                 * more bytes of length.  For deltas, the 20 bytes of delta
 282                 * sha1 follows that.
 283                 */
 284                hdrlen = encode_header(obj_type, size, header);
 285                sha1write(f, header, hdrlen);
 286
 287                if (entry->delta) {
 288                        sha1write(f, entry->delta, 20);
 289                        hdrlen += 20;
 290                }
 291                datalen = sha1write_compressed(f, buf, size);
 292                free(buf);
 293        }
 294        else {
 295                struct packed_git *p = entry->in_pack;
 296                use_packed_git(p);
 297
 298                datalen = find_packed_object_size(p, entry->in_pack_offset);
 299                buf = p->pack_base + entry->in_pack_offset;
 300                sha1write(f, buf, datalen);
 301                unuse_packed_git(p);
 302                hdrlen = 0; /* not really */
 303                if (obj_type == OBJ_DELTA)
 304                        reused_delta++;
 305                reused++;
 306        }
 307        if (obj_type == OBJ_DELTA)
 308                written_delta++;
 309        written++;
 310        return hdrlen + datalen;
 311}
 312
 313static unsigned long write_one(struct sha1file *f,
 314                               struct object_entry *e,
 315                               unsigned long offset)
 316{
 317        if (e->offset)
 318                /* offset starts from header size and cannot be zero
 319                 * if it is written already.
 320                 */
 321                return offset;
 322        e->offset = offset;
 323        offset += write_object(f, e);
 324        /* if we are deltified, write out its base object. */
 325        if (e->delta)
 326                offset = write_one(f, e->delta, offset);
 327        return offset;
 328}
 329
 330static void write_pack_file(void)
 331{
 332        int i;
 333        struct sha1file *f;
 334        unsigned long offset;
 335        struct pack_header hdr;
 336        unsigned last_percent = 999;
 337        int do_progress = 0;
 338
 339        if (!base_name)
 340                f = sha1fd(1, "<stdout>");
 341        else {
 342                f = sha1create("%s-%s.%s", base_name,
 343                               sha1_to_hex(object_list_sha1), "pack");
 344                do_progress = progress;
 345        }
 346        if (do_progress)
 347                fprintf(stderr, "Writing %d objects.\n", nr_result);
 348
 349        hdr.hdr_signature = htonl(PACK_SIGNATURE);
 350        hdr.hdr_version = htonl(PACK_VERSION);
 351        hdr.hdr_entries = htonl(nr_result);
 352        sha1write(f, &hdr, sizeof(hdr));
 353        offset = sizeof(hdr);
 354        if (!nr_result)
 355                goto done;
 356        for (i = 0; i < nr_objects; i++) {
 357                offset = write_one(f, objects + i, offset);
 358                if (do_progress) {
 359                        unsigned percent = written * 100 / nr_result;
 360                        if (progress_update || percent != last_percent) {
 361                                fprintf(stderr, "%4u%% (%u/%u) done\r",
 362                                        percent, written, nr_result);
 363                                progress_update = 0;
 364                                last_percent = percent;
 365                        }
 366                }
 367        }
 368        if (do_progress)
 369                fputc('\n', stderr);
 370 done:
 371        sha1close(f, pack_file_sha1, 1);
 372}
 373
 374static void write_index_file(void)
 375{
 376        int i;
 377        struct sha1file *f = sha1create("%s-%s.%s", base_name,
 378                                        sha1_to_hex(object_list_sha1), "idx");
 379        struct object_entry **list = sorted_by_sha;
 380        struct object_entry **last = list + nr_result;
 381        unsigned int array[256];
 382
 383        /*
 384         * Write the first-level table (the list is sorted,
 385         * but we use a 256-entry lookup to be able to avoid
 386         * having to do eight extra binary search iterations).
 387         */
 388        for (i = 0; i < 256; i++) {
 389                struct object_entry **next = list;
 390                while (next < last) {
 391                        struct object_entry *entry = *next;
 392                        if (entry->sha1[0] != i)
 393                                break;
 394                        next++;
 395                }
 396                array[i] = htonl(next - sorted_by_sha);
 397                list = next;
 398        }
 399        sha1write(f, array, 256 * sizeof(int));
 400
 401        /*
 402         * Write the actual SHA1 entries..
 403         */
 404        list = sorted_by_sha;
 405        for (i = 0; i < nr_result; i++) {
 406                struct object_entry *entry = *list++;
 407                unsigned int offset = htonl(entry->offset);
 408                sha1write(f, &offset, 4);
 409                sha1write(f, entry->sha1, 20);
 410        }
 411        sha1write(f, pack_file_sha1, 20);
 412        sha1close(f, NULL, 1);
 413}
 414
 415static int locate_object_entry_hash(const unsigned char *sha1)
 416{
 417        int i;
 418        unsigned int ui;
 419        memcpy(&ui, sha1, sizeof(unsigned int));
 420        i = ui % object_ix_hashsz;
 421        while (0 < object_ix[i]) {
 422                if (!memcmp(sha1, objects[object_ix[i]-1].sha1, 20))
 423                        return i;
 424                if (++i == object_ix_hashsz)
 425                        i = 0;
 426        }
 427        return -1 - i;
 428}
 429
 430static struct object_entry *locate_object_entry(const unsigned char *sha1)
 431{
 432        int i;
 433
 434        if (!object_ix_hashsz)
 435                return NULL;
 436
 437        i = locate_object_entry_hash(sha1);
 438        if (0 <= i)
 439                return &objects[object_ix[i]-1];
 440        return NULL;
 441}
 442
 443static void rehash_objects(void)
 444{
 445        int i;
 446        struct object_entry *oe;
 447
 448        object_ix_hashsz = nr_objects * 3;
 449        if (object_ix_hashsz < 1024)
 450                object_ix_hashsz = 1024;
 451        object_ix = xrealloc(object_ix, sizeof(int) * object_ix_hashsz);
 452        object_ix = memset(object_ix, 0, sizeof(int) * object_ix_hashsz);
 453        for (i = 0, oe = objects; i < nr_objects; i++, oe++) {
 454                int ix = locate_object_entry_hash(oe->sha1);
 455                if (0 <= ix)
 456                        continue;
 457                ix = -1 - ix;
 458                object_ix[ix] = i + 1;
 459        }
 460}
 461
 462struct name_path {
 463        struct name_path *up;
 464        const char *elem;
 465        int len;
 466};
 467
 468#define DIRBITS 12
 469
 470static unsigned name_hash(struct name_path *path, const char *name)
 471{
 472        struct name_path *p = path;
 473        const char *n = name + strlen(name);
 474        unsigned hash = 0, name_hash = 0, name_done = 0;
 475
 476        if (n != name && n[-1] == '\n')
 477                n--;
 478        while (name <= --n) {
 479                unsigned char c = *n;
 480                if (c == '/' && !name_done) {
 481                        name_hash = hash;
 482                        name_done = 1;
 483                        hash = 0;
 484                }
 485                hash = hash * 11 + c;
 486        }
 487        if (!name_done) {
 488                name_hash = hash;
 489                hash = 0;
 490        }
 491        for (p = path; p; p = p->up) {
 492                hash = hash * 11 + '/';
 493                n = p->elem + p->len;
 494                while (p->elem <= --n) {
 495                        unsigned char c = *n;
 496                        hash = hash * 11 + c;
 497                }
 498        }
 499        /*
 500         * Make sure "Makefile" and "t/Makefile" are hashed separately
 501         * but close enough.
 502         */
 503        hash = (name_hash<<DIRBITS) | (hash & ((1U<<DIRBITS )-1));
 504
 505        if (0) { /* debug */
 506                n = name + strlen(name);
 507                if (n != name && n[-1] == '\n')
 508                        n--;
 509                while (name <= --n)
 510                        fputc(*n, stderr);
 511                for (p = path; p; p = p->up) {
 512                        fputc('/', stderr);
 513                        n = p->elem + p->len;
 514                        while (p->elem <= --n)
 515                                fputc(*n, stderr);
 516                }
 517                fprintf(stderr, "\t%08x\n", hash);
 518        }
 519        return hash;
 520}
 521
 522static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude)
 523{
 524        unsigned int idx = nr_objects;
 525        struct object_entry *entry;
 526        struct packed_git *p;
 527        unsigned int found_offset = 0;
 528        struct packed_git *found_pack = NULL;
 529        int ix, status = 0;
 530
 531        if (!exclude) {
 532                for (p = packed_git; p; p = p->next) {
 533                        struct pack_entry e;
 534                        if (find_pack_entry_one(sha1, &e, p)) {
 535                                if (incremental)
 536                                        return 0;
 537                                if (local && !p->pack_local)
 538                                        return 0;
 539                                if (!found_pack) {
 540                                        found_offset = e.offset;
 541                                        found_pack = e.p;
 542                                }
 543                        }
 544                }
 545        }
 546        if ((entry = locate_object_entry(sha1)) != NULL)
 547                goto already_added;
 548
 549        if (idx >= nr_alloc) {
 550                unsigned int needed = (idx + 1024) * 3 / 2;
 551                objects = xrealloc(objects, needed * sizeof(*entry));
 552                nr_alloc = needed;
 553        }
 554        entry = objects + idx;
 555        nr_objects = idx + 1;
 556        memset(entry, 0, sizeof(*entry));
 557        memcpy(entry->sha1, sha1, 20);
 558        entry->hash = hash;
 559
 560        if (object_ix_hashsz * 3 <= nr_objects * 4)
 561                rehash_objects();
 562        else {
 563                ix = locate_object_entry_hash(entry->sha1);
 564                if (0 <= ix)
 565                        die("internal error in object hashing.");
 566                object_ix[-1 - ix] = idx + 1;
 567        }
 568        status = 1;
 569
 570 already_added:
 571        if (progress_update) {
 572                fprintf(stderr, "Counting objects...%d\r", nr_objects);
 573                progress_update = 0;
 574        }
 575        if (exclude)
 576                entry->preferred_base = 1;
 577        else {
 578                if (found_pack) {
 579                        entry->in_pack = found_pack;
 580                        entry->in_pack_offset = found_offset;
 581                }
 582        }
 583        return status;
 584}
 585
 586static void add_pbase_tree(struct tree_desc *tree, struct name_path *up)
 587{
 588        while (tree->size) {
 589                const unsigned char *sha1;
 590                const char *name;
 591                unsigned mode, hash;
 592                unsigned long size;
 593                char type[20];
 594
 595                sha1 = tree_entry_extract(tree, &name, &mode);
 596                update_tree_entry(tree);
 597                if (!has_sha1_file(sha1))
 598                        continue;
 599                if (sha1_object_info(sha1, type, &size))
 600                        continue;
 601
 602                hash = name_hash(up, name);
 603                if (!add_object_entry(sha1, hash, 1))
 604                        continue;
 605
 606                if (!strcmp(type, "tree")) {
 607                        struct tree_desc sub;
 608                        void *elem;
 609                        struct name_path me;
 610
 611                        elem = read_sha1_file(sha1, type, &sub.size);
 612                        sub.buf = elem;
 613                        if (sub.buf) {
 614                                me.up = up;
 615                                me.elem = name;
 616                                me.len = strlen(name);
 617                                add_pbase_tree(&sub, &me);
 618                                free(elem);
 619                        }
 620                }
 621        }
 622}
 623
 624static void add_preferred_base(unsigned char *sha1)
 625{
 626        struct tree_desc tree;
 627        void *elem;
 628
 629        elem = read_object_with_reference(sha1, "tree", &tree.size, NULL);
 630        tree.buf = elem;
 631        if (!tree.buf)
 632                return;
 633        if (add_object_entry(sha1, name_hash(NULL, ""), 1))
 634                add_pbase_tree(&tree, NULL);
 635        free(elem);
 636}
 637
 638static void check_object(struct object_entry *entry)
 639{
 640        char type[20];
 641
 642        if (entry->in_pack && !entry->preferred_base) {
 643                unsigned char base[20];
 644                unsigned long size;
 645                struct object_entry *base_entry;
 646
 647                /* We want in_pack_type even if we do not reuse delta.
 648                 * There is no point not reusing non-delta representations.
 649                 */
 650                check_reuse_pack_delta(entry->in_pack,
 651                                       entry->in_pack_offset,
 652                                       base, &size,
 653                                       &entry->in_pack_type);
 654
 655                /* Check if it is delta, and the base is also an object
 656                 * we are going to pack.  If so we will reuse the existing
 657                 * delta.
 658                 */
 659                if (!no_reuse_delta &&
 660                    entry->in_pack_type == OBJ_DELTA &&
 661                    (base_entry = locate_object_entry(base)) &&
 662                    (!base_entry->preferred_base)) {
 663
 664                        /* Depth value does not matter - find_deltas()
 665                         * will never consider reused delta as the
 666                         * base object to deltify other objects
 667                         * against, in order to avoid circular deltas.
 668                         */
 669
 670                        /* uncompressed size of the delta data */
 671                        entry->size = entry->delta_size = size;
 672                        entry->delta = base_entry;
 673                        entry->type = OBJ_DELTA;
 674
 675                        entry->delta_sibling = base_entry->delta_child;
 676                        base_entry->delta_child = entry;
 677
 678                        return;
 679                }
 680                /* Otherwise we would do the usual */
 681        }
 682
 683        if (sha1_object_info(entry->sha1, type, &entry->size))
 684                die("unable to get type of object %s",
 685                    sha1_to_hex(entry->sha1));
 686
 687        if (!strcmp(type, "commit")) {
 688                entry->type = OBJ_COMMIT;
 689        } else if (!strcmp(type, "tree")) {
 690                entry->type = OBJ_TREE;
 691        } else if (!strcmp(type, "blob")) {
 692                entry->type = OBJ_BLOB;
 693        } else if (!strcmp(type, "tag")) {
 694                entry->type = OBJ_TAG;
 695        } else
 696                die("unable to pack object %s of type %s",
 697                    sha1_to_hex(entry->sha1), type);
 698}
 699
 700static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
 701{
 702        struct object_entry *child = me->delta_child;
 703        unsigned int m = n;
 704        while (child) {
 705                unsigned int c = check_delta_limit(child, n + 1);
 706                if (m < c)
 707                        m = c;
 708                child = child->delta_sibling;
 709        }
 710        return m;
 711}
 712
 713static void get_object_details(void)
 714{
 715        int i;
 716        struct object_entry *entry;
 717
 718        prepare_pack_ix();
 719        for (i = 0, entry = objects; i < nr_objects; i++, entry++)
 720                check_object(entry);
 721
 722        if (nr_objects == nr_result) {
 723                /*
 724                 * Depth of objects that depend on the entry -- this
 725                 * is subtracted from depth-max to break too deep
 726                 * delta chain because of delta data reusing.
 727                 * However, we loosen this restriction when we know we
 728                 * are creating a thin pack -- it will have to be
 729                 * expanded on the other end anyway, so do not
 730                 * artificially cut the delta chain and let it go as
 731                 * deep as it wants.
 732                 */
 733                for (i = 0, entry = objects; i < nr_objects; i++, entry++)
 734                        if (!entry->delta && entry->delta_child)
 735                                entry->delta_limit =
 736                                        check_delta_limit(entry, 1);
 737        }
 738}
 739
 740typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *);
 741
 742static entry_sort_t current_sort;
 743
 744static int sort_comparator(const void *_a, const void *_b)
 745{
 746        struct object_entry *a = *(struct object_entry **)_a;
 747        struct object_entry *b = *(struct object_entry **)_b;
 748        return current_sort(a,b);
 749}
 750
 751static struct object_entry **create_sorted_list(entry_sort_t sort)
 752{
 753        struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *));
 754        int i;
 755
 756        for (i = 0; i < nr_objects; i++)
 757                list[i] = objects + i;
 758        current_sort = sort;
 759        qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator);
 760        return list;
 761}
 762
 763static int sha1_sort(const struct object_entry *a, const struct object_entry *b)
 764{
 765        return memcmp(a->sha1, b->sha1, 20);
 766}
 767
 768static struct object_entry **create_final_object_list(void)
 769{
 770        struct object_entry **list;
 771        int i, j;
 772
 773        for (i = nr_result = 0; i < nr_objects; i++)
 774                if (!objects[i].preferred_base)
 775                        nr_result++;
 776        list = xmalloc(nr_result * sizeof(struct object_entry *));
 777        for (i = j = 0; i < nr_objects; i++) {
 778                if (!objects[i].preferred_base)
 779                        list[j++] = objects + i;
 780        }
 781        current_sort = sha1_sort;
 782        qsort(list, nr_result, sizeof(struct object_entry *), sort_comparator);
 783        return list;
 784}
 785
 786static int type_size_sort(const struct object_entry *a, const struct object_entry *b)
 787{
 788        if (a->type < b->type)
 789                return -1;
 790        if (a->type > b->type)
 791                return 1;
 792        if (a->hash < b->hash)
 793                return -1;
 794        if (a->hash > b->hash)
 795                return 1;
 796        if (a->preferred_base < b->preferred_base)
 797                return -1;
 798        if (a->preferred_base > b->preferred_base)
 799                return 1;
 800        if (a->size < b->size)
 801                return -1;
 802        if (a->size > b->size)
 803                return 1;
 804        return a < b ? -1 : (a > b);
 805}
 806
 807struct unpacked {
 808        struct object_entry *entry;
 809        void *data;
 810};
 811
 812/*
 813 * We search for deltas _backwards_ in a list sorted by type and
 814 * by size, so that we see progressively smaller and smaller files.
 815 * That's because we prefer deltas to be from the bigger file
 816 * to the smaller - deletes are potentially cheaper, but perhaps
 817 * more importantly, the bigger file is likely the more recent
 818 * one.
 819 */
 820static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth)
 821{
 822        struct object_entry *cur_entry = cur->entry;
 823        struct object_entry *old_entry = old->entry;
 824        unsigned long size, oldsize, delta_size, sizediff;
 825        long max_size;
 826        void *delta_buf;
 827
 828        /* Don't bother doing diffs between different types */
 829        if (cur_entry->type != old_entry->type)
 830                return -1;
 831
 832        /* We do not compute delta to *create* objects we are not
 833         * going to pack.
 834         */
 835        if (cur_entry->preferred_base)
 836                return -1;
 837
 838        /* If the current object is at pack edge, take the depth the
 839         * objects that depend on the current object into account --
 840         * otherwise they would become too deep.
 841         */
 842        if (cur_entry->delta_child) {
 843                if (max_depth <= cur_entry->delta_limit)
 844                        return 0;
 845                max_depth -= cur_entry->delta_limit;
 846        }
 847
 848        size = cur_entry->size;
 849        oldsize = old_entry->size;
 850        sizediff = oldsize > size ? oldsize - size : size - oldsize;
 851
 852        if (size < 50)
 853                return -1;
 854        if (old_entry->depth >= max_depth)
 855                return 0;
 856
 857        /*
 858         * NOTE!
 859         *
 860         * We always delta from the bigger to the smaller, since that's
 861         * more space-efficient (deletes don't have to say _what_ they
 862         * delete).
 863         */
 864        max_size = size / 2 - 20;
 865        if (cur_entry->delta)
 866                max_size = cur_entry->delta_size-1;
 867        if (sizediff >= max_size)
 868                return -1;
 869        delta_buf = diff_delta(old->data, oldsize,
 870                               cur->data, size, &delta_size, max_size);
 871        if (!delta_buf)
 872                return 0;
 873        cur_entry->delta = old_entry;
 874        cur_entry->delta_size = delta_size;
 875        cur_entry->depth = old_entry->depth + 1;
 876        free(delta_buf);
 877        return 0;
 878}
 879
 880static void progress_interval(int signum)
 881{
 882        signal(SIGALRM, progress_interval);
 883        progress_update = 1;
 884}
 885
 886static void find_deltas(struct object_entry **list, int window, int depth)
 887{
 888        int i, idx;
 889        unsigned int array_size = window * sizeof(struct unpacked);
 890        struct unpacked *array = xmalloc(array_size);
 891        unsigned processed = 0;
 892        unsigned last_percent = 999;
 893
 894        memset(array, 0, array_size);
 895        i = nr_objects;
 896        idx = 0;
 897        if (progress)
 898                fprintf(stderr, "Deltifying %d objects.\n", nr_result);
 899
 900        while (--i >= 0) {
 901                struct object_entry *entry = list[i];
 902                struct unpacked *n = array + idx;
 903                unsigned long size;
 904                char type[10];
 905                int j;
 906
 907                if (!entry->preferred_base)
 908                        processed++;
 909
 910                if (progress) {
 911                        unsigned percent = processed * 100 / nr_result;
 912                        if (percent != last_percent || progress_update) {
 913                                fprintf(stderr, "%4u%% (%u/%u) done\r",
 914                                        percent, processed, nr_result);
 915                                progress_update = 0;
 916                                last_percent = percent;
 917                        }
 918                }
 919
 920                if (entry->delta)
 921                        /* This happens if we decided to reuse existing
 922                         * delta from a pack.  "!no_reuse_delta &&" is implied.
 923                         */
 924                        continue;
 925
 926                free(n->data);
 927                n->entry = entry;
 928                n->data = read_sha1_file(entry->sha1, type, &size);
 929                if (size != entry->size)
 930                        die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
 931
 932                j = window;
 933                while (--j > 0) {
 934                        unsigned int other_idx = idx + j;
 935                        struct unpacked *m;
 936                        if (other_idx >= window)
 937                                other_idx -= window;
 938                        m = array + other_idx;
 939                        if (!m->entry)
 940                                break;
 941                        if (try_delta(n, m, depth) < 0)
 942                                break;
 943                }
 944#if 0
 945                /* if we made n a delta, and if n is already at max
 946                 * depth, leaving it in the window is pointless.  we
 947                 * should evict it first.
 948                 * ... in theory only; somehow this makes things worse.
 949                 */
 950                if (entry->delta && depth <= entry->depth)
 951                        continue;
 952#endif
 953                idx++;
 954                if (idx >= window)
 955                        idx = 0;
 956        }
 957
 958        if (progress)
 959                fputc('\n', stderr);
 960
 961        for (i = 0; i < window; ++i)
 962                free(array[i].data);
 963        free(array);
 964}
 965
 966static void prepare_pack(int window, int depth)
 967{
 968        get_object_details();
 969        sorted_by_type = create_sorted_list(type_size_sort);
 970        if (window && depth)
 971                find_deltas(sorted_by_type, window+1, depth);
 972}
 973
 974static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout)
 975{
 976        static const char cache[] = "pack-cache/pack-%s.%s";
 977        char *cached_pack, *cached_idx;
 978        int ifd, ofd, ifd_ix = -1;
 979
 980        cached_pack = git_path(cache, sha1_to_hex(sha1), "pack");
 981        ifd = open(cached_pack, O_RDONLY);
 982        if (ifd < 0)
 983                return 0;
 984
 985        if (!pack_to_stdout) {
 986                cached_idx = git_path(cache, sha1_to_hex(sha1), "idx");
 987                ifd_ix = open(cached_idx, O_RDONLY);
 988                if (ifd_ix < 0) {
 989                        close(ifd);
 990                        return 0;
 991                }
 992        }
 993
 994        if (progress)
 995                fprintf(stderr, "Reusing %d objects pack %s\n", nr_objects,
 996                        sha1_to_hex(sha1));
 997
 998        if (pack_to_stdout) {
 999                if (copy_fd(ifd, 1))
1000                        exit(1);
1001                close(ifd);
1002        }
1003        else {
1004                char name[PATH_MAX];
1005                snprintf(name, sizeof(name),
1006                         "%s-%s.%s", base_name, sha1_to_hex(sha1), "pack");
1007                ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
1008                if (ofd < 0)
1009                        die("unable to open %s (%s)", name, strerror(errno));
1010                if (copy_fd(ifd, ofd))
1011                        exit(1);
1012                close(ifd);
1013
1014                snprintf(name, sizeof(name),
1015                         "%s-%s.%s", base_name, sha1_to_hex(sha1), "idx");
1016                ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
1017                if (ofd < 0)
1018                        die("unable to open %s (%s)", name, strerror(errno));
1019                if (copy_fd(ifd_ix, ofd))
1020                        exit(1);
1021                close(ifd_ix);
1022                puts(sha1_to_hex(sha1));
1023        }
1024
1025        return 1;
1026}
1027
1028int main(int argc, char **argv)
1029{
1030        SHA_CTX ctx;
1031        char line[PATH_MAX + 20];
1032        int window = 10, depth = 10, pack_to_stdout = 0;
1033        struct object_entry **list;
1034        int i;
1035
1036        setup_git_directory();
1037
1038        for (i = 1; i < argc; i++) {
1039                const char *arg = argv[i];
1040
1041                if (*arg == '-') {
1042                        if (!strcmp("--non-empty", arg)) {
1043                                non_empty = 1;
1044                                continue;
1045                        }
1046                        if (!strcmp("--local", arg)) {
1047                                local = 1;
1048                                continue;
1049                        }
1050                        if (!strcmp("--incremental", arg)) {
1051                                incremental = 1;
1052                                continue;
1053                        }
1054                        if (!strncmp("--window=", arg, 9)) {
1055                                char *end;
1056                                window = strtoul(arg+9, &end, 0);
1057                                if (!arg[9] || *end)
1058                                        usage(pack_usage);
1059                                continue;
1060                        }
1061                        if (!strncmp("--depth=", arg, 8)) {
1062                                char *end;
1063                                depth = strtoul(arg+8, &end, 0);
1064                                if (!arg[8] || *end)
1065                                        usage(pack_usage);
1066                                continue;
1067                        }
1068                        if (!strcmp("-q", arg)) {
1069                                progress = 0;
1070                                continue;
1071                        }
1072                        if (!strcmp("--no-reuse-delta", arg)) {
1073                                no_reuse_delta = 1;
1074                                continue;
1075                        }
1076                        if (!strcmp("--stdout", arg)) {
1077                                pack_to_stdout = 1;
1078                                continue;
1079                        }
1080                        usage(pack_usage);
1081                }
1082                if (base_name)
1083                        usage(pack_usage);
1084                base_name = arg;
1085        }
1086
1087        if (pack_to_stdout != !base_name)
1088                usage(pack_usage);
1089
1090        prepare_packed_git();
1091
1092        if (progress) {
1093                struct itimerval v;
1094                v.it_interval.tv_sec = 1;
1095                v.it_interval.tv_usec = 0;
1096                v.it_value = v.it_interval;
1097                signal(SIGALRM, progress_interval);
1098                setitimer(ITIMER_REAL, &v, NULL);
1099                fprintf(stderr, "Generating pack...\n");
1100        }
1101
1102        while (fgets(line, sizeof(line), stdin) != NULL) {
1103                unsigned char sha1[20];
1104
1105                if (line[0] == '-') {
1106                        if (get_sha1_hex(line+1, sha1))
1107                                die("expected edge sha1, got garbage:\n %s",
1108                                    line+1);
1109                        add_preferred_base(sha1);
1110                        continue;
1111                }
1112                if (get_sha1_hex(line, sha1))
1113                        die("expected sha1, got garbage:\n %s", line);
1114                add_object_entry(sha1, name_hash(NULL, line+41), 0);
1115        }
1116        if (progress)
1117                fprintf(stderr, "Done counting %d objects.\n", nr_objects);
1118        sorted_by_sha = create_final_object_list();
1119        if (non_empty && !nr_result)
1120                return 0;
1121
1122        SHA1_Init(&ctx);
1123        list = sorted_by_sha;
1124        for (i = 0; i < nr_result; i++) {
1125                struct object_entry *entry = *list++;
1126                SHA1_Update(&ctx, entry->sha1, 20);
1127        }
1128        SHA1_Final(object_list_sha1, &ctx);
1129        if (progress && (nr_objects != nr_result))
1130                fprintf(stderr, "Result has %d objects.\n", nr_result);
1131
1132        if (reuse_cached_pack(object_list_sha1, pack_to_stdout))
1133                ;
1134        else {
1135                if (nr_result)
1136                        prepare_pack(window, depth);
1137                if (progress && pack_to_stdout) {
1138                        /* the other end usually displays progress itself */
1139                        struct itimerval v = {{0,},};
1140                        setitimer(ITIMER_REAL, &v, NULL);
1141                        signal(SIGALRM, SIG_IGN );
1142                        progress_update = 0;
1143                }
1144                write_pack_file();
1145                if (!pack_to_stdout) {
1146                        write_index_file();
1147                        puts(sha1_to_hex(object_list_sha1));
1148                }
1149        }
1150        if (progress)
1151                fprintf(stderr, "Total %d, written %d (delta %d), reused %d (delta %d)\n",
1152                        nr_result, written, written_delta, reused, reused_delta);
1153        return 0;
1154}