builtin / pack-redundant.con commit builtin/fetch: remove unique promisor remote limitation (5e46139)
   1/*
   2*
   3* Copyright 2005, Lukas Sandstrom <lukass@etek.chalmers.se>
   4*
   5* This file is licensed under the GPL v2.
   6*
   7*/
   8
   9#include "builtin.h"
  10#include "repository.h"
  11#include "packfile.h"
  12#include "object-store.h"
  13
  14#define BLKSIZE 512
  15
  16static const char pack_redundant_usage[] =
  17"git pack-redundant [--verbose] [--alt-odb] (--all | <filename.pack>...)";
  18
  19static int load_all_packs, verbose, alt_odb;
  20
  21struct llist_item {
  22        struct llist_item *next;
  23        const struct object_id *oid;
  24};
  25static struct llist {
  26        struct llist_item *front;
  27        struct llist_item *back;
  28        size_t size;
  29} *all_objects; /* all objects which must be present in local packfiles */
  30
  31static struct pack_list {
  32        struct pack_list *next;
  33        struct packed_git *pack;
  34        struct llist *unique_objects;
  35        struct llist *remaining_objects;
  36        size_t all_objects_size;
  37} *local_packs = NULL, *altodb_packs = NULL;
  38
  39static struct llist_item *free_nodes;
  40
  41static inline void llist_item_put(struct llist_item *item)
  42{
  43        item->next = free_nodes;
  44        free_nodes = item;
  45}
  46
  47static inline struct llist_item *llist_item_get(void)
  48{
  49        struct llist_item *new_item;
  50        if ( free_nodes ) {
  51                new_item = free_nodes;
  52                free_nodes = free_nodes->next;
  53        } else {
  54                int i = 1;
  55                ALLOC_ARRAY(new_item, BLKSIZE);
  56                for (; i < BLKSIZE; i++)
  57                        llist_item_put(&new_item[i]);
  58        }
  59        return new_item;
  60}
  61
  62static inline void llist_init(struct llist **list)
  63{
  64        *list = xmalloc(sizeof(struct llist));
  65        (*list)->front = (*list)->back = NULL;
  66        (*list)->size = 0;
  67}
  68
  69static struct llist * llist_copy(struct llist *list)
  70{
  71        struct llist *ret;
  72        struct llist_item *new_item, *old_item, *prev;
  73
  74        llist_init(&ret);
  75
  76        if ((ret->size = list->size) == 0)
  77                return ret;
  78
  79        new_item = ret->front = llist_item_get();
  80        new_item->oid = list->front->oid;
  81
  82        old_item = list->front->next;
  83        while (old_item) {
  84                prev = new_item;
  85                new_item = llist_item_get();
  86                prev->next = new_item;
  87                new_item->oid = old_item->oid;
  88                old_item = old_item->next;
  89        }
  90        new_item->next = NULL;
  91        ret->back = new_item;
  92
  93        return ret;
  94}
  95
  96static inline struct llist_item *llist_insert(struct llist *list,
  97                                              struct llist_item *after,
  98                                              const struct object_id *oid)
  99{
 100        struct llist_item *new_item = llist_item_get();
 101        new_item->oid = oid;
 102        new_item->next = NULL;
 103
 104        if (after != NULL) {
 105                new_item->next = after->next;
 106                after->next = new_item;
 107                if (after == list->back)
 108                        list->back = new_item;
 109        } else {/* insert in front */
 110                if (list->size == 0)
 111                        list->back = new_item;
 112                else
 113                        new_item->next = list->front;
 114                list->front = new_item;
 115        }
 116        list->size++;
 117        return new_item;
 118}
 119
 120static inline struct llist_item *llist_insert_back(struct llist *list,
 121                                                   const struct object_id *oid)
 122{
 123        return llist_insert(list, list->back, oid);
 124}
 125
 126static inline struct llist_item *llist_insert_sorted_unique(struct llist *list,
 127                        const struct object_id *oid, struct llist_item *hint)
 128{
 129        struct llist_item *prev = NULL, *l;
 130
 131        l = (hint == NULL) ? list->front : hint;
 132        while (l) {
 133                int cmp = oidcmp(l->oid, oid);
 134                if (cmp > 0) { /* we insert before this entry */
 135                        return llist_insert(list, prev, oid);
 136                }
 137                if (!cmp) { /* already exists */
 138                        return l;
 139                }
 140                prev = l;
 141                l = l->next;
 142        }
 143        /* insert at the end */
 144        return llist_insert_back(list, oid);
 145}
 146
 147/* returns a pointer to an item in front of sha1 */
 148static inline struct llist_item * llist_sorted_remove(struct llist *list, const struct object_id *oid, struct llist_item *hint)
 149{
 150        struct llist_item *prev, *l;
 151
 152redo_from_start:
 153        l = (hint == NULL) ? list->front : hint;
 154        prev = NULL;
 155        while (l) {
 156                const int cmp = oidcmp(l->oid, oid);
 157                if (cmp > 0) /* not in list, since sorted */
 158                        return prev;
 159                if (!cmp) { /* found */
 160                        if (prev == NULL) {
 161                                if (hint != NULL && hint != list->front) {
 162                                        /* we don't know the previous element */
 163                                        hint = NULL;
 164                                        goto redo_from_start;
 165                                }
 166                                list->front = l->next;
 167                        } else
 168                                prev->next = l->next;
 169                        if (l == list->back)
 170                                list->back = prev;
 171                        llist_item_put(l);
 172                        list->size--;
 173                        return prev;
 174                }
 175                prev = l;
 176                l = l->next;
 177        }
 178        return prev;
 179}
 180
 181/* computes A\B */
 182static void llist_sorted_difference_inplace(struct llist *A,
 183                                     struct llist *B)
 184{
 185        struct llist_item *hint, *b;
 186
 187        hint = NULL;
 188        b = B->front;
 189
 190        while (b) {
 191                hint = llist_sorted_remove(A, b->oid, hint);
 192                b = b->next;
 193        }
 194}
 195
 196static inline struct pack_list * pack_list_insert(struct pack_list **pl,
 197                                           struct pack_list *entry)
 198{
 199        struct pack_list *p = xmalloc(sizeof(struct pack_list));
 200        memcpy(p, entry, sizeof(struct pack_list));
 201        p->next = *pl;
 202        *pl = p;
 203        return p;
 204}
 205
 206static inline size_t pack_list_size(struct pack_list *pl)
 207{
 208        size_t ret = 0;
 209        while (pl) {
 210                ret++;
 211                pl = pl->next;
 212        }
 213        return ret;
 214}
 215
 216static struct pack_list * pack_list_difference(const struct pack_list *A,
 217                                               const struct pack_list *B)
 218{
 219        struct pack_list *ret;
 220        const struct pack_list *pl;
 221
 222        if (A == NULL)
 223                return NULL;
 224
 225        pl = B;
 226        while (pl != NULL) {
 227                if (A->pack == pl->pack)
 228                        return pack_list_difference(A->next, B);
 229                pl = pl->next;
 230        }
 231        ret = xmalloc(sizeof(struct pack_list));
 232        memcpy(ret, A, sizeof(struct pack_list));
 233        ret->next = pack_list_difference(A->next, B);
 234        return ret;
 235}
 236
 237static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
 238{
 239        unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step;
 240        const unsigned char *p1_base, *p2_base;
 241        struct llist_item *p1_hint = NULL, *p2_hint = NULL;
 242        const unsigned int hashsz = the_hash_algo->rawsz;
 243
 244        if (!p1->unique_objects)
 245                p1->unique_objects = llist_copy(p1->remaining_objects);
 246        if (!p2->unique_objects)
 247                p2->unique_objects = llist_copy(p2->remaining_objects);
 248
 249        p1_base = p1->pack->index_data;
 250        p2_base = p2->pack->index_data;
 251        p1_base += 256 * 4 + ((p1->pack->index_version < 2) ? 4 : 8);
 252        p2_base += 256 * 4 + ((p2->pack->index_version < 2) ? 4 : 8);
 253        p1_step = hashsz + ((p1->pack->index_version < 2) ? 4 : 0);
 254        p2_step = hashsz + ((p2->pack->index_version < 2) ? 4 : 0);
 255
 256        while (p1_off < p1->pack->num_objects * p1_step &&
 257               p2_off < p2->pack->num_objects * p2_step)
 258        {
 259                const int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 260                /* cmp ~ p1 - p2 */
 261                if (cmp == 0) {
 262                        p1_hint = llist_sorted_remove(p1->unique_objects,
 263                                        (const struct object_id *)(p1_base + p1_off),
 264                                        p1_hint);
 265                        p2_hint = llist_sorted_remove(p2->unique_objects,
 266                                        (const struct object_id *)(p1_base + p1_off),
 267                                        p2_hint);
 268                        p1_off += p1_step;
 269                        p2_off += p2_step;
 270                        continue;
 271                }
 272                if (cmp < 0) { /* p1 has the object, p2 doesn't */
 273                        p1_off += p1_step;
 274                } else { /* p2 has the object, p1 doesn't */
 275                        p2_off += p2_step;
 276                }
 277        }
 278}
 279
 280static size_t sizeof_union(struct packed_git *p1, struct packed_git *p2)
 281{
 282        size_t ret = 0;
 283        unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step;
 284        const unsigned char *p1_base, *p2_base;
 285        const unsigned int hashsz = the_hash_algo->rawsz;
 286
 287        p1_base = p1->index_data;
 288        p2_base = p2->index_data;
 289        p1_base += 256 * 4 + ((p1->index_version < 2) ? 4 : 8);
 290        p2_base += 256 * 4 + ((p2->index_version < 2) ? 4 : 8);
 291        p1_step = hashsz + ((p1->index_version < 2) ? 4 : 0);
 292        p2_step = hashsz + ((p2->index_version < 2) ? 4 : 0);
 293
 294        while (p1_off < p1->num_objects * p1_step &&
 295               p2_off < p2->num_objects * p2_step)
 296        {
 297                int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 298                /* cmp ~ p1 - p2 */
 299                if (cmp == 0) {
 300                        ret++;
 301                        p1_off += p1_step;
 302                        p2_off += p2_step;
 303                        continue;
 304                }
 305                if (cmp < 0) { /* p1 has the object, p2 doesn't */
 306                        p1_off += p1_step;
 307                } else { /* p2 has the object, p1 doesn't */
 308                        p2_off += p2_step;
 309                }
 310        }
 311        return ret;
 312}
 313
 314/* another O(n^2) function ... */
 315static size_t get_pack_redundancy(struct pack_list *pl)
 316{
 317        struct pack_list *subset;
 318        size_t ret = 0;
 319
 320        if (pl == NULL)
 321                return 0;
 322
 323        while ((subset = pl->next)) {
 324                while (subset) {
 325                        ret += sizeof_union(pl->pack, subset->pack);
 326                        subset = subset->next;
 327                }
 328                pl = pl->next;
 329        }
 330        return ret;
 331}
 332
 333static inline off_t pack_set_bytecount(struct pack_list *pl)
 334{
 335        off_t ret = 0;
 336        while (pl) {
 337                ret += pl->pack->pack_size;
 338                ret += pl->pack->index_size;
 339                pl = pl->next;
 340        }
 341        return ret;
 342}
 343
 344static int cmp_remaining_objects(const void *a, const void *b)
 345{
 346        struct pack_list *pl_a = *((struct pack_list **)a);
 347        struct pack_list *pl_b = *((struct pack_list **)b);
 348
 349        if (pl_a->remaining_objects->size == pl_b->remaining_objects->size) {
 350                /* have the same remaining_objects, big pack first */
 351                if (pl_a->all_objects_size == pl_b->all_objects_size)
 352                        return 0;
 353                else if (pl_a->all_objects_size < pl_b->all_objects_size)
 354                        return 1;
 355                else
 356                        return -1;
 357        } else if (pl_a->remaining_objects->size < pl_b->remaining_objects->size) {
 358                /* sort by remaining objects, more objects first */
 359                return 1;
 360        } else {
 361                return -1;
 362        }
 363}
 364
 365/* Sort pack_list, greater size of remaining_objects first */
 366static void sort_pack_list(struct pack_list **pl)
 367{
 368        struct pack_list **ary, *p;
 369        int i;
 370        size_t n = pack_list_size(*pl);
 371
 372        if (n < 2)
 373                return;
 374
 375        /* prepare an array of packed_list for easier sorting */
 376        ary = xcalloc(n, sizeof(struct pack_list *));
 377        for (n = 0, p = *pl; p; p = p->next)
 378                ary[n++] = p;
 379
 380        QSORT(ary, n, cmp_remaining_objects);
 381
 382        /* link them back again */
 383        for (i = 0; i < n - 1; i++)
 384                ary[i]->next = ary[i + 1];
 385        ary[n - 1]->next = NULL;
 386        *pl = ary[0];
 387
 388        free(ary);
 389}
 390
 391
 392static void minimize(struct pack_list **min)
 393{
 394        struct pack_list *pl, *unique = NULL, *non_unique = NULL;
 395        struct llist *missing, *unique_pack_objects;
 396
 397        pl = local_packs;
 398        while (pl) {
 399                if (pl->unique_objects->size)
 400                        pack_list_insert(&unique, pl);
 401                else
 402                        pack_list_insert(&non_unique, pl);
 403                pl = pl->next;
 404        }
 405        /* find out which objects are missing from the set of unique packs */
 406        missing = llist_copy(all_objects);
 407        pl = unique;
 408        while (pl) {
 409                llist_sorted_difference_inplace(missing, pl->remaining_objects);
 410                pl = pl->next;
 411        }
 412
 413        *min = unique;
 414
 415        /* return if there are no objects missing from the unique set */
 416        if (missing->size == 0) {
 417                free(missing);
 418                return;
 419        }
 420
 421        unique_pack_objects = llist_copy(all_objects);
 422        llist_sorted_difference_inplace(unique_pack_objects, missing);
 423
 424        /* remove unique pack objects from the non_unique packs */
 425        pl = non_unique;
 426        while (pl) {
 427                llist_sorted_difference_inplace(pl->remaining_objects, unique_pack_objects);
 428                pl = pl->next;
 429        }
 430
 431        while (non_unique) {
 432                /* sort the non_unique packs, greater size of remaining_objects first */
 433                sort_pack_list(&non_unique);
 434                if (non_unique->remaining_objects->size == 0)
 435                        break;
 436
 437                pack_list_insert(min, non_unique);
 438
 439                for (pl = non_unique->next; pl && pl->remaining_objects->size > 0;  pl = pl->next)
 440                        llist_sorted_difference_inplace(pl->remaining_objects, non_unique->remaining_objects);
 441
 442                non_unique = non_unique->next;
 443        }
 444}
 445
 446static void load_all_objects(void)
 447{
 448        struct pack_list *pl = local_packs;
 449        struct llist_item *hint, *l;
 450
 451        llist_init(&all_objects);
 452
 453        while (pl) {
 454                hint = NULL;
 455                l = pl->remaining_objects->front;
 456                while (l) {
 457                        hint = llist_insert_sorted_unique(all_objects,
 458                                                          l->oid, hint);
 459                        l = l->next;
 460                }
 461                pl = pl->next;
 462        }
 463        /* remove objects present in remote packs */
 464        pl = altodb_packs;
 465        while (pl) {
 466                llist_sorted_difference_inplace(all_objects, pl->remaining_objects);
 467                pl = pl->next;
 468        }
 469}
 470
 471/* this scales like O(n^2) */
 472static void cmp_local_packs(void)
 473{
 474        struct pack_list *subset, *pl = local_packs;
 475
 476        while ((subset = pl)) {
 477                while ((subset = subset->next))
 478                        cmp_two_packs(pl, subset);
 479                pl = pl->next;
 480        }
 481}
 482
 483static void scan_alt_odb_packs(void)
 484{
 485        struct pack_list *local, *alt;
 486
 487        alt = altodb_packs;
 488        while (alt) {
 489                local = local_packs;
 490                while (local) {
 491                        llist_sorted_difference_inplace(local->remaining_objects,
 492                                                        alt->remaining_objects);
 493                        local = local->next;
 494                }
 495                alt = alt->next;
 496        }
 497}
 498
 499static struct pack_list * add_pack(struct packed_git *p)
 500{
 501        struct pack_list l;
 502        unsigned long off = 0, step;
 503        const unsigned char *base;
 504
 505        if (!p->pack_local && !(alt_odb || verbose))
 506                return NULL;
 507
 508        l.pack = p;
 509        llist_init(&l.remaining_objects);
 510
 511        if (open_pack_index(p))
 512                return NULL;
 513
 514        base = p->index_data;
 515        base += 256 * 4 + ((p->index_version < 2) ? 4 : 8);
 516        step = the_hash_algo->rawsz + ((p->index_version < 2) ? 4 : 0);
 517        while (off < p->num_objects * step) {
 518                llist_insert_back(l.remaining_objects, (const struct object_id *)(base + off));
 519                off += step;
 520        }
 521        l.all_objects_size = l.remaining_objects->size;
 522        l.unique_objects = NULL;
 523        if (p->pack_local)
 524                return pack_list_insert(&local_packs, &l);
 525        else
 526                return pack_list_insert(&altodb_packs, &l);
 527}
 528
 529static struct pack_list * add_pack_file(const char *filename)
 530{
 531        struct packed_git *p = get_all_packs(the_repository);
 532
 533        if (strlen(filename) < 40)
 534                die("Bad pack filename: %s", filename);
 535
 536        while (p) {
 537                if (strstr(p->pack_name, filename))
 538                        return add_pack(p);
 539                p = p->next;
 540        }
 541        die("Filename %s not found in packed_git", filename);
 542}
 543
 544static void load_all(void)
 545{
 546        struct packed_git *p = get_all_packs(the_repository);
 547
 548        while (p) {
 549                add_pack(p);
 550                p = p->next;
 551        }
 552}
 553
 554int cmd_pack_redundant(int argc, const char **argv, const char *prefix)
 555{
 556        int i;
 557        struct pack_list *min = NULL, *red, *pl;
 558        struct llist *ignore;
 559        struct object_id *oid;
 560        char buf[GIT_MAX_HEXSZ + 2]; /* hex hash + \n + \0 */
 561
 562        if (argc == 2 && !strcmp(argv[1], "-h"))
 563                usage(pack_redundant_usage);
 564
 565        for (i = 1; i < argc; i++) {
 566                const char *arg = argv[i];
 567                if (!strcmp(arg, "--")) {
 568                        i++;
 569                        break;
 570                }
 571                if (!strcmp(arg, "--all")) {
 572                        load_all_packs = 1;
 573                        continue;
 574                }
 575                if (!strcmp(arg, "--verbose")) {
 576                        verbose = 1;
 577                        continue;
 578                }
 579                if (!strcmp(arg, "--alt-odb")) {
 580                        alt_odb = 1;
 581                        continue;
 582                }
 583                if (*arg == '-')
 584                        usage(pack_redundant_usage);
 585                else
 586                        break;
 587        }
 588
 589        if (load_all_packs)
 590                load_all();
 591        else
 592                while (*(argv + i) != NULL)
 593                        add_pack_file(*(argv + i++));
 594
 595        if (local_packs == NULL)
 596                die("Zero packs found!");
 597
 598        load_all_objects();
 599
 600        if (alt_odb)
 601                scan_alt_odb_packs();
 602
 603        /* ignore objects given on stdin */
 604        llist_init(&ignore);
 605        if (!isatty(0)) {
 606                while (fgets(buf, sizeof(buf), stdin)) {
 607                        oid = xmalloc(sizeof(*oid));
 608                        if (get_oid_hex(buf, oid))
 609                                die("Bad object ID on stdin: %s", buf);
 610                        llist_insert_sorted_unique(ignore, oid, NULL);
 611                }
 612        }
 613        llist_sorted_difference_inplace(all_objects, ignore);
 614        pl = local_packs;
 615        while (pl) {
 616                llist_sorted_difference_inplace(pl->remaining_objects, ignore);
 617                pl = pl->next;
 618        }
 619
 620        cmp_local_packs();
 621
 622        minimize(&min);
 623
 624        if (verbose) {
 625                fprintf(stderr, "There are %lu packs available in alt-odbs.\n",
 626                        (unsigned long)pack_list_size(altodb_packs));
 627                fprintf(stderr, "The smallest (bytewise) set of packs is:\n");
 628                pl = min;
 629                while (pl) {
 630                        fprintf(stderr, "\t%s\n", pl->pack->pack_name);
 631                        pl = pl->next;
 632                }
 633                fprintf(stderr, "containing %lu duplicate objects "
 634                                "with a total size of %lukb.\n",
 635                        (unsigned long)get_pack_redundancy(min),
 636                        (unsigned long)pack_set_bytecount(min)/1024);
 637                fprintf(stderr, "A total of %lu unique objects were considered.\n",
 638                        (unsigned long)all_objects->size);
 639                fprintf(stderr, "Redundant packs (with indexes):\n");
 640        }
 641        pl = red = pack_list_difference(local_packs, min);
 642        while (pl) {
 643                printf("%s\n%s\n",
 644                       sha1_pack_index_name(pl->pack->hash),
 645                       pl->pack->pack_name);
 646                pl = pl->next;
 647        }
 648        if (verbose)
 649                fprintf(stderr, "%luMB of redundant packs in total.\n",
 650                        (unsigned long)pack_set_bytecount(red)/(1024*1024));
 651
 652        return 0;
 653}