pack-redundant.con commit Teach 'diff' about 'diff' attribute. (8c70124)
   1/*
   2*
   3* Copyright 2005, Lukas Sandstrom <lukass@etek.chalmers.se>
   4*
   5* This file is licensed under the GPL v2.
   6*
   7*/
   8
   9#include "cache.h"
  10
  11#define BLKSIZE 512
  12
  13static const char pack_redundant_usage[] =
  14"git-pack-redundant [ --verbose ] [ --alt-odb ] < --all | <.pack filename> ...>";
  15
  16static int load_all_packs, verbose, alt_odb;
  17
  18struct llist_item {
  19        struct llist_item *next;
  20        const unsigned char *sha1;
  21};
  22static struct llist {
  23        struct llist_item *front;
  24        struct llist_item *back;
  25        size_t size;
  26} *all_objects; /* all objects which must be present in local packfiles */
  27
  28static struct pack_list {
  29        struct pack_list *next;
  30        struct packed_git *pack;
  31        struct llist *unique_objects;
  32        struct llist *all_objects;
  33} *local_packs = NULL, *altodb_packs = NULL;
  34
  35struct pll {
  36        struct pll *next;
  37        struct pack_list *pl;
  38};
  39
  40static struct llist_item *free_nodes;
  41
  42static inline void llist_item_put(struct llist_item *item)
  43{
  44        item->next = free_nodes;
  45        free_nodes = item;
  46}
  47
  48static inline struct llist_item *llist_item_get(void)
  49{
  50        struct llist_item *new;
  51        if ( free_nodes ) {
  52                new = free_nodes;
  53                free_nodes = free_nodes->next;
  54        } else {
  55                int i = 1;
  56                new = xmalloc(sizeof(struct llist_item) * BLKSIZE);
  57                for(;i < BLKSIZE; i++) {
  58                        llist_item_put(&new[i]);
  59                }
  60        }
  61        return new;
  62}
  63
  64static void llist_free(struct llist *list)
  65{
  66        while((list->back = list->front)) {
  67                list->front = list->front->next;
  68                llist_item_put(list->back);
  69        }
  70        free(list);
  71}
  72
  73static inline void llist_init(struct llist **list)
  74{
  75        *list = xmalloc(sizeof(struct llist));
  76        (*list)->front = (*list)->back = NULL;
  77        (*list)->size = 0;
  78}
  79
  80static struct llist * llist_copy(struct llist *list)
  81{
  82        struct llist *ret;
  83        struct llist_item *new, *old, *prev;
  84        
  85        llist_init(&ret);
  86
  87        if ((ret->size = list->size) == 0)
  88                return ret;
  89
  90        new = ret->front = llist_item_get();
  91        new->sha1 = list->front->sha1;
  92
  93        old = list->front->next;
  94        while (old) {
  95                prev = new;
  96                new = llist_item_get();
  97                prev->next = new;
  98                new->sha1 = old->sha1;
  99                old = old->next;
 100        }
 101        new->next = NULL;
 102        ret->back = new;
 103        
 104        return ret;
 105}
 106
 107static inline struct llist_item *llist_insert(struct llist *list,
 108                                              struct llist_item *after,
 109                                               const unsigned char *sha1)
 110{
 111        struct llist_item *new = llist_item_get();
 112        new->sha1 = sha1;
 113        new->next = NULL;
 114
 115        if (after != NULL) {
 116                new->next = after->next;
 117                after->next = new;
 118                if (after == list->back)
 119                        list->back = new;
 120        } else {/* insert in front */
 121                if (list->size == 0)
 122                        list->back = new;
 123                else
 124                        new->next = list->front;
 125                list->front = new;
 126        }
 127        list->size++;
 128        return new;
 129}
 130
 131static inline struct llist_item *llist_insert_back(struct llist *list,
 132                                                   const unsigned char *sha1)
 133{
 134        return llist_insert(list, list->back, sha1);
 135}
 136
 137static inline struct llist_item *llist_insert_sorted_unique(struct llist *list,
 138                        const unsigned char *sha1, struct llist_item *hint)
 139{
 140        struct llist_item *prev = NULL, *l;
 141
 142        l = (hint == NULL) ? list->front : hint;
 143        while (l) {
 144                int cmp = hashcmp(l->sha1, sha1);
 145                if (cmp > 0) { /* we insert before this entry */
 146                        return llist_insert(list, prev, sha1);
 147                }
 148                if(!cmp) { /* already exists */
 149                        return l;
 150                }
 151                prev = l;
 152                l = l->next;
 153        }
 154        /* insert at the end */
 155        return llist_insert_back(list, sha1);
 156}
 157
 158/* returns a pointer to an item in front of sha1 */
 159static inline struct llist_item * llist_sorted_remove(struct llist *list, const unsigned char *sha1, struct llist_item *hint)
 160{
 161        struct llist_item *prev, *l;
 162
 163redo_from_start:
 164        l = (hint == NULL) ? list->front : hint;
 165        prev = NULL;
 166        while (l) {
 167                int cmp = hashcmp(l->sha1, sha1);
 168                if (cmp > 0) /* not in list, since sorted */
 169                        return prev;
 170                if(!cmp) { /* found */
 171                        if (prev == NULL) {
 172                                if (hint != NULL && hint != list->front) {
 173                                        /* we don't know the previous element */
 174                                        hint = NULL;
 175                                        goto redo_from_start;
 176                                }
 177                                list->front = l->next;
 178                        } else
 179                                prev->next = l->next;
 180                        if (l == list->back)
 181                                list->back = prev;
 182                        llist_item_put(l);
 183                        list->size--;
 184                        return prev;
 185                }
 186                prev = l;
 187                l = l->next;
 188        }
 189        return prev;
 190}
 191
 192/* computes A\B */
 193static void llist_sorted_difference_inplace(struct llist *A,
 194                                     struct llist *B)
 195{
 196        struct llist_item *hint, *b;
 197
 198        hint = NULL;
 199        b = B->front;
 200
 201        while (b) {
 202                hint = llist_sorted_remove(A, b->sha1, hint);
 203                b = b->next;
 204        }
 205}
 206
 207static inline struct pack_list * pack_list_insert(struct pack_list **pl,
 208                                           struct pack_list *entry)
 209{
 210        struct pack_list *p = xmalloc(sizeof(struct pack_list));
 211        memcpy(p, entry, sizeof(struct pack_list));
 212        p->next = *pl;
 213        *pl = p;
 214        return p;
 215}
 216
 217static inline size_t pack_list_size(struct pack_list *pl)
 218{
 219        size_t ret = 0;
 220        while(pl) {
 221                ret++;
 222                pl = pl->next;
 223        }
 224        return ret;
 225}
 226
 227static struct pack_list * pack_list_difference(const struct pack_list *A,
 228                                               const struct pack_list *B)
 229{
 230        struct pack_list *ret;
 231        const struct pack_list *pl;
 232
 233        if (A == NULL)
 234                return NULL;
 235
 236        pl = B;
 237        while (pl != NULL) {
 238                if (A->pack == pl->pack)
 239                        return pack_list_difference(A->next, B);
 240                pl = pl->next;
 241        }
 242        ret = xmalloc(sizeof(struct pack_list));
 243        memcpy(ret, A, sizeof(struct pack_list));
 244        ret->next = pack_list_difference(A->next, B);
 245        return ret;
 246}
 247
 248static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
 249{
 250        int p1_off, p2_off;
 251        const unsigned char *p1_base, *p2_base;
 252        struct llist_item *p1_hint = NULL, *p2_hint = NULL;
 253
 254        p1_off = p2_off = 256 * 4 + 4;
 255        p1_base = p1->pack->index_data;
 256        p2_base = p2->pack->index_data;
 257
 258        while (p1_off <= p1->pack->index_size - 3 * 20 &&
 259               p2_off <= p2->pack->index_size - 3 * 20)
 260        {
 261                int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 262                /* cmp ~ p1 - p2 */
 263                if (cmp == 0) {
 264                        p1_hint = llist_sorted_remove(p1->unique_objects,
 265                                        p1_base + p1_off, p1_hint);
 266                        p2_hint = llist_sorted_remove(p2->unique_objects,
 267                                        p1_base + p1_off, p2_hint);
 268                        p1_off+=24;
 269                        p2_off+=24;
 270                        continue;
 271                }
 272                if (cmp < 0) { /* p1 has the object, p2 doesn't */
 273                        p1_off+=24;
 274                } else { /* p2 has the object, p1 doesn't */
 275                        p2_off+=24;
 276                }
 277        }
 278}
 279
 280static void pll_free(struct pll *l)
 281{
 282        struct pll *old;
 283        struct pack_list *opl;
 284
 285        while (l) {
 286                old = l;
 287                while (l->pl) {
 288                        opl = l->pl;
 289                        l->pl = opl->next;
 290                        free(opl);
 291                }
 292                l = l->next;
 293                free(old);
 294        }
 295}
 296
 297/* all the permutations have to be free()d at the same time,
 298 * since they refer to each other
 299 */
 300static struct pll * get_permutations(struct pack_list *list, int n)
 301{
 302        struct pll *subset, *ret = NULL, *new_pll = NULL, *pll;
 303
 304        if (list == NULL || pack_list_size(list) < n || n == 0)
 305                return NULL;
 306
 307        if (n == 1) {
 308                while (list) {
 309                        new_pll = xmalloc(sizeof(pll));
 310                        new_pll->pl = NULL;
 311                        pack_list_insert(&new_pll->pl, list);
 312                        new_pll->next = ret;
 313                        ret = new_pll;
 314                        list = list->next;
 315                }
 316                return ret;
 317        }
 318
 319        while (list->next) {
 320                subset = get_permutations(list->next, n - 1);
 321                while (subset) {
 322                        new_pll = xmalloc(sizeof(pll));
 323                        new_pll->pl = subset->pl;
 324                        pack_list_insert(&new_pll->pl, list);
 325                        new_pll->next = ret;
 326                        ret = new_pll;
 327                        subset = subset->next;
 328                }
 329                list = list->next;
 330        }
 331        return ret;
 332}
 333
 334static int is_superset(struct pack_list *pl, struct llist *list)
 335{
 336        struct llist *diff;
 337
 338        diff = llist_copy(list);
 339
 340        while (pl) {
 341                llist_sorted_difference_inplace(diff, pl->all_objects);
 342                if (diff->size == 0) { /* we're done */
 343                        llist_free(diff);
 344                        return 1;
 345                }
 346                pl = pl->next;
 347        }
 348        llist_free(diff);
 349        return 0;
 350}
 351
 352static size_t sizeof_union(struct packed_git *p1, struct packed_git *p2)
 353{
 354        size_t ret = 0;
 355        int p1_off, p2_off;
 356        const unsigned char *p1_base, *p2_base;
 357
 358        p1_off = p2_off = 256 * 4 + 4;
 359        p1_base = p1->index_data;
 360        p2_base = p2->index_data;
 361
 362        while (p1_off <= p1->index_size - 3 * 20 &&
 363               p2_off <= p2->index_size - 3 * 20)
 364        {
 365                int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
 366                /* cmp ~ p1 - p2 */
 367                if (cmp == 0) {
 368                        ret++;
 369                        p1_off+=24;
 370                        p2_off+=24;
 371                        continue;
 372                }
 373                if (cmp < 0) { /* p1 has the object, p2 doesn't */
 374                        p1_off+=24;
 375                } else { /* p2 has the object, p1 doesn't */
 376                        p2_off+=24;
 377                }
 378        }
 379        return ret;
 380}
 381
 382/* another O(n^2) function ... */
 383static size_t get_pack_redundancy(struct pack_list *pl)
 384{
 385        struct pack_list *subset;
 386        size_t ret = 0;
 387
 388        if (pl == NULL)
 389                return 0;
 390
 391        while ((subset = pl->next)) {
 392                while(subset) {
 393                        ret += sizeof_union(pl->pack, subset->pack);
 394                        subset = subset->next;
 395                }
 396                pl = pl->next;
 397        }
 398        return ret;
 399}
 400
 401static inline off_t pack_set_bytecount(struct pack_list *pl)
 402{
 403        off_t ret = 0;
 404        while (pl) {
 405                ret += pl->pack->pack_size;
 406                ret += pl->pack->index_size;
 407                pl = pl->next;
 408        }
 409        return ret;
 410}
 411
 412static void minimize(struct pack_list **min)
 413{
 414        struct pack_list *pl, *unique = NULL,
 415                *non_unique = NULL, *min_perm = NULL;
 416        struct pll *perm, *perm_all, *perm_ok = NULL, *new_perm;
 417        struct llist *missing;
 418        off_t min_perm_size = 0, perm_size;
 419        int n;
 420
 421        pl = local_packs;
 422        while (pl) {
 423                if(pl->unique_objects->size)
 424                        pack_list_insert(&unique, pl);
 425                else
 426                        pack_list_insert(&non_unique, pl);
 427                pl = pl->next;
 428        }
 429        /* find out which objects are missing from the set of unique packs */
 430        missing = llist_copy(all_objects);
 431        pl = unique;
 432        while (pl) {
 433                llist_sorted_difference_inplace(missing, pl->all_objects);
 434                pl = pl->next;
 435        }
 436
 437        /* return if there are no objects missing from the unique set */
 438        if (missing->size == 0) {
 439                *min = unique;
 440                return;
 441        }
 442
 443        /* find the permutations which contain all missing objects */
 444        for (n = 1; n <= pack_list_size(non_unique) && !perm_ok; n++) {
 445                perm_all = perm = get_permutations(non_unique, n);
 446                while (perm) {
 447                        if (is_superset(perm->pl, missing)) {
 448                                new_perm = xmalloc(sizeof(struct pll));
 449                                memcpy(new_perm, perm, sizeof(struct pll));
 450                                new_perm->next = perm_ok;
 451                                perm_ok = new_perm;
 452                        }
 453                        perm = perm->next;
 454                }
 455                if (perm_ok)
 456                        break;
 457                pll_free(perm_all);
 458        }
 459        if (perm_ok == NULL)
 460                die("Internal error: No complete sets found!\n");
 461
 462        /* find the permutation with the smallest size */
 463        perm = perm_ok;
 464        while (perm) {
 465                perm_size = pack_set_bytecount(perm->pl);
 466                if (!min_perm_size || min_perm_size > perm_size) {
 467                        min_perm_size = perm_size;
 468                        min_perm = perm->pl;
 469                }
 470                perm = perm->next;
 471        }
 472        *min = min_perm;
 473        /* add the unique packs to the list */
 474        pl = unique;
 475        while(pl) {
 476                pack_list_insert(min, pl);
 477                pl = pl->next;
 478        }
 479}
 480
 481static void load_all_objects(void)
 482{
 483        struct pack_list *pl = local_packs;
 484        struct llist_item *hint, *l;
 485
 486        llist_init(&all_objects);
 487
 488        while (pl) {
 489                hint = NULL;
 490                l = pl->all_objects->front;
 491                while (l) {
 492                        hint = llist_insert_sorted_unique(all_objects,
 493                                                          l->sha1, hint);
 494                        l = l->next;
 495                }
 496                pl = pl->next;
 497        }
 498        /* remove objects present in remote packs */
 499        pl = altodb_packs;
 500        while (pl) {
 501                llist_sorted_difference_inplace(all_objects, pl->all_objects);
 502                pl = pl->next;
 503        }
 504}
 505
 506/* this scales like O(n^2) */
 507static void cmp_local_packs(void)
 508{
 509        struct pack_list *subset, *pl = local_packs;
 510
 511        while ((subset = pl)) {
 512                while((subset = subset->next))
 513                        cmp_two_packs(pl, subset);
 514                pl = pl->next;
 515        }
 516}
 517
 518static void scan_alt_odb_packs(void)
 519{
 520        struct pack_list *local, *alt;
 521
 522        alt = altodb_packs;
 523        while (alt) {
 524                local = local_packs;
 525                while (local) {
 526                        llist_sorted_difference_inplace(local->unique_objects,
 527                                                        alt->all_objects);
 528                        local = local->next;
 529                }
 530                llist_sorted_difference_inplace(all_objects, alt->all_objects);
 531                alt = alt->next;
 532        }
 533}
 534
 535static struct pack_list * add_pack(struct packed_git *p)
 536{
 537        struct pack_list l;
 538        size_t off;
 539        const unsigned char *base;
 540
 541        if (!p->pack_local && !(alt_odb || verbose))
 542                return NULL;
 543
 544        l.pack = p;
 545        llist_init(&l.all_objects);
 546
 547        off = 256 * 4 + 4;
 548        base = p->index_data;
 549        while (off <= p->index_size - 3 * 20) {
 550                llist_insert_back(l.all_objects, base + off);
 551                off += 24;
 552        }
 553        /* this list will be pruned in cmp_two_packs later */
 554        l.unique_objects = llist_copy(l.all_objects);
 555        if (p->pack_local)
 556                return pack_list_insert(&local_packs, &l);
 557        else
 558                return pack_list_insert(&altodb_packs, &l);
 559}
 560
 561static struct pack_list * add_pack_file(char *filename)
 562{
 563        struct packed_git *p = packed_git;
 564
 565        if (strlen(filename) < 40)
 566                die("Bad pack filename: %s\n", filename);
 567
 568        while (p) {
 569                if (strstr(p->pack_name, filename))
 570                        return add_pack(p);
 571                p = p->next;
 572        }
 573        die("Filename %s not found in packed_git\n", filename);
 574}
 575
 576static void load_all(void)
 577{
 578        struct packed_git *p = packed_git;
 579
 580        while (p) {
 581                add_pack(p);
 582                p = p->next;
 583        }
 584}
 585
 586int main(int argc, char **argv)
 587{
 588        int i;
 589        struct pack_list *min, *red, *pl;
 590        struct llist *ignore;
 591        unsigned char *sha1;
 592        char buf[42]; /* 40 byte sha1 + \n + \0 */
 593
 594        setup_git_directory();
 595
 596        for (i = 1; i < argc; i++) {
 597                const char *arg = argv[i];
 598                if(!strcmp(arg, "--")) {
 599                        i++;
 600                        break;
 601                }
 602                if(!strcmp(arg, "--all")) {
 603                        load_all_packs = 1;
 604                        continue;
 605                }
 606                if(!strcmp(arg, "--verbose")) {
 607                        verbose = 1;
 608                        continue;
 609                }
 610                if(!strcmp(arg, "--alt-odb")) {
 611                        alt_odb = 1;
 612                        continue;
 613                }
 614                if(*arg == '-')
 615                        usage(pack_redundant_usage);
 616                else
 617                        break;
 618        }
 619
 620        prepare_packed_git();
 621
 622        if (load_all_packs)
 623                load_all();
 624        else
 625                while (*(argv + i) != NULL)
 626                        add_pack_file(*(argv + i++));
 627
 628        if (local_packs == NULL)
 629                die("Zero packs found!\n");
 630
 631        load_all_objects();
 632
 633        cmp_local_packs();
 634        if (alt_odb)
 635                scan_alt_odb_packs();
 636
 637        /* ignore objects given on stdin */
 638        llist_init(&ignore);
 639        if (!isatty(0)) {
 640                while (fgets(buf, sizeof(buf), stdin)) {
 641                        sha1 = xmalloc(20);
 642                        if (get_sha1_hex(buf, sha1))
 643                                die("Bad sha1 on stdin: %s", buf);
 644                        llist_insert_sorted_unique(ignore, sha1, NULL);
 645                }
 646        }
 647        llist_sorted_difference_inplace(all_objects, ignore);
 648        pl = local_packs;
 649        while (pl) {
 650                llist_sorted_difference_inplace(pl->unique_objects, ignore);
 651                pl = pl->next;
 652        }
 653
 654        minimize(&min);
 655
 656        if (verbose) {
 657                fprintf(stderr, "There are %lu packs available in alt-odbs.\n",
 658                        (unsigned long)pack_list_size(altodb_packs));
 659                fprintf(stderr, "The smallest (bytewise) set of packs is:\n");
 660                pl = min;
 661                while (pl) {
 662                        fprintf(stderr, "\t%s\n", pl->pack->pack_name);
 663                        pl = pl->next;
 664                }
 665                fprintf(stderr, "containing %lu duplicate objects "
 666                                "with a total size of %lukb.\n",
 667                        (unsigned long)get_pack_redundancy(min),
 668                        (unsigned long)pack_set_bytecount(min)/1024);
 669                fprintf(stderr, "A total of %lu unique objects were considered.\n",
 670                        (unsigned long)all_objects->size);
 671                fprintf(stderr, "Redundant packs (with indexes):\n");
 672        }
 673        pl = red = pack_list_difference(local_packs, min);
 674        while (pl) {
 675                printf("%s\n%s\n",
 676                       sha1_pack_index_name(pl->pack->sha1),
 677                       pl->pack->pack_name);
 678                pl = pl->next;
 679        }
 680        if (verbose)
 681                fprintf(stderr, "%luMB of redundant packs in total.\n",
 682                        (unsigned long)pack_set_bytecount(red)/(1024*1024));
 683
 684        return 0;
 685}