2267478c3616af9efb5476503f51f2fb904525cb
   1/*
   2*
   3* Copyright 2005, Lukas Sandstrom <lukass@etek.chalmers.se>
   4*
   5* This file is licensed under the GPL v2.
   6*
   7*/
   8
   9#include "cache.h"
  10
  11static const char pack_intersect_usage[] =
  12"git-pack-intersect [ -v ]  < -a | <.pack filename> ...>";
  13
  14int all = 0, verbose = 0;
  15
  16struct llist_item {
  17        struct llist_item *next;
  18        char *sha1;
  19};
  20struct llist {
  21        struct llist_item *front;
  22        struct llist_item *back;
  23        size_t size;
  24} *all_objects;
  25
  26struct pack_list {
  27        struct pack_list *next;
  28        struct packed_git *pack;
  29        struct llist *unique_objects;
  30        struct llist *all_objects;
  31} *pack_list;
  32
  33struct pll {
  34        struct pll *next;
  35        struct pack_list *pl;
  36};
  37
  38inline void llist_free(struct llist *list)
  39{
  40        while((list->back = list->front)) {
  41                list->front = list->front->next;
  42                free(list->back);
  43        }
  44        free(list);
  45}
  46
  47inline void llist_init(struct llist **list)
  48{
  49        *list = xmalloc(sizeof(struct llist));
  50        (*list)->front = (*list)->back = NULL;
  51        (*list)->size = 0;
  52}
  53
  54struct llist * llist_copy(struct llist *list)
  55{
  56        struct llist *ret;
  57        struct llist_item *new, *old, *prev;
  58        
  59        llist_init(&ret);
  60
  61        if ((ret->size = list->size) == 0)
  62                return ret;
  63
  64        new = ret->front = xmalloc(sizeof(struct llist_item));
  65        new->sha1 = list->front->sha1;
  66
  67        old = list->front->next;
  68        while (old) {
  69                prev = new;
  70                new = xmalloc(sizeof(struct llist_item));
  71                prev->next = new;
  72                new->sha1 = old->sha1;
  73                old = old->next;
  74        }
  75        new->next = NULL;
  76        ret->back = new;
  77        
  78        return ret;
  79}
  80
  81inline struct llist_item * llist_insert(struct llist *list,
  82                                        struct llist_item *after, char *sha1)
  83{
  84        struct llist_item *new = xmalloc(sizeof(struct llist_item));
  85        new->sha1 = sha1;
  86        new->next = NULL;
  87
  88        if (after != NULL) {
  89                new->next = after->next;
  90                after->next = new;
  91                if (after == list->back)
  92                        list->back = new;
  93        } else {/* insert in front */
  94                if (list->size == 0)
  95                        list->back = new;
  96                else
  97                        new->next = list->front;
  98                list->front = new;
  99        }
 100        list->size++;
 101        return new;
 102}
 103
 104inline struct llist_item * llist_insert_back(struct llist *list, char *sha1)
 105{
 106        return llist_insert(list, list->back, sha1);
 107}
 108
 109inline struct llist_item * llist_insert_sorted_unique(struct llist *list,
 110                                        char *sha1, struct llist_item *hint)
 111{
 112        struct llist_item *prev = NULL, *l;
 113
 114        l = (hint == NULL) ? list->front : hint;
 115        while (l) {
 116                int cmp = memcmp(l->sha1, sha1, 20);
 117                if (cmp > 0) { /* we insert before this entry */
 118                        return llist_insert(list, prev, sha1);
 119                }
 120                if(!cmp) { /* already exists */
 121                        return l;
 122                }
 123                prev = l;
 124                l = l->next;
 125        }
 126        /* insert at the end */
 127        return llist_insert_back(list, sha1);
 128}
 129
 130/* computes A\B */
 131struct llist * llist_sorted_difference(struct llist_item *A,
 132                                       struct llist_item *B)
 133{
 134        struct llist *ret;
 135        llist_init(&ret);
 136
 137        while (A != NULL && B != NULL) {
 138                int cmp = memcmp(A->sha1, B->sha1, 20);
 139                if (!cmp) {
 140                        A = A->next;
 141                        B = B->next;
 142                        continue;
 143                }
 144                if(cmp > 0) { /* we'll never find this B */
 145                        B = B->next;
 146                        continue;
 147                }
 148                /* A has the object, B doesn't */
 149                llist_insert_back(ret, A->sha1);
 150                A = A->next;
 151        }
 152        while (A != NULL) {
 153                llist_insert_back(ret, A->sha1);
 154                A = A->next;
 155        }
 156        return ret;
 157}
 158
 159/* returns a pointer to an item in front of sha1 */
 160inline struct llist_item * llist_sorted_remove(struct llist *list, char *sha1,
 161                                               struct llist_item *hint)
 162{
 163        struct llist_item *prev, *l;
 164
 165redo_from_start:
 166        l = (hint == NULL) ? list->front : hint;
 167        prev = NULL;
 168        while (l) {
 169                int cmp = memcmp(l->sha1, sha1, 20);
 170                if (cmp > 0) /* not in list, since sorted */
 171                        return prev;
 172                if(!cmp) { /* found */
 173                        if (prev == NULL) {
 174                                if (hint != NULL && hint != list->front) {
 175                                        /* we don't know the previous element */
 176                                        hint = NULL;
 177                                        goto redo_from_start;
 178                                }
 179                                list->front = l->next;
 180                        } else
 181                                prev->next = l->next;
 182                        if (l == list->back)
 183                                list->back = prev;
 184                        free(l);
 185                        list->size--;
 186                        return prev;
 187                }
 188                prev = l;
 189                l = l->next;
 190        }
 191        return prev;
 192}
 193
 194inline struct pack_list * pack_list_insert(struct pack_list **pl,
 195                                           struct pack_list *entry)
 196{
 197        struct pack_list *p = xmalloc(sizeof(struct pack_list));
 198        memcpy(p, entry, sizeof(struct pack_list));
 199        p->next = *pl;
 200        *pl = p;
 201        return p;
 202}
 203
 204struct pack_list * pack_list_difference(struct pack_list *A,
 205                                        struct pack_list *B)
 206{
 207        struct pack_list *ret, *pl;
 208
 209        if (A == NULL)
 210                return NULL;
 211
 212        pl = B;
 213        while (pl != NULL) {
 214                if (A->pack == pl->pack)
 215                        return pack_list_difference(A->next, B);
 216                pl = pl->next;
 217        }
 218        ret = xmalloc(sizeof(struct pack_list));
 219        memcpy(ret, A, sizeof(struct pack_list));
 220        ret->next = pack_list_difference(A->next, B);
 221        return ret;
 222}
 223
 224void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
 225{
 226        int p1_off, p2_off;
 227        void *p1_base, *p2_base;
 228        struct llist_item *p1_hint = NULL, *p2_hint = NULL;
 229        
 230        p1_off = p2_off = 256 * 4 + 4;
 231        p1_base = (void *)p1->pack->index_base;
 232        p2_base = (void *)p2->pack->index_base;
 233
 234        while (p1_off <= p1->pack->index_size - 3 * 20 &&
 235               p2_off <= p2->pack->index_size - 3 * 20)
 236        {
 237                int cmp = memcmp(p1_base + p1_off, p2_base + p2_off, 20);
 238                /* cmp ~ p1 - p2 */
 239                if (cmp == 0) {
 240                        p1_hint = llist_sorted_remove(p1->unique_objects,
 241                                        p1_base + p1_off, p1_hint);
 242                        p2_hint = llist_sorted_remove(p2->unique_objects,
 243                                        p1_base + p1_off, p2_hint);
 244                        p1_off+=24;
 245                        p2_off+=24;
 246                        continue;
 247                }
 248                if (cmp < 0) { /* p1 has the object, p2 doesn't */
 249                        p1_off+=24;
 250                } else { /* p2 has the object, p1 doesn't */
 251                        p2_off+=24;
 252                }
 253        }
 254}
 255
 256/* all the permutations have to be free()d at the same time,
 257 * since they refer to each other
 258 */
 259struct pll * get_all_permutations(struct pack_list *list)
 260{
 261        struct pll *subset, *pll, *new_pll = NULL; /*silence warning*/
 262
 263        if (list == NULL)
 264                return NULL;
 265
 266        if (list->next == NULL) {
 267                new_pll = xmalloc(sizeof(struct pll));
 268                new_pll->next = NULL;
 269                new_pll->pl = list;
 270                return new_pll;
 271        }
 272
 273        pll = subset = get_all_permutations(list->next);
 274        while (pll) {
 275                new_pll = xmalloc(sizeof(struct pll));
 276                new_pll->next = pll->next;
 277                pll->next = new_pll;
 278
 279                new_pll->pl = xmalloc(sizeof(struct pack_list));
 280                memcpy(new_pll->pl, list, sizeof(struct pack_list));
 281                new_pll->pl->next = pll->pl;
 282
 283                pll = new_pll->next;
 284        }
 285        /* add ourself to the end */
 286        new_pll->next = xmalloc(sizeof(struct pll));
 287        new_pll->next->pl = xmalloc(sizeof(struct pack_list));
 288        new_pll->next->next = NULL;
 289        memcpy(new_pll->next->pl, list, sizeof(struct pack_list));
 290        new_pll->next->pl->next = NULL;
 291
 292        return subset;
 293}
 294
 295int is_superset(struct pack_list *pl, struct llist *list)
 296{
 297        struct llist *diff, *old;
 298
 299        diff = llist_copy(list);
 300
 301        while (pl) {
 302                old = diff;
 303                diff = llist_sorted_difference(diff->front,
 304                                               pl->all_objects->front);
 305                llist_free(old);
 306                if (diff->size == 0) { /* we're done */
 307                        llist_free(diff);
 308                        return 1;
 309                }
 310                pl = pl->next;
 311        }
 312        llist_free(diff);
 313        return 0;
 314}
 315
 316size_t sizeof_union(struct packed_git *p1, struct packed_git *p2)
 317{
 318        size_t ret = 0;
 319        int p1_off, p2_off;
 320        void *p1_base, *p2_base;
 321
 322        p1_off = p2_off = 256 * 4 + 4;
 323        p1_base = (void *)p1->index_base;
 324        p2_base = (void *)p2->index_base;
 325
 326        while (p1_off <= p1->index_size - 3 * 20 &&
 327               p2_off <= p2->index_size - 3 * 20)
 328        {
 329                int cmp = memcmp(p1_base + p1_off, p2_base + p2_off, 20);
 330                /* cmp ~ p1 - p2 */
 331                if (cmp == 0) {
 332                        ret++;
 333                        p1_off+=24;
 334                        p2_off+=24;
 335                        continue;
 336                }
 337                if (cmp < 0) { /* p1 has the object, p2 doesn't */
 338                        p1_off+=24;
 339                } else { /* p2 has the object, p1 doesn't */
 340                        p2_off+=24;
 341                }
 342        }
 343        return ret;
 344}
 345
 346/* another O(n^2) function ... */
 347size_t get_pack_redundancy(struct pack_list *pl)
 348{
 349        struct pack_list *subset;
 350        size_t ret = 0;
 351        while ((subset = pl->next)) {
 352                while(subset) {
 353                        ret += sizeof_union(pl->pack, subset->pack);
 354                        subset = subset->next;
 355                }
 356                pl = pl->next;
 357        }
 358        return ret;
 359}
 360
 361inline size_t pack_set_bytecount(struct pack_list *pl)
 362{
 363        size_t ret = 0;
 364        while (pl) {
 365                ret += pl->pack->pack_size;
 366                ret += pl->pack->index_size;
 367                pl = pl->next;
 368        }
 369        return ret;
 370}
 371
 372void minimize(struct pack_list **min)
 373{
 374        struct pack_list *pl, *unique = NULL,
 375                *non_unique = NULL, *min_perm = NULL;
 376        struct pll *perm, *perm_all, *perm_ok = NULL, *new_perm;
 377        struct llist *missing, *old;
 378        size_t min_perm_size = (size_t)-1, perm_size;
 379
 380        pl = pack_list;
 381        while (pl) {
 382                if(pl->unique_objects->size)
 383                        pack_list_insert(&unique, pl);
 384                else
 385                        pack_list_insert(&non_unique, pl);
 386                pl = pl->next;
 387        }
 388        /* find out which objects are missing from the set of unique packs */
 389        missing = llist_copy(all_objects);
 390        pl = unique;
 391        while (pl) {
 392                old = missing;
 393                missing = llist_sorted_difference(missing->front,
 394                                                  pl->all_objects->front);
 395                llist_free(old);
 396                pl = pl->next;
 397        }
 398
 399        if (missing->size == 0) {
 400                *min = unique;
 401                return;
 402        }
 403
 404        /* find the permutations which contain all missing objects */
 405        perm_all = perm = get_all_permutations(non_unique);
 406        while (perm) {
 407                if (is_superset(perm->pl, missing)) {
 408                        new_perm = xmalloc(sizeof(struct pll));
 409                        new_perm->pl = perm->pl;
 410                        new_perm->next = perm_ok;
 411                        perm_ok = new_perm;
 412                }
 413                perm = perm->next;
 414        }
 415        
 416        if (perm_ok == NULL)
 417                die("Internal error: No complete sets found!\n");
 418
 419        /* find the permutation with the smallest size */
 420        perm = perm_ok;
 421        while (perm) {
 422                perm_size = pack_set_bytecount(perm->pl);
 423                if (min_perm_size > perm_size) {
 424                        min_perm_size = perm_size;
 425                        min_perm = perm->pl;
 426                }
 427                perm = perm->next;
 428        }
 429        *min = min_perm;
 430        /* add the unique packs to the list */
 431        pl = unique;
 432        while(pl) {
 433                pack_list_insert(min, pl);
 434                pl = pl->next;
 435        }
 436}
 437
 438void load_all_objects()
 439{
 440        struct pack_list *pl = pack_list;
 441        struct llist_item *hint, *l;
 442        int i;
 443
 444        llist_init(&all_objects);
 445
 446        while (pl) {
 447                i = 0;
 448                hint = NULL;
 449                l = pl->all_objects->front;
 450                while (l) {
 451                        hint = llist_insert_sorted_unique(all_objects,
 452                                                          l->sha1, hint);
 453                        l = l->next;
 454                }
 455                pl = pl->next;
 456        }
 457}
 458
 459/* this scales like O(n^2) */
 460void cmp_packs()
 461{
 462        struct pack_list *subset, *curr = pack_list;
 463
 464        while ((subset = curr)) {
 465                while((subset = subset->next))
 466                        cmp_two_packs(curr, subset);
 467                curr = curr->next;
 468        }
 469}
 470
 471struct pack_list * add_pack(struct packed_git *p)
 472{
 473        struct pack_list l;
 474        size_t off;
 475        void *base;
 476
 477        l.pack = p;
 478        llist_init(&l.all_objects);
 479
 480        off = 256 * 4 + 4;
 481        base = (void *)p->index_base;
 482        while (off <= p->index_size - 3 * 20) {
 483                llist_insert_back(l.all_objects, base + off);
 484                off+=24;
 485        }
 486        /* this list will be pruned in cmp_two_packs later */
 487        l.unique_objects = llist_copy(l.all_objects);
 488        return pack_list_insert(&pack_list, &l);
 489}
 490
 491struct pack_list * add_pack_file(char *filename)
 492{
 493        struct packed_git *p = packed_git;
 494
 495        if (strlen(filename) < 40)
 496                die("Bad pack filename: %s\n", filename);
 497
 498        while (p) {
 499                if (strstr(p->pack_name, filename))
 500                        /* this will silently ignore packs in alt-odb */
 501                        return add_pack(p);
 502                p = p->next;
 503        }
 504        die("Filename %s not found in packed_git\n", filename);
 505}
 506
 507void load_all()
 508{
 509        struct packed_git *p = packed_git;
 510
 511        while (p) {
 512                if (p->pack_local) /* ignore alt-odb for now */
 513                        add_pack(p);
 514                p = p->next;
 515        }
 516}
 517
 518int main(int argc, char **argv)
 519{
 520        int i;
 521        struct pack_list *min, *red, *pl;
 522
 523        for (i = 1; i < argc; i++) {
 524                const char *arg = argv[i];
 525                if(!strcmp(arg, "--"))
 526                        break;
 527                if(!strcmp(arg, "-a")) {
 528                        all = 1;
 529                        continue;
 530                }
 531                if(!strcmp(arg, "-v")) {
 532                        verbose = 1;
 533                        continue;
 534                }
 535                if(*arg == '-')
 536                        usage(pack_intersect_usage);
 537                else
 538                        break;
 539        }
 540
 541        prepare_packed_git();
 542
 543        if(all)
 544                load_all();
 545        else
 546                while (*(argv + i) != NULL)
 547                        add_pack_file(*(argv + i++));
 548
 549        if (pack_list == NULL)
 550                die("Zero packs found!\n");
 551
 552        cmp_packs();
 553
 554        load_all_objects();
 555
 556        minimize(&min);
 557        if (verbose) {
 558                fprintf(stderr, "The smallest (bytewise) set of packs is:\n");
 559                pl = min;
 560                while (pl) {
 561                        fprintf(stderr, "\t%s\n", pl->pack->pack_name);
 562                        pl = pl->next;
 563                }
 564                fprintf(stderr, "containing %ld duplicate objects "
 565                                "with a total size of %ldkb.\n",
 566                        get_pack_redundancy(min), pack_set_bytecount(min)/1024);
 567                fprintf(stderr, "Redundant packs (with indexes):\n");
 568        }
 569        pl = red = pack_list_difference(pack_list, min);
 570        while (pl) {
 571                printf("%s\n%s\n",
 572                       sha1_pack_index_name(pl->pack->sha1), pl->pack->pack_name);
 573                pl = pl->next;
 574        }
 575
 576        return 0;
 577}