Added automatic index generation to fast-import.
[gitweb.git] / fast-import.c
index 416ba5c7c6bacfbb62236326ffe02769cdee0199..0d9511849993f01e5886ba2bd5a529a66e00281f 100644 (file)
@@ -8,11 +8,75 @@
 
 static int max_depth = 10;
 static unsigned long object_count;
+static unsigned long duplicate_count;
+static unsigned long packoff;
+static unsigned long overflow_count;
 static int packfd;
 static int current_depth;
 static void *lastdat;
 static unsigned long lastdatlen;
 static unsigned char lastsha1[20];
+static unsigned char packsha1[20];
+
+struct object_entry
+{
+       struct object_entry *next;
+       unsigned long offset;
+       unsigned char sha1[20];
+};
+
+struct overflow_object_entry
+{
+       struct overflow_object_entry *next;
+       struct object_entry oe;
+};
+
+struct object_entry *pool_start;
+struct object_entry *pool_next;
+struct object_entry *pool_end;
+struct overflow_object_entry *overflow;
+struct object_entry *table[1 << 16];
+
+static struct object_entry* new_object(unsigned char *sha1)
+{
+       if (pool_next != pool_end) {
+               struct object_entry *e = pool_next++;
+               memcpy(e->sha1, sha1, sizeof(e->sha1));
+               return e;
+       } else {
+               struct overflow_object_entry *e;
+
+               e = xmalloc(sizeof(struct overflow_object_entry));
+               e->next = overflow;
+               memcpy(e->oe.sha1, sha1, sizeof(e->oe.sha1));
+               overflow = e;
+               overflow_count++;
+               return &e->oe;
+       }
+}
+
+static struct object_entry* insert_object(unsigned char *sha1)
+{
+       unsigned int h = sha1[0] << 8 | sha1[1];
+       struct object_entry *e = table[h];
+       struct object_entry *p = 0;
+
+       while (e) {
+               if (!memcmp(sha1, e->sha1, sizeof(e->sha1)))
+                       return e;
+               p = e;
+               e = e->next;
+       }
+
+       e = new_object(sha1);
+       e->next = 0;
+       e->offset = 0;
+       if (p)
+               p->next = e;
+       else
+               table[h] = e;
+       return e;
+}
 
 static ssize_t yread(int fd, void *buffer, size_t length)
 {
@@ -66,7 +130,7 @@ static unsigned long encode_header(enum object_type type, unsigned long size, un
        return n;
 }
 
-static void write_blob (void *dat, unsigned long datlen)
+static void write_blob(void *dat, unsigned long datlen)
 {
        z_stream s;
        void *out, *delta;
@@ -92,6 +156,7 @@ static void write_blob (void *dat, unsigned long datlen)
                        die("Can't write object header: %s", strerror(errno));
                if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
                        die("Can't write object base: %s", strerror(errno));
+               packoff += hdrlen + sizeof(lastsha1);
        } else {
                current_depth = 0;
                s.next_in = dat;
@@ -99,6 +164,7 @@ static void write_blob (void *dat, unsigned long datlen)
                hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
                if (ywrite(packfd, hdr, hdrlen) != hdrlen)
                        die("Can't write object header: %s", strerror(errno));
+               packoff += hdrlen;
        }
 
        s.avail_out = deflateBound(&s, s.avail_in);
@@ -109,13 +175,14 @@ static void write_blob (void *dat, unsigned long datlen)
 
        if (ywrite(packfd, out, s.total_out) != s.total_out)
                die("Failed writing compressed data %s", strerror(errno));
+       packoff += s.total_out;
 
        free(out);
        if (delta)
                free(delta);
 }
 
-static void init_pack_header ()
+static void init_pack_header()
 {
        const char* magic = "PACK";
        unsigned long version = 2;
@@ -129,13 +196,13 @@ static void init_pack_header ()
                die("Can't write pack version: %s", strerror(errno));
        if (ywrite(packfd, &zero, 4) != 4)
                die("Can't write 0 object count: %s", strerror(errno));
+       packoff = 4 * 3;
 }
 
-static void fixup_header_footer ()
+static void fixup_header_footer()
 {
        SHA_CTX c;
        char hdr[8];
-       unsigned char sha1[20];
        unsigned long cnt;
        char *buf;
        size_t n;
@@ -148,7 +215,6 @@ static void fixup_header_footer ()
                die("Failed reading header: %s", strerror(errno));
        SHA1_Update(&c, hdr, 8);
 
-fprintf(stderr, "%lu objects\n", object_count);
        cnt = htonl(object_count);
        SHA1_Update(&c, &cnt, 4);
        if (ywrite(packfd, &cnt, 4) != 4)
@@ -163,16 +229,81 @@ fprintf(stderr, "%lu objects\n", object_count);
        }
        free(buf);
 
-       SHA1_Final(sha1, &c);
-       if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1))
+       SHA1_Final(packsha1, &c);
+       if (ywrite(packfd, packsha1, sizeof(packsha1)) != sizeof(packsha1))
                die("Failed writing pack checksum: %s", strerror(errno));
 }
 
-int main (int argc, const char **argv)
+static int oecmp (const void *_a, const void *_b)
 {
-       packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666);
+       struct object_entry *a = *((struct object_entry**)_a);
+       struct object_entry *b = *((struct object_entry**)_b);
+       return memcmp(a->sha1, b->sha1, sizeof(a->sha1));
+}
+
+static void write_index(const char *idx_name)
+{
+       struct sha1file *f;
+       struct object_entry **idx, **c, **last;
+       struct object_entry *e;
+       struct overflow_object_entry *o;
+       unsigned int array[256];
+       int i;
+
+       /* Build the sorted table of object IDs. */
+       idx = xmalloc(object_count * sizeof(struct object_entry*));
+       c = idx;
+       for (e = pool_start; e != pool_next; e++)
+               *c++ = e;
+       for (o = overflow; o; o = o->next)
+               *c++ = &o->oe;
+       last = idx + object_count;
+       qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
+
+       /* Generate the fan-out array. */
+       c = idx;
+       for (i = 0; i < 256; i++) {
+               struct object_entry **next = c;;
+               while (next < last) {
+                       if ((*next)->sha1[0] != i)
+                               break;
+                       next++;
+               }
+               array[i] = htonl(next - idx);
+               c = next;
+       }
+
+       f = sha1create("%s", idx_name);
+       sha1write(f, array, 256 * sizeof(int));
+       for (c = idx; c != last; c++) {
+               unsigned int offset = htonl((*c)->offset);
+               sha1write(f, &offset, 4);
+               sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
+       }
+       sha1write(f, packsha1, sizeof(packsha1));
+       sha1close(f, NULL, 1);
+       free(idx);
+}
+
+int main(int argc, const char **argv)
+{
+       const char *base_name = argv[1];
+       int est_obj_cnt = atoi(argv[2]);
+       char *pack_name;
+       char *idx_name;
+
+       pack_name = xmalloc(strlen(base_name) + 6);
+       sprintf(pack_name, "%s.pack", base_name);
+       idx_name = xmalloc(strlen(base_name) + 5);
+       sprintf(idx_name, "%s.idx", base_name);
+
+       packfd = open(pack_name, O_RDWR|O_CREAT|O_TRUNC, 0666);
        if (packfd < 0)
-               die("Can't create pack file %s: %s", argv[1], strerror(errno));
+               die("Can't create pack file %s: %s", pack_name, strerror(errno));
+
+       pool_start = xmalloc(est_obj_cnt * sizeof(struct object_entry));
+       pool_next = pool_start;
+       pool_end = pool_start + est_obj_cnt;
 
        init_pack_header();
        for (;;) {
@@ -182,8 +313,10 @@ int main (int argc, const char **argv)
                char hdr[128];
                unsigned char sha1[20];
                SHA_CTX c;
+               struct object_entry *e;
 
                if (yread(0, &datlen, 4) != 4)
+
                        break;
 
                dat = xmalloc(datlen);
@@ -196,19 +329,30 @@ int main (int argc, const char **argv)
                SHA1_Update(&c, dat, datlen);
                SHA1_Final(sha1, &c);
 
-               write_blob(dat, datlen);
-               object_count++;
-               printf("%s\n", sha1_to_hex(sha1));
-               fflush(stdout);
+               e = insert_object(sha1);
+               if (!e->offset) {
+                       e->offset = packoff;
+                       write_blob(dat, datlen);
+                       object_count++;
+                       printf("%s\n", sha1_to_hex(sha1));
+                       fflush(stdout);
 
-               if (lastdat)
-                       free(lastdat);
-               lastdat = dat;
-               lastdatlen = datlen;
-               memcpy(lastsha1, sha1, sizeof(sha1));
+                       if (lastdat)
+                               free(lastdat);
+                       lastdat = dat;
+                       lastdatlen = datlen;
+                       memcpy(lastsha1, sha1, sizeof(sha1));
+               } else {
+                       duplicate_count++;
+                       free(dat);
+               }
        }
        fixup_header_footer();
        close(packfd);
+       write_index(idx_name);
+
+       fprintf(stderr, "%lu objects, %lu duplicates, %lu pool overflow\n",
+               object_count, duplicate_count, overflow_count);
 
        return 0;
 }