improve diff-delta with sparse and/or repetitive data

author Nicolas Pitre <nico@cam.org>
Wed, 3 May 2006 03:31:00 +0000 (23:31 -0400)

committer Junio C Hamano <junkio@cox.net>
Wed, 3 May 2006 04:32:39 +0000 (21:32 -0700)
author: Nicolas Pitre <nico@cam.org>
Wed, 3 May 2006 03:31:00 +0000 (23:31 -0400)
committer: Junio C Hamano <junkio@cox.net>
Wed, 3 May 2006 04:32:39 +0000 (21:32 -0700)
diff --git a/diff-delta.c b/diff-delta.c

index 45df786b0a099e03bcca57f40047be04635e130d..c61887518874d550f66d2d52bd7559d023fcb176 100644 (file)
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -136,11 +136,12 @@ struct delta_index {
  
  struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
  {
  
  struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
  {
-       unsigned int i, hsize, hmask, entries, *hash_count;
+       unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
         const unsigned char *data, *buffer = buf;
         struct delta_index *index;
         struct index_entry *entry, **hash;
         void *mem;
         const unsigned char *data, *buffer = buf;
         struct delta_index *index;
         struct index_entry *entry, **hash;
         void *mem;
+       unsigned long memsize;
  
         if (!buf || !bufsize)
                 return NULL;
  
         if (!buf || !bufsize)
                 return NULL;
@@ -155,9 +156,10 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
         hmask = hsize - 1;
  
         /* allocate lookup index */
         hmask = hsize - 1;
  
         /* allocate lookup index */
-       mem = malloc(sizeof(*index) +
-                    sizeof(*hash) * hsize +
-                    sizeof(*entry) * entries);
+       memsize = sizeof(*index) +
+                 sizeof(*hash) * hsize +
+                 sizeof(*entry) * entries;
+       mem = malloc(memsize);
         if (!mem)
                 return NULL;
         index = mem;
         if (!mem)
                 return NULL;
         index = mem;
@@ -179,18 +181,26 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
         }
  
         /* then populate the index */
         }
  
         /* then populate the index */
-       data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
-       while (data >= buffer) {
+       prev_val = ~0;
+       for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+            data >= buffer;
+            data -= RABIN_WINDOW) {
                 unsigned int val = 0;
                 for (i = 1; i <= RABIN_WINDOW; i++)
                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
                 unsigned int val = 0;
                 for (i = 1; i <= RABIN_WINDOW; i++)
                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
-               i = val & hmask;
-               entry->ptr = data + RABIN_WINDOW;
-               entry->val = val;
-               entry->next = hash[i];
-               hash[i] = entry++;
-               hash_count[i]++;
-               data -= RABIN_WINDOW;
+               if (val == prev_val) {
+                       /* keep the lowest of consecutive identical blocks */
+                       entry[-1].ptr = data + RABIN_WINDOW;
+               } else {
+                       prev_val = val;
+                       i = val & hmask;
+                       entry->ptr = data + RABIN_WINDOW;
+                       entry->val = val;
+                       entry->next = hash[i];
+                       hash[i] = entry++;
+                       hash_count[i]++;
+                       entries--;
+               }
         }
  
         /*
         }
  
         /*
@@ -220,6 +230,10 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
         }
         free(hash_count);
  
         }
         free(hash_count);
  
+       /* If we didn't use all hash entries, free the unused memory. */
+       if (entries)
+               index = realloc(index, memsize - entries * sizeof(*entry));
+
         return index;
  }
  
         return index;
  }
author	Nicolas Pitre <nico@cam.org>
author	Wed, 3 May 2006 03:31:00 +0000 (23:31 -0400)
committer	Junio C Hamano <junkio@cox.net>
committer	Wed, 3 May 2006 04:32:39 +0000 (21:32 -0700)