hashmap: add string interning API
authorKarsten Blees <karsten.blees@gmail.com>
Wed, 2 Jul 2014 22:22:54 +0000 (00:22 +0200)
committerJunio C Hamano <gitster@pobox.com>
Mon, 7 Jul 2014 20:56:38 +0000 (13:56 -0700)
Interning short strings with high probability of duplicates can reduce the
memory footprint and speed up comparisons.

Add strintern() and memintern() APIs that use a hashmap to manage the pool
of unique, interned strings.

Note: strintern(getenv()) could be used to sanitize git's use of getenv(),
in case we ever encounter a platform where a call to getenv() invalidates
previous getenv() results (which is allowed by POSIX).

Signed-off-by: Karsten Blees <blees@dcon.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Documentation/technical/api-hashmap.txt
hashmap.c
hashmap.h
t/t0011-hashmap.sh
test-hashmap.c
index 8ed92a9f850357c0915d8215964af141bbf46fbf..ad7a5bddd24d91ceda78d430fd86a204b21fd005 100644 (file)
@@ -193,6 +193,21 @@ more entries.
 `hashmap_iter_first` is a combination of both (i.e. initializes the iterator
 and returns the first entry, if any).
 
+`const char *strintern(const char *string)`::
+`const void *memintern(const void *data, size_t len)`::
+
+       Returns the unique, interned version of the specified string or data,
+       similar to the `String.intern` API in Java and .NET, respectively.
+       Interned strings remain valid for the entire lifetime of the process.
++
+Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
+strings / data must not be modified or freed.
++
+Interned strings are best used for short strings with high probability of
+duplicates.
++
+Uses a hashmap to store the pool of interned strings.
+
 Usage example
 -------------
 
index d1b8056d8d53c35d62599999026766a4342eb764..f693839cb4786fd5be57d878b58499273ec17f81 100644 (file)
--- a/hashmap.c
+++ b/hashmap.c
@@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
                current = iter->map->table[iter->tablepos++];
        }
 }
+
+struct pool_entry {
+       struct hashmap_entry ent;
+       size_t len;
+       unsigned char data[FLEX_ARRAY];
+};
+
+static int pool_entry_cmp(const struct pool_entry *e1,
+                         const struct pool_entry *e2,
+                         const unsigned char *keydata)
+{
+       return e1->data != keydata &&
+              (e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
+}
+
+const void *memintern(const void *data, size_t len)
+{
+       static struct hashmap map;
+       struct pool_entry key, *e;
+
+       /* initialize string pool hashmap */
+       if (!map.tablesize)
+               hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);
+
+       /* lookup interned string in pool */
+       hashmap_entry_init(&key, memhash(data, len));
+       key.len = len;
+       e = hashmap_get(&map, &key, data);
+       if (!e) {
+               /* not found: create it */
+               e = xmallocz(sizeof(struct pool_entry) + len);
+               hashmap_entry_init(e, key.ent.hash);
+               e->len = len;
+               memcpy(e->data, data, len);
+               hashmap_add(&map, e);
+       }
+       return e->data;
+}
index a8b9e3dc8c9a49869191e33cfa9e974b6f11e3d1..ab7958ae333bcc635ba2ac8e40ee8aa6d8814ab4 100644 (file)
--- a/hashmap.h
+++ b/hashmap.h
@@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
        return hashmap_iter_next(iter);
 }
 
+/* string interning */
+
+extern const void *memintern(const void *data, size_t len);
+static inline const char *strintern(const char *string)
+{
+       return memintern(string, strlen(string));
+}
+
 #endif
index 391e2b64927d7d095df2569a4e56ea075a391d78..f97c80556fac55ae1bc11d24712be21a9dac0dcf 100755 (executable)
@@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
 
 '
 
+test_expect_success 'string interning' '
+
+test_hashmap "intern value1
+intern Value1
+intern value2
+intern value2
+" "value1
+Value1
+value2
+value2"
+
+'
+
 test_done
index 3c9f67bcb275a0378da95bdf020dfac89ebdecb0..07aa7ecdeede64477b424620425826b380bf895d 100644 (file)
@@ -234,6 +234,20 @@ int main(int argc, char *argv[])
                        /* print table sizes */
                        printf("%u %u\n", map.tablesize, map.size);
 
+               } else if (!strcmp("intern", cmd) && l1) {
+
+                       /* test that strintern works */
+                       const char *i1 = strintern(p1);
+                       const char *i2 = strintern(p1);
+                       if (strcmp(i1, p1))
+                               printf("strintern(%s) returns %s\n", p1, i1);
+                       else if (i1 == p1)
+                               printf("strintern(%s) returns input pointer\n", p1);
+                       else if (i1 != i2)
+                               printf("strintern(%s) != strintern(%s)", i1, i2);
+                       else
+                               printf("%s\n", i1);
+
                } else if (!strcmp("perfhashmap", cmd) && l1 && l2) {
 
                        perf_hashmap(atoi(p1), atoi(p2));