Merge branch 'kb/hashmap-updates'

author Junio C Hamano <gitster@pobox.com>
Mon, 21 Jul 2014 18:18:44 +0000 (11:18 -0700)

committer Junio C Hamano <gitster@pobox.com>
Mon, 21 Jul 2014 18:18:44 +0000 (11:18 -0700)
author: Junio C Hamano <gitster@pobox.com>
Mon, 21 Jul 2014 18:18:44 +0000 (11:18 -0700)
committer: Junio C Hamano <gitster@pobox.com>
Mon, 21 Jul 2014 18:18:44 +0000 (11:18 -0700)
diff --git a/Documentation/technical/api-hashmap.txt b/Documentation/technical/api-hashmap.txt

index b977ae8bbb1f46ebb40d1d315e79262fe344f1c2..ad7a5bddd24d91ceda78d430fd86a204b21fd005 100644 (file)
--- a/Documentation/technical/api-hashmap.txt
+++ b/Documentation/technical/api-hashmap.txt
@@ -8,11 +8,19 @@ Data Structures
  
  `struct hashmap`::
  
-       The hash table structure.
+       The hash table structure. Members can be used as follows, but should
+       not be modified directly:
  +
-The `size` member keeps track of the total number of entries. The `cmpfn`
-member is a function used to compare two entries for equality. The `table` and
-`tablesize` members store the hash table and its size, respectively.
+The `size` member keeps track of the total number of entries (0 means the
+hashmap is empty).
++
+`tablesize` is the allocated size of the hash table. A non-0 value indicates
+that the hashmap is initialized. It may also be useful for statistical purposes
+(i.e. `size / tablesize` is the current load factor).
++
+`cmpfn` stores the comparison function specified in `hashmap_init()`. In
+advanced scenarios, it may be useful to change this, e.g. to switch between
+case-sensitive and case-insensitive lookup.
  
  `struct hashmap_entry`::
  
@@ -58,6 +66,15 @@ Functions
  +
  `strihash` and `memihash` are case insensitive versions.
  
+`unsigned int sha1hash(const unsigned char *sha1)`::
+
+       Converts a cryptographic hash (e.g. SHA-1) into an int-sized hash code
+       for use in hash tables. Cryptographic hashes are supposed to have
+       uniform distribution, so in contrast to `memhash()`, this just copies
+       the first `sizeof(int)` bytes without shuffling any bits. Note that
+       the results will be different on big-endian and little-endian
+       platforms, so they should not be stored or transferred over the net.
+
  `void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, size_t initial_size)`::
  
         Initializes a hashmap structure.
@@ -101,6 +118,20 @@ hashmap_entry) that has at least been initialized with the proper hash code
  If an entry with matching hash code is found, `key` and `keydata` are passed
  to `hashmap_cmp_fn` to decide whether the entry matches the key.
  
+`void *hashmap_get_from_hash(const struct hashmap *map, unsigned int hash, const void *keydata)`::
+
+       Returns the hashmap entry for the specified hash code and key data,
+       or NULL if not found.
++
+`map` is the hashmap structure.
++
+`hash` is the hash code of the entry to look up.
++
+If an entry with matching hash code is found, `keydata` is passed to
+`hashmap_cmp_fn` to decide whether the entry matches the key. The
+`entry_or_key` parameter points to a bogus hashmap_entry structure that
+should not be used in the comparison.
+
  `void *hashmap_get_next(const struct hashmap *map, const void *entry)`::
  
         Returns the next equal hashmap entry, or NULL if not found. This can be
@@ -162,6 +193,21 @@ more entries.
  `hashmap_iter_first` is a combination of both (i.e. initializes the iterator
  and returns the first entry, if any).
  
+`const char *strintern(const char *string)`::
+`const void *memintern(const void *data, size_t len)`::
+
+       Returns the unique, interned version of the specified string or data,
+       similar to the `String.intern` API in Java and .NET, respectively.
+       Interned strings remain valid for the entire lifetime of the process.
++
+Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
+strings / data must not be modified or freed.
++
+Interned strings are best used for short strings with high probability of
+duplicates.
++
+Uses a hashmap to store the pool of interned strings.
+
  Usage example
  -------------
  
diff --git a/builtin/describe.c b/builtin/describe.c

index 24d740c8b1705f9c8993bd808c9a618acecc300c..ee6a3b998f2c22b1d131a180e77079117e598136 100644 (file)
--- a/builtin/describe.c
+++ b/builtin/describe.c
@@ -56,18 +56,9 @@ static int commit_name_cmp(const struct commit_name *cn1,
         return hashcmp(cn1->peeled, peeled ? peeled : cn2->peeled);
  }
  
-static inline unsigned int hash_sha1(const unsigned char *sha1)
-{
-       unsigned int hash;
-       memcpy(&hash, sha1, sizeof(hash));
-       return hash;
-}
-
  static inline struct commit_name *find_commit_name(const unsigned char *peeled)
  {
-       struct commit_name key;
-       hashmap_entry_init(&key, hash_sha1(peeled));
-       return hashmap_get(&names, &key, peeled);
+       return hashmap_get_from_hash(&names, sha1hash(peeled), peeled);
  }
  
  static int replace_name(struct commit_name *e,
@@ -114,7 +105,7 @@ static void add_to_known_names(const char *path,
                 if (!e) {
                         e = xmalloc(sizeof(struct commit_name));
                         hashcpy(e->peeled, peeled);
-                       hashmap_entry_init(e, hash_sha1(peeled));
+                       hashmap_entry_init(e, sha1hash(peeled));
                         hashmap_add(&names, e);
                         e->path = NULL;
                 }
diff --git a/decorate.c b/decorate.c

index 7cb5d29a89a43ecf7e433db513b66184df3accb8..b2aac90c26296eacdd6c2cdb68a15f3744c595e9 100644 (file)
--- a/decorate.c
+++ b/decorate.c
@@ -8,10 +8,7 @@
  
  static unsigned int hash_obj(const struct object *obj, unsigned int n)
  {
-       unsigned int hash;
-
-       memcpy(&hash, obj->sha1, sizeof(unsigned int));
-       return hash % n;
+       return sha1hash(obj->sha1) % n;
  }
  
  static void *insert_decoration(struct decoration *n, const struct object *base, void *decoration)
diff --git a/diffcore-rename.c b/diffcore-rename.c

index 749a35d2c2ab3271c6503a19271a5be6a6e97b4d..2e44a3745939bb75841730ba0cff78ea872df8d9 100644 (file)
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -242,14 +242,12 @@ struct file_similarity {
  
  static unsigned int hash_filespec(struct diff_filespec *filespec)
  {
-       unsigned int hash;
         if (!filespec->sha1_valid) {
                 if (diff_populate_filespec(filespec, 0))
                         return 0;
                 hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
         }
-       memcpy(&hash, filespec->sha1, sizeof(hash));
-       return hash;
+       return sha1hash(filespec->sha1);
  }
  
  static int find_identical_files(struct hashmap *srcs,
@@ -259,15 +257,14 @@ static int find_identical_files(struct hashmap *srcs,
         int renames = 0;
  
         struct diff_filespec *target = rename_dst[dst_index].two;
-       struct file_similarity *p, *best, dst;
+       struct file_similarity *p, *best = NULL;
         int i = 100, best_score = -1;
  
         /*
          * Find the best source match for specified destination.
          */
-       best = NULL;
-       hashmap_entry_init(&dst, hash_filespec(target));
-       for (p = hashmap_get(srcs, &dst, NULL); p; p = hashmap_get_next(srcs, p)) {
+       p = hashmap_get_from_hash(srcs, hash_filespec(target), NULL);
+       for (; p; p = hashmap_get_next(srcs, p)) {
                 int score;
                 struct diff_filespec *source = p->filespec;
  
diff --git a/hashmap.c b/hashmap.c

index d1b8056d8d53c35d62599999026766a4342eb764..f693839cb4786fd5be57d878b58499273ec17f81 100644 (file)
--- a/hashmap.c
+++ b/hashmap.c
@@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
                 current = iter->map->table[iter->tablepos++];
         }
  }
+
+struct pool_entry {
+       struct hashmap_entry ent;
+       size_t len;
+       unsigned char data[FLEX_ARRAY];
+};
+
+static int pool_entry_cmp(const struct pool_entry *e1,
+                         const struct pool_entry *e2,
+                         const unsigned char *keydata)
+{
+       return e1->data != keydata &&
+              (e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
+}
+
+const void *memintern(const void *data, size_t len)
+{
+       static struct hashmap map;
+       struct pool_entry key, *e;
+
+       /* initialize string pool hashmap */
+       if (!map.tablesize)
+               hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);
+
+       /* lookup interned string in pool */
+       hashmap_entry_init(&key, memhash(data, len));
+       key.len = len;
+       e = hashmap_get(&map, &key, data);
+       if (!e) {
+               /* not found: create it */
+               e = xmallocz(sizeof(struct pool_entry) + len);
+               hashmap_entry_init(e, key.ent.hash);
+               e->len = len;
+               memcpy(e->data, data, len);
+               hashmap_add(&map, e);
+       }
+       return e->data;
+}
diff --git a/hashmap.h b/hashmap.h

index a816ad47b14d2d377ba0e03a3f401ed36a56efa8..ab7958ae333bcc635ba2ac8e40ee8aa6d8814ab4 100644 (file)
--- a/hashmap.h
+++ b/hashmap.h
@@ -13,6 +13,17 @@ extern unsigned int strihash(const char *buf);
  extern unsigned int memhash(const void *buf, size_t len);
  extern unsigned int memihash(const void *buf, size_t len);
  
+static inline unsigned int sha1hash(const unsigned char *sha1)
+{
+       /*
+        * Equivalent to 'return *(unsigned int *)sha1;', but safe on
+        * platforms that don't support unaligned reads.
+        */
+       unsigned int hash;
+       memcpy(&hash, sha1, sizeof(hash));
+       return hash;
+}
+
  /* data structures */
  
  struct hashmap_entry {
@@ -57,6 +68,14 @@ extern void *hashmap_put(struct hashmap *map, void *entry);
  extern void *hashmap_remove(struct hashmap *map, const void *key,
                 const void *keydata);
  
+static inline void *hashmap_get_from_hash(const struct hashmap *map,
+               unsigned int hash, const void *keydata)
+{
+       struct hashmap_entry key;
+       hashmap_entry_init(&key, hash);
+       return hashmap_get(map, &key, keydata);
+}
+
  /* hashmap_iter functions */
  
  extern void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter);
@@ -68,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
         return hashmap_iter_next(iter);
  }
  
+/* string interning */
+
+extern const void *memintern(const void *data, size_t len);
+static inline const char *strintern(const char *string)
+{
+       return memintern(string, strlen(string));
+}
+
  #endif
diff --git a/khash.h b/khash.h

index 57ff6038c5be0fb5aa28f1b297bf6972bebab350..06c79065490f152ceed2732ba1b93c309eb4d574 100644 (file)
--- a/khash.h
+++ b/khash.h
@@ -320,19 +320,12 @@ static const double __ac_HASH_UPPER = 0.77;
                 code;                                                                                           \
         } }
  
-static inline khint_t __kh_oid_hash(const unsigned char *oid)
-{
-       khint_t hash;
-       memcpy(&hash, oid, sizeof(hash));
-       return hash;
-}
-
  #define __kh_oid_cmp(a, b) (hashcmp(a, b) == 0)
  
-KHASH_INIT(sha1, const unsigned char *, void *, 1, __kh_oid_hash, __kh_oid_cmp)
+KHASH_INIT(sha1, const unsigned char *, void *, 1, sha1hash, __kh_oid_cmp)
  typedef kh_sha1_t khash_sha1;
  
-KHASH_INIT(sha1_pos, const unsigned char *, int, 1, __kh_oid_hash, __kh_oid_cmp)
+KHASH_INIT(sha1_pos, const unsigned char *, int, 1, sha1hash, __kh_oid_cmp)
  typedef kh_sha1_pos_t khash_sha1_pos;
  
  #endif /* __AC_KHASH_H */
diff --git a/name-hash.c b/name-hash.c

index 49fd508317ebbde5a76f6af64318920ae237c8fd..702cd0518fca67a84417de8268ed70dfa29392e4 100644 (file)
--- a/name-hash.c
+++ b/name-hash.c
@@ -213,12 +213,11 @@ struct cache_entry *index_dir_exists(struct index_state *istate, const char *nam
  struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int icase)
  {
         struct cache_entry *ce;
-       struct hashmap_entry key;
  
         lazy_init_name_hash(istate);
  
-       hashmap_entry_init(&key, memihash(name, namelen));
-       ce = hashmap_get(&istate->name_hash, &key, NULL);
+       ce = hashmap_get_from_hash(&istate->name_hash,
+                                  memihash(name, namelen), NULL);
         while (ce) {
                 if (same_name(ce, name, namelen, icase))
                         return ce;
diff --git a/object.c b/object.c

index 9c31e9a5e0e0d913044501089e09037e47894f03..91c7c867b9943f6e1b5b7fef560d50450c8adc5a 100644 (file)
--- a/object.c
+++ b/object.c
@@ -50,18 +50,7 @@ int type_from_string(const char *str)
   */
  static unsigned int hash_obj(const unsigned char *sha1, unsigned int n)
  {
-       unsigned int hash;
-
-       /*
-        * Since the sha1 is essentially random, we just take the
-        * required number of bits directly from the first
-        * sizeof(unsigned int) bytes of sha1.  First we have to copy
-        * the bytes into a properly aligned integer.  If we cared
-        * about getting consistent results across architectures, we
-        * would have to call ntohl() here, too.
-        */
-       memcpy(&hash, sha1, sizeof(unsigned int));
-       return hash & (n - 1);
+       return sha1hash(sha1) & (n - 1);
  }
  
  /*
diff --git a/pack-objects.c b/pack-objects.c

index 4f36c3204544c40ada8e6ce584deb6063b055518..9992f3ecf249c32e5be31dc047bad46a4c4ac367 100644 (file)
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -7,10 +7,9 @@ static uint32_t locate_object_entry_hash(struct packing_data *pdata,
                                          const unsigned char *sha1,
                                          int *found)
  {
-       uint32_t i, hash, mask = (pdata->index_size - 1);
+       uint32_t i, mask = (pdata->index_size - 1);
  
-       memcpy(&hash, sha1, sizeof(uint32_t));
-       i = hash & mask;
+       i = sha1hash(sha1) & mask;
  
         while (pdata->index[i] > 0) {
                 uint32_t pos = pdata->index[i] - 1;
diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh

index 391e2b64927d7d095df2569a4e56ea075a391d78..f97c80556fac55ae1bc11d24712be21a9dac0dcf 100755 (executable)
--- a/t/t0011-hashmap.sh
+++ b/t/t0011-hashmap.sh
@@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
  
  '
  
+test_expect_success 'string interning' '
+
+test_hashmap "intern value1
+intern Value1
+intern value2
+intern value2
+" "value1
+Value1
+value2
+value2"
+
+'
+
  test_done
diff --git a/test-hashmap.c b/test-hashmap.c

index f5183fb9e82575c86355b690178a8c3d96175eb4..07aa7ecdeede64477b424620425826b380bf895d 100644 (file)
--- a/test-hashmap.c
+++ b/test-hashmap.c
@@ -115,9 +115,8 @@ static void perf_hashmap(unsigned int method, unsigned int rounds)
  
                 for (j = 0; j < rounds; j++) {
                         for (i = 0; i < TEST_SIZE; i++) {
-                               struct hashmap_entry key;
-                               hashmap_entry_init(&key, hashes[i]);
-                               hashmap_get(&map, &key, entries[i]->key);
+                               hashmap_get_from_hash(&map, hashes[i],
+                                                     entries[i]->key);
                         }
                 }
  
@@ -199,12 +198,8 @@ int main(int argc, char *argv[])
  
                 } else if (!strcmp("get", cmd) && l1) {
  
-                       /* setup static key */
-                       struct hashmap_entry key;
-                       hashmap_entry_init(&key, hash);
-
                         /* lookup entry in hashmap */
-                       entry = hashmap_get(&map, &key, p1);
+                       entry = hashmap_get_from_hash(&map, hash, p1);
  
                         /* print result */
                         if (!entry)
@@ -239,6 +234,20 @@ int main(int argc, char *argv[])
                         /* print table sizes */
                         printf("%u %u\n", map.tablesize, map.size);
  
+               } else if (!strcmp("intern", cmd) && l1) {
+
+                       /* test that strintern works */
+                       const char *i1 = strintern(p1);
+                       const char *i2 = strintern(p1);
+                       if (strcmp(i1, p1))
+                               printf("strintern(%s) returns %s\n", p1, i1);
+                       else if (i1 == p1)
+                               printf("strintern(%s) returns input pointer\n", p1);
+                       else if (i1 != i2)
+                               printf("strintern(%s) != strintern(%s)", i1, i2);
+                       else
+                               printf("%s\n", i1);
+
                 } else if (!strcmp("perfhashmap", cmd) && l1 && l2) {
  
                         perf_hashmap(atoi(p1), atoi(p2));
author	Junio C Hamano <gitster@pobox.com>
author	Mon, 21 Jul 2014 18:18:44 +0000 (11:18 -0700)
committer	Junio C Hamano <gitster@pobox.com>
committer	Mon, 21 Jul 2014 18:18:44 +0000 (11:18 -0700)
Documentation/technical/api-hashmap.txt		patch \| blob \| history
builtin/describe.c		patch \| blob \| history
decorate.c		patch \| blob \| history
diffcore-rename.c		patch \| blob \| history
hashmap.c		patch \| blob \| history
hashmap.h		patch \| blob \| history
khash.h		patch \| blob \| history
name-hash.c		patch \| blob \| history
object.c		patch \| blob \| history
pack-objects.c		patch \| blob \| history
t/t0011-hashmap.sh		patch \| blob \| history
test-hashmap.c		patch \| blob \| history