Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
hashmap: add string interning API
Interning short strings with high probability of duplicates can reduce the
memory footprint and speed up comparisons.

Add strintern() and memintern() APIs that use a hashmap to manage the pool
of unique, interned strings.

Note: strintern(getenv()) could be used to sanitize git's use of getenv(),
in case we ever encounter a platform where a call to getenv() invalidates
previous getenv() results (which is allowed by POSIX).

Signed-off-by: Karsten Blees <blees@dcon.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information
Karsten Blees authored and Junio C Hamano committed Jul 7, 2014
1 parent ab73a9d commit 7b64d42
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 0 deletions.
15 changes: 15 additions & 0 deletions Documentation/technical/api-hashmap.txt
Expand Up @@ -193,6 +193,21 @@ more entries.
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
and returns the first entry, if any).

`const char *strintern(const char *string)`::
`const void *memintern(const void *data, size_t len)`::

Returns the unique, interned version of the specified string or data,
similar to the `String.intern` API in Java and .NET, respectively.
Interned strings remain valid for the entire lifetime of the process.
+
Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
strings / data must not be modified or freed.
+
Interned strings are best used for short strings with high probability of
duplicates.
+
Uses a hashmap to store the pool of interned strings.

Usage example
-------------

Expand Down
38 changes: 38 additions & 0 deletions hashmap.c
Expand Up @@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
current = iter->map->table[iter->tablepos++];
}
}

struct pool_entry {
struct hashmap_entry ent;
size_t len;
unsigned char data[FLEX_ARRAY];
};

static int pool_entry_cmp(const struct pool_entry *e1,
const struct pool_entry *e2,
const unsigned char *keydata)
{
return e1->data != keydata &&
(e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
}

const void *memintern(const void *data, size_t len)
{
static struct hashmap map;
struct pool_entry key, *e;

/* initialize string pool hashmap */
if (!map.tablesize)
hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);

/* lookup interned string in pool */
hashmap_entry_init(&key, memhash(data, len));
key.len = len;
e = hashmap_get(&map, &key, data);
if (!e) {
/* not found: create it */
e = xmallocz(sizeof(struct pool_entry) + len);
hashmap_entry_init(e, key.ent.hash);
e->len = len;
memcpy(e->data, data, len);
hashmap_add(&map, e);
}
return e->data;
}
8 changes: 8 additions & 0 deletions hashmap.h
Expand Up @@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
return hashmap_iter_next(iter);
}

/* string interning */

extern const void *memintern(const void *data, size_t len);
static inline const char *strintern(const char *string)
{
return memintern(string, strlen(string));
}

#endif
13 changes: 13 additions & 0 deletions t/t0011-hashmap.sh
Expand Up @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
'

test_expect_success 'string interning' '
test_hashmap "intern value1
intern Value1
intern value2
intern value2
" "value1
Value1
value2
value2"
'

test_done
14 changes: 14 additions & 0 deletions test-hashmap.c
Expand Up @@ -234,6 +234,20 @@ int main(int argc, char *argv[])
/* print table sizes */
printf("%u %u\n", map.tablesize, map.size);

} else if (!strcmp("intern", cmd) && l1) {

/* test that strintern works */
const char *i1 = strintern(p1);
const char *i2 = strintern(p1);
if (strcmp(i1, p1))
printf("strintern(%s) returns %s\n", p1, i1);
else if (i1 == p1)
printf("strintern(%s) returns input pointer\n", p1);
else if (i1 != i2)
printf("strintern(%s) != strintern(%s)", i1, i2);
else
printf("%s\n", i1);

} else if (!strcmp("perfhashmap", cmd) && l1 && l2) {

perf_hashmap(atoi(p1), atoi(p2));
Expand Down

0 comments on commit 7b64d42

Please sign in to comment.