Skip to content

Commit

Permalink
hashmap: add string interning API
Browse files Browse the repository at this point in the history
Interning short strings with high probability of duplicates can reduce the
memory footprint and speed up comparisons.

Add strintern() and memintern() APIs that use a hashmap to manage the pool
of unique, interned strings.

Note: strintern(getenv()) could be used to sanitize git's use of getenv(),
in case we ever encounter a platform where a call to getenv() invalidates
previous getenv() results (which is allowed by POSIX).

Signed-off-by: Karsten Blees <blees@dcon.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information
Karsten Blees authored and Junio C Hamano committed Jul 7, 2014
1 parent ab73a9d commit 7b64d42
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 0 deletions.
15 changes: 15 additions & 0 deletions Documentation/technical/api-hashmap.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,21 @@ more entries.
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
and returns the first entry, if any).

`const char *strintern(const char *string)`::
`const void *memintern(const void *data, size_t len)`::

Returns the unique, interned version of the specified string or data,
similar to the `String.intern` API in Java and .NET, respectively.
Interned strings remain valid for the entire lifetime of the process.
+
Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
strings / data must not be modified or freed.
+
Interned strings are best used for short strings with high probability of
duplicates.
+
Uses a hashmap to store the pool of interned strings.

Usage example
-------------

Expand Down
38 changes: 38 additions & 0 deletions hashmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
current = iter->map->table[iter->tablepos++];
}
}

struct pool_entry {
struct hashmap_entry ent;
size_t len;
unsigned char data[FLEX_ARRAY];
};

static int pool_entry_cmp(const struct pool_entry *e1,
const struct pool_entry *e2,
const unsigned char *keydata)
{
return e1->data != keydata &&
(e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
}

const void *memintern(const void *data, size_t len)
{
static struct hashmap map;
struct pool_entry key, *e;

/* initialize string pool hashmap */
if (!map.tablesize)
hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);

/* lookup interned string in pool */
hashmap_entry_init(&key, memhash(data, len));
key.len = len;
e = hashmap_get(&map, &key, data);
if (!e) {
/* not found: create it */
e = xmallocz(sizeof(struct pool_entry) + len);
hashmap_entry_init(e, key.ent.hash);
e->len = len;
memcpy(e->data, data, len);
hashmap_add(&map, e);
}
return e->data;
}
8 changes: 8 additions & 0 deletions hashmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
return hashmap_iter_next(iter);
}

/* string interning */

extern const void *memintern(const void *data, size_t len);
static inline const char *strintern(const char *string)
{
return memintern(string, strlen(string));
}

#endif
13 changes: 13 additions & 0 deletions t/t0011-hashmap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
'

test_expect_success 'string interning' '
test_hashmap "intern value1
intern Value1
intern value2
intern value2
" "value1
Value1
value2
value2"
'

test_done
14 changes: 14 additions & 0 deletions test-hashmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,20 @@ int main(int argc, char *argv[])
/* print table sizes */
printf("%u %u\n", map.tablesize, map.size);

} else if (!strcmp("intern", cmd) && l1) {

/* test that strintern works */
const char *i1 = strintern(p1);
const char *i2 = strintern(p1);
if (strcmp(i1, p1))
printf("strintern(%s) returns %s\n", p1, i1);
else if (i1 == p1)
printf("strintern(%s) returns input pointer\n", p1);
else if (i1 != i2)
printf("strintern(%s) != strintern(%s)", i1, i2);
else
printf("%s\n", i1);

} else if (!strcmp("perfhashmap", cmd) && l1 && l2) {

perf_hashmap(atoi(p1), atoi(p2));
Expand Down

0 comments on commit 7b64d42

Please sign in to comment.