Skip to content

Commit

Permalink
unicode: cache the normalization tables in struct unicode_map
Browse files Browse the repository at this point in the history
Instead of repeatedly looking up the version add pointers to the
NFD and NFD+CF tables to struct unicode_map, and pass a
unicode_map plus index to the functions using the normalization
tables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
  • Loading branch information
Christoph Hellwig authored and Gabriel Krisman Bertazi committed Oct 11, 2021
1 parent fbc59d6 commit 6ca99ce
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 94 deletions.
37 changes: 19 additions & 18 deletions fs/unicode/utf8-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,13 @@
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/errno.h>
#include <linux/unicode.h>
#include <linux/stringhash.h>

#include "utf8n.h"

int utf8_validate(const struct unicode_map *um, const struct qstr *str)
{
const struct utf8data *data = utf8nfdi(um->version);

if (utf8nlen(data, str->name, str->len) < 0)
if (utf8nlen(um, UTF8_NFDI, str->name, str->len) < 0)
return -1;
return 0;
}
Expand All @@ -23,14 +20,13 @@ EXPORT_SYMBOL(utf8_validate);
int utf8_strncmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur1, cur2;
int c1, c2;

if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
if (utf8ncursor(&cur1, um, UTF8_NFDI, s1->name, s1->len) < 0)
return -EINVAL;

if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
if (utf8ncursor(&cur2, um, UTF8_NFDI, s2->name, s2->len) < 0)
return -EINVAL;

do {
Expand All @@ -50,14 +46,13 @@ EXPORT_SYMBOL(utf8_strncmp);
int utf8_strncasecmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1, cur2;
int c1, c2;

if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
return -EINVAL;

if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
if (utf8ncursor(&cur2, um, UTF8_NFDICF, s2->name, s2->len) < 0)
return -EINVAL;

do {
Expand All @@ -81,12 +76,11 @@ int utf8_strncasecmp_folded(const struct unicode_map *um,
const struct qstr *cf,
const struct qstr *s1)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1;
int c1, c2;
int i = 0;

if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
if (utf8ncursor(&cur1, um, UTF8_NFDICF, s1->name, s1->len) < 0)
return -EINVAL;

do {
Expand All @@ -105,11 +99,10 @@ EXPORT_SYMBOL(utf8_strncasecmp_folded);
int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
size_t nlen = 0;

if (utf8ncursor(&cur, data, str->name, str->len) < 0)
if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
return -EINVAL;

for (nlen = 0; nlen < dlen; nlen++) {
Expand All @@ -128,12 +121,11 @@ EXPORT_SYMBOL(utf8_casefold);
int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
struct qstr *str)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
int c;
unsigned long hash = init_name_hash(salt);

if (utf8ncursor(&cur, data, str->name, str->len) < 0)
if (utf8ncursor(&cur, um, UTF8_NFDICF, str->name, str->len) < 0)
return -EINVAL;

while ((c = utf8byte(&cur))) {
Expand All @@ -149,11 +141,10 @@ EXPORT_SYMBOL(utf8_casefold_hash);
int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur;
ssize_t nlen = 0;

if (utf8ncursor(&cur, data, str->name, str->len) < 0)
if (utf8ncursor(&cur, um, UTF8_NFDI, str->name, str->len) < 0)
return -EINVAL;

for (nlen = 0; nlen < dlen; nlen++) {
Expand All @@ -180,7 +171,17 @@ struct unicode_map *utf8_load(unsigned int version)
if (!um)
return ERR_PTR(-ENOMEM);
um->version = version;
um->ntab[UTF8_NFDI] = utf8nfdi(version);
if (!um->ntab[UTF8_NFDI])
goto out_free_um;
um->ntab[UTF8_NFDICF] = utf8nfdicf(version);
if (!um->ntab[UTF8_NFDICF])
goto out_free_um;
return um;

out_free_um:
kfree(um);
return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL(utf8_load);

Expand Down
45 changes: 20 additions & 25 deletions fs/unicode/utf8-norm.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,21 +309,19 @@ utf8hangul(const char *str, unsigned char *hangul)
* is well-formed and corresponds to a known unicode code point. The
* shorthand for this will be "is valid UTF-8 unicode".
*/
static utf8leaf_t *utf8nlookup(const struct utf8data *data,
unsigned char *hangul, const char *s, size_t len)
static utf8leaf_t *utf8nlookup(const struct unicode_map *um,
enum utf8_normalization n, unsigned char *hangul, const char *s,
size_t len)
{
utf8trie_t *trie = NULL;
utf8trie_t *trie = utf8data + um->ntab[n]->offset;
int offlen;
int offset;
int mask;
int node;

if (!data)
return NULL;
if (len == 0)
return NULL;

trie = utf8data + data->offset;
node = 1;
while (node) {
offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
Expand Down Expand Up @@ -385,29 +383,28 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data,
*
* Forwards to utf8nlookup().
*/
static utf8leaf_t *utf8lookup(const struct utf8data *data,
unsigned char *hangul, const char *s)
static utf8leaf_t *utf8lookup(const struct unicode_map *um,
enum utf8_normalization n, unsigned char *hangul, const char *s)
{
return utf8nlookup(data, hangul, s, (size_t)-1);
return utf8nlookup(um, n, hangul, s, (size_t)-1);
}

/*
* Length of the normalization of s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
ssize_t utf8nlen(const struct unicode_map *um, enum utf8_normalization n,
const char *s, size_t len)
{
utf8leaf_t *leaf;
size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];

if (!data)
return -1;
while (len && *s) {
leaf = utf8nlookup(data, hangul, s, len);
leaf = utf8nlookup(um, n, hangul, s, len);
if (!leaf)
return -1;
if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
if (utf8agetab[LEAF_GEN(leaf)] > um->ntab[n]->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
Expand All @@ -430,14 +427,13 @@ EXPORT_SYMBOL(utf8nlen);
*
* Returns -1 on error, 0 on success.
*/
int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
const char *s, size_t len)
int utf8ncursor(struct utf8cursor *u8c, const struct unicode_map *um,
enum utf8_normalization n, const char *s, size_t len)
{
if (!data)
return -1;
if (!s)
return -1;
u8c->data = data;
u8c->um = um;
u8c->n = n;
u8c->s = s;
u8c->p = NULL;
u8c->ss = NULL;
Expand Down Expand Up @@ -512,9 +508,9 @@ int utf8byte(struct utf8cursor *u8c)

/* Look up the data for the current character. */
if (u8c->p) {
leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
} else {
leaf = utf8nlookup(u8c->data, u8c->hangul,
leaf = utf8nlookup(u8c->um, u8c->n, u8c->hangul,
u8c->s, u8c->len);
}

Expand All @@ -524,7 +520,8 @@ int utf8byte(struct utf8cursor *u8c)

ccc = LEAF_CCC(leaf);
/* Characters that are too new have CCC 0. */
if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
if (utf8agetab[LEAF_GEN(leaf)] >
u8c->um->ntab[u8c->n]->maxage) {
ccc = STOPPER;
} else if (ccc == DECOMPOSE) {
u8c->len -= utf8clen(u8c->s);
Expand All @@ -538,7 +535,7 @@ int utf8byte(struct utf8cursor *u8c)
goto ccc_mismatch;
}

leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
leaf = utf8lookup(u8c->um, u8c->n, u8c->hangul, u8c->s);
if (!leaf)
return -1;
ccc = LEAF_CCC(leaf);
Expand Down Expand Up @@ -611,7 +608,6 @@ const struct utf8data *utf8nfdi(unsigned int maxage)
return NULL;
return &utf8nfdidata[i];
}
EXPORT_SYMBOL(utf8nfdi);

const struct utf8data *utf8nfdicf(unsigned int maxage)
{
Expand All @@ -623,4 +619,3 @@ const struct utf8data *utf8nfdicf(unsigned int maxage)
return NULL;
return &utf8nfdicfdata[i];
}
EXPORT_SYMBOL(utf8nfdicf);
Loading

0 comments on commit 6ca99ce

Please sign in to comment.