Skip to content

Commit

Permalink
Update.
Browse files Browse the repository at this point in the history
2004-11-08  Ulrich Drepper  <drepper@redhat.com>

	* posix/regcomp.c (utf8_sb_map): Define.
	(free_dfa_content): Don't free dfa->sb_char if it's a pointer to
	utf8_sb_map.
	(init_dfa): Use utf8_sb_map instead of initializing memory when the
	encoding is UTF-8.

	* posix/regcomp.c (init_dfa): Get the codeset name outside glibc as
	well.  Check if it is spelled UTF8 as well as UTF-8, and check
	case-insensitively.  Set dfa->map_notascii manually when outside
	glibc.
	* posix/regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
	optimizations based on map_notascii.
	* posix/regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
	|| _LIBC]: Include langinfo.h.

	* posix/regex_internal.h (struct re_backref_cache_entry): Add "more"
	field.
	* posix/regexec.c (check_dst_limits): Hoist computation of the source
	and destination bkref_idx out of the loop.  Pass it to
	check_dst_limits_calc_pos.
	(check_dst_limits_calc_pos_1): New function, containing the recursive
	loop of check_dst_limits_calc_pos; uses the "more" field of
	struct re_backref_cache to control the loop.
	(check_dst_limits_calc_pos): Store into "boundaries" the position
	relative to lim's start and end positions.  Do not accept eclosures,
	accept bkref_idx instead.  Call check_dst_limits_calc_pos_1 to do the
	work.
	(sift_states_bkref): Use the "more" field of struct re_backref_cache
	to control the loop.  A big "if" was turned into a continue and the
	function was reindented.
	(get_subexp): Use the "more" field of struct re_backref_cache
	to control the loop.
	(match_ctx_add_entry): Initialize the bkref_ents' "more" field.
	(search_cur_bkref_entry): Return -1 if out of bounds.

	* posix/regexec.c (empty_set): Remove.
	(sift_states_backward): Remove cur_src variable.  Move inner loop
	to build_sifted_states.
	(build_sifted_states): Extract from sift_states_backward.  Do not
	use empty_set.
	(update_cur_sifted_state): Do not use empty_set.  Special case
	dest_nodes->nelem == 0.
  • Loading branch information
Ulrich Drepper committed Nov 8, 2004
1 parent d2c38eb commit e40a38b
Show file tree
Hide file tree
Showing 5 changed files with 362 additions and 220 deletions.
45 changes: 45 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,50 @@
2004-11-08 Ulrich Drepper <drepper@redhat.com>

* posix/regcomp.c (utf8_sb_map): Define.
(free_dfa_content): Don't free dfa->sb_char if it's a pointer to
utf8_sb_map.
(init_dfa): Use utf8_sb_map instead of initializing memory when the
encoding is UTF-8.

2004-11-03 Paolo Bonzini <bonzini@gnu.org>

* posix/regcomp.c (init_dfa): Get the codeset name outside glibc as
well. Check if it is spelled UTF8 as well as UTF-8, and check
case-insensitively. Set dfa->map_notascii manually when outside
glibc.
* posix/regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
optimizations based on map_notascii.
* posix/regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
|| _LIBC]: Include langinfo.h.

* posix/regex_internal.h (struct re_backref_cache_entry): Add "more"
field.
* posix/regexec.c (check_dst_limits): Hoist computation of the source
and destination bkref_idx out of the loop. Pass it to
check_dst_limits_calc_pos.
(check_dst_limits_calc_pos_1): New function, containing the recursive
loop of check_dst_limits_calc_pos; uses the "more" field of
struct re_backref_cache to control the loop.
(check_dst_limits_calc_pos): Store into "boundaries" the position
relative to lim's start and end positions. Do not accept eclosures,
accept bkref_idx instead. Call check_dst_limits_calc_pos_1 to do the
work.
(sift_states_bkref): Use the "more" field of struct re_backref_cache
to control the loop. A big "if" was turned into a continue and the
function was reindented.
(get_subexp): Use the "more" field of struct re_backref_cache
to control the loop.
(match_ctx_add_entry): Initialize the bkref_ents' "more" field.
(search_cur_bkref_entry): Return -1 if out of bounds.

* posix/regexec.c (empty_set): Remove.
(sift_states_backward): Remove cur_src variable. Move inner loop
to build_sifted_states.
(build_sifted_states): Extract from sift_states_backward. Do not
use empty_set.
(update_cur_sifted_state): Do not use empty_set. Special case
dest_nodes->nelem == 0.

* posix/regex_internal.h (struct re_backref_cache_entry): Remove flag
field.
(struct re_sift_context_t): Remove cur_bkref, cls_subexp_idx,
Expand Down
80 changes: 69 additions & 11 deletions posix/regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,23 @@ weak_alias (__regerror, regerror)
#endif


#ifdef RE_ENABLE_I18N
/* This static array is used for the map to single-byte characters when
UTF-8 is used. Otherwise we would allocate memory just to initialize
it the same all the time. UTF-8 is the preferred encoding so this is
a worthwhile optimization. */
static const bitset utf8_sb_map =
{
/* Set the first 128 bits. */
# if UINT_MAX == 0xffffffff
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
# else
# error "Add case for new unsigned int size"
# endif
};
#endif


static void
free_dfa_content (re_dfa_t *dfa)
{
Expand Down Expand Up @@ -613,7 +630,8 @@ free_dfa_content (re_dfa_t *dfa)
}
re_free (dfa->state_table);
#ifdef RE_ENABLE_I18N
re_free (dfa->sb_char);
if (dfa->sb_char != utf8_sb_map)
re_free (dfa->sb_char);
#endif
#ifdef DEBUG
re_free (dfa->re_str);
Expand Down Expand Up @@ -824,6 +842,9 @@ init_dfa (dfa, pat_len)
int pat_len;
{
int table_size;
#ifndef _LIBC
char *codeset_name;
#endif

memset (dfa, '\0', sizeof (re_dfa_t));

Expand Down Expand Up @@ -853,22 +874,59 @@ init_dfa (dfa, pat_len)
dfa->is_utf8 = 1;
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
!= 0);
#else
# ifdef HAVE_LANGINFO_CODESET
codeset_name = nl_langinfo (CODESET);
# else
codeset_name = getenv ("LC_ALL");
if (codeset_name == NULL || codeset[0] == '\0')
codeset_name = getenv ("LC_CTYPE");
if (codeset_name == NULL || codeset[0] == '\0')
codeset_name = getenv ("LANG");
if (codeset_name == NULL)
codeset_name = "";
else if (strchr (codeset_name, '.') != NULL)
codeset_name = strchr (codeset_name, '.') + 1;
# endif

if (strcasecmp (codeset_name, "UTF-8") == 0
|| strcasecmp (codeset_name, "UTF8") == 0)
dfa->is_utf8 = 1;

/* We check exhaustively in the loop below if this charset is a
superset of ASCII. */
dfa->map_notascii = 0;
#endif

#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
{
int i, j, ch;

dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
if (BE (dfa->sb_char == NULL, 0))
return REG_ESPACE;
if (dfa->is_utf8)
memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
else
for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
for (j = 0; j < UINT_BITS; ++j, ++ch)
if (__btowc (ch) != WEOF)
dfa->sb_char[i] |= 1 << j;
{
int i, j, ch;

dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
if (BE (dfa->sb_char == NULL, 0))
return REG_ESPACE;

/* Clear all bits by, then set those corresponding to single
byte chars. */
bitset_empty (dfa->sb_char);

for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
for (j = 0; j < UINT_BITS; ++j, ++ch)
{
wchar_t wch = __btowc (ch);
if (wch != WEOF)
dfa->sb_char[i] |= 1 << j;
# ifndef _LIBC
if (isascii (ch) && wch != (wchar_t) ch)
dfa->map_notascii = 1;
# endif
}
}
}
#endif

Expand Down
16 changes: 5 additions & 11 deletions posix/regex_internal.c
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,8 @@ build_wcs_upper_buffer (pstr)
byte_idx = pstr->valid_len;
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;

#ifdef _LIBC
/* The following optimization assumes that the wchar_t encoding is
always ISO 10646. */
/* The following optimization assumes that ASCII characters can be
mapped to wide characters with a simple cast. */
if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
{
while (byte_idx < end_idx)
Expand All @@ -309,8 +308,7 @@ build_wcs_upper_buffer (pstr)
pstr->mbs[byte_idx]
= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
/* The next step uses the assumption that wchar_t is encoded
with ISO 10646: all ASCII values can be converted like
this. */
ASCII-safe: all ASCII values can be converted like this. */
pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
++byte_idx;
continue;
Expand Down Expand Up @@ -368,14 +366,11 @@ build_wcs_upper_buffer (pstr)
return REG_NOERROR;
}
else
#endif
for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
{
wchar_t wc;
const char *p;
#ifdef _LIBC
offsets_needed:
#endif
offsets_needed:
remain_len = end_idx - byte_idx;
prev_st = pstr->cur_state;
if (BE (pstr->trans != NULL, 0))
Expand Down Expand Up @@ -647,7 +642,6 @@ re_string_reconstruct (pstr, idx, eflags)
int wcs_idx;
wint_t wc = WEOF;

#ifdef _LIBC
if (pstr->is_utf8)
{
const unsigned char *raw, *p, *q, *end;
Expand Down Expand Up @@ -687,7 +681,7 @@ re_string_reconstruct (pstr, idx, eflags)
break;
}
}
#endif

if (wc == WEOF)
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
if (BE (pstr->valid_len, 0))
Expand Down
6 changes: 6 additions & 0 deletions posix/regex_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#include <stdlib.h>
#include <string.h>

#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
# include <langinfo.h>
#endif
#if defined HAVE_LOCALE_H || defined _LIBC
# include <locale.h>
#endif
Expand Down Expand Up @@ -545,6 +548,9 @@ struct re_backref_cache_entry
int str_idx;
int subexp_from;
int subexp_to;
/* We need only one byte from the following field. If other small
fields are added the type could be changed to 'char'. */
int more;
};

typedef struct
Expand Down
Loading

0 comments on commit e40a38b

Please sign in to comment.