Update.

2004-11-08 Ulrich Drepper <drepper@redhat.com> * posix/regcomp.c (utf8_sb_map): Define. (free_dfa_content): Don't free dfa->sb_char if it's a pointer to utf8_sb_map. (init_dfa): Use utf8_sb_map instead of initializing memory when the encoding is UTF-8. * posix/regcomp.c (init_dfa): Get the codeset name outside glibc as well. Check if it is spelled UTF8 as well as UTF-8, and check case-insensitively. Set dfa->map_notascii manually when outside glibc. * posix/regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable optimizations based on map_notascii. * posix/regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET || _LIBC]: Include langinfo.h. * posix/regex_internal.h (struct re_backref_cache_entry): Add "more" field. * posix/regexec.c (check_dst_limits): Hoist computation of the source and destination bkref_idx out of the loop. Pass it to check_dst_limits_calc_pos. (check_dst_limits_calc_pos_1): New function, containing the recursive loop of check_dst_limits_calc_pos; uses the "more" field of struct re_backref_cache to control the loop. (check_dst_limits_calc_pos): Store into "boundaries" the position relative to lim's start and end positions. Do not accept eclosures, accept bkref_idx instead. Call check_dst_limits_calc_pos_1 to do the work. (sift_states_bkref): Use the "more" field of struct re_backref_cache to control the loop. A big "if" was turned into a continue and the function was reindented. (get_subexp): Use the "more" field of struct re_backref_cache to control the loop. (match_ctx_add_entry): Initialize the bkref_ents' "more" field. (search_cur_bkref_entry): Return -1 if out of bounds. * posix/regexec.c (empty_set): Remove. (sift_states_backward): Remove cur_src variable. Move inner loop to build_sifted_states. (build_sifted_states): Extract from sift_states_backward. Do not use empty_set. (update_cur_sifted_state): Do not use empty_set. Special case dest_nodes->nelem == 0.
git-mirror · Nov 8, 2004 · e40a38b · e40a38b
1 parent d2c38eb
commit e40a38b
Show file tree

Hide file tree

Showing 5 changed files with 362 additions and 220 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,5 +1,50 @@
+2004-11-08  Ulrich Drepper  <drepper@redhat.com>
+
+	* posix/regcomp.c (utf8_sb_map): Define.
+	(free_dfa_content): Don't free dfa->sb_char if it's a pointer to
+	utf8_sb_map.
+	(init_dfa): Use utf8_sb_map instead of initializing memory when the
+	encoding is UTF-8.
+
 2004-11-03  Paolo Bonzini  <bonzini@gnu.org>
 
+	* posix/regcomp.c (init_dfa): Get the codeset name outside glibc as
+	well.  Check if it is spelled UTF8 as well as UTF-8, and check
+	case-insensitively.  Set dfa->map_notascii manually when outside
+	glibc.
+	* posix/regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
+	optimizations based on map_notascii.
+	* posix/regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
+	|| _LIBC]: Include langinfo.h.
+
+	* posix/regex_internal.h (struct re_backref_cache_entry): Add "more"
+	field.
+	* posix/regexec.c (check_dst_limits): Hoist computation of the source
+	and destination bkref_idx out of the loop.  Pass it to
+	check_dst_limits_calc_pos.
+	(check_dst_limits_calc_pos_1): New function, containing the recursive
+	loop of check_dst_limits_calc_pos; uses the "more" field of
+	struct re_backref_cache to control the loop.
+	(check_dst_limits_calc_pos): Store into "boundaries" the position
+	relative to lim's start and end positions.  Do not accept eclosures,
+	accept bkref_idx instead.  Call check_dst_limits_calc_pos_1 to do the
+	work.
+	(sift_states_bkref): Use the "more" field of struct re_backref_cache
+	to control the loop.  A big "if" was turned into a continue and the
+	function was reindented.
+	(get_subexp): Use the "more" field of struct re_backref_cache
+	to control the loop.
+	(match_ctx_add_entry): Initialize the bkref_ents' "more" field.
+	(search_cur_bkref_entry): Return -1 if out of bounds.
+
+	* posix/regexec.c (empty_set): Remove.
+	(sift_states_backward): Remove cur_src variable.  Move inner loop
+	to build_sifted_states.
+	(build_sifted_states): Extract from sift_states_backward.  Do not
+	use empty_set.
+	(update_cur_sifted_state): Do not use empty_set.  Special case
+	dest_nodes->nelem == 0.
+
 	* posix/regex_internal.h (struct re_backref_cache_entry): Remove flag
 	field.
 	(struct re_sift_context_t): Remove cur_bkref, cls_subexp_idx,

diff --git a/posix/regcomp.c b/posix/regcomp.c
@@ -566,6 +566,23 @@ weak_alias (__regerror, regerror)
 #endif
 
 
+#ifdef RE_ENABLE_I18N
+/* This static array is used for the map to single-byte characters when
+   UTF-8 is used.  Otherwise we would allocate memory just to initialize
+   it the same all the time.  UTF-8 is the preferred encoding so this is
+   a worthwhile optimization.  */
+static const bitset utf8_sb_map =
+{
+  /* Set the first 128 bits.  */
+# if UINT_MAX == 0xffffffff
+  0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+# else
+#  error "Add case for new unsigned int size"
+# endif
+};
+#endif
+
+
 static void
 free_dfa_content (re_dfa_t *dfa)
 {
@@ -613,7 +630,8 @@ free_dfa_content (re_dfa_t *dfa)
       }
   re_free (dfa->state_table);
 #ifdef RE_ENABLE_I18N
-  re_free (dfa->sb_char);
+  if (dfa->sb_char != utf8_sb_map)
+    re_free (dfa->sb_char);
 #endif
 #ifdef DEBUG
   re_free (dfa->re_str);
@@ -824,6 +842,9 @@ init_dfa (dfa, pat_len)
      int pat_len;
 {
   int table_size;
+#ifndef _LIBC
+  char *codeset_name;
+#endif
 
   memset (dfa, '\0', sizeof (re_dfa_t));
 
@@ -853,22 +874,59 @@ init_dfa (dfa, pat_len)
     dfa->is_utf8 = 1;
   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 		       != 0);
+#else
+# ifdef HAVE_LANGINFO_CODESET
+  codeset_name = nl_langinfo (CODESET);
+# else
+  codeset_name = getenv ("LC_ALL");
+  if (codeset_name == NULL || codeset[0] == '\0')
+    codeset_name = getenv ("LC_CTYPE");
+  if (codeset_name == NULL || codeset[0] == '\0')
+    codeset_name = getenv ("LANG");
+  if (codeset_name == NULL)
+    codeset_name = "";
+  else if (strchr (codeset_name, '.') !=  NULL)
+    codeset_name = strchr (codeset_name, '.') + 1;
+# endif
+
+  if (strcasecmp (codeset_name, "UTF-8") == 0
+      || strcasecmp (codeset_name, "UTF8") == 0)
+    dfa->is_utf8 = 1;
+
+  /* We check exhaustively in the loop below if this charset is a
+     superset of ASCII.  */
+  dfa->map_notascii = 0;
 #endif
+
 #ifdef RE_ENABLE_I18N
   if (dfa->mb_cur_max > 1)
     {
-      int i, j, ch;
-
-      dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
-      if (BE (dfa->sb_char == NULL, 0))
-	return REG_ESPACE;
       if (dfa->is_utf8)
-	memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
+	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
       else
-	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
-	  for (j = 0; j < UINT_BITS; ++j, ++ch)
-	    if (__btowc (ch) != WEOF)
-	      dfa->sb_char[i] |= 1 << j;
+	{
+	  int i, j, ch;
+
+	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
+	  if (BE (dfa->sb_char == NULL, 0))
+	    return REG_ESPACE;
+
+	  /* Clear all bits by, then set those corresponding to single
+	     byte chars.  */
+	  bitset_empty (dfa->sb_char);
+
+	  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
+	    for (j = 0; j < UINT_BITS; ++j, ++ch)
+	      {
+		wchar_t wch = __btowc (ch);
+		if (wch != WEOF)
+		  dfa->sb_char[i] |= 1 << j;
+# ifndef _LIBC
+		if (isascii (ch) && wch != (wchar_t) ch)
+		  dfa->map_notascii = 1;
+# endif
+	      }
+	}
     }
 #endif
 

diff --git a/posix/regex_internal.c b/posix/regex_internal.c
@@ -293,9 +293,8 @@ build_wcs_upper_buffer (pstr)
   byte_idx = pstr->valid_len;
   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 
-#ifdef _LIBC
-  /* The following optimization assumes that the wchar_t encoding is
-     always ISO 10646.  */
+  /* The following optimization assumes that ASCII characters can be
+     mapped to wide characters with a simple cast.  */
   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
     {
       while (byte_idx < end_idx)
@@ -309,8 +308,7 @@ build_wcs_upper_buffer (pstr)
 	      pstr->mbs[byte_idx]
 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 	      /* The next step uses the assumption that wchar_t is encoded
-		 with ISO 10646: all ASCII values can be converted like
-		 this.  */
+		 ASCII-safe: all ASCII values can be converted like this.  */
 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 	      ++byte_idx;
 	      continue;
@@ -368,14 +366,11 @@ build_wcs_upper_buffer (pstr)
       return REG_NOERROR;
     }
   else
-#endif
     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
       {
 	wchar_t wc;
 	const char *p;
-#ifdef _LIBC
-offsets_needed:
-#endif
+      offsets_needed:
 	remain_len = end_idx - byte_idx;
 	prev_st = pstr->cur_state;
 	if (BE (pstr->trans != NULL, 0))
@@ -647,7 +642,6 @@ re_string_reconstruct (pstr, idx, eflags)
 	      int wcs_idx;
 	      wint_t wc = WEOF;
 
-#ifdef _LIBC
 	      if (pstr->is_utf8)
 		{
 		  const unsigned char *raw, *p, *q, *end;
@@ -687,7 +681,7 @@ re_string_reconstruct (pstr, idx, eflags)
 			break;
 		      }
 		}
-#endif
+
 	      if (wc == WEOF)
 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 	      if (BE (pstr->valid_len, 0))

diff --git a/posix/regex_internal.h b/posix/regex_internal.h
@@ -27,6 +27,9 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
+# include <langinfo.h>
+#endif
 #if defined HAVE_LOCALE_H || defined _LIBC
 # include <locale.h>
 #endif
@@ -545,6 +548,9 @@ struct re_backref_cache_entry
   int str_idx;
   int subexp_from;
   int subexp_to;
+  /* We need only one byte from the following field.  If other small
+     fields are added the type could be changed to 'char'.  */
+  int more;
 };
 
 typedef struct