Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Fix ranges with multibyte characters as endpoints.
This is another bug in computing the fastmap.  It was reported by a user
of sed because it usually does not happen with !_LIBC.  However, it is
there in that case too.

The bug is that whenever we have a range at the beginning of the regex,
the regex must be tested on any possible multibyte character.  The reason
why _LIBC masks it, is that in general there is a collation symbol for
each possible multibyte-character lead byte, so all the lead bytes are
in general already part of the fastmap.

The tests use cyrillic characters as an example.  With _LIBC, they pass
without the patch too, but you can make them fail by removing collation
symbols handling.
  • Loading branch information
Paolo Bonzini authored and Ulrich Drepper committed Nov 18, 2009
1 parent 7443244 commit 815d814
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 2 deletions.
8 changes: 8 additions & 0 deletions ChangeLog
@@ -1,3 +1,11 @@
2009-11-17 Paolo Bonzini <bonzini@gnu.org>

* posix/bug-regex30.c: New file.
* posix/Makefile: Add rules to build and run bug-regex30.
* posix/regcomp.c (re_compile_fastmap_iter): Add all multibyte
character lead bytes when there is a range in a COMPLEX_BRACKET.
Reported by Oleg Bylatov.

2009-11-17 Ulrich Drepper <drepper@redhat.com>

[BZ #10969]
Expand Down
3 changes: 2 additions & 1 deletion posix/Makefile
Expand Up @@ -82,7 +82,7 @@ tests := tstgetopt testfnm runtests runptests \
bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
bug-regex25 bug-regex26 bug-regex27 bug-regex28 \
bug-regex29 \
bug-regex29 bug-regex30 \
tst-nice tst-nanosleep tst-regex2 \
transbug tst-rxspencer tst-pcre tst-boost \
bug-ga1 tst-vfork1 tst-vfork2 tst-vfork3 tst-waitid \
Expand Down Expand Up @@ -195,6 +195,7 @@ bug-regex22-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex23-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex25-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex26-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex30-ENV = LOCPATH=$(common-objpfx)localedata
tst-rxspencer-ARGS = --utf8 rxspencer/tests
tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
tst-pcre-ARGS = PCRE.tests
Expand Down
103 changes: 103 additions & 0 deletions posix/bug-regex30.c
@@ -0,0 +1,103 @@
/* Russian regular expression tests.
Copyright (C) 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Paolo Bonzini <pbonzini@redhat.com>, 2009.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */

#include <sys/types.h>
#include <mcheck.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>

/* Tests supposed to match. */
struct
{
const char *pattern;
const char *string;
int flags, nmatch;
regmatch_t rm[5];
} tests[] = {
/* U+0413 \xd0\x93 CYRILLIC CAPITAL LETTER GHE
U+0420 \xd0\xa0 CYRILLIC CAPITAL LETTER ER
U+0430 \xd0\xb0 CYRILLIC SMALL LETTER A
U+0433 \xd0\xb3 CYRILLIC SMALL LETTER GHE
U+0440 \xd1\x80 CYRILLIC SMALL LETTER ER
U+044F \xd1\x8f CYRILLIC SMALL LETTER YA */
{ "[\xd0\xb0-\xd1\x8f]", "\xd0\xb3", 0, 1,
{ { 0, 2 } } },
{ "[\xd0\xb0-\xd1\x8f]", "\xd0\x93", REG_ICASE, 1,
{ { 0, 2 } } },
{ "[\xd1\x80-\xd1\x8f]", "\xd0\xa0", REG_ICASE, 1,
{ { 0, 2 } } },
};


static int
do_test (void)
{
if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL)
{
puts ("setlocale failed");
return 1;
}

int ret = 0;

for (size_t i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
{
regex_t re;
regmatch_t rm[5];
int n = regcomp (&re, tests[i].pattern, tests[i].flags);
if (n != 0)
{
char buf[500];
regerror (n, &re, buf, sizeof (buf));
printf ("regcomp %zd failed: %s\n", i, buf);
ret = 1;
continue;
}

if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0))
{
printf ("regexec %zd failed\n", i);
ret = 1;
regfree (&re);
continue;
}

for (n = 0; n < tests[i].nmatch; ++n)
if (rm[n].rm_so != tests[i].rm[n].rm_so
|| rm[n].rm_eo != tests[i].rm[n].rm_eo)
{
if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1)
break;
printf ("regexec match failure rm[%d] %d..%d\n",
n, rm[n].rm_so, rm[n].rm_eo);
ret = 1;
break;
}

regfree (&re);
}

return ret;
}

#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"
2 changes: 1 addition & 1 deletion posix/regcomp.c
Expand Up @@ -377,7 +377,7 @@ re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
applies to multibyte character sets; for single byte character
sets, the SIMPLE_BRACKET again suffices. */
if (dfa->mb_cur_max > 1
&& (cset->nchar_classes || cset->non_match
&& (cset->nchar_classes || cset->non_match || cset->nranges
# ifdef _LIBC
|| cset->nequiv_classes
# endif /* _LIBC */
Expand Down

0 comments on commit 815d814

Please sign in to comment.