Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Fix ranges with multibyte characters as endpoints.
This is another bug in computing the fastmap. It was reported by a user of sed because it usually does not happen with !_LIBC. However, it is there in that case too. The bug is that whenever we have a range at the beginning of the regex, the regex must be tested on any possible multibyte character. The reason why _LIBC masks it, is that in general there is a collation symbol for each possible multibyte-character lead byte, so all the lead bytes are in general already part of the fastmap. The tests use cyrillic characters as an example. With _LIBC, they pass without the patch too, but you can make them fail by removing collation symbols handling.
- Loading branch information
Paolo Bonzini
authored and
Ulrich Drepper
committed
Nov 18, 2009
1 parent
7443244
commit 815d814
Showing
4 changed files
with
114 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/* Russian regular expression tests. | ||
Copyright (C) 2009 Free Software Foundation, Inc. | ||
This file is part of the GNU C Library. | ||
Contributed by Paolo Bonzini <pbonzini@redhat.com>, 2009. | ||
The GNU C Library is free software; you can redistribute it and/or | ||
modify it under the terms of the GNU Lesser General Public | ||
License as published by the Free Software Foundation; either | ||
version 2.1 of the License, or (at your option) any later version. | ||
The GNU C Library is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
Lesser General Public License for more details. | ||
You should have received a copy of the GNU Lesser General Public | ||
License along with the GNU C Library; if not, write to the Free | ||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | ||
02111-1307 USA. */ | ||
|
||
#include <sys/types.h> | ||
#include <mcheck.h> | ||
#include <regex.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <locale.h> | ||
|
||
/* Tests supposed to match. */ | ||
struct | ||
{ | ||
const char *pattern; | ||
const char *string; | ||
int flags, nmatch; | ||
regmatch_t rm[5]; | ||
} tests[] = { | ||
/* U+0413 \xd0\x93 CYRILLIC CAPITAL LETTER GHE | ||
U+0420 \xd0\xa0 CYRILLIC CAPITAL LETTER ER | ||
U+0430 \xd0\xb0 CYRILLIC SMALL LETTER A | ||
U+0433 \xd0\xb3 CYRILLIC SMALL LETTER GHE | ||
U+0440 \xd1\x80 CYRILLIC SMALL LETTER ER | ||
U+044F \xd1\x8f CYRILLIC SMALL LETTER YA */ | ||
{ "[\xd0\xb0-\xd1\x8f]", "\xd0\xb3", 0, 1, | ||
{ { 0, 2 } } }, | ||
{ "[\xd0\xb0-\xd1\x8f]", "\xd0\x93", REG_ICASE, 1, | ||
{ { 0, 2 } } }, | ||
{ "[\xd1\x80-\xd1\x8f]", "\xd0\xa0", REG_ICASE, 1, | ||
{ { 0, 2 } } }, | ||
}; | ||
|
||
|
||
static int | ||
do_test (void) | ||
{ | ||
if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL) | ||
{ | ||
puts ("setlocale failed"); | ||
return 1; | ||
} | ||
|
||
int ret = 0; | ||
|
||
for (size_t i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) | ||
{ | ||
regex_t re; | ||
regmatch_t rm[5]; | ||
int n = regcomp (&re, tests[i].pattern, tests[i].flags); | ||
if (n != 0) | ||
{ | ||
char buf[500]; | ||
regerror (n, &re, buf, sizeof (buf)); | ||
printf ("regcomp %zd failed: %s\n", i, buf); | ||
ret = 1; | ||
continue; | ||
} | ||
|
||
if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0)) | ||
{ | ||
printf ("regexec %zd failed\n", i); | ||
ret = 1; | ||
regfree (&re); | ||
continue; | ||
} | ||
|
||
for (n = 0; n < tests[i].nmatch; ++n) | ||
if (rm[n].rm_so != tests[i].rm[n].rm_so | ||
|| rm[n].rm_eo != tests[i].rm[n].rm_eo) | ||
{ | ||
if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1) | ||
break; | ||
printf ("regexec match failure rm[%d] %d..%d\n", | ||
n, rm[n].rm_so, rm[n].rm_eo); | ||
ret = 1; | ||
break; | ||
} | ||
|
||
regfree (&re); | ||
} | ||
|
||
return ret; | ||
} | ||
|
||
#define TEST_FUNCTION do_test () | ||
#include "../test-skeleton.c" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters