Skip to content

Commit

Permalink
[BZ #544]
Browse files Browse the repository at this point in the history
Update.
2004-11-18  Jakub Jelinek  <jakub@redhat.com>

	[BZ #544]
	* posix/regex.h (RE_NO_SUB): New define.
	* posix/regex_internal.h (OP_DELETED_SUBEXP): New.
	(re_dfa_t): Add subexp_map.
	* posix/regcomp.c (struct subexp_optimize): New type.
	(optimize_subexps): New routine.
	(re_compile_internal): Call it.
	(re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB.
	(free_dfa_content): Free subexp_map.
	(calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP
	nodes.
	* posix/regexec.c (re_search_internal): If subexp_map
	is not NULL, duplicate registers as needed.
	* posix/Makefile: Add rules to build and run tst-regex2.
	* posix/tst-regex2.c: New test.
	* posix/rxspencer/tests: Fix last two tests (\0 -> \1).
	Add some new tests for nested subexpressions.
  • Loading branch information
Ulrich Drepper committed Nov 18, 2004
1 parent 1b1d367 commit c06a695
Show file tree
Hide file tree
Showing 9 changed files with 408 additions and 6 deletions.
20 changes: 20 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
2004-11-18 Jakub Jelinek <jakub@redhat.com>

[BZ #544]
* posix/regex.h (RE_NO_SUB): New define.
* posix/regex_internal.h (OP_DELETED_SUBEXP): New.
(re_dfa_t): Add subexp_map.
* posix/regcomp.c (struct subexp_optimize): New type.
(optimize_subexps): New routine.
(re_compile_internal): Call it.
(re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB.
(free_dfa_content): Free subexp_map.
(calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP
nodes.
* posix/regexec.c (re_search_internal): If subexp_map
is not NULL, duplicate registers as needed.
* posix/Makefile: Add rules to build and run tst-regex2.
* posix/tst-regex2.c: New test.
* posix/rxspencer/tests: Fix last two tests (\0 -> \1).
Add some new tests for nested subexpressions.

2004-11-18 Ulrich Drepper <drepper@redhat.com>

* libio/libio.h (_IO_FLAGS2_FORTIFY): Renamed from
Expand Down
5 changes: 4 additions & 1 deletion posix/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ tests := tstgetopt testfnm runtests runptests \
bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
tst-nice tst-nanosleep \
tst-nice tst-nanosleep tst-regex2 \
transbug tst-rxspencer tst-pcre tst-boost \
bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
tst-getaddrinfo2 bug-glob1 bug-glob2
Expand Down Expand Up @@ -160,6 +160,7 @@ tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata
tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata
tst-regex-ENV = LOCPATH=$(common-objpfx)localedata
tst-regex2-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata
Expand Down Expand Up @@ -244,8 +245,10 @@ $(objpfx)tst-getconf.out: tst-getconf.sh $(objpfx)getconf

ifeq (yes,$(build-shared))
$(objpfx)tst-regex: $(common-objpfx)rt/librt.so
$(objpfx)tst-regex2: $(common-objpfx)rt/librt.so
else
$(objpfx)tst-regex: $(common-objpfx)rt/librt.a
$(objpfx)tst-regex2: $(common-objpfx)rt/librt.a
endif

$(objpfx)bug-ga2-mem: $(objpfx)bug-ga2.out
Expand Down
105 changes: 103 additions & 2 deletions posix/regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa);
#ifdef RE_ENABLE_I18N
static void optimize_utf8 (re_dfa_t *dfa);
#endif
struct subexp_optimize
{
re_dfa_t *dfa;
re_token_t *nodes;
int no_sub, re_nsub;
};
static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
bin_tree_t *node, int sidx, int depth);
static reg_errcode_t analyze (re_dfa_t *dfa);
static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
Expand Down Expand Up @@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, bufp)

/* And GNU code determines whether or not to get register information
by passing null for the REGS argument to re_match, etc., not by
setting no_sub. */
bufp->no_sub = 0;
setting no_sub, unless RE_NO_SUB is set. */
bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);

/* Match anchors at newline. */
bufp->newline_anchor = 1;
Expand Down Expand Up @@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa)
if (dfa->sb_char != utf8_sb_map)
re_free (dfa->sb_char);
#endif
re_free (dfa->subexp_map);
#ifdef DEBUG
re_free (dfa->re_str);
#endif
Expand Down Expand Up @@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, length, syntax)
optimize_utf8 (dfa);
#endif

if (preg->re_nsub > 0)
{
struct subexp_optimize so;

so.dfa = dfa;
so.nodes = dfa->nodes;
so.no_sub = preg->no_sub;
so.re_nsub = preg->re_nsub;
dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
}

/* Analyze the tree and collect information which is necessary to
create the dfa. */
err = analyze (dfa);
Expand Down Expand Up @@ -1121,6 +1141,82 @@ optimize_utf8 (dfa)
}
#endif

static bin_tree_t *
optimize_subexps (so, node, sidx, depth)
struct subexp_optimize *so;
bin_tree_t *node;
int sidx, depth;
{
int idx, new_depth, new_sidx;
bin_tree_t *ret;
if (node == NULL)
return NULL;

new_depth = 0;
new_sidx = sidx;
if ((depth & 1) && node->type == CONCAT
&& node->right && node->right->type == 0
&& so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
{
new_depth = depth + 1;
if (new_depth == 2
|| (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
&& so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
new_sidx = so->nodes[idx].opr.idx;
}
node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
new_depth = (depth & 1) == 0 && node->type == CONCAT
&& node->left && node->left->type == 0
&& so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
? depth + 1 : 0;
node->right = optimize_subexps (so, node->right, sidx, new_depth);

if (node->type != CONCAT)
return node;
if ((depth & 1) == 0
&& node->left
&& node->left->type == 0
&& so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
ret = node->right;
else if ((depth & 1)
&& node->right
&& node->right->type == 0
&& so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
ret = node->left;
else
return node;

if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
&& so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
return node;

if (!so->no_sub)
{
int i;

if (depth < 2)
return node;

if (so->dfa->subexp_map == NULL)
{
so->dfa->subexp_map = re_malloc (int, so->re_nsub);
if (so->dfa->subexp_map == NULL)
return node;

for (i = 0; i < so->re_nsub; i++)
so->dfa->subexp_map[i] = i;
}

i = so->nodes[idx].opr.idx;
assert (sidx < i);
so->dfa->subexp_map[i] = sidx;
}

so->nodes[idx].type = OP_DELETED_SUBEXP;
ret->parent = node->parent;
return ret;
}

/* Analyze the structure tree, and calculate "first", "next", "edest",
"eclosure", and "inveclosure". */

Expand Down Expand Up @@ -1525,6 +1621,8 @@ calc_inveclosure (dfa)
int src, idx, dest;
for (src = 0; src < dfa->nodes_len; ++src)
{
if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
continue;
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
{
dest = dfa->eclosures[src].elems[idx];
Expand Down Expand Up @@ -1560,6 +1658,9 @@ calc_eclosure (dfa)
#ifdef DEBUG
assert (dfa->eclosures[node_idx].nelem != -1);
#endif
if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
continue;

/* If we have already calculated, skip it. */
if (dfa->eclosures[node_idx].nelem != 0)
continue;
Expand Down
4 changes: 4 additions & 0 deletions posix/regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t;
immediately after an alternation or begin-group operator. */
#define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)

/* If this bit is set, then no_sub will be set to 1 during
re_compile_pattern. */
#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)

/* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is
stored in the pattern buffer, so changing this does not affect
Expand Down
2 changes: 2 additions & 0 deletions posix/regex_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ typedef enum
OP_DUP_PLUS = EPSILON_BIT | 4,
OP_DUP_QUESTION = EPSILON_BIT | 5,
ANCHOR = EPSILON_BIT | 6,
OP_DELETED_SUBEXP = EPSILON_BIT | 7,

/* Tree type, these are used only by tree. */
CONCAT = 16,
Expand Down Expand Up @@ -644,6 +645,7 @@ struct re_dfa_t
int mb_cur_max;
bitset word_char;
reg_syntax_t syntax;
int *subexp_map;
#ifdef DEBUG
char* re_str;
#endif
Expand Down
12 changes: 12 additions & 0 deletions posix/regexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
pmatch[reg_idx].rm_so += match_first;
pmatch[reg_idx].rm_eo += match_first;
}

if (dfa->subexp_map)
for (reg_idx = 0;
reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
reg_idx++)
if (dfa->subexp_map[reg_idx] != reg_idx)
{
pmatch[reg_idx + 1].rm_so
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
pmatch[reg_idx + 1].rm_eo
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
}
}

free_return:
Expand Down
20 changes: 18 additions & 2 deletions posix/rxspencer/tests
Original file line number Diff line number Diff line change
Expand Up @@ -508,5 +508,21 @@ a*a*a*a*a*a*a* & aaaaaa aaaaaa
(\b){0} - x @x -
\(\b\)\{0,0\} b abc @abc -
a(\b){0}c - ac ac -
a(.*)b(\0){0}c - abc abc @bc,-
a(.*)b(\0){0}c - axbc axbc x,-
a(.*)b(\1){0}c - abc abc @bc,-
a(.*)b(\1){0}c - axbc axbc x,-

a\(\(b*\)\)c\1d b abbcbbd abbcbbd bb,bb
a\(\([bc]\)\)\2d b abcdabbd abbd b,b
a\(\(\(\([bc]\)\)\3\)\)*d b abbccd abbccd cc,cc,c,c
a(b)(c)d - abcd abcd b,c
a(((b)))c - abc abc b,b,b
a(((b|(((c))))))d - abd abd b,b,b,-,-,-
a(((b*|c|e)))d - abbd abbd bb,bb,bb
a((b|c)){0,0}d - ad ad -,-
a((b|c)){0,1}d - abd abd b,b
a((b|c)){0,2}d - abcd abcd c,c
a((b+|((c)*)))+d - abd abd b,b,-,-
a((b+|((c)*)))+d - abcd abcd c,c,c,c
(((\b))){0} - x @x -,-,-
a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,-
a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,-
Loading

0 comments on commit c06a695

Please sign in to comment.