From fb62eb7fab97cea880ea7fe4f341a4dfad14ab48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Sat, 10 Jan 2009 00:08:40 +0100 Subject: [PATCH 1/2] grep -w: forward to next possible position after rejected match grep -w accepts matches between non-word characters, only. If a match from regexec() doesn't meet this criteria, grep continues its search after the first character of that match. We can be a bit smarter here and skip all positions that follow a word character first, as they can't match our criteria. This way we can consume characters quite cheaply and don't need to special-case the handling of the beginning of a line. Here's a contrived example command on msysgit (best of five runs): $ time git grep -w ...... v1.6.1 >/dev/null real 0m1.611s user 0m0.000s sys 0m0.015s With the patch it's quite a bit faster: $ time git grep -w ...... v1.6.1 >/dev/null real 0m1.179s user 0m0.000s sys 0m0.015s More common search patterns will gain a lot less, but it's a nice clean up anyway. Signed-off-by: Rene Scharfe Signed-off-by: Junio C Hamano --- grep.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/grep.c b/grep.c index 49e931996..22a56b5d5 100644 --- a/grep.c +++ b/grep.c @@ -294,7 +294,6 @@ static struct { static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol, char *eol, enum grep_context ctx) { int hit = 0; - int at_true_bol = 1; int saved_ch = 0; regmatch_t pmatch[10]; @@ -337,7 +336,7 @@ static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol * either end of the line, or at word boundary * (i.e. the next char must not be a word char). */ - if ( ((pmatch[0].rm_so == 0 && at_true_bol) || + if ( ((pmatch[0].rm_so == 0) || !word_char(bol[pmatch[0].rm_so-1])) && ((pmatch[0].rm_eo == (eol-bol)) || !word_char(bol[pmatch[0].rm_eo])) ) @@ -349,10 +348,14 @@ static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol /* There could be more than one match on the * line, and the first match might not be * strict word match. But later ones could be! + * Forward to the next possible start, i.e. the + * next position following a non-word char. */ bol = pmatch[0].rm_so + bol + 1; - at_true_bol = 0; - goto again; + while (word_char(bol[-1]) && bol < eol) + bol++; + if (bol < eol) + goto again; } } if (p->token == GREP_PATTERN_HEAD && saved_ch) From c822255cfc1ac83daeeeee1647e3c775450c830c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Sat, 10 Jan 2009 00:18:34 +0100 Subject: [PATCH 2/2] grep: don't call regexec() for fixed strings Add the new flag "fixed" to struct grep_pat and set it if the pattern is doesn't contain any regex control characters in addition to if the flag -F/--fixed-strings was specified. This gives a nice speed up on msysgit, where regexec() seems to be extra slow. Before (best of five runs): $ time git grep grep v1.6.1 >/dev/null real 0m0.552s user 0m0.000s sys 0m0.000s $ time git grep -F grep v1.6.1 >/dev/null real 0m0.170s user 0m0.000s sys 0m0.015s With the patch: $ time git grep grep v1.6.1 >/dev/null real 0m0.173s user 0m0.000s sys 0m0.000s The difference is much smaller on Linux, but still measurable. Signed-off-by: Rene Scharfe Signed-off-by: Junio C Hamano --- grep.c | 29 +++++++++++++++++++++++++---- grep.h | 1 + 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/grep.c b/grep.c index 22a56b5d5..6485760ff 100644 --- a/grep.c +++ b/grep.c @@ -28,9 +28,31 @@ void append_grep_pattern(struct grep_opt *opt, const char *pat, p->next = NULL; } +static int isregexspecial(int c) +{ + return isspecial(c) || c == '$' || c == '(' || c == ')' || c == '+' || + c == '.' || c == '^' || c == '{' || c == '|'; +} + +static int is_fixed(const char *s) +{ + while (!isregexspecial(*s)) + s++; + return !*s; +} + static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { - int err = regcomp(&p->regexp, p->pattern, opt->regflags); + int err; + + if (opt->fixed || is_fixed(p->pattern)) + p->fixed = 1; + if (opt->regflags & REG_ICASE) + p->fixed = 0; + if (p->fixed) + return; + + err = regcomp(&p->regexp, p->pattern, opt->regflags); if (err) { char errbuf[1024]; char where[1024]; @@ -159,8 +181,7 @@ void compile_grep_patterns(struct grep_opt *opt) case GREP_PATTERN: /* atom */ case GREP_PATTERN_HEAD: case GREP_PATTERN_BODY: - if (!opt->fixed) - compile_regexp(p, opt); + compile_regexp(p, opt); break; default: opt->extended = 1; @@ -314,7 +335,7 @@ static int match_one_pattern(struct grep_opt *opt, struct grep_pat *p, char *bol } again: - if (!opt->fixed) { + if (!p->fixed) { regex_t *exp = &p->regexp; hit = !regexec(exp, bol, ARRAY_SIZE(pmatch), pmatch, 0); diff --git a/grep.h b/grep.h index 45a222d90..5102ce335 100644 --- a/grep.h +++ b/grep.h @@ -30,6 +30,7 @@ struct grep_pat { const char *pattern; enum grep_header_field field; regex_t regexp; + unsigned fixed:1; }; enum grep_expr_node {