Skip to content

Commit

Permalink
grep: optimize built-in grep by skipping lines that do not hit
Browse files Browse the repository at this point in the history
The internal "grep" engine we use checks for hits line-by-line, instead of
letting the underlying regexec()/fixmatch() routines scan for the first
match from the rest of the buffer.  This was a major source of overhead
compared to the external grep.

Introduce a "look-ahead" mechanism to find the next line that would
potentially match by using regexec()/fixmatch() in the remainder of the
text to skip unmatching lines, and use it when the query criteria is
simple enough (i.e. punt for an advanced grep boolean expression like
"lines that have both X and Y but not Z" for now) and we are not running
under "-v" (aka "--invert-match") option.

Note that "-L" (aka "--files-without-match") is not a reason to disable
this optimization.  Under the option, we are interested if the file has
any hit at all, and that is what we determine reliably with or without the
optimization.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information
Junio C Hamano committed Jan 12, 2010
1 parent cb57220 commit a26345b
Showing 1 changed file with 75 additions and 0 deletions.
75 changes: 75 additions & 0 deletions grep.c
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,65 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
}
}

static int should_lookahead(struct grep_opt *opt)
{
struct grep_pat *p;

if (opt->extended)
return 0; /* punt for too complex stuff */
if (opt->invert)
return 0;
for (p = opt->pattern_list; p; p = p->next) {
if (p->token != GREP_PATTERN)
return 0; /* punt for "header only" and stuff */
}
return 1;
}

static int look_ahead(struct grep_opt *opt,
unsigned long *left_p,
unsigned *lno_p,
char **bol_p)
{
unsigned lno = *lno_p;
char *bol = *bol_p;
struct grep_pat *p;
char *sp, *last_bol;
regoff_t earliest = -1;

for (p = opt->pattern_list; p; p = p->next) {
int hit;
regmatch_t m;

if (p->fixed)
hit = !fixmatch(p->pattern, bol, &m);
else
hit = !regexec(&p->regexp, bol, 1, &m, 0);
if (!hit || m.rm_so < 0 || m.rm_eo < 0)
continue;
if (earliest < 0 || m.rm_so < earliest)
earliest = m.rm_so;
}

if (earliest < 0) {
*bol_p = bol + *left_p;
*left_p = 0;
return 1;
}
for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
; /* find the beginning of the line */
last_bol = sp;

for (sp = bol; sp < last_bol; sp++) {
if (*sp == '\n')
lno++;
}
*left_p -= last_bol - bol;
*bol_p = last_bol;
*lno_p = lno;
return 0;
}

static int grep_buffer_1(struct grep_opt *opt, const char *name,
char *buf, unsigned long size, int collect_hits)
{
Expand All @@ -617,6 +676,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
unsigned last_hit = 0;
int binary_match_only = 0;
unsigned count = 0;
int try_lookahead = 0;
enum grep_context ctx = GREP_CONTEXT_HEAD;
xdemitconf_t xecfg;

Expand Down Expand Up @@ -645,11 +705,26 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
opt->priv = &xecfg;
}
}
try_lookahead = should_lookahead(opt);

while (left) {
char *eol, ch;
int hit;

/*
* look_ahead() skips quicly to the line that possibly
* has the next hit; don't call it if we need to do
* something more than just skipping the current line
* in response to an unmatch for the current line. E.g.
* inside a post-context window, we will show the current
* line as a context around the previous hit when it
* doesn't hit.
*/
if (try_lookahead
&& !(last_hit
&& lno <= last_hit + opt->post_context)
&& look_ahead(opt, &left, &lno, &bol))
break;
eol = end_of_line(bol, &left);
ch = *eol;
*eol = 0;
Expand Down

0 comments on commit a26345b

Please sign in to comment.