From 07b57e90f7c852c4fe212ab1d91058f27469a74b Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 17 Jan 2009 17:29:42 +0100 Subject: [PATCH 1/9] Add color_fwrite_lines(), a function coloring each line individually We have to set the color before every line and reset it before every newline. Add a function color_fwrite_lines() which does that for us. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- color.c | 28 ++++++++++++++++++++++++++++ color.h | 1 + 2 files changed, 29 insertions(+) diff --git a/color.c b/color.c index fc0b72ad5..d4ae83f9b 100644 --- a/color.c +++ b/color.c @@ -191,3 +191,31 @@ int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...) va_end(args); return r; } + +/* + * This function splits the buffer by newlines and colors the lines individually. + * + * Returns 0 on success. + */ +int color_fwrite_lines(FILE *fp, const char *color, + size_t count, const char *buf) +{ + if (!*color) + return fwrite(buf, count, 1, fp) != 1; + while (count) { + char *p = memchr(buf, '\n', count); + if (p != buf && (fputs(color, fp) < 0 || + fwrite(buf, p ? p - buf : count, 1, fp) != 1 || + fputs(COLOR_RESET, fp) < 0)) + return -1; + if (!p) + return 0; + if (fputc('\n', fp) < 0) + return -1; + count -= p + 1 - buf; + buf = p + 1; + } + return 0; +} + + diff --git a/color.h b/color.h index 6cf5c88aa..cd5c985eb 100644 --- a/color.h +++ b/color.h @@ -19,5 +19,6 @@ int git_config_colorbool(const char *var, const char *value, int stdout_is_tty); void color_parse(const char *var, const char *value, char *dst); int color_fprintf(FILE *fp, const char *color, const char *fmt, ...); int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...); +int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf); #endif /* COLOR_H */ From 23c1575f747393f9847874fd1ed72a44557459d1 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 17 Jan 2009 17:29:43 +0100 Subject: [PATCH 2/9] color-words: refactor word splitting and use ALLOC_GROW() Word splitting is now performed by the function diff_words_fill(), avoiding having the same code twice. In the same spirit, avoid duplicating the code of ALLOC_GROW(). Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- diff.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/diff.c b/diff.c index d23548292..c111eef13 100644 --- a/diff.c +++ b/diff.c @@ -326,10 +326,7 @@ struct diff_words_buffer { static void diff_words_append(char *line, unsigned long len, struct diff_words_buffer *buffer) { - if (buffer->text.size + len > buffer->alloc) { - buffer->alloc = (buffer->text.size + len) * 3 / 2; - buffer->text.ptr = xrealloc(buffer->text.ptr, buffer->alloc); - } + ALLOC_GROW(buffer->text.ptr, buffer->text.size + len, buffer->alloc); line++; len--; memcpy(buffer->text.ptr + buffer->text.size, line, len); @@ -398,6 +395,22 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) } } +/* + * This function splits the words in buffer->text, and stores the list with + * newline separator into out. + */ +static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) +{ + int i; + out->size = buffer->text.size; + out->ptr = xmalloc(out->size); + memcpy(out->ptr, buffer->text.ptr, out->size); + for (i = 0; i < out->size; i++) + if (isspace(out->ptr[i])) + out->ptr[i] = '\n'; + buffer->current = 0; +} + /* this executes the word diff on the accumulated buffers */ static void diff_words_show(struct diff_words_data *diff_words) { @@ -405,26 +418,11 @@ static void diff_words_show(struct diff_words_data *diff_words) xdemitconf_t xecfg; xdemitcb_t ecb; mmfile_t minus, plus; - int i; memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); - minus.size = diff_words->minus.text.size; - minus.ptr = xmalloc(minus.size); - memcpy(minus.ptr, diff_words->minus.text.ptr, minus.size); - for (i = 0; i < minus.size; i++) - if (isspace(minus.ptr[i])) - minus.ptr[i] = '\n'; - diff_words->minus.current = 0; - - plus.size = diff_words->plus.text.size; - plus.ptr = xmalloc(plus.size); - memcpy(plus.ptr, diff_words->plus.text.ptr, plus.size); - for (i = 0; i < plus.size; i++) - if (isspace(plus.ptr[i])) - plus.ptr[i] = '\n'; - diff_words->plus.current = 0; - + diff_words_fill(&diff_words->minus, &minus); + diff_words_fill(&diff_words->plus, &plus); xpp.flags = XDF_NEED_MINIMAL; xecfg.ctxlen = diff_words->minus.alloc + diff_words->plus.alloc; xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, From 2e5d2003b28820f88296e47a79eb440ca0295000 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 17 Jan 2009 17:29:44 +0100 Subject: [PATCH 3/9] color-words: change algorithm to allow for 0-character word boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Up until now, the color-words code assumed that word boundaries are identical to white space characters. Therefore, it could get away with a very simple scheme: it copied the hunks, substituted newlines for each white space character, called libxdiff with the processed text, and then identified the text to output by the offsets (which agreed since the original text had the same length). This code was ugly, for a number of reasons: - it was impossible to introduce 0-character word boundaries, - we had to print everything word by word, and - the code needed extra special handling of newlines in the removed part. Fix all of these issues by processing the text such that - we build word lists, separated by newlines, - we remember the original offsets for every word, and - after calling libxdiff on the wordlists, we parse the hunk headers, and find the corresponding offsets, and then - we print the removed/added parts in one go. The pre and post samples in the test were provided by Santi Béjar. Note that there is some strange special handling of hunk headers where one line range is 0 due to POSIX: in this case, the start is one too low. In other words a hunk header '@@ -1,0 +2 @@' actually means that the line must be added after the _second_ line of the pre text, _not_ the first. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- diff.c | 157 ++++++++++++++++++++++++------------------ t/t4034-diff-words.sh | 66 ++++++++++++++++++ 2 files changed, 157 insertions(+), 66 deletions(-) create mode 100755 t/t4034-diff-words.sh diff --git a/diff.c b/diff.c index c111eef13..37c886a81 100644 --- a/diff.c +++ b/diff.c @@ -319,8 +319,10 @@ static int fill_mmfile(mmfile_t *mf, struct diff_filespec *one) struct diff_words_buffer { mmfile_t text; long alloc; - long current; /* output pointer */ - int suppressed_newline; + struct diff_words_orig { + const char *begin, *end; + } *orig; + int orig_nr, orig_alloc; }; static void diff_words_append(char *line, unsigned long len, @@ -335,80 +337,89 @@ static void diff_words_append(char *line, unsigned long len, struct diff_words_data { struct diff_words_buffer minus, plus; + const char *current_plus; FILE *file; }; -static void print_word(FILE *file, struct diff_words_buffer *buffer, int len, int color, - int suppress_newline) +static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) { - const char *ptr; - int eol = 0; + struct diff_words_data *diff_words = priv; + int minus_first, minus_len, plus_first, plus_len; + const char *minus_begin, *minus_end, *plus_begin, *plus_end; - if (len == 0) + if (line[0] != '@' || parse_hunk_header(line, len, + &minus_first, &minus_len, &plus_first, &plus_len)) return; - ptr = buffer->text.ptr + buffer->current; - buffer->current += len; + /* POSIX requires that first be decremented by one if len == 0... */ + if (minus_len) { + minus_begin = diff_words->minus.orig[minus_first].begin; + minus_end = + diff_words->minus.orig[minus_first + minus_len - 1].end; + } else + minus_begin = minus_end = + diff_words->minus.orig[minus_first].end; - if (ptr[len - 1] == '\n') { - eol = 1; - len--; - } + if (plus_len) { + plus_begin = diff_words->plus.orig[plus_first].begin; + plus_end = diff_words->plus.orig[plus_first + plus_len - 1].end; + } else + plus_begin = plus_end = diff_words->plus.orig[plus_first].end; - fputs(diff_get_color(1, color), file); - fwrite(ptr, len, 1, file); - fputs(diff_get_color(1, DIFF_RESET), file); + if (diff_words->current_plus != plus_begin) + fwrite(diff_words->current_plus, + plus_begin - diff_words->current_plus, 1, + diff_words->file); + if (minus_begin != minus_end) + color_fwrite_lines(diff_words->file, + diff_get_color(1, DIFF_FILE_OLD), + minus_end - minus_begin, minus_begin); + if (plus_begin != plus_end) + color_fwrite_lines(diff_words->file, + diff_get_color(1, DIFF_FILE_NEW), + plus_end - plus_begin, plus_begin); - if (eol) { - if (suppress_newline) - buffer->suppressed_newline = 1; - else - putc('\n', file); - } -} - -static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) -{ - struct diff_words_data *diff_words = priv; - - if (diff_words->minus.suppressed_newline) { - if (line[0] != '+') - putc('\n', diff_words->file); - diff_words->minus.suppressed_newline = 0; - } - - len--; - switch (line[0]) { - case '-': - print_word(diff_words->file, - &diff_words->minus, len, DIFF_FILE_OLD, 1); - break; - case '+': - print_word(diff_words->file, - &diff_words->plus, len, DIFF_FILE_NEW, 0); - break; - case ' ': - print_word(diff_words->file, - &diff_words->plus, len, DIFF_PLAIN, 0); - diff_words->minus.current += len; - break; - } + diff_words->current_plus = plus_end; } /* - * This function splits the words in buffer->text, and stores the list with - * newline separator into out. + * This function splits the words in buffer->text, stores the list with + * newline separator into out, and saves the offsets of the original words + * in buffer->orig. */ static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) { - int i; - out->size = buffer->text.size; - out->ptr = xmalloc(out->size); - memcpy(out->ptr, buffer->text.ptr, out->size); - for (i = 0; i < out->size; i++) - if (isspace(out->ptr[i])) - out->ptr[i] = '\n'; - buffer->current = 0; + int i, j; + + out->size = 0; + out->ptr = xmalloc(buffer->text.size); + + /* fake an empty "0th" word */ + ALLOC_GROW(buffer->orig, 1, buffer->orig_alloc); + buffer->orig[0].begin = buffer->orig[0].end = buffer->text.ptr; + buffer->orig_nr = 1; + + for (i = 0; i < buffer->text.size; i++) { + if (isspace(buffer->text.ptr[i])) + continue; + for (j = i + 1; j < buffer->text.size && + !isspace(buffer->text.ptr[j]); j++) + ; /* find the end of the word */ + + /* store original boundaries */ + ALLOC_GROW(buffer->orig, buffer->orig_nr + 1, + buffer->orig_alloc); + buffer->orig[buffer->orig_nr].begin = buffer->text.ptr + i; + buffer->orig[buffer->orig_nr].end = buffer->text.ptr + j; + buffer->orig_nr++; + + /* store one word */ + memcpy(out->ptr + out->size, buffer->text.ptr + i, j - i); + out->ptr[out->size + j - i] = '\n'; + out->size += j - i + 1; + + i = j - 1; + } } /* this executes the word diff on the accumulated buffers */ @@ -419,22 +430,34 @@ static void diff_words_show(struct diff_words_data *diff_words) xdemitcb_t ecb; mmfile_t minus, plus; + /* special case: only removal */ + if (!diff_words->plus.text.size) { + color_fwrite_lines(diff_words->file, + diff_get_color(1, DIFF_FILE_OLD), + diff_words->minus.text.size, diff_words->minus.text.ptr); + diff_words->minus.text.size = 0; + return; + } + + diff_words->current_plus = diff_words->plus.text.ptr; + memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); diff_words_fill(&diff_words->minus, &minus); diff_words_fill(&diff_words->plus, &plus); xpp.flags = XDF_NEED_MINIMAL; - xecfg.ctxlen = diff_words->minus.alloc + diff_words->plus.alloc; + xecfg.ctxlen = 0; xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, &xpp, &xecfg, &ecb); free(minus.ptr); free(plus.ptr); + if (diff_words->current_plus != diff_words->plus.text.ptr + + diff_words->plus.text.size) + fwrite(diff_words->current_plus, + diff_words->plus.text.ptr + diff_words->plus.text.size + - diff_words->current_plus, 1, + diff_words->file); diff_words->minus.text.size = diff_words->plus.text.size = 0; - - if (diff_words->minus.suppressed_newline) { - putc('\n', diff_words->file); - diff_words->minus.suppressed_newline = 0; - } } typedef unsigned long (*sane_truncate_fn)(char *line, unsigned long len); @@ -458,7 +481,9 @@ static void free_diff_words_data(struct emit_callback *ecbdata) diff_words_show(ecbdata->diff_words); free (ecbdata->diff_words->minus.text.ptr); + free (ecbdata->diff_words->minus.orig); free (ecbdata->diff_words->plus.text.ptr); + free (ecbdata->diff_words->plus.orig); free(ecbdata->diff_words); ecbdata->diff_words = NULL; } diff --git a/t/t4034-diff-words.sh b/t/t4034-diff-words.sh new file mode 100755 index 000000000..b22195f8b --- /dev/null +++ b/t/t4034-diff-words.sh @@ -0,0 +1,66 @@ +#!/bin/sh + +test_description='word diff colors' + +. ./test-lib.sh + +test_expect_success setup ' + + git config diff.color.old red + git config diff.color.new green + +' + +decrypt_color () { + sed \ + -e 's/.\[1m//g' \ + -e 's/.\[31m//g' \ + -e 's/.\[32m//g' \ + -e 's/.\[36m//g' \ + -e 's/.\[m//g' +} + +word_diff () { + test_must_fail git diff --no-index "$@" pre post > output && + decrypt_color < output > output.decrypted && + test_cmp expect output.decrypted +} + +cat > pre <<\EOF +h(4) + +a = b + c +EOF + +cat > post <<\EOF +h(4),hh[44] + +a = b + c + +aa = a + +aeff = aeff * ( aaa ) +EOF + +cat > expect <<\EOF +diff --git a/pre b/post +index 330b04f..5ed8eff 100644 +--- a/pre ++++ b/post +@@ -1,3 +1,7 @@ +h(4)h(4),hh[44] + +a = b + c + +aa = a + +aeff = aeff * ( aaa ) +EOF + +test_expect_success 'word diff with runs of whitespace' ' + + word_diff --color-words + +' + +test_done From 2b6a5417d750d086d1da906e46de2b3ad8df6753 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 17 Jan 2009 17:29:45 +0100 Subject: [PATCH 4/9] color-words: take an optional regular expression describing words In some applications, words are not delimited by white space. To allow for that, you can specify a regular expression describing what makes a word with git diff --color-words='[A-Za-z0-9]+' Note that words cannot contain newline characters. As suggested by Thomas Rast, the words are the exact matches of the regular expression. Note that a regular expression beginning with a '^' will match only a word at the beginning of the hunk, not a word at the beginning of a line, and is probably not what you want. This commit contains a quoting fix by Thomas Rast. Signed-off-by: Johannes Schindelin Signed-off-by: Junio C Hamano --- Documentation/diff-options.txt | 6 +++- diff.c | 64 +++++++++++++++++++++++++++++----- diff.h | 1 + t/t4034-diff-words.sh | 57 ++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 10 deletions(-) diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt index 43793d750..2c1fa4b10 100644 --- a/Documentation/diff-options.txt +++ b/Documentation/diff-options.txt @@ -91,8 +91,12 @@ endif::git-format-patch[] Turn off colored diff, even when the configuration file gives the default to color output. ---color-words:: +--color-words[=regex]:: Show colored word diff, i.e. color words which have changed. ++ +Optionally, you can pass a regular expression that tells Git what the +words are that you are looking for; The default is to interpret any +stretch of non-whitespace as a word. --no-renames:: Turn off rename detection, even when the configuration diff --git a/diff.c b/diff.c index 37c886a81..9fb3d0df3 100644 --- a/diff.c +++ b/diff.c @@ -333,12 +333,14 @@ static void diff_words_append(char *line, unsigned long len, len--; memcpy(buffer->text.ptr + buffer->text.size, line, len); buffer->text.size += len; + buffer->text.ptr[buffer->text.size] = '\0'; } struct diff_words_data { struct diff_words_buffer minus, plus; const char *current_plus; FILE *file; + regex_t *word_regex; }; static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) @@ -382,17 +384,49 @@ static void fn_out_diff_words_aux(void *priv, char *line, unsigned long len) diff_words->current_plus = plus_end; } +/* This function starts looking at *begin, and returns 0 iff a word was found. */ +static int find_word_boundaries(mmfile_t *buffer, regex_t *word_regex, + int *begin, int *end) +{ + if (word_regex && *begin < buffer->size) { + regmatch_t match[1]; + if (!regexec(word_regex, buffer->ptr + *begin, 1, match, 0)) { + char *p = memchr(buffer->ptr + *begin + match[0].rm_so, + '\n', match[0].rm_eo - match[0].rm_so); + *end = p ? p - buffer->ptr : match[0].rm_eo + *begin; + *begin += match[0].rm_so; + return *begin >= *end; + } + return -1; + } + + /* find the next word */ + while (*begin < buffer->size && isspace(buffer->ptr[*begin])) + (*begin)++; + if (*begin >= buffer->size) + return -1; + + /* find the end of the word */ + *end = *begin + 1; + while (*end < buffer->size && !isspace(buffer->ptr[*end])) + (*end)++; + + return 0; +} + /* * This function splits the words in buffer->text, stores the list with * newline separator into out, and saves the offsets of the original words * in buffer->orig. */ -static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) +static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out, + regex_t *word_regex) { int i, j; + long alloc = 0; out->size = 0; - out->ptr = xmalloc(buffer->text.size); + out->ptr = NULL; /* fake an empty "0th" word */ ALLOC_GROW(buffer->orig, 1, buffer->orig_alloc); @@ -400,11 +434,8 @@ static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) buffer->orig_nr = 1; for (i = 0; i < buffer->text.size; i++) { - if (isspace(buffer->text.ptr[i])) - continue; - for (j = i + 1; j < buffer->text.size && - !isspace(buffer->text.ptr[j]); j++) - ; /* find the end of the word */ + if (find_word_boundaries(&buffer->text, word_regex, &i, &j)) + return; /* store original boundaries */ ALLOC_GROW(buffer->orig, buffer->orig_nr + 1, @@ -414,6 +445,7 @@ static void diff_words_fill(struct diff_words_buffer *buffer, mmfile_t *out) buffer->orig_nr++; /* store one word */ + ALLOC_GROW(out->ptr, out->size + j - i + 1, alloc); memcpy(out->ptr + out->size, buffer->text.ptr + i, j - i); out->ptr[out->size + j - i] = '\n'; out->size += j - i + 1; @@ -443,9 +475,10 @@ static void diff_words_show(struct diff_words_data *diff_words) memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); - diff_words_fill(&diff_words->minus, &minus); - diff_words_fill(&diff_words->plus, &plus); + diff_words_fill(&diff_words->minus, &minus, diff_words->word_regex); + diff_words_fill(&diff_words->plus, &plus, diff_words->word_regex); xpp.flags = XDF_NEED_MINIMAL; + /* as only the hunk header will be parsed, we need a 0-context */ xecfg.ctxlen = 0; xdi_diff_outf(&minus, &plus, fn_out_diff_words_aux, diff_words, &xpp, &xecfg, &ecb); @@ -484,6 +517,7 @@ static void free_diff_words_data(struct emit_callback *ecbdata) free (ecbdata->diff_words->minus.orig); free (ecbdata->diff_words->plus.text.ptr); free (ecbdata->diff_words->plus.orig); + free(ecbdata->diff_words->word_regex); free(ecbdata->diff_words); ecbdata->diff_words = NULL; } @@ -1506,6 +1540,14 @@ static void builtin_diff(const char *name_a, ecbdata.diff_words = xcalloc(1, sizeof(struct diff_words_data)); ecbdata.diff_words->file = o->file; + if (o->word_regex) { + ecbdata.diff_words->word_regex = (regex_t *) + xmalloc(sizeof(regex_t)); + if (regcomp(ecbdata.diff_words->word_regex, + o->word_regex, REG_EXTENDED)) + die ("Invalid regular expression: %s", + o->word_regex); + } } xdi_diff_outf(&mf1, &mf2, fn_out_consume, &ecbdata, &xpp, &xecfg, &ecb); @@ -2517,6 +2559,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac) DIFF_OPT_CLR(options, COLOR_DIFF); else if (!strcmp(arg, "--color-words")) options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS; + else if (!prefixcmp(arg, "--color-words=")) { + options->flags |= DIFF_OPT_COLOR_DIFF | DIFF_OPT_COLOR_DIFF_WORDS; + options->word_regex = arg + 14; + } else if (!strcmp(arg, "--exit-code")) DIFF_OPT_SET(options, EXIT_WITH_STATUS); else if (!strcmp(arg, "--quiet")) diff --git a/diff.h b/diff.h index 4d5a32781..23cd90c2e 100644 --- a/diff.h +++ b/diff.h @@ -98,6 +98,7 @@ struct diff_options { int stat_width; int stat_name_width; + const char *word_regex; /* this is set by diffcore for DIFF_FORMAT_PATCH */ int found_changes; diff --git a/t/t4034-diff-words.sh b/t/t4034-diff-words.sh index b22195f8b..487348630 100755 --- a/t/t4034-diff-words.sh +++ b/t/t4034-diff-words.sh @@ -63,4 +63,61 @@ test_expect_success 'word diff with runs of whitespace' ' ' +cat > expect <<\EOF +diff --git a/pre b/post +index 330b04f..5ed8eff 100644 +--- a/pre ++++ b/post +@@ -1,3 +1,7 @@ +h(4),hh[44] + +a = b + c + +aa = a + +aeff = aeff * ( aaa ) +EOF + +test_expect_success 'word diff with a regular expression' ' + + word_diff --color-words="[a-z]+" + +' + +echo 'aaa (aaa)' > pre +echo 'aaa (aaa) aaa' > post + +cat > expect <<\EOF +diff --git a/pre b/post +index c29453b..be22f37 100644 +--- a/pre ++++ b/post +@@ -1 +1 @@ +aaa (aaa) aaa +EOF + +test_expect_success 'test parsing words for newline' ' + + word_diff --color-words="a+" + +' + +echo '(:' > pre +echo '(' > post + +cat > expect <<\EOF +diff --git a/pre b/post +index 289cb9d..2d06f37 100644 +--- a/pre ++++ b/post +@@ -1 +1 @@ +(: +EOF + +test_expect_success 'test when words are only removed at the end' ' + + word_diff --color-words=. + +' + test_done From bf82940dbf12f066ba42a2a03a5bb626ba22c067 Mon Sep 17 00:00:00 2001 From: Thomas Rast Date: Sat, 17 Jan 2009 17:29:46 +0100 Subject: [PATCH 5/9] color-words: enable REG_NEWLINE to help user We silently truncate a match at the newline, which may lead to unexpected behaviour, e.g., when matching "<[^>]*>" against since then "" doesn't!) even though the regex said only angle-bracket-delimited things can be words. To alleviate the problem slightly, use REG_NEWLINE so that negated classes can't match a newline. Of course newlines can still be matched explicitly. Signed-off-by: Thomas Rast Signed-off-by: Junio C Hamano --- diff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/diff.c b/diff.c index 9fb3d0df3..00c661f82 100644 --- a/diff.c +++ b/diff.c @@ -1544,7 +1544,8 @@ static void builtin_diff(const char *name_a, ecbdata.diff_words->word_regex = (regex_t *) xmalloc(sizeof(regex_t)); if (regcomp(ecbdata.diff_words->word_regex, - o->word_regex, REG_EXTENDED)) + o->word_regex, + REG_EXTENDED | REG_NEWLINE)) die ("Invalid regular expression: %s", o->word_regex); } From c4b252c3d894673968b144d8e10b79ef22c17b0a Mon Sep 17 00:00:00 2001 From: Thomas Rast Date: Sat, 17 Jan 2009 17:29:47 +0100 Subject: [PATCH 6/9] color-words: expand docs with precise semantics Signed-off-by: Thomas Rast Signed-off-by: Junio C Hamano --- Documentation/diff-options.txt | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt index 2c1fa4b10..8689a92d8 100644 --- a/Documentation/diff-options.txt +++ b/Documentation/diff-options.txt @@ -91,12 +91,17 @@ endif::git-format-patch[] Turn off colored diff, even when the configuration file gives the default to color output. ---color-words[=regex]:: - Show colored word diff, i.e. color words which have changed. +--color-words[=]:: + Show colored word diff, i.e., color words which have changed. + By default, words are separated by whitespace. + -Optionally, you can pass a regular expression that tells Git what the -words are that you are looking for; The default is to interpret any -stretch of non-whitespace as a word. +When a is specified, every non-overlapping match of the + is considered a word. Anything between these matches is +considered whitespace and ignored(!) for the purposes of finding +differences. You may want to append `|[^[:space:]]` to your regular +expression to make sure that it matches all non-whitespace characters. +A match that contains a newline is silently truncated(!) at the +newline. --no-renames:: Turn off rename detection, even when the configuration From 80c49c3de2d5a3aa12b0980a65f1163c8aef0c16 Mon Sep 17 00:00:00 2001 From: Thomas Rast Date: Sat, 17 Jan 2009 17:29:48 +0100 Subject: [PATCH 7/9] color-words: make regex configurable via attributes Make the --color-words splitting regular expression configurable via the diff driver's 'wordregex' attribute. The user can then set the driver on a file in .gitattributes. If a regex is given on the command line, it overrides the driver's setting. We also provide built-in regexes for the languages that already had funcname patterns, and add an appropriate diff driver entry for C/++. (The patterns are designed to run UTF-8 sequences into a single chunk to make sure they remain readable.) Signed-off-by: Thomas Rast Signed-off-by: Junio C Hamano --- Documentation/diff-options.txt | 4 ++ Documentation/gitattributes.txt | 21 +++++++++ diff.c | 10 +++++ t/t4034-diff-words.sh | 36 +++++++++++++++ userdiff.c | 78 ++++++++++++++++++++++++++------- userdiff.h | 1 + 6 files changed, 135 insertions(+), 15 deletions(-) diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt index 8689a92d8..1edb82e8e 100644 --- a/Documentation/diff-options.txt +++ b/Documentation/diff-options.txt @@ -102,6 +102,10 @@ differences. You may want to append `|[^[:space:]]` to your regular expression to make sure that it matches all non-whitespace characters. A match that contains a newline is silently truncated(!) at the newline. ++ +The regex can also be set via a diff driver, see +linkgit:gitattributes[1]; giving it explicitly overrides any diff +driver setting. --no-renames:: Turn off rename detection, even when the configuration diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt index 8af22ecca..ba3ba1273 100644 --- a/Documentation/gitattributes.txt +++ b/Documentation/gitattributes.txt @@ -317,6 +317,8 @@ patterns are available: - `bibtex` suitable for files with BibTeX coded references. +- `cpp` suitable for source code in the C and C++ languages. + - `html` suitable for HTML/XHTML documents. - `java` suitable for source code in the Java language. @@ -334,6 +336,25 @@ patterns are available: - `tex` suitable for source code for LaTeX documents. +Customizing word diff +^^^^^^^^^^^^^^^^^^^^^ + +You can customize the rules that `git diff --color-words` uses to +split words in a line, by specifying an appropriate regular expression +in the "diff.*.wordregex" configuration variable. For example, in TeX +a backslash followed by a sequence of letters forms a command, but +several such commands can be run together without intervening +whitespace. To separate them, use a regular expression such as + +------------------------ +[diff "tex"] + wordregex = "\\\\[a-zA-Z]+|[{}]|\\\\.|[^\\{}[:space:]]+" +------------------------ + +A built-in pattern is provided for all languages listed in the +previous section. + + Performing text diffs of binary files ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/diff.c b/diff.c index 00c661f82..9fcde963d 100644 --- a/diff.c +++ b/diff.c @@ -1380,6 +1380,12 @@ static const struct userdiff_funcname *diff_funcname_pattern(struct diff_filespe return one->driver->funcname.pattern ? &one->driver->funcname : NULL; } +static const char *userdiff_word_regex(struct diff_filespec *one) +{ + diff_filespec_load_driver(one); + return one->driver->word_regex; +} + void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const char *b) { if (!options->a_prefix) @@ -1540,6 +1546,10 @@ static void builtin_diff(const char *name_a, ecbdata.diff_words = xcalloc(1, sizeof(struct diff_words_data)); ecbdata.diff_words->file = o->file; + if (!o->word_regex) + o->word_regex = userdiff_word_regex(one); + if (!o->word_regex) + o->word_regex = userdiff_word_regex(two); if (o->word_regex) { ecbdata.diff_words->word_regex = (regex_t *) xmalloc(sizeof(regex_t)); diff --git a/t/t4034-diff-words.sh b/t/t4034-diff-words.sh index 487348630..744221bef 100755 --- a/t/t4034-diff-words.sh +++ b/t/t4034-diff-words.sh @@ -84,6 +84,41 @@ test_expect_success 'word diff with a regular expression' ' ' +test_expect_success 'set a diff driver' ' + git config diff.testdriver.wordregex "[^[:space:]]" && + cat < .gitattributes +pre diff=testdriver +post diff=testdriver +EOF +' + +test_expect_success 'option overrides default' ' + + word_diff --color-words="[a-z]+" + +' + +cat > expect <<\EOF +diff --git a/pre b/post +index 330b04f..5ed8eff 100644 +--- a/pre ++++ b/post +@@ -1,3 +1,7 @@ +h(4),hh[44] + +a = b + c + +aa = a + +aeff = aeff * ( aaa ) +EOF + +test_expect_success 'use default supplied by driver' ' + + word_diff --color-words + +' + echo 'aaa (aaa)' > pre echo 'aaa (aaa) aaa' > post @@ -100,6 +135,7 @@ test_expect_success 'test parsing words for newline' ' word_diff --color-words="a+" + ' echo '(:' > pre diff --git a/userdiff.c b/userdiff.c index 3681062eb..2b5550948 100644 --- a/userdiff.c +++ b/userdiff.c @@ -6,14 +6,20 @@ static struct userdiff_driver *drivers; static int ndrivers; static int drivers_alloc; -#define FUNCNAME(name, pattern) \ - { name, NULL, -1, { pattern, REG_EXTENDED } } +#define PATTERNS(name, pattern, wordregex) \ + { name, NULL, -1, { pattern, REG_EXTENDED }, wordregex } static struct userdiff_driver builtin_drivers[] = { -FUNCNAME("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$"), -FUNCNAME("java", +PATTERNS("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$", + "[^<>= \t]+|[^[:space:]]|[\x80-\xff]+"), +PATTERNS("java", "!^[ \t]*(catch|do|for|if|instanceof|new|return|switch|throw|while)\n" - "^[ \t]*(([ \t]*[A-Za-z_][A-Za-z_0-9]*){2,}[ \t]*\\([^;]*)$"), -FUNCNAME("objc", + "^[ \t]*(([ \t]*[A-Za-z_][A-Za-z_0-9]*){2,}[ \t]*\\([^;]*)$", + "[a-zA-Z_][a-zA-Z0-9_]*" + "|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lL]?" + "|[-+*/<>%&^|=!]=" + "|--|\\+\\+|<<=?|>>>?=?|&&|\\|\\|" + "|[^[:space:]]|[\x80-\xff]+"), +PATTERNS("objc", /* Negate C statements that can look like functions */ "!^[ \t]*(do|for|if|else|return|switch|while)\n" /* Objective-C methods */ @@ -21,20 +27,60 @@ FUNCNAME("objc", /* C functions */ "^[ \t]*(([ \t]*[A-Za-z_][A-Za-z_0-9]*){2,}[ \t]*\\([^;]*)$\n" /* Objective-C class/protocol definitions */ - "^(@(implementation|interface|protocol)[ \t].*)$"), -FUNCNAME("pascal", + "^(@(implementation|interface|protocol)[ \t].*)$", + /* -- */ + "[a-zA-Z_][a-zA-Z0-9_]*" + "|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lL]?" + "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->" + "|[^[:space:]]|[\x80-\xff]+"), +PATTERNS("pascal", "^((procedure|function|constructor|destructor|interface|" "implementation|initialization|finalization)[ \t]*.*)$" "\n" - "^(.*=[ \t]*(class|record).*)$"), -FUNCNAME("php", "^[\t ]*((function|class).*)"), -FUNCNAME("python", "^[ \t]*((class|def)[ \t].*)$"), -FUNCNAME("ruby", "^[ \t]*((class|module|def)[ \t].*)$"), -FUNCNAME("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$"), -FUNCNAME("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$"), + "^(.*=[ \t]*(class|record).*)$", + /* -- */ + "[a-zA-Z_][a-zA-Z0-9_]*" + "|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+" + "|<>|<=|>=|:=|\\.\\." + "|[^[:space:]]|[\x80-\xff]+"), +PATTERNS("php", "^[\t ]*((function|class).*)", + /* -- */ + "[a-zA-Z_][a-zA-Z0-9_]*" + "|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+" + "|[-+*/<>%&^|=!.]=|--|\\+\\+|<<=?|>>=?|===|&&|\\|\\||::|->" + "|[^[:space:]]|[\x80-\xff]+"), +PATTERNS("python", "^[ \t]*((class|def)[ \t].*)$", + /* -- */ + "[a-zA-Z_][a-zA-Z0-9_]*" + "|[-+0-9.e]+[jJlL]?|0[xX]?[0-9a-fA-F]+[lL]?" + "|[-+*/<>%&^|=!]=|//=?|<<=?|>>=?|\\*\\*=?" + "|[^[:space:]|[\x80-\xff]+"), + /* -- */ +PATTERNS("ruby", "^[ \t]*((class|module|def)[ \t].*)$", + /* -- */ + "(@|@@|\\$)?[a-zA-Z_][a-zA-Z0-9_]*" + "|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+|\\?(\\\\C-)?(\\\\M-)?." + "|//=?|[-+*/<>%&^|=!]=|<<=?|>>=?|===|\\.{1,3}|::|[!=]~" + "|[^[:space:]|[\x80-\xff]+"), +PATTERNS("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$", + "[={}\"]|[^={}\" \t]+"), +PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$", + "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+|[^[:space:]]"), +PATTERNS("cpp", + /* Jump targets or access declarations */ + "!^[ \t]*[A-Za-z_][A-Za-z_0-9]*:.*$\n" + /* C/++ functions/methods at top level */ + "^([A-Za-z_][A-Za-z_0-9]*([ \t]+[A-Za-z_][A-Za-z_0-9]*([ \t]*::[ \t]*[^[:space:]]+)?){1,}[ \t]*\\([^;]*)$\n" + /* compound type at top level */ + "^((struct|class|enum)[^;]*)$", + /* -- */ + "[a-zA-Z_][a-zA-Z0-9_]*" + "|[-+0-9.e]+[fFlL]?|0[xXbB]?[0-9a-fA-F]+[lL]?" + "|[-+*/<>%&^|=!]=|--|\\+\\+|<<=?|>>=?|&&|\\|\\||::|->" + "|[^[:space:]]|[\x80-\xff]+"), { "default", NULL, -1, { NULL, 0 } }, }; -#undef FUNCNAME +#undef PATTERNS static struct userdiff_driver driver_true = { "diff=true", @@ -134,6 +180,8 @@ int userdiff_config(const char *k, const char *v) return parse_string(&drv->external, k, v); if ((drv = parse_driver(k, v, "textconv"))) return parse_string(&drv->textconv, k, v); + if ((drv = parse_driver(k, v, "wordregex"))) + return parse_string(&drv->word_regex, k, v); return 0; } diff --git a/userdiff.h b/userdiff.h index ba2945770..c3151594f 100644 --- a/userdiff.h +++ b/userdiff.h @@ -11,6 +11,7 @@ struct userdiff_driver { const char *external; int binary; struct userdiff_funcname funcname; + const char *word_regex; const char *textconv; }; From 98a4d87b87e9846eafd21ba232cc2b7ba3f718fc Mon Sep 17 00:00:00 2001 From: Boyd Stephen Smith Jr Date: Tue, 20 Jan 2009 21:46:57 -0600 Subject: [PATCH 8/9] color-words: Support diff.wordregex config option When diff is invoked with --color-words (w/o =regex), use the regular expression the user has configured as diff.wordregex. diff drivers configured via attributes take precedence over the diff.wordregex-words setting. If the user wants to change them, they have their own configuration variables. Signed-off-by: Boyd Stephen Smith Jr Signed-off-by: Junio C Hamano --- Documentation/config.txt | 6 +++++ Documentation/diff-options.txt | 7 +++--- diff.c | 5 ++++ t/t4034-diff-words.sh | 45 ++++++++++++++++++++++++++++++++-- 4 files changed, 58 insertions(+), 5 deletions(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index 7408bb2d3..0ca983ac3 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -639,6 +639,12 @@ diff.suppress-blank-empty:: A boolean to inhibit the standard behavior of printing a space before each empty output line. Defaults to false. +diff.wordregex:: + A POSIX Extended Regular Expression used to determine what is a "word" + when performing word-by-word difference calculations. Character + sequences that match the regular expression are "words", all other + characters are *ignorable* whitespace. + fetch.unpackLimit:: If the number of objects fetched over the git native transfer is below this diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt index 1edb82e8e..164e2c534 100644 --- a/Documentation/diff-options.txt +++ b/Documentation/diff-options.txt @@ -103,9 +103,10 @@ expression to make sure that it matches all non-whitespace characters. A match that contains a newline is silently truncated(!) at the newline. + -The regex can also be set via a diff driver, see -linkgit:gitattributes[1]; giving it explicitly overrides any diff -driver setting. +The regex can also be set via a diff driver or configuration option, see +linkgit:gitattributes[1] or linkgit:git-config[1]. Giving it explicitly +overrides any diff driver or configuration setting. Diff drivers +override configuration settings. --no-renames:: Turn off rename detection, even when the configuration diff --git a/diff.c b/diff.c index 9fcde963d..ed8b83c68 100644 --- a/diff.c +++ b/diff.c @@ -23,6 +23,7 @@ static int diff_detect_rename_default; static int diff_rename_limit_default = 200; static int diff_suppress_blank_empty; int diff_use_color_default = -1; +static const char *diff_word_regex_cfg; static const char *external_diff_cmd_cfg; int diff_auto_refresh_index = 1; static int diff_mnemonic_prefix; @@ -92,6 +93,8 @@ int git_diff_ui_config(const char *var, const char *value, void *cb) } if (!strcmp(var, "diff.external")) return git_config_string(&external_diff_cmd_cfg, var, value); + if (!strcmp(var, "diff.wordregex")) + return git_config_string(&diff_word_regex_cfg, var, value); return git_diff_basic_config(var, value, cb); } @@ -1550,6 +1553,8 @@ static void builtin_diff(const char *name_a, o->word_regex = userdiff_word_regex(one); if (!o->word_regex) o->word_regex = userdiff_word_regex(two); + if (!o->word_regex) + o->word_regex = diff_word_regex_cfg; if (o->word_regex) { ecbdata.diff_words->word_regex = (regex_t *) xmalloc(sizeof(regex_t)); diff --git a/t/t4034-diff-words.sh b/t/t4034-diff-words.sh index 744221bef..6bcc15308 100755 --- a/t/t4034-diff-words.sh +++ b/t/t4034-diff-words.sh @@ -77,6 +77,7 @@ a = b + c aeff = aeff * ( aaa ) EOF +cp expect expect.letter-runs-are-words test_expect_success 'word diff with a regular expression' ' @@ -92,7 +93,7 @@ post diff=testdriver EOF ' -test_expect_success 'option overrides default' ' +test_expect_success 'option overrides .gitattributes' ' word_diff --color-words="[a-z]+" @@ -112,13 +113,53 @@ a = b + c aeff = aeff * ( aaa ) EOF +cp expect expect.non-whitespace-is-word -test_expect_success 'use default supplied by driver' ' +test_expect_success 'use regex supplied by driver' ' word_diff --color-words ' +test_expect_success 'set diff.wordregex option' ' + git config diff.wordregex "[[:alnum:]]+" +' + +cp expect.letter-runs-are-words expect + +test_expect_success 'command-line overrides config' ' + word_diff --color-words="[a-z]+" +' + +cp expect.non-whitespace-is-word expect + +test_expect_success '.gitattributes override config' ' + word_diff --color-words +' + +test_expect_success 'remove diff driver regex' ' + git config --unset diff.testdriver.wordregex +' + +cat > expect <<\EOF +diff --git a/pre b/post +index 330b04f..5ed8eff 100644 +--- a/pre ++++ b/post +@@ -1,3 +1,7 @@ +h(4),hh[44] + +a = b + c + +aa = a + +aeff = aeff * ( aaa ) +EOF + +test_expect_success 'use configured regex' ' + word_diff --color-words +' + echo 'aaa (aaa)' > pre echo 'aaa (aaa) aaa' > post From ae3b970ac3e21324a95fea75213c2569180d74c6 Mon Sep 17 00:00:00 2001 From: Boyd Stephen Smith Jr Date: Tue, 20 Jan 2009 22:59:54 -0600 Subject: [PATCH 9/9] Change the spelling of "wordregex". Use "wordRegex" for configuration variable names. Use "word_regex" for C language tokens. Signed-off-by: Boyd Stephen Smith Jr. Signed-off-by: Junio C Hamano --- Documentation/config.txt | 2 +- Documentation/gitattributes.txt | 4 ++-- t/t4034-diff-words.sh | 8 ++++---- userdiff.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index 0ca983ac3..332213e65 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -639,7 +639,7 @@ diff.suppress-blank-empty:: A boolean to inhibit the standard behavior of printing a space before each empty output line. Defaults to false. -diff.wordregex:: +diff.wordRegex:: A POSIX Extended Regular Expression used to determine what is a "word" when performing word-by-word difference calculations. Character sequences that match the regular expression are "words", all other diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt index ba3ba1273..227934f59 100644 --- a/Documentation/gitattributes.txt +++ b/Documentation/gitattributes.txt @@ -341,14 +341,14 @@ Customizing word diff You can customize the rules that `git diff --color-words` uses to split words in a line, by specifying an appropriate regular expression -in the "diff.*.wordregex" configuration variable. For example, in TeX +in the "diff.*.wordRegex" configuration variable. For example, in TeX a backslash followed by a sequence of letters forms a command, but several such commands can be run together without intervening whitespace. To separate them, use a regular expression such as ------------------------ [diff "tex"] - wordregex = "\\\\[a-zA-Z]+|[{}]|\\\\.|[^\\{}[:space:]]+" + wordRegex = "\\\\[a-zA-Z]+|[{}]|\\\\.|[^\\{}[:space:]]+" ------------------------ A built-in pattern is provided for all languages listed in the diff --git a/t/t4034-diff-words.sh b/t/t4034-diff-words.sh index 6bcc15308..4508effca 100755 --- a/t/t4034-diff-words.sh +++ b/t/t4034-diff-words.sh @@ -86,7 +86,7 @@ test_expect_success 'word diff with a regular expression' ' ' test_expect_success 'set a diff driver' ' - git config diff.testdriver.wordregex "[^[:space:]]" && + git config diff.testdriver.wordRegex "[^[:space:]]" && cat < .gitattributes pre diff=testdriver post diff=testdriver @@ -121,8 +121,8 @@ test_expect_success 'use regex supplied by driver' ' ' -test_expect_success 'set diff.wordregex option' ' - git config diff.wordregex "[[:alnum:]]+" +test_expect_success 'set diff.wordRegex option' ' + git config diff.wordRegex "[[:alnum:]]+" ' cp expect.letter-runs-are-words expect @@ -138,7 +138,7 @@ test_expect_success '.gitattributes override config' ' ' test_expect_success 'remove diff driver regex' ' - git config --unset diff.testdriver.wordregex + git config --unset diff.testdriver.wordRegex ' cat > expect <<\EOF diff --git a/userdiff.c b/userdiff.c index 2b5550948..d556da975 100644 --- a/userdiff.c +++ b/userdiff.c @@ -6,8 +6,8 @@ static struct userdiff_driver *drivers; static int ndrivers; static int drivers_alloc; -#define PATTERNS(name, pattern, wordregex) \ - { name, NULL, -1, { pattern, REG_EXTENDED }, wordregex } +#define PATTERNS(name, pattern, word_regex) \ + { name, NULL, -1, { pattern, REG_EXTENDED }, word_regex } static struct userdiff_driver builtin_drivers[] = { PATTERNS("html", "^[ \t]*(<[Hh][1-6][ \t].*>.*)$", "[^<>= \t]+|[^[:space:]]|[\x80-\xff]+"),