From 11f944dd6bdabd003325c85dc60b16389d012361 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 Feb 2011 19:55:19 -0800 Subject: [PATCH 1/3] for_each_hash: allow passing a 'void *data' pointer to callback For the find_exact_renames() function, this allows us to pass the diff_options structure pointer to the low-level routines. We will use that to distinguish between the "rename" and "copy" cases. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- builtin/describe.c | 4 ++-- diffcore-rename.c | 14 ++++++++------ hash.c | 4 ++-- hash.h | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/builtin/describe.c b/builtin/describe.c index 342129fdb..3ba26dc81 100644 --- a/builtin/describe.c +++ b/builtin/describe.c @@ -63,7 +63,7 @@ static inline struct commit_name *find_commit_name(const unsigned char *peeled) return n; } -static int set_util(void *chain) +static int set_util(void *chain, void *data) { struct commit_name *n; for (n = chain; n; n = n->next) { @@ -289,7 +289,7 @@ static void describe(const char *arg, int last_one) fprintf(stderr, "searching to describe %s\n", arg); if (!have_util) { - for_each_hash(&names, set_util); + for_each_hash(&names, set_util, NULL); have_util = 1; } diff --git a/diffcore-rename.c b/diffcore-rename.c index df41be56d..e5e88feb5 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -247,7 +247,8 @@ struct file_similarity { }; static int find_identical_files(struct file_similarity *src, - struct file_similarity *dst) + struct file_similarity *dst, + struct diff_options *options) { int renames = 0; @@ -306,11 +307,12 @@ static void free_similarity_list(struct file_similarity *p) } } -static int find_same_files(void *ptr) +static int find_same_files(void *ptr, void *data) { int ret; struct file_similarity *p = ptr; struct file_similarity *src = NULL, *dst = NULL; + struct diff_options *options = data; /* Split the hash list up into sources and destinations */ do { @@ -329,7 +331,7 @@ static int find_same_files(void *ptr) * If we have both sources *and* destinations, see if * we can match them up */ - ret = (src && dst) ? find_identical_files(src, dst) : 0; + ret = (src && dst) ? find_identical_files(src, dst, options) : 0; /* Free the hashes and return the number of renames found */ free_similarity_list(src); @@ -377,7 +379,7 @@ static void insert_file_table(struct hash_table *table, int src_dst, int index, * and then during the second round we try to match * cache-dirty entries as well. */ -static int find_exact_renames(void) +static int find_exact_renames(struct diff_options *options) { int i; struct hash_table file_table; @@ -390,7 +392,7 @@ static int find_exact_renames(void) insert_file_table(&file_table, 1, i, rename_dst[i].two); /* Find the renames */ - i = for_each_hash(&file_table, find_same_files); + i = for_each_hash(&file_table, find_same_files, options); /* .. and free the hash data structure */ free_hash(&file_table); @@ -467,7 +469,7 @@ void diffcore_rename(struct diff_options *options) * We really want to cull the candidates list early * with cheap tests in order to avoid doing deltas. */ - rename_count = find_exact_renames(); + rename_count = find_exact_renames(options); /* Did we only want exact renames? */ if (minimum_score == MAX_SCORE) diff --git a/hash.c b/hash.c index 1cd4c9d5c..749ecfe48 100644 --- a/hash.c +++ b/hash.c @@ -81,7 +81,7 @@ void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table) return insert_hash_entry(hash, ptr, table); } -int for_each_hash(const struct hash_table *table, int (*fn)(void *)) +int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data) { int sum = 0; unsigned int i; @@ -92,7 +92,7 @@ int for_each_hash(const struct hash_table *table, int (*fn)(void *)) void *ptr = array->ptr; array++; if (ptr) { - int val = fn(ptr); + int val = fn(ptr, data); if (val < 0) return val; sum += val; diff --git a/hash.h b/hash.h index 69e33a47b..b875ce67c 100644 --- a/hash.h +++ b/hash.h @@ -30,7 +30,7 @@ struct hash_table { extern void *lookup_hash(unsigned int hash, const struct hash_table *table); extern void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table); -extern int for_each_hash(const struct hash_table *table, int (*fn)(void *)); +extern int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data); extern void free_hash(struct hash_table *table); static inline void init_hash(struct hash_table *table) From 0940e5f211452ec2520d1b04233acddf0a872c06 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 Feb 2011 20:10:32 -0800 Subject: [PATCH 2/3] diffcore-rename: properly honor the difference between -M and -C We would allow rename detection to do copy detection even when asked purely for renames. That confuses users, but more importantly it can terminally confuse the recursive merge rename logic. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 53 +++++++++++++++++----------------- t/t4003-diff-rename-1.sh | 2 +- t/t4004-diff-rename-symlink.sh | 2 +- t/t4005-diff-rename-2.sh | 2 +- t/t4008-diff-break-rewrite.sh | 4 +-- t/t4009-diff-rename-4.sh | 2 +- 6 files changed, 32 insertions(+), 33 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index e5e88feb5..b9b039d4a 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -278,6 +278,8 @@ static int find_identical_files(struct file_similarity *src, } /* Give higher scores to sources that haven't been used already */ score = !source->rename_used; + if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY) + continue; score += basename_same(source, target); if (score > best_score) { best = p; @@ -416,6 +418,27 @@ static void record_if_better(struct diff_score m[], struct diff_score *o) m[worst] = *o; } +static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies) +{ + int count = 0, i; + + for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { + struct diff_rename_dst *dst; + + if ((mx[i].dst < 0) || + (mx[i].score < minimum_score)) + break; /* there is no more usable pair. */ + dst = &rename_dst[mx[i].dst]; + if (dst->pair) + continue; /* already done, either exact or fuzzy. */ + if (!copies && rename_src[mx[i].src].one->rename_used) + continue; + record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); + count++; + } + return count; +} + void diffcore_rename(struct diff_options *options) { int detect_rename = options->detect_rename; @@ -538,33 +561,9 @@ void diffcore_rename(struct diff_options *options) /* cost matrix sorted by most to least similar pair */ qsort(mx, dst_cnt * NUM_CANDIDATE_PER_DST, sizeof(*mx), score_compare); - for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { - struct diff_rename_dst *dst; - - if ((mx[i].dst < 0) || - (mx[i].score < minimum_score)) - break; /* there is no more usable pair. */ - dst = &rename_dst[mx[i].dst]; - if (dst->pair) - continue; /* already done, either exact or fuzzy. */ - if (rename_src[mx[i].src].one->rename_used) - continue; - record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); - rename_count++; - } - - for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { - struct diff_rename_dst *dst; - - if ((mx[i].dst < 0) || - (mx[i].score < minimum_score)) - break; /* there is no more usable pair. */ - dst = &rename_dst[mx[i].dst]; - if (dst->pair) - continue; /* already done, either exact or fuzzy. */ - record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); - rename_count++; - } + rename_count += find_renames(mx, dst_cnt, minimum_score, 0); + if (detect_rename == DIFF_DETECT_COPY) + rename_count += find_renames(mx, dst_cnt, minimum_score, 1); free(mx); cleanup: diff --git a/t/t4003-diff-rename-1.sh b/t/t4003-diff-rename-1.sh index c6130c401..bfa883563 100755 --- a/t/t4003-diff-rename-1.sh +++ b/t/t4003-diff-rename-1.sh @@ -29,7 +29,7 @@ test_expect_success \ # copy-and-edit one, and rename-and-edit the other. We do not say # anything about rezrov. -GIT_DIFF_OPTS=--unified=0 git diff-index -M -p $tree >current +GIT_DIFF_OPTS=--unified=0 git diff-index -C -p $tree >current cat >expected <<\EOF diff --git a/COPYING b/COPYING.1 copy from COPYING diff --git a/t/t4004-diff-rename-symlink.sh b/t/t4004-diff-rename-symlink.sh index 92a65f485..6e562c80d 100755 --- a/t/t4004-diff-rename-symlink.sh +++ b/t/t4004-diff-rename-symlink.sh @@ -35,7 +35,7 @@ test_expect_success SYMLINKS \ # a new creation. test_expect_success SYMLINKS 'setup diff output' " - GIT_DIFF_OPTS=--unified=0 git diff-index -M -p $tree >current && + GIT_DIFF_OPTS=--unified=0 git diff-index -C -p $tree >current && cat >expected <<\EOF diff --git a/bozbar b/bozbar new file mode 120000 diff --git a/t/t4005-diff-rename-2.sh b/t/t4005-diff-rename-2.sh index 1ba359d47..77d7f4946 100755 --- a/t/t4005-diff-rename-2.sh +++ b/t/t4005-diff-rename-2.sh @@ -29,7 +29,7 @@ test_expect_success \ # and COPYING.2 are based on COPYING, and do not say anything about # rezrov. -git diff-index -M $tree >current +git diff-index -C $tree >current cat >expected <<\EOF :100644 100644 6ff87c4664981e4397625791c8ea3bbb5f2279a3 0603b3238a076dc6c8022aedc6648fa523a17178 C1234 COPYING COPYING.1 diff --git a/t/t4008-diff-break-rewrite.sh b/t/t4008-diff-break-rewrite.sh index d79d9e1e7..73b4a24f5 100755 --- a/t/t4008-diff-break-rewrite.sh +++ b/t/t4008-diff-break-rewrite.sh @@ -173,8 +173,8 @@ test_expect_success \ 'compare_diff_raw expected current' test_expect_success \ - 'run diff with -B -M' \ - 'git diff-index -B -M "$tree" >current' + 'run diff with -B -C' \ + 'git diff-index -B -C "$tree" >current' cat >expected <<\EOF :100644 100644 f5deac7be59e7eeab8657fd9ae706fd6a57daed2 08bb2fb671deff4c03a4d4a0a1315dff98d5732c C095 file0 file1 diff --git a/t/t4009-diff-rename-4.sh b/t/t4009-diff-rename-4.sh index de3f17478..f22c8e3db 100755 --- a/t/t4009-diff-rename-4.sh +++ b/t/t4009-diff-rename-4.sh @@ -29,7 +29,7 @@ test_expect_success \ # and COPYING.2 are based on COPYING, and do not say anything about # rezrov. -git diff-index -z -M $tree >current +git diff-index -z -C $tree >current cat >expected <<\EOF :100644 100644 6ff87c4664981e4397625791c8ea3bbb5f2279a3 0603b3238a076dc6c8022aedc6648fa523a17178 C1234 From 3a4d67692b5a80213ca47a603fa5505a5990cc87 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 Feb 2011 20:12:06 -0800 Subject: [PATCH 3/3] diffcore-rename: improve estimate_similarity() heuristics The logic to quickly dismiss potential rename pairs was broken. It would too eagerly dismiss possible renames when all of the difference was due to pure new data (or deleted data). Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index b9b039d4a..0cd4c1305 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -170,7 +170,7 @@ static int estimate_similarity(struct diff_filespec *src, * and the final score computation below would not have a * divide-by-zero issue. */ - if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) + if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) return 0; if (!src->cnt_data && diff_populate_filespec(src, 0))