From 505f297989f4b1fc4c5cb5d0c94e783e385f8d6d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Oct 2007 11:16:50 -0700 Subject: [PATCH 1/8] Add 'diffcore.h' to LIB_H The diffcore.h header file is included by more than just the internal diff generation files, and needs to be part of the proper dependencies. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 72f5ef43c..ba969e439 100644 --- a/Makefile +++ b/Makefile @@ -290,7 +290,7 @@ LIB_H = \ run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h \ tree-walk.h log-tree.h dir.h path-list.h unpack-trees.h builtin.h \ utf8.h reflog-walk.h patch-ids.h attr.h decorate.h progress.h \ - mailmap.h remote.h transport.h + mailmap.h remote.h transport.h diffcore.h DIFF_OBJS = \ diff.o diff-lib.o diffcore-break.o diffcore-order.o \ @@ -917,7 +917,6 @@ git-http-push$X: revision.o http.o http-push.o $(GITLIBS) $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) $(patsubst git-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) -$(DIFF_OBJS): diffcore.h $(LIB_FILE): $(LIB_OBJS) $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) From cb1491b6bff20748532c9e50afc7f9d6896167a8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Oct 2007 11:17:55 -0700 Subject: [PATCH 2/8] Split out "exact content match" phase of rename detection This makes the exact content match a separate function of its own. Partly to cut down a bit on the size of the diffcore_rename() function (which is too complex as it is), and partly because there are smarter ways to do this than an O(m*n) loop over it all, and that function should be rewritten to take that into account. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 90 +++++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 34 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 142e5376d..2077a9b98 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -262,6 +262,58 @@ static int compute_stays(struct diff_queue_struct *q, return 1; } +/* + * Find exact renames first. + * + * The first round matches up the up-to-date entries, + * and then during the second round we try to match + * cache-dirty entries as well. + * + * Note: the rest of the rename logic depends on this + * phase also populating all the filespecs for any + * entry that isn't matched up with an exact rename, + * see "is_exact_match()". + */ +static int find_exact_renames(void) +{ + int rename_count = 0; + int contents_too; + + for (contents_too = 0; contents_too < 2; contents_too++) { + int i; + + for (i = 0; i < rename_dst_nr; i++) { + struct diff_filespec *two = rename_dst[i].two; + int j; + + if (rename_dst[i].pair) + continue; /* dealt with an earlier round */ + for (j = 0; j < rename_src_nr; j++) { + int k; + struct diff_filespec *one = rename_src[j].one; + if (!is_exact_match(one, two, contents_too)) + continue; + + /* see if there is a basename match, too */ + for (k = j; k < rename_src_nr; k++) { + one = rename_src[k].one; + if (basename_same(one, two) && + is_exact_match(one, two, + contents_too)) { + j = k; + break; + } + } + + record_rename_pair(i, j, (int)MAX_SCORE); + rename_count++; + break; /* we are done with this entry */ + } + } + } + return rename_count; +} + void diffcore_rename(struct diff_options *options) { int detect_rename = options->detect_rename; @@ -270,12 +322,11 @@ void diffcore_rename(struct diff_options *options) struct diff_queue_struct *q = &diff_queued_diff; struct diff_queue_struct outq; struct diff_score *mx; - int i, j, rename_count, contents_too; + int i, j, rename_count; int num_create, num_src, dst_cnt; if (!minimum_score) minimum_score = DEFAULT_RENAME_SCORE; - rename_count = 0; for (i = 0; i < q->nr; i++) { struct diff_filepair *p = q->queue[i]; @@ -318,40 +369,11 @@ void diffcore_rename(struct diff_options *options) if (rename_dst_nr * rename_src_nr > rename_limit * rename_limit) goto cleanup; - /* We really want to cull the candidates list early + /* + * We really want to cull the candidates list early * with cheap tests in order to avoid doing deltas. - * The first round matches up the up-to-date entries, - * and then during the second round we try to match - * cache-dirty entries as well. */ - for (contents_too = 0; contents_too < 2; contents_too++) { - for (i = 0; i < rename_dst_nr; i++) { - struct diff_filespec *two = rename_dst[i].two; - if (rename_dst[i].pair) - continue; /* dealt with an earlier round */ - for (j = 0; j < rename_src_nr; j++) { - int k; - struct diff_filespec *one = rename_src[j].one; - if (!is_exact_match(one, two, contents_too)) - continue; - - /* see if there is a basename match, too */ - for (k = j; k < rename_src_nr; k++) { - one = rename_src[k].one; - if (basename_same(one, two) && - is_exact_match(one, two, - contents_too)) { - j = k; - break; - } - } - - record_rename_pair(i, j, (int)MAX_SCORE); - rename_count++; - break; /* we are done with this entry */ - } - } - } + rename_count = find_exact_renames(); /* Have we run out the created file pool? If so we can avoid * doing the delta matrix altogether. From 9fb88419ba85e641006c80db53620423f37f1c93 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Oct 2007 11:19:10 -0700 Subject: [PATCH 3/8] Ref-count the filespecs used by diffcore Rather than copy the filespecs when introducing new versions of them (for rename or copy detection), use a refcount and increment the count when reusing the diff_filespec. This avoids unnecessary allocations, but the real reason behind this is a future enhancement: we will want to track shared data across the copy/rename detection. In order to efficiently notice when a filespec is used by a rename, the rename machinery wants to keep track of a rename usage count which is shared across all different users of the filespec. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diff.c | 15 +++++++++++---- diffcore-rename.c | 16 ++++++---------- diffcore.h | 2 ++ 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/diff.c b/diff.c index dfb8595b7..0b320f6b9 100644 --- a/diff.c +++ b/diff.c @@ -1440,9 +1440,18 @@ struct diff_filespec *alloc_filespec(const char *path) memset(spec, 0, sizeof(*spec)); spec->path = (char *)(spec + 1); memcpy(spec->path, path, namelen+1); + spec->count = 1; return spec; } +void free_filespec(struct diff_filespec *spec) +{ + if (!--spec->count) { + diff_free_filespec_data(spec); + free(spec); + } +} + void fill_filespec(struct diff_filespec *spec, const unsigned char *sha1, unsigned short mode) { @@ -2435,10 +2444,8 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *queue, void diff_free_filepair(struct diff_filepair *p) { - diff_free_filespec_data(p->one); - diff_free_filespec_data(p->two); - free(p->one); - free(p->two); + free_filespec(p->one); + free_filespec(p->two); free(p); } diff --git a/diffcore-rename.c b/diffcore-rename.c index 2077a9b98..3da06b702 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -209,21 +209,19 @@ static int estimate_similarity(struct diff_filespec *src, static void record_rename_pair(int dst_index, int src_index, int score) { - struct diff_filespec *one, *two, *src, *dst; + struct diff_filespec *src, *dst; struct diff_filepair *dp; if (rename_dst[dst_index].pair) die("internal error: dst already matched."); src = rename_src[src_index].one; - one = alloc_filespec(src->path); - fill_filespec(one, src->sha1, src->mode); + src->count++; dst = rename_dst[dst_index].two; - two = alloc_filespec(dst->path); - fill_filespec(two, dst->sha1, dst->mode); + dst->count++; - dp = diff_queue(NULL, one, two); + dp = diff_queue(NULL, src, dst); dp->renamed_pair = 1; if (!strcmp(src->path, dst->path)) dp->score = rename_src[src_index].score; @@ -526,10 +524,8 @@ void diffcore_rename(struct diff_options *options) } } - for (i = 0; i < rename_dst_nr; i++) { - diff_free_filespec_data(rename_dst[i].two); - free(rename_dst[i].two); - } + for (i = 0; i < rename_dst_nr; i++) + free_filespec(rename_dst[i].two); free(rename_dst); rename_dst = NULL; diff --git a/diffcore.h b/diffcore.h index eb618b1ec..30055ac5a 100644 --- a/diffcore.h +++ b/diffcore.h @@ -29,6 +29,7 @@ struct diff_filespec { void *cnt_data; const char *funcname_pattern_ident; unsigned long size; + int count; /* Reference count */ int xfrm_flags; /* for use by the xfrm */ unsigned short mode; /* file mode */ unsigned sha1_valid : 1; /* if true, use sha1 and trust mode; @@ -43,6 +44,7 @@ struct diff_filespec { }; extern struct diff_filespec *alloc_filespec(const char *); +extern void free_filespec(struct diff_filespec *); extern void fill_filespec(struct diff_filespec *, const unsigned char *, unsigned short); From 644797119d7a3b7a043a51a9cccd8758f8451f91 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Oct 2007 11:20:56 -0700 Subject: [PATCH 4/8] copy vs rename detection: avoid unnecessary O(n*m) loops The core rename detection had some rather stupid code to check if a pathname was used by a later modification or rename, which basically walked the whole pathname space for all renames for each rename, in order to tell whether it was a pure rename (no remaining users) or should be considered a copy (other users of the source file remaining). That's really silly, since we can just keep a count of users around, and replace all those complex and expensive loops with just testing that simple counter (but this all depends on the previous commit that shared the diff_filespec data structure by using a separate reference count). Note that the reference count is not the same as the rename count: they behave otherwise rather similarly, but the reference count is tied to the allocation (and decremented at de-allocation, so that when it turns zero we can get rid of the memory), while the rename count is tied to the renames and is decremented when we find a rename (so that when it turns zero we know that it was a rename, not a copy). Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diff.c | 40 ++++++++++++---------------- diffcore-rename.c | 68 ++++++++++++----------------------------------- diffcore.h | 2 +- 3 files changed, 35 insertions(+), 75 deletions(-) diff --git a/diff.c b/diff.c index 0b320f6b9..af85b94d1 100644 --- a/diff.c +++ b/diff.c @@ -2597,9 +2597,9 @@ void diff_debug_filepair(const struct diff_filepair *p, int i) { diff_debug_filespec(p->one, i, "one"); diff_debug_filespec(p->two, i, "two"); - fprintf(stderr, "score %d, status %c stays %d broken %d\n", + fprintf(stderr, "score %d, status %c rename_used %d broken %d\n", p->score, p->status ? p->status : '?', - p->source_stays, p->broken_pair); + p->one->rename_used, p->broken_pair); } void diff_debug_queue(const char *msg, struct diff_queue_struct *q) @@ -2617,8 +2617,8 @@ void diff_debug_queue(const char *msg, struct diff_queue_struct *q) static void diff_resolve_rename_copy(void) { - int i, j; - struct diff_filepair *p, *pp; + int i; + struct diff_filepair *p; struct diff_queue_struct *q = &diff_queued_diff; diff_debug_queue("resolve-rename-copy", q); @@ -2640,27 +2640,21 @@ static void diff_resolve_rename_copy(void) * either in-place edit or rename/copy edit. */ else if (DIFF_PAIR_RENAME(p)) { - if (p->source_stays) { - p->status = DIFF_STATUS_COPIED; - continue; - } - /* See if there is some other filepair that - * copies from the same source as us. If so - * we are a copy. Otherwise we are either a - * copy if the path stays, or a rename if it - * does not, but we already handled "stays" case. + /* + * A rename might have re-connected a broken + * pair up, causing the pathnames to be the + * same again. If so, that's not a rename at + * all, just a modification.. + * + * Otherwise, see if this source was used for + * multiple renames, in which case we decrement + * the count, and call it a copy. */ - for (j = i + 1; j < q->nr; j++) { - pp = q->queue[j]; - if (strcmp(pp->one->path, p->one->path)) - continue; /* not us */ - if (!DIFF_PAIR_RENAME(pp)) - continue; /* not a rename/copy */ - /* pp is a rename/copy from the same source */ + if (!strcmp(p->one->path, p->two->path)) + p->status = DIFF_STATUS_MODIFIED; + else if (--p->one->rename_used > 0) p->status = DIFF_STATUS_COPIED; - break; - } - if (!p->status) + else p->status = DIFF_STATUS_RENAMED; } else if (hashcmp(p->one->sha1, p->two->sha1) || diff --git a/diffcore-rename.c b/diffcore-rename.c index 3da06b702..edb2424d1 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -55,12 +55,10 @@ static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two, static struct diff_rename_src { struct diff_filespec *one; unsigned short score; /* to remember the break score */ - unsigned src_path_left : 1; } *rename_src; static int rename_src_nr, rename_src_alloc; static struct diff_rename_src *register_rename_src(struct diff_filespec *one, - int src_path_left, unsigned short score) { int first, last; @@ -92,7 +90,6 @@ static struct diff_rename_src *register_rename_src(struct diff_filespec *one, (rename_src_nr - first - 1) * sizeof(*rename_src)); rename_src[first].one = one; rename_src[first].score = score; - rename_src[first].src_path_left = src_path_left; return &(rename_src[first]); } @@ -216,6 +213,7 @@ static void record_rename_pair(int dst_index, int src_index, int score) die("internal error: dst already matched."); src = rename_src[src_index].one; + src->rename_used++; src->count++; dst = rename_dst[dst_index].two; @@ -227,7 +225,6 @@ static void record_rename_pair(int dst_index, int src_index, int score) dp->score = rename_src[src_index].score; else dp->score = score; - dp->source_stays = rename_src[src_index].src_path_left; rename_dst[dst_index].pair = dp; } @@ -245,21 +242,6 @@ static int score_compare(const void *a_, const void *b_) return b->score - a->score; } -static int compute_stays(struct diff_queue_struct *q, - struct diff_filespec *one) -{ - int i; - for (i = 0; i < q->nr; i++) { - struct diff_filepair *p = q->queue[i]; - if (strcmp(one->path, p->two->path)) - continue; - if (DIFF_PAIR_RENAME(p)) { - return 0; /* something else is renamed into this */ - } - } - return 1; -} - /* * Find exact renames first. * @@ -338,15 +320,25 @@ void diffcore_rename(struct diff_options *options) locate_rename_dst(p->two, 1); } else if (!DIFF_FILE_VALID(p->two)) { - /* If the source is a broken "delete", and + /* + * If the source is a broken "delete", and * they did not really want to get broken, * that means the source actually stays. + * So we increment the "rename_used" score + * by one, to indicate ourselves as a user + */ + if (p->broken_pair && !p->score) + p->one->rename_used++; + register_rename_src(p->one, p->score); + } + else if (detect_rename == DIFF_DETECT_COPY) { + /* + * Increment the "rename_used" score by + * one, to indicate ourselves as a user. */ - int stays = (p->broken_pair && !p->score); - register_rename_src(p->one, stays, p->score); + p->one->rename_used++; + register_rename_src(p->one, p->score); } - else if (detect_rename == DIFF_DETECT_COPY) - register_rename_src(p->one, 1, p->score); } if (rename_dst_nr == 0 || rename_src_nr == 0) goto cleanup; /* nothing to do */ @@ -472,16 +464,7 @@ void diffcore_rename(struct diff_options *options) pair_to_free = p; } else { - for (j = 0; j < rename_dst_nr; j++) { - if (!rename_dst[j].pair) - continue; - if (strcmp(rename_dst[j].pair-> - one->path, - p->one->path)) - continue; - break; - } - if (j < rename_dst_nr) + if (p->one->rename_used) /* this path remains */ pair_to_free = p; } @@ -507,23 +490,6 @@ void diffcore_rename(struct diff_options *options) *q = outq; diff_debug_queue("done collapsing", q); - /* We need to see which rename source really stays here; - * earlier we only checked if the path is left in the result, - * but even if a path remains in the result, if that is coming - * from copying something else on top of it, then the original - * source is lost and does not stay. - */ - for (i = 0; i < q->nr; i++) { - struct diff_filepair *p = q->queue[i]; - if (DIFF_PAIR_RENAME(p) && p->source_stays) { - /* If one appears as the target of a rename-copy, - * then mark p->source_stays = 0; otherwise - * leave it as is. - */ - p->source_stays = compute_stays(q, p->one); - } - } - for (i = 0; i < rename_dst_nr; i++) free_filespec(rename_dst[i].two); diff --git a/diffcore.h b/diffcore.h index 30055ac5a..cc96c2073 100644 --- a/diffcore.h +++ b/diffcore.h @@ -31,6 +31,7 @@ struct diff_filespec { unsigned long size; int count; /* Reference count */ int xfrm_flags; /* for use by the xfrm */ + int rename_used; /* Count of rename users */ unsigned short mode; /* file mode */ unsigned sha1_valid : 1; /* if true, use sha1 and trust mode; * if false, use the name and read from @@ -58,7 +59,6 @@ struct diff_filepair { struct diff_filespec *two; unsigned short int score; char status; /* M C R N D U (see Documentation/diff-format.txt) */ - unsigned source_stays : 1; /* all of R/C are copies */ unsigned broken_pair : 1; unsigned renamed_pair : 1; unsigned is_unmerged : 1; From 9027f53cb5051bf83a0254e7f8aeb5d1a206de0b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Oct 2007 11:23:26 -0700 Subject: [PATCH 5/8] Do linear-time/space rename logic for exact renames This implements a smarter rename detector for exact renames, which rather than doing a pairwise comparison (time O(m*n)) will just hash the files into a hash-table (size O(n+m)), and only do pairwise comparisons to renames that have the same hash (time O(n+m) except for unrealistic hash collissions, which we just cull aggressively). Admittedly the exact rename case is not nearly as interesting as the generic case, but it's an important case none-the-less. A similar general approach should work for the generic case too, but even then you do need to handle the exact renames/copies separately (to avoid the inevitable added cost factor that comes from the _size_ of the file), so this is worth doing. In the expectation that we will indeed do the same hashing trick for the general rename case, this code uses a generic hash-table implementation that can be used for other things too. In fact, we might be able to consolidate some of our existing hash tables with the new generic code in hash.[ch]. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- Makefile | 4 +- diffcore-rename.c | 211 ++++++++++++++++++++++++++++++++-------------- hash.c | 110 ++++++++++++++++++++++++ hash.h | 43 ++++++++++ 4 files changed, 303 insertions(+), 65 deletions(-) create mode 100644 hash.c create mode 100644 hash.h diff --git a/Makefile b/Makefile index ba969e439..2e6fd8f21 100644 --- a/Makefile +++ b/Makefile @@ -290,7 +290,7 @@ LIB_H = \ run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h \ tree-walk.h log-tree.h dir.h path-list.h unpack-trees.h builtin.h \ utf8.h reflog-walk.h patch-ids.h attr.h decorate.h progress.h \ - mailmap.h remote.h transport.h diffcore.h + mailmap.h remote.h transport.h diffcore.h hash.h DIFF_OBJS = \ diff.o diff-lib.o diffcore-break.o diffcore-order.o \ @@ -300,7 +300,7 @@ DIFF_OBJS = \ LIB_OBJS = \ blob.o commit.o connect.o csum-file.o cache-tree.o base85.o \ date.o diff-delta.o entry.o exec_cmd.o ident.o \ - interpolate.o \ + interpolate.o hash.o \ lockfile.o \ patch-ids.o \ object.o pack-check.o pack-write.o patch-delta.o path.o pkt-line.o \ diff --git a/diffcore-rename.c b/diffcore-rename.c index edb2424d1..e7e370b2c 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -4,6 +4,7 @@ #include "cache.h" #include "diff.h" #include "diffcore.h" +#include "hash.h" /* Table of rename/copy destinations */ @@ -93,29 +94,6 @@ static struct diff_rename_src *register_rename_src(struct diff_filespec *one, return &(rename_src[first]); } -static int is_exact_match(struct diff_filespec *src, - struct diff_filespec *dst, - int contents_too) -{ - if (src->sha1_valid && dst->sha1_valid && - !hashcmp(src->sha1, dst->sha1)) - return 1; - if (!contents_too) - return 0; - if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1)) - return 0; - if (src->size != dst->size) - return 0; - if (src->sha1_valid && dst->sha1_valid) - return !hashcmp(src->sha1, dst->sha1); - if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0)) - return 0; - if (src->size == dst->size && - !memcmp(src->data, dst->data, src->size)) - return 1; - return 0; -} - static int basename_same(struct diff_filespec *src, struct diff_filespec *dst) { int src_len = strlen(src->path), dst_len = strlen(dst->path); @@ -242,56 +220,163 @@ static int score_compare(const void *a_, const void *b_) return b->score - a->score; } +struct file_similarity { + int src_dst, index; + struct diff_filespec *filespec; + struct file_similarity *next; +}; + +static int find_identical_files(struct file_similarity *src, + struct file_similarity *dst) +{ + int renames = 0; + + /* + * Walk over all the destinations ... + */ + do { + struct diff_filespec *one = dst->filespec; + struct file_similarity *p, *best; + int i = 100; + + /* + * .. to find the best source match + */ + best = NULL; + for (p = src; p; p = p->next) { + struct diff_filespec *two = p->filespec; + + /* False hash collission? */ + if (hashcmp(one->sha1, two->sha1)) + continue; + /* Non-regular files? If so, the modes must match! */ + if (!S_ISREG(one->mode) || !S_ISREG(two->mode)) { + if (one->mode != two->mode) + continue; + } + best = p; + if (basename_same(one, two)) + break; + + /* Too many identical alternatives? Pick one */ + if (!--i) + break; + } + if (best) { + record_rename_pair(dst->index, best->index, MAX_SCORE); + renames++; + } + } while ((dst = dst->next) != NULL); + return renames; +} + +/* + * Note: the rest of the rename logic depends on this + * phase also populating all the filespecs for any + * entry that isn't matched up with an exact rename. + */ +static void free_similarity_list(struct file_similarity *p) +{ + while (p) { + struct file_similarity *entry = p; + p = p->next; + + /* Stupid special case, see note above! */ + diff_populate_filespec(entry->filespec, 0); + free(entry); + } +} + +static int find_same_files(void *ptr) +{ + int ret; + struct file_similarity *p = ptr; + struct file_similarity *src = NULL, *dst = NULL; + + /* Split the hash list up into sources and destinations */ + do { + struct file_similarity *entry = p; + p = p->next; + if (entry->src_dst < 0) { + entry->next = src; + src = entry; + } else { + entry->next = dst; + dst = entry; + } + } while (p); + + /* + * If we have both sources *and* destinations, see if + * we can match them up + */ + ret = (src && dst) ? find_identical_files(src, dst) : 0; + + /* Free the hashes and return the number of renames found */ + free_similarity_list(src); + free_similarity_list(dst); + return ret; +} + +static unsigned int hash_filespec(struct diff_filespec *filespec) +{ + unsigned int hash; + if (!filespec->sha1_valid) { + if (diff_populate_filespec(filespec, 0)) + return 0; + hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1); + } + memcpy(&hash, filespec->sha1, sizeof(hash)); + return hash; +} + +static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec) +{ + void **pos; + unsigned int hash; + struct file_similarity *entry = xmalloc(sizeof(*entry)); + + entry->src_dst = src_dst; + entry->index = index; + entry->filespec = filespec; + entry->next = NULL; + + hash = hash_filespec(filespec); + pos = insert_hash(hash, entry, table); + + /* We already had an entry there? */ + if (pos) { + entry->next = *pos; + *pos = entry; + } +} + /* * Find exact renames first. * * The first round matches up the up-to-date entries, * and then during the second round we try to match * cache-dirty entries as well. - * - * Note: the rest of the rename logic depends on this - * phase also populating all the filespecs for any - * entry that isn't matched up with an exact rename, - * see "is_exact_match()". */ static int find_exact_renames(void) { - int rename_count = 0; - int contents_too; - - for (contents_too = 0; contents_too < 2; contents_too++) { - int i; - - for (i = 0; i < rename_dst_nr; i++) { - struct diff_filespec *two = rename_dst[i].two; - int j; - - if (rename_dst[i].pair) - continue; /* dealt with an earlier round */ - for (j = 0; j < rename_src_nr; j++) { - int k; - struct diff_filespec *one = rename_src[j].one; - if (!is_exact_match(one, two, contents_too)) - continue; + int i; + struct hash_table file_table; - /* see if there is a basename match, too */ - for (k = j; k < rename_src_nr; k++) { - one = rename_src[k].one; - if (basename_same(one, two) && - is_exact_match(one, two, - contents_too)) { - j = k; - break; - } - } - - record_rename_pair(i, j, (int)MAX_SCORE); - rename_count++; - break; /* we are done with this entry */ - } - } - } - return rename_count; + init_hash(&file_table); + for (i = 0; i < rename_src_nr; i++) + insert_file_table(&file_table, -1, i, rename_src[i].one); + + for (i = 0; i < rename_dst_nr; i++) + insert_file_table(&file_table, 1, i, rename_dst[i].two); + + /* Find the renames */ + i = for_each_hash(&file_table, find_same_files); + + /* .. and free the hash data structure */ + free_hash(&file_table); + + return i; } void diffcore_rename(struct diff_options *options) diff --git a/hash.c b/hash.c new file mode 100644 index 000000000..7b492d4fc --- /dev/null +++ b/hash.c @@ -0,0 +1,110 @@ +/* + * Some generic hashing helpers. + */ +#include "cache.h" +#include "hash.h" + +/* + * Look up a hash entry in the hash table. Return the pointer to + * the existing entry, or the empty slot if none existed. The caller + * can then look at the (*ptr) to see whether it existed or not. + */ +static struct hash_table_entry *lookup_hash_entry(unsigned int hash, struct hash_table *table) +{ + unsigned int size = table->size, nr = hash % size; + struct hash_table_entry *array = table->array; + + while (array[nr].ptr) { + if (array[nr].hash == hash) + break; + nr++; + if (nr >= size) + nr = 0; + } + return array + nr; +} + + +/* + * Insert a new hash entry pointer into the table. + * + * If that hash entry already existed, return the pointer to + * the existing entry (and the caller can create a list of the + * pointers or do anything else). If it didn't exist, return + * NULL (and the caller knows the pointer has been inserted). + */ +static void **insert_hash_entry(unsigned int hash, void *ptr, struct hash_table *table) +{ + struct hash_table_entry *entry = lookup_hash_entry(hash, table); + + if (!entry->ptr) { + entry->ptr = ptr; + entry->hash = hash; + table->nr++; + return NULL; + } + return &entry->ptr; +} + +static void grow_hash_table(struct hash_table *table) +{ + unsigned int i; + unsigned int old_size = table->size, new_size; + struct hash_table_entry *old_array = table->array, *new_array; + + new_size = alloc_nr(old_size); + new_array = xcalloc(sizeof(struct hash_table_entry), new_size); + table->size = new_size; + table->array = new_array; + table->nr = 0; + for (i = 0; i < old_size; i++) { + unsigned int hash = old_array[i].hash; + void *ptr = old_array[i].ptr; + if (ptr) + insert_hash_entry(hash, ptr, table); + } + free(old_array); +} + +void *lookup_hash(unsigned int hash, struct hash_table *table) +{ + if (!table->array) + return NULL; + return &lookup_hash_entry(hash, table)->ptr; +} + +void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table) +{ + unsigned int nr = table->nr; + if (nr >= table->size/2) + grow_hash_table(table); + return insert_hash_entry(hash, ptr, table); +} + +int for_each_hash(struct hash_table *table, int (*fn)(void *)) +{ + int sum = 0; + unsigned int i; + unsigned int size = table->size; + struct hash_table_entry *array = table->array; + + for (i = 0; i < size; i++) { + void *ptr = array->ptr; + array++; + if (ptr) { + int val = fn(ptr); + if (val < 0) + return val; + sum += val; + } + } + return sum; +} + +void free_hash(struct hash_table *table) +{ + free(table->array); + table->array = NULL; + table->size = 0; + table->nr = 0; +} diff --git a/hash.h b/hash.h new file mode 100644 index 000000000..a8b0fbb5b --- /dev/null +++ b/hash.h @@ -0,0 +1,43 @@ +#ifndef HASH_H +#define HASH_H + +/* + * These are some simple generic hash table helper functions. + * Not necessarily suitable for all users, but good for things + * where you want to just keep track of a list of things, and + * have a good hash to use on them. + * + * It keeps the hash table at roughly 50-75% free, so the memory + * cost of the hash table itself is roughly + * + * 3 * 2*sizeof(void *) * nr_of_objects + * + * bytes. + * + * FIXME: on 64-bit architectures, we waste memory. It would be + * good to have just 32-bit pointers, requiring a special allocator + * for hashed entries or something. + */ +struct hash_table_entry { + unsigned int hash; + void *ptr; +}; + +struct hash_table { + unsigned int size, nr; + struct hash_table_entry *array; +}; + +extern void *lookup_hash(unsigned int hash, struct hash_table *table); +extern void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table); +extern int for_each_hash(struct hash_table *table, int (*fn)(void *)); +extern void free_hash(struct hash_table *table); + +static inline void init_hash(struct hash_table *table) +{ + table->size = 0; + table->nr = 0; + table->array = NULL; +} + +#endif From 17559a643ecef94834d930790498c6babe3e89a8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 Oct 2007 11:24:47 -0700 Subject: [PATCH 6/8] Do exact rename detection regardless of rename limits Now that the exact rename detection is linear-time (with a very small constant factor to boot), there is no longer any reason to limit it by the number of files involved. In some trivial testing, I created a repository with a directory that had a hundred thousand files in it (all with different contents), and then moved that directory to show the effects of renaming 100,000 files. With the new code, that resulted in [torvalds@woody big-rename]$ time ~/git/git show -C | wc -l 400006 real 0m2.071s user 0m1.520s sys 0m0.576s ie the code can correctly detect the hundred thousand renames in about 2 seconds (the number "400006" comes from four lines for each rename: diff --git a/really-big-dir/file-1-1-1-1-1 b/moved-big-dir/file-1-1-1-1-1 similarity index 100% rename from really-big-dir/file-1-1-1-1-1 rename to moved-big-dir/file-1-1-1-1-1 and the extra six lines is from a one-liner commit message and all the commit information and spacing). Most of those two seconds weren't even really the rename detection, it's really all the other stuff needed to get there. With the old code, this wouldn't have been practically possible. Doing a pairwise check of the ten billion possible pairs would have been prohibitively expensive. In fact, even with the rename limiter in place, the old code would waste a lot of time just on the diff_filespec checks, and despite not even trying to find renames, it used to look like: [torvalds@woody big-rename]$ time git show -C | wc -l 1400006 real 0m12.337s user 0m12.285s sys 0m0.192s ie we used to take 12 seconds for this load and not even do any rename detection! (The number 1400006 comes from fourteen lines per file moved: seven lines each for the delete and the create of a one-liner file, and the same extra six lines of commit information). Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index e7e370b2c..394693222 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -428,6 +428,12 @@ void diffcore_rename(struct diff_options *options) if (rename_dst_nr == 0 || rename_src_nr == 0) goto cleanup; /* nothing to do */ + /* + * We really want to cull the candidates list early + * with cheap tests in order to avoid doing deltas. + */ + rename_count = find_exact_renames(); + /* * This basically does a test for the rename matrix not * growing larger than a "rename_limit" square matrix, ie: @@ -444,12 +450,6 @@ void diffcore_rename(struct diff_options *options) if (rename_dst_nr * rename_src_nr > rename_limit * rename_limit) goto cleanup; - /* - * We really want to cull the candidates list early - * with cheap tests in order to avoid doing deltas. - */ - rename_count = find_exact_renames(); - /* Have we run out the created file pool? If so we can avoid * doing the delta matrix altogether. */ From 81ac051d6ac9deef9a34de496fb981469aae77f0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 26 Oct 2007 16:51:28 -0700 Subject: [PATCH 7/8] Fix ugly magic special case in exact rename detection For historical reasons, the exact rename detection had populated the filespecs for the entries it compared, and the rest of the similarity analysis depended on that. I hadn't even bothered to debug why that was the case when I re-did the rename detection, I just made the new one have the same broken behaviour, with a note about this special case. This fixes that fixme. The reason the exact rename detector needed to fill in the file sizes of the files it checked was that the _inexact_ rename detector was broken, and started comparing file sizes before it filled them in. Fixing that allows the exact phase to do the sane thing of never even caring (since all *it* cares about is really just the SHA1 itself, not the size nor the contents). It turns out that this also indirectly fixes a bug: trying to populate all the filespecs will run out of virtual memory if there is tons and tons of possible rename options. The fuzzy similarity analysis does the right thing in this regard, and free's the blob info after it has generated the hash tables, so the special case code caused more trouble than just some extra illogical code. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 394693222..7ed5ef81b 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -144,6 +144,20 @@ static int estimate_similarity(struct diff_filespec *src, if (!S_ISREG(src->mode) || !S_ISREG(dst->mode)) return 0; + /* + * Need to check that source and destination sizes are + * filled in before comparing them. + * + * If we already have "cnt_data" filled in, we know it's + * all good (avoid checking the size for zero, as that + * is a possible size - we really should have a flag to + * say whether the size is valid or not!) + */ + if (!src->cnt_data && diff_populate_filespec(src, 0)) + return 0; + if (!dst->cnt_data && diff_populate_filespec(dst, 0)) + return 0; + max_size = ((src->size > dst->size) ? src->size : dst->size); base_size = ((src->size < dst->size) ? src->size : dst->size); delta_size = max_size - base_size; @@ -159,11 +173,6 @@ static int estimate_similarity(struct diff_filespec *src, if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) return 0; - if ((!src->cnt_data && diff_populate_filespec(src, 0)) - || (!dst->cnt_data && diff_populate_filespec(dst, 0))) - return 0; /* error but caught downstream */ - - delta_limit = (unsigned long) (base_size * (MAX_SCORE-minimum_score) / MAX_SCORE); if (diffcore_count_changes(src, dst, @@ -270,19 +279,11 @@ static int find_identical_files(struct file_similarity *src, return renames; } -/* - * Note: the rest of the rename logic depends on this - * phase also populating all the filespecs for any - * entry that isn't matched up with an exact rename. - */ static void free_similarity_list(struct file_similarity *p) { while (p) { struct file_similarity *entry = p; p = p->next; - - /* Stupid special case, see note above! */ - diff_populate_filespec(entry->filespec, 0); free(entry); } } From 42899ac898f2b539fc762c8452da2c981ccbd815 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 26 Oct 2007 16:56:34 -0700 Subject: [PATCH 8/8] Do the fuzzy rename detection limits with the exact renames removed When we do the fuzzy rename detection, we don't care about the destinations that we already handled with the exact rename detector. And, in fact, the code already knew that - but the rename limiter, which used to run *before* exact renames were detected, did not. This fixes it so that the rename detection limiter now bases its decisions on the *remaining* rename counts, rather than the original ones. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 7ed5ef81b..f9ebea564 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -435,33 +435,37 @@ void diffcore_rename(struct diff_options *options) */ rename_count = find_exact_renames(); + /* Did we only want exact renames? */ + if (minimum_score == MAX_SCORE) + goto cleanup; + + /* + * Calculate how many renames are left (but all the source + * files still remain as options for rename/copies!) + */ + num_create = (rename_dst_nr - rename_count); + num_src = rename_src_nr; + + /* All done? */ + if (!num_create) + goto cleanup; + /* * This basically does a test for the rename matrix not * growing larger than a "rename_limit" square matrix, ie: * - * rename_dst_nr * rename_src_nr > rename_limit * rename_limit + * num_create * num_src > rename_limit * rename_limit * * but handles the potential overflow case specially (and we * assume at least 32-bit integers) */ if (rename_limit <= 0 || rename_limit > 32767) rename_limit = 32767; - if (rename_dst_nr > rename_limit && rename_src_nr > rename_limit) + if (num_create > rename_limit && num_src > rename_limit) goto cleanup; - if (rename_dst_nr * rename_src_nr > rename_limit * rename_limit) + if (num_create * num_src > rename_limit * rename_limit) goto cleanup; - /* Have we run out the created file pool? If so we can avoid - * doing the delta matrix altogether. - */ - if (rename_count == rename_dst_nr) - goto cleanup; - - if (minimum_score == MAX_SCORE) - goto cleanup; - - num_create = (rename_dst_nr - rename_count); - num_src = rename_src_nr; mx = xmalloc(sizeof(*mx) * num_create * num_src); for (dst_cnt = i = 0; i < rename_dst_nr; i++) { int base = dst_cnt * num_src;