From f8bb1d94311b7438e7e9831b51ffb047b7ae65d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 16 Aug 2014 10:08:02 +0700 Subject: [PATCH 1/5] wrapper.c: introduce gentle xmallocz that does not die() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- git-compat-util.h | 1 + wrapper.c | 68 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/git-compat-util.h b/git-compat-util.h index f587749b7..8785fd30d 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -593,6 +593,7 @@ extern try_to_free_t set_try_to_free_routine(try_to_free_t); extern char *xstrdup(const char *str); extern void *xmalloc(size_t size); extern void *xmallocz(size_t size); +extern void *xmallocz_gently(size_t size); extern void *xmemdupz(const void *data, size_t len); extern char *xstrndup(const char *str, size_t len); extern void *xrealloc(void *ptr, size_t size); diff --git a/wrapper.c b/wrapper.c index bc1bfb860..dc9c8f4dd 100644 --- a/wrapper.c +++ b/wrapper.c @@ -9,16 +9,23 @@ static void do_nothing(size_t size) static void (*try_to_free_routine)(size_t size) = do_nothing; -static void memory_limit_check(size_t size) +static int memory_limit_check(size_t size, int gentle) { static int limit = -1; if (limit == -1) { const char *env = getenv("GIT_ALLOC_LIMIT"); limit = env ? atoi(env) * 1024 : 0; } - if (limit && size > limit) - die("attempting to allocate %"PRIuMAX" over limit %d", - (intmax_t)size, limit); + if (limit && size > limit) { + if (gentle) { + error("attempting to allocate %"PRIuMAX" over limit %d", + (intmax_t)size, limit); + return -1; + } else + die("attempting to allocate %"PRIuMAX" over limit %d", + (intmax_t)size, limit); + } + return 0; } try_to_free_t set_try_to_free_routine(try_to_free_t routine) @@ -42,11 +49,12 @@ char *xstrdup(const char *str) return ret; } -void *xmalloc(size_t size) +static void *do_xmalloc(size_t size, int gentle) { void *ret; - memory_limit_check(size); + if (memory_limit_check(size, gentle)) + return NULL; ret = malloc(size); if (!ret && !size) ret = malloc(1); @@ -55,9 +63,16 @@ void *xmalloc(size_t size) ret = malloc(size); if (!ret && !size) ret = malloc(1); - if (!ret) - die("Out of memory, malloc failed (tried to allocate %lu bytes)", - (unsigned long)size); + if (!ret) { + if (!gentle) + die("Out of memory, malloc failed (tried to allocate %lu bytes)", + (unsigned long)size); + else { + error("Out of memory, malloc failed (tried to allocate %lu bytes)", + (unsigned long)size); + return NULL; + } + } } #ifdef XMALLOC_POISON memset(ret, 0xA5, size); @@ -65,16 +80,37 @@ void *xmalloc(size_t size) return ret; } -void *xmallocz(size_t size) +void *xmalloc(size_t size) +{ + return do_xmalloc(size, 0); +} + +static void *do_xmallocz(size_t size, int gentle) { void *ret; - if (unsigned_add_overflows(size, 1)) - die("Data too large to fit into virtual memory space."); - ret = xmalloc(size + 1); - ((char*)ret)[size] = 0; + if (unsigned_add_overflows(size, 1)) { + if (gentle) { + error("Data too large to fit into virtual memory space."); + return NULL; + } else + die("Data too large to fit into virtual memory space."); + } + ret = do_xmalloc(size + 1, gentle); + if (ret) + ((char*)ret)[size] = 0; return ret; } +void *xmallocz(size_t size) +{ + return do_xmallocz(size, 0); +} + +void *xmallocz_gently(size_t size) +{ + return do_xmallocz(size, 1); +} + /* * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of * "data" to the allocated memory, zero terminates the allocated memory, @@ -96,7 +132,7 @@ void *xrealloc(void *ptr, size_t size) { void *ret; - memory_limit_check(size); + memory_limit_check(size, 0); ret = realloc(ptr, size); if (!ret && !size) ret = realloc(ptr, 1); @@ -115,7 +151,7 @@ void *xcalloc(size_t nmemb, size_t size) { void *ret; - memory_limit_check(size * nmemb); + memory_limit_check(size * nmemb, 0); ret = calloc(nmemb, size); if (!ret && (!nmemb || !size)) ret = calloc(1, 1); From 735efde838b68747e737e863427a4843a8efaddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 16 Aug 2014 10:08:03 +0700 Subject: [PATCH 2/5] sha1_file.c: do not die failing to malloc in unpack_compressed_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fewer die() gives better control to the caller, provided that the caller _can_ handle it. And in unpack_compressed_entry() case, it can, because unpack_compressed_entry() already returns NULL if it fails to inflate data. A side effect from this is fsck continues to run when very large blobs are present (and do not fit in memory). Noticed-by: Dale R. Worley Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- sha1_file.c | 4 +++- t/t1050-large.sh | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sha1_file.c b/sha1_file.c index 3f70b1d86..8db73f0c5 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1923,7 +1923,9 @@ static void *unpack_compressed_entry(struct packed_git *p, git_zstream stream; unsigned char *buffer, *in; - buffer = xmallocz(size); + buffer = xmallocz_gently(size); + if (!buffer) + return NULL; memset(&stream, 0, sizeof(stream)); stream.next_out = buffer; stream.avail_out = size + 1; diff --git a/t/t1050-large.sh b/t/t1050-large.sh index aea493646..5642f84b8 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -163,4 +163,10 @@ test_expect_success 'zip achiving, deflate' ' git archive --format=zip HEAD >/dev/null ' +test_expect_success 'fsck' ' + test_must_fail git fsck 2>err && + n=$(grep "error: attempting to allocate .* over limit" err | wc -l) && + test "$n" -gt 1 +' + test_done From 8e5dd3d654ebb9e86e06b9ef5ca86bd6ebf44de1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 16 Aug 2014 10:08:04 +0700 Subject: [PATCH 3/5] diff.c: allow to pass more flags to diff_populate_filespec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- diff.c | 13 +++++++------ diffcore-rename.c | 6 ++++-- diffcore.h | 3 ++- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/diff.c b/diff.c index 867f034b8..f4b7421fa 100644 --- a/diff.c +++ b/diff.c @@ -376,7 +376,7 @@ static unsigned long diff_filespec_size(struct diff_filespec *one) { if (!DIFF_FILE_VALID(one)) return 0; - diff_populate_filespec(one, 1); + diff_populate_filespec(one, CHECK_SIZE_ONLY); return one->size; } @@ -1910,11 +1910,11 @@ static void show_dirstat(struct diff_options *options) diff_free_filespec_data(p->one); diff_free_filespec_data(p->two); } else if (DIFF_FILE_VALID(p->one)) { - diff_populate_filespec(p->one, 1); + diff_populate_filespec(p->one, CHECK_SIZE_ONLY); copied = added = 0; diff_free_filespec_data(p->one); } else if (DIFF_FILE_VALID(p->two)) { - diff_populate_filespec(p->two, 1); + diff_populate_filespec(p->two, CHECK_SIZE_ONLY); copied = 0; added = p->two->size; diff_free_filespec_data(p->two); @@ -2668,8 +2668,9 @@ static int diff_populate_gitlink(struct diff_filespec *s, int size_only) * grab the data for the blob (or file) for our own in-core comparison. * diff_filespec has data and size fields for this purpose. */ -int diff_populate_filespec(struct diff_filespec *s, int size_only) +int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) { + int size_only = flags & CHECK_SIZE_ONLY; int err = 0; /* * demote FAIL to WARN to allow inspecting the situation @@ -4688,8 +4689,8 @@ static int diff_filespec_check_stat_unmatch(struct diff_filepair *p) !DIFF_FILE_VALID(p->two) || (p->one->sha1_valid && p->two->sha1_valid) || (p->one->mode != p->two->mode) || - diff_populate_filespec(p->one, 1) || - diff_populate_filespec(p->two, 1) || + diff_populate_filespec(p->one, CHECK_SIZE_ONLY) || + diff_populate_filespec(p->two, CHECK_SIZE_ONLY) || (p->one->size != p->two->size) || !diff_filespec_is_identical(p->one, p->two)) /* (2) */ p->skip_stat_unmatch_result = 1; diff --git a/diffcore-rename.c b/diffcore-rename.c index 2e44a3745..4e132f1fd 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -147,9 +147,11 @@ static int estimate_similarity(struct diff_filespec *src, * is a possible size - we really should have a flag to * say whether the size is valid or not!) */ - if (!src->cnt_data && diff_populate_filespec(src, 1)) + if (!src->cnt_data && + diff_populate_filespec(src, CHECK_SIZE_ONLY)) return 0; - if (!dst->cnt_data && diff_populate_filespec(dst, 1)) + if (!dst->cnt_data && + diff_populate_filespec(dst, CHECK_SIZE_ONLY)) return 0; max_size = ((src->size > dst->size) ? src->size : dst->size); diff --git a/diffcore.h b/diffcore.h index c876dac71..c80df18f3 100644 --- a/diffcore.h +++ b/diffcore.h @@ -55,7 +55,8 @@ extern void free_filespec(struct diff_filespec *); extern void fill_filespec(struct diff_filespec *, const unsigned char *, int, unsigned short); -extern int diff_populate_filespec(struct diff_filespec *, int); +#define CHECK_SIZE_ONLY 1 +extern int diff_populate_filespec(struct diff_filespec *, unsigned int); extern void diff_free_filespec_data(struct diff_filespec *); extern void diff_free_filespec_blob(struct diff_filespec *); extern int diff_filespec_is_binary(struct diff_filespec *); From 6bf3b813486b4528feca39d599c256f662defc14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 16 Aug 2014 10:08:05 +0700 Subject: [PATCH 4/5] diff --stat: mark any file larger than core.bigfilethreshold binary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Too large files may lead to failure to allocate memory. If it happens here, it could impact quite a few commands that involve diff. Moreover, too large files are inefficient to compare anyway (and most likely non-text), so mark them binary and skip looking at their content. Noticed-by: Dale R. Worley Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/config.txt | 3 ++- Documentation/gitattributes.txt | 4 ++-- diff.c | 26 ++++++++++++++++++-------- diffcore.h | 1 + t/t1050-large.sh | 4 ++++ 5 files changed, 27 insertions(+), 11 deletions(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index c55c22ab7..3b5b24aeb 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -499,7 +499,8 @@ core.bigFileThreshold:: Files larger than this size are stored deflated, without attempting delta compression. Storing large files without delta compression avoids excessive memory usage, at the - slight expense of increased disk usage. + slight expense of increased disk usage. Additionally files + larger than this size are always treated as binary. + Default is 512 MiB on all platforms. This should be reasonable for most projects as source code and other text files can still diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt index 643c1ba92..9b45bda74 100644 --- a/Documentation/gitattributes.txt +++ b/Documentation/gitattributes.txt @@ -440,8 +440,8 @@ Unspecified:: A path to which the `diff` attribute is unspecified first gets its contents inspected, and if it looks like - text, it is treated as text. Otherwise it would - generate `Binary files differ`. + text and is smaller than core.bigFileThreshold, it is treated + as text. Otherwise it would generate `Binary files differ`. String:: diff --git a/diff.c b/diff.c index f4b7421fa..d381a6f44 100644 --- a/diff.c +++ b/diff.c @@ -2188,8 +2188,8 @@ int diff_filespec_is_binary(struct diff_filespec *one) one->is_binary = one->driver->binary; else { if (!one->data && DIFF_FILE_VALID(one)) - diff_populate_filespec(one, 0); - if (one->data) + diff_populate_filespec(one, CHECK_BINARY); + if (one->is_binary == -1 && one->data) one->is_binary = buffer_is_binary(one->data, one->size); if (one->is_binary == -1) @@ -2725,6 +2725,11 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) } if (size_only) return 0; + if ((flags & CHECK_BINARY) && + s->size > big_file_threshold && s->is_binary == -1) { + s->is_binary = 1; + return 0; + } fd = open(s->path, O_RDONLY); if (fd < 0) goto err_empty; @@ -2746,16 +2751,21 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags) } else { enum object_type type; - if (size_only) { + if (size_only || (flags & CHECK_BINARY)) { type = sha1_object_info(s->sha1, &s->size); if (type < 0) die("unable to read %s", sha1_to_hex(s->sha1)); - } else { - s->data = read_sha1_file(s->sha1, &type, &s->size); - if (!s->data) - die("unable to read %s", sha1_to_hex(s->sha1)); - s->should_free = 1; + if (size_only) + return 0; + if (s->size > big_file_threshold && s->is_binary == -1) { + s->is_binary = 1; + return 0; + } } + s->data = read_sha1_file(s->sha1, &type, &s->size); + if (!s->data) + die("unable to read %s", sha1_to_hex(s->sha1)); + s->should_free = 1; } return 0; } diff --git a/diffcore.h b/diffcore.h index c80df18f3..33ea2de34 100644 --- a/diffcore.h +++ b/diffcore.h @@ -56,6 +56,7 @@ extern void fill_filespec(struct diff_filespec *, const unsigned char *, int, unsigned short); #define CHECK_SIZE_ONLY 1 +#define CHECK_BINARY 2 extern int diff_populate_filespec(struct diff_filespec *, unsigned int); extern void diff_free_filespec_data(struct diff_filespec *); extern void diff_free_filespec_blob(struct diff_filespec *); diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 5642f84b8..00d2f33df 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -112,6 +112,10 @@ test_expect_success 'diff --raw' ' git diff --raw HEAD^ ' +test_expect_success 'diff --stat' ' + git diff --stat HEAD^ HEAD +' + test_expect_success 'hash-object' ' git hash-object large1 ' From 1aaf69e669b7fd67073d3024b386ac25ac77d0f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 16 Aug 2014 10:08:06 +0700 Subject: [PATCH 5/5] diff: shortcut for diff'ing two binary SHA-1 objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we are given two SHA-1 and asked to determine if they are different (but not _what_ differences), we know right away by comparing SHA-1. A side effect of this patch is, because large files are marked binary, diff-tree will not need to unpack them. 'diff-index --cached' will not either. But 'diff-files' still does. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- diff.c | 13 +++++++++++++ t/t1050-large.sh | 10 ++++++++++ 2 files changed, 23 insertions(+) diff --git a/diff.c b/diff.c index d381a6f44..b85bcfbf5 100644 --- a/diff.c +++ b/diff.c @@ -2324,6 +2324,19 @@ static void builtin_diff(const char *name_a, } else if (!DIFF_OPT_TST(o, TEXT) && ( (!textconv_one && diff_filespec_is_binary(one)) || (!textconv_two && diff_filespec_is_binary(two)) )) { + if (!one->data && !two->data && + S_ISREG(one->mode) && S_ISREG(two->mode) && + !DIFF_OPT_TST(o, BINARY)) { + if (!hashcmp(one->sha1, two->sha1)) { + if (must_show_header) + fprintf(o->file, "%s", header.buf); + goto free_ab_and_return; + } + fprintf(o->file, "%s", header.buf); + fprintf(o->file, "%sBinary files %s and %s differ\n", + line_prefix, lbl[0], lbl[1]); + goto free_ab_and_return; + } if (fill_mmfile(&mf1, one) < 0 || fill_mmfile(&mf2, two) < 0) die("unable to read files to diff"); /* Quite common confusing case */ diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 00d2f33df..05a1e1d27 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -116,6 +116,16 @@ test_expect_success 'diff --stat' ' git diff --stat HEAD^ HEAD ' +test_expect_success 'diff' ' + git diff HEAD^ HEAD >actual && + grep "Binary files.*differ" actual +' + +test_expect_success 'diff --cached' ' + git diff --cached HEAD^ >actual && + grep "Binary files.*differ" actual +' + test_expect_success 'hash-object' ' git hash-object large1 '