From 681b07de11c9463f6d93639d9fff9eccd8629eb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 23 May 2012 21:09:46 +0700 Subject: [PATCH 1/4] index-pack: hash non-delta objects while reading from stream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index dc2cfe6e6..a74485653 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -384,11 +384,27 @@ static void unlink_base_data(struct base_data *c) free_base_data(c); } -static void *unpack_entry_data(unsigned long offset, unsigned long size) +static int is_delta_type(enum object_type type) +{ + return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA); +} + +static void *unpack_entry_data(unsigned long offset, unsigned long size, + enum object_type type, unsigned char *sha1) { int status; git_zstream stream; void *buf = xmalloc(size); + git_SHA_CTX c; + char hdr[32]; + int hdrlen; + + if (!is_delta_type(type)) { + hdrlen = sprintf(hdr, "%s %lu", typename(type), size) + 1; + git_SHA1_Init(&c); + git_SHA1_Update(&c, hdr, hdrlen); + } else + sha1 = NULL; memset(&stream, 0, sizeof(stream)); git_inflate_init(&stream); @@ -396,18 +412,25 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size) stream.avail_out = size; do { + unsigned char *last_out = stream.next_out; stream.next_in = fill(1); stream.avail_in = input_len; status = git_inflate(&stream, 0); use(input_len - stream.avail_in); + if (sha1) + git_SHA1_Update(&c, last_out, stream.next_out - last_out); } while (status == Z_OK); if (stream.total_out != size || status != Z_STREAM_END) bad_object(offset, _("inflate returned %d"), status); git_inflate_end(&stream); + if (sha1) + git_SHA1_Final(sha1, &c); return buf; } -static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base) +static void *unpack_raw_entry(struct object_entry *obj, + union delta_base *delta_base, + unsigned char *sha1) { unsigned char *p; unsigned long size, c; @@ -467,7 +490,7 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_ } obj->hdr_size = consumed_bytes - obj->idx.offset; - data = unpack_entry_data(obj->idx.offset, obj->size); + data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1); obj->idx.crc32 = input_crc32; return data; } @@ -569,9 +592,8 @@ static void find_delta_children(const union delta_base *base, } static void sha1_object(const void *data, unsigned long size, - enum object_type type, unsigned char *sha1) + enum object_type type, const unsigned char *sha1) { - hash_sha1_file(data, size, typename(type), sha1); read_lock(); if (has_sha1_file(sha1)) { void *has_data; @@ -627,11 +649,6 @@ static void sha1_object(const void *data, unsigned long size, } } -static int is_delta_type(enum object_type type) -{ - return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA); -} - /* * This function is part of find_unresolved_deltas(). There are two * walkers going in the opposite ways. @@ -711,6 +728,8 @@ static void resolve_delta(struct object_entry *delta_obj, free(delta_data); if (!result->data) bad_object(delta_obj->idx.offset, _("failed to apply delta")); + hash_sha1_file(result->data, result->size, + typename(delta_obj->real_type), delta_obj->idx.sha1); sha1_object(result->data, result->size, delta_obj->real_type, delta_obj->idx.sha1); counter_lock(); @@ -851,7 +870,7 @@ static void parse_pack_objects(unsigned char *sha1) nr_objects); for (i = 0; i < nr_objects; i++) { struct object_entry *obj = &objects[i]; - void *data = unpack_raw_entry(obj, &delta->base); + void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1); obj->real_type = obj->type; if (is_delta_type(obj->type)) { nr_deltas++; From 9ec2dde9f3008ed3822c8e7510ab710bdb613873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 23 May 2012 21:09:47 +0700 Subject: [PATCH 2/4] index-pack: use streaming interface on large blobs (most of the time) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unpack_raw_entry() will not allocate and return decompressed blobs if they are larger than core.bigFileThreshold. sha1_object() may not be called on those objects because there's no actual content. sha1_object() is called later on those objects, where we can safely use get_data_from_pack() to retrieve blob content for checking. However we always do that when we definitely need the blob content. And we often don't. There are two cases when we may need object content. The first case is when we find an in-repo blob with the same SHA-1. We need to do collision test, byte-on-byte. If this test is on, the blob must be loaded on memory (i.e. no streaming). Normally (e.g. in fetch/pull/clone) this does not happen because git avoid to send objects that client already has. The other case is when --strict is specified and the object in question is not a blob, which can't happen in reality becase we deal with large _blobs_ here. Note: --verify (or git-verify-pack) a pack from current repository will trigger collision test on every object in the pack, which effectively disables this patch. This could be easily worked around by setting GIT_DIR to an imaginary place with no packs. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 52 +++++++++++++++++++++++++++++++++++++------- t/t1050-large.sh | 5 +++++ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index a74485653..c7c2b8865 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -392,9 +392,10 @@ static int is_delta_type(enum object_type type) static void *unpack_entry_data(unsigned long offset, unsigned long size, enum object_type type, unsigned char *sha1) { + static char fixed_buf[8192]; int status; git_zstream stream; - void *buf = xmalloc(size); + void *buf; git_SHA_CTX c; char hdr[32]; int hdrlen; @@ -405,11 +406,15 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size, git_SHA1_Update(&c, hdr, hdrlen); } else sha1 = NULL; + if (type == OBJ_BLOB && size > big_file_threshold) + buf = fixed_buf; + else + buf = xmalloc(size); memset(&stream, 0, sizeof(stream)); git_inflate_init(&stream); stream.next_out = buf; - stream.avail_out = size; + stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size; do { unsigned char *last_out = stream.next_out; @@ -419,13 +424,17 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size, use(input_len - stream.avail_in); if (sha1) git_SHA1_Update(&c, last_out, stream.next_out - last_out); + if (buf == fixed_buf) { + stream.next_out = buf; + stream.avail_out = sizeof(fixed_buf); + } } while (status == Z_OK); if (stream.total_out != size || status != Z_STREAM_END) bad_object(offset, _("inflate returned %d"), status); git_inflate_end(&stream); if (sha1) git_SHA1_Final(sha1, &c); - return buf; + return buf == fixed_buf ? NULL : buf; } static void *unpack_raw_entry(struct object_entry *obj, @@ -591,14 +600,21 @@ static void find_delta_children(const union delta_base *base, *last_index = last; } -static void sha1_object(const void *data, unsigned long size, - enum object_type type, const unsigned char *sha1) +static void sha1_object(const void *data, struct object_entry *obj_entry, + unsigned long size, enum object_type type, + const unsigned char *sha1) { + void *new_data = NULL; + + assert(data || obj_entry); + read_lock(); if (has_sha1_file(sha1)) { void *has_data; enum object_type has_type; unsigned long has_size; + if (!data) + data = new_data = get_data_from_pack(obj_entry); has_data = read_sha1_file(sha1, &has_type, &has_size); read_unlock(); if (!has_data) @@ -623,6 +639,9 @@ static void sha1_object(const void *data, unsigned long size, int eaten; void *buf = (void *) data; + if (!buf) + buf = new_data = get_data_from_pack(obj_entry); + /* * we do not need to free the memory here, as the * buf is deleted by the caller. @@ -647,6 +666,8 @@ static void sha1_object(const void *data, unsigned long size, } read_unlock(); } + + free(new_data); } /* @@ -730,7 +751,7 @@ static void resolve_delta(struct object_entry *delta_obj, bad_object(delta_obj->idx.offset, _("failed to apply delta")); hash_sha1_file(result->data, result->size, typename(delta_obj->real_type), delta_obj->idx.sha1); - sha1_object(result->data, result->size, delta_obj->real_type, + sha1_object(result->data, NULL, result->size, delta_obj->real_type, delta_obj->idx.sha1); counter_lock(); nr_resolved_deltas++; @@ -860,7 +881,7 @@ static void *threaded_second_pass(void *data) */ static void parse_pack_objects(unsigned char *sha1) { - int i; + int i, nr_delays = 0; struct delta_entry *delta = deltas; struct stat st; @@ -876,8 +897,12 @@ static void parse_pack_objects(unsigned char *sha1) nr_deltas++; delta->obj_no = i; delta++; + } else if (!data) { + /* large blobs, check later */ + obj->real_type = OBJ_BAD; + nr_delays++; } else - sha1_object(data, obj->size, obj->type, obj->idx.sha1); + sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1); free(data); display_progress(progress, i+1); } @@ -897,6 +922,17 @@ static void parse_pack_objects(unsigned char *sha1) if (S_ISREG(st.st_mode) && lseek(input_fd, 0, SEEK_CUR) - input_len != st.st_size) die(_("pack has junk at the end")); + + for (i = 0; i < nr_objects; i++) { + struct object_entry *obj = &objects[i]; + if (obj->real_type != OBJ_BAD) + continue; + obj->real_type = obj->type; + sha1_object(NULL, obj, obj->size, obj->type, obj->idx.sha1); + nr_delays--; + } + if (nr_delays) + die(_("confusion beyond insanity in parse_pack_objects()")); } /* diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 55ed955ce..3f806889a 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -130,6 +130,11 @@ test_expect_success 'git-show a large file' ' ' +test_expect_success 'index-pack' ' + git clone file://"`pwd`"/.git foo && + GIT_DIR=non-existent git index-pack --strict --verify foo/.git/objects/pack/*.pack +' + test_expect_success 'repack' ' git repack -ad ' From 8a2e163ccd03a12696cbf7518c0b742d313b6f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 23 May 2012 21:09:48 +0700 Subject: [PATCH 3/4] index-pack: factor out unpack core from get_data_from_pack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows caller to consume large inflated object with a fixed amount of memory. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index c7c2b8865..9129299cd 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -504,7 +504,9 @@ static void *unpack_raw_entry(struct object_entry *obj, return data; } -static void *get_data_from_pack(struct object_entry *obj) +static void *unpack_data(struct object_entry *obj, + int (*consume)(const unsigned char *, unsigned long, void *), + void *cb_data) { off_t from = obj[0].idx.offset + obj[0].hdr_size; unsigned long len = obj[1].idx.offset - from; @@ -512,15 +514,16 @@ static void *get_data_from_pack(struct object_entry *obj) git_zstream stream; int status; - data = xmalloc(obj->size); + data = xmalloc(consume ? 64*1024 : obj->size); inbuf = xmalloc((len < 64*1024) ? len : 64*1024); memset(&stream, 0, sizeof(stream)); git_inflate_init(&stream); stream.next_out = data; - stream.avail_out = obj->size; + stream.avail_out = consume ? 64*1024 : obj->size; do { + unsigned char *last_out = stream.next_out; ssize_t n = (len < 64*1024) ? len : 64*1024; n = pread(pack_fd, inbuf, n, from); if (n < 0) @@ -535,6 +538,15 @@ static void *get_data_from_pack(struct object_entry *obj) stream.next_in = inbuf; stream.avail_in = n; status = git_inflate(&stream, 0); + if (consume) { + if (consume(last_out, stream.next_out - last_out, cb_data)) { + free(inbuf); + free(data); + return NULL; + } + stream.next_out = data; + stream.avail_out = 64*1024; + } } while (len && status == Z_OK && !stream.avail_in); /* This has been inflated OK when first encountered, so... */ @@ -543,9 +555,18 @@ static void *get_data_from_pack(struct object_entry *obj) git_inflate_end(&stream); free(inbuf); + if (consume) { + free(data); + data = NULL; + } return data; } +static void *get_data_from_pack(struct object_entry *obj) +{ + return unpack_data(obj, NULL, NULL); +} + static int compare_delta_bases(const union delta_base *base1, const union delta_base *base2, enum object_type type1, From 4614043c8fd8972bcde4fbfce7a8d25dd6d206fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Thu, 24 May 2012 20:55:44 +0700 Subject: [PATCH 4/4] index-pack: use streaming interface for collision test on large blobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When putting whole objects in core is unavoidable, try match object type and size first before actually inflating. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 82 +++++++++++++++++++++++++++++++++++++++--- t/t5300-pack-object.sh | 5 +++ 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 9129299cd..8b5c1eb33 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -9,6 +9,7 @@ #include "progress.h" #include "fsck.h" #include "exec_cmd.h" +#include "streaming.h" #include "thread-utils.h" static const char index_pack_usage[] = @@ -621,31 +622,102 @@ static void find_delta_children(const union delta_base *base, *last_index = last; } +struct compare_data { + struct object_entry *entry; + struct git_istream *st; + unsigned char *buf; + unsigned long buf_size; +}; + +static int compare_objects(const unsigned char *buf, unsigned long size, + void *cb_data) +{ + struct compare_data *data = cb_data; + + if (data->buf_size < size) { + free(data->buf); + data->buf = xmalloc(size); + data->buf_size = size; + } + + while (size) { + ssize_t len = read_istream(data->st, data->buf, size); + if (len == 0) + die(_("SHA1 COLLISION FOUND WITH %s !"), + sha1_to_hex(data->entry->idx.sha1)); + if (len < 0) + die(_("unable to read %s"), + sha1_to_hex(data->entry->idx.sha1)); + if (memcmp(buf, data->buf, len)) + die(_("SHA1 COLLISION FOUND WITH %s !"), + sha1_to_hex(data->entry->idx.sha1)); + size -= len; + buf += len; + } + return 0; +} + +static int check_collison(struct object_entry *entry) +{ + struct compare_data data; + enum object_type type; + unsigned long size; + + if (entry->size <= big_file_threshold || entry->type != OBJ_BLOB) + return -1; + + memset(&data, 0, sizeof(data)); + data.entry = entry; + data.st = open_istream(entry->idx.sha1, &type, &size, NULL); + if (!data.st) + return -1; + if (size != entry->size || type != entry->type) + die(_("SHA1 COLLISION FOUND WITH %s !"), + sha1_to_hex(entry->idx.sha1)); + unpack_data(entry, compare_objects, &data); + close_istream(data.st); + free(data.buf); + return 0; +} + static void sha1_object(const void *data, struct object_entry *obj_entry, unsigned long size, enum object_type type, const unsigned char *sha1) { void *new_data = NULL; + int collision_test_needed; assert(data || obj_entry); read_lock(); - if (has_sha1_file(sha1)) { + collision_test_needed = has_sha1_file(sha1); + read_unlock(); + + if (collision_test_needed && !data) { + read_lock(); + if (!check_collison(obj_entry)) + collision_test_needed = 0; + read_unlock(); + } + if (collision_test_needed) { void *has_data; enum object_type has_type; unsigned long has_size; - if (!data) - data = new_data = get_data_from_pack(obj_entry); + read_lock(); + has_type = sha1_object_info(sha1, &has_size); + if (has_type != type || has_size != size) + die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1)); has_data = read_sha1_file(sha1, &has_type, &has_size); read_unlock(); + if (!data) + data = new_data = get_data_from_pack(obj_entry); if (!has_data) die(_("cannot read existing object %s"), sha1_to_hex(sha1)); if (size != has_size || type != has_type || memcmp(data, has_data, size) != 0) die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1)); free(has_data); - } else - read_unlock(); + } if (strict) { read_lock(); diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh index d9d856b87..300ed910a 100755 --- a/t/t5300-pack-object.sh +++ b/t/t5300-pack-object.sh @@ -418,4 +418,9 @@ test_expect_success \ 'test_must_fail git index-pack -o bad.idx test-3.pack 2>msg && grep "SHA1 COLLISION FOUND" msg' +test_expect_success \ + 'make sure index-pack detects the SHA1 collision (large blobs)' \ + 'test_must_fail git -c core.bigfilethreshold=1 index-pack -o bad.idx test-3.pack 2>msg && + grep "SHA1 COLLISION FOUND" msg' + test_done