From 6163cd8a2a8703bdbd30920731aec32c68c39453 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Thu, 3 May 2012 08:51:00 +0700 Subject: [PATCH 01/10] streaming: void pointer instead of char pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow any kind of buffer to be fed to read_istream() without an explicit cast by making it's buf argument a void pointer. It's about arbitrary data, not only characters. Signed-off-by: Rene Scharfe Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- streaming.c | 2 +- streaming.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/streaming.c b/streaming.c index 7e7ee2be6..3a3cd1206 100644 --- a/streaming.c +++ b/streaming.c @@ -99,7 +99,7 @@ int close_istream(struct git_istream *st) return r; } -ssize_t read_istream(struct git_istream *st, char *buf, size_t sz) +ssize_t read_istream(struct git_istream *st, void *buf, size_t sz) { return st->vtbl->read(st, buf, sz); } diff --git a/streaming.h b/streaming.h index 3e827709c..1d05c2a46 100644 --- a/streaming.h +++ b/streaming.h @@ -10,7 +10,7 @@ struct git_istream; extern struct git_istream *open_istream(const unsigned char *, enum object_type *, unsigned long *, struct stream_filter *); extern int close_istream(struct git_istream *); -extern ssize_t read_istream(struct git_istream *, char *, size_t); +extern ssize_t read_istream(struct git_istream *, void *, size_t); extern int stream_blob_to_fd(int fd, const unsigned char *, struct stream_filter *, int can_seek); From d240d4102103215e9fe38431b1702a4024d2f1a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Thu, 3 May 2012 08:51:01 +0700 Subject: [PATCH 02/10] archive-tar: turn write_tar_entry into blob-writing only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this patch write_tar_entry() can: - write global header by write_global_extended_header() calling write_tar_entry with with both sha1 and path == NULL - write extended header for symlinks, by write_tar_entry() calling itself with sha1 != NULL and path == NULL - write a normal blob. In this case both sha1 and path are valid. After this patch, the first two call sites are modified to write the header without calling write_tar_entry(). The function is now for writing blobs only. This simplifies handling when write_tar_entry() learns about large blobs. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-tar.c | 78 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 20af0051a..1727ab90a 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -123,6 +123,43 @@ static size_t get_path_prefix(const char *path, size_t pathlen, size_t maxlen) return i; } +static void prepare_header(struct archiver_args *args, + struct ustar_header *header, + unsigned int mode, unsigned long size) +{ + sprintf(header->mode, "%07o", mode & 07777); + sprintf(header->size, "%011lo", S_ISREG(mode) ? size : 0); + sprintf(header->mtime, "%011lo", (unsigned long) args->time); + + sprintf(header->uid, "%07o", 0); + sprintf(header->gid, "%07o", 0); + strlcpy(header->uname, "root", sizeof(header->uname)); + strlcpy(header->gname, "root", sizeof(header->gname)); + sprintf(header->devmajor, "%07o", 0); + sprintf(header->devminor, "%07o", 0); + + memcpy(header->magic, "ustar", 6); + memcpy(header->version, "00", 2); + + sprintf(header->chksum, "%07o", ustar_header_chksum(header)); +} + +static int write_extended_header(struct archiver_args *args, + const unsigned char *sha1, + const void *buffer, unsigned long size) +{ + struct ustar_header header; + unsigned int mode; + memset(&header, 0, sizeof(header)); + *header.typeflag = TYPEFLAG_EXT_HEADER; + mode = 0100666; + sprintf(header.name, "%s.paxheader", sha1_to_hex(sha1)); + prepare_header(args, &header, mode, size); + write_blocked(&header, sizeof(header)); + write_blocked(buffer, size); + return 0; +} + static int write_tar_entry(struct archiver_args *args, const unsigned char *sha1, const char *path, size_t pathlen, unsigned int mode, void *buffer, unsigned long size) @@ -134,13 +171,9 @@ static int write_tar_entry(struct archiver_args *args, memset(&header, 0, sizeof(header)); if (!sha1) { - *header.typeflag = TYPEFLAG_GLOBAL_HEADER; - mode = 0100666; - strcpy(header.name, "pax_global_header"); + die("BUG: sha1 == NULL is not supported"); } else if (!path) { - *header.typeflag = TYPEFLAG_EXT_HEADER; - mode = 0100666; - sprintf(header.name, "%s.paxheader", sha1_to_hex(sha1)); + die("BUG: path == NULL is not supported"); } else { if (S_ISDIR(mode) || S_ISGITLINK(mode)) { *header.typeflag = TYPEFLAG_DIR; @@ -182,25 +215,11 @@ static int write_tar_entry(struct archiver_args *args, memcpy(header.linkname, buffer, size); } - sprintf(header.mode, "%07o", mode & 07777); - sprintf(header.size, "%011lo", S_ISREG(mode) ? size : 0); - sprintf(header.mtime, "%011lo", (unsigned long) args->time); - - sprintf(header.uid, "%07o", 0); - sprintf(header.gid, "%07o", 0); - strlcpy(header.uname, "root", sizeof(header.uname)); - strlcpy(header.gname, "root", sizeof(header.gname)); - sprintf(header.devmajor, "%07o", 0); - sprintf(header.devminor, "%07o", 0); - - memcpy(header.magic, "ustar", 6); - memcpy(header.version, "00", 2); - - sprintf(header.chksum, "%07o", ustar_header_chksum(&header)); + prepare_header(args, &header, mode, size); if (ext_header.len > 0) { - err = write_tar_entry(args, sha1, NULL, 0, 0, ext_header.buf, - ext_header.len); + err = write_extended_header(args, sha1, ext_header.buf, + ext_header.len); if (err) return err; } @@ -215,11 +234,18 @@ static int write_global_extended_header(struct archiver_args *args) { const unsigned char *sha1 = args->commit_sha1; struct strbuf ext_header = STRBUF_INIT; - int err; + struct ustar_header header; + unsigned int mode; + int err = 0; strbuf_append_ext_header(&ext_header, "comment", sha1_to_hex(sha1), 40); - err = write_tar_entry(args, NULL, NULL, 0, 0, ext_header.buf, - ext_header.len); + memset(&header, 0, sizeof(header)); + *header.typeflag = TYPEFLAG_GLOBAL_HEADER; + mode = 0100666; + strcpy(header.name, "pax_global_header"); + prepare_header(args, &header, mode, ext_header.len); + write_blocked(&header, sizeof(header)); + write_blocked(ext_header.buf, ext_header.len); strbuf_release(&ext_header); return err; } From 853907097af803c595f0c12fc355c907702eb7c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Thu, 3 May 2012 08:51:02 +0700 Subject: [PATCH 03/10] archive-tar: unindent write_tar_entry by one level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's used to be if (!sha1) { ... } else if (!path) { ... } else { ... } Now that the first two blocks are no-op. We can remove the if/else skeleton and put the else block back by one indent level. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-tar.c | 56 +++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 1727ab90a..6c8a0bd3b 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -170,40 +170,34 @@ static int write_tar_entry(struct archiver_args *args, memset(&header, 0, sizeof(header)); - if (!sha1) { - die("BUG: sha1 == NULL is not supported"); - } else if (!path) { - die("BUG: path == NULL is not supported"); + if (S_ISDIR(mode) || S_ISGITLINK(mode)) { + *header.typeflag = TYPEFLAG_DIR; + mode = (mode | 0777) & ~tar_umask; + } else if (S_ISLNK(mode)) { + *header.typeflag = TYPEFLAG_LNK; + mode |= 0777; + } else if (S_ISREG(mode)) { + *header.typeflag = TYPEFLAG_REG; + mode = (mode | ((mode & 0100) ? 0777 : 0666)) & ~tar_umask; } else { - if (S_ISDIR(mode) || S_ISGITLINK(mode)) { - *header.typeflag = TYPEFLAG_DIR; - mode = (mode | 0777) & ~tar_umask; - } else if (S_ISLNK(mode)) { - *header.typeflag = TYPEFLAG_LNK; - mode |= 0777; - } else if (S_ISREG(mode)) { - *header.typeflag = TYPEFLAG_REG; - mode = (mode | ((mode & 0100) ? 0777 : 0666)) & ~tar_umask; + return error("unsupported file mode: 0%o (SHA1: %s)", + mode, sha1_to_hex(sha1)); + } + if (pathlen > sizeof(header.name)) { + size_t plen = get_path_prefix(path, pathlen, + sizeof(header.prefix)); + size_t rest = pathlen - plen - 1; + if (plen > 0 && rest <= sizeof(header.name)) { + memcpy(header.prefix, path, plen); + memcpy(header.name, path + plen + 1, rest); } else { - return error("unsupported file mode: 0%o (SHA1: %s)", - mode, sha1_to_hex(sha1)); + sprintf(header.name, "%s.data", + sha1_to_hex(sha1)); + strbuf_append_ext_header(&ext_header, "path", + path, pathlen); } - if (pathlen > sizeof(header.name)) { - size_t plen = get_path_prefix(path, pathlen, - sizeof(header.prefix)); - size_t rest = pathlen - plen - 1; - if (plen > 0 && rest <= sizeof(header.name)) { - memcpy(header.prefix, path, plen); - memcpy(header.name, path + plen + 1, rest); - } else { - sprintf(header.name, "%s.data", - sha1_to_hex(sha1)); - strbuf_append_ext_header(&ext_header, "path", - path, pathlen); - } - } else - memcpy(header.name, path, pathlen); - } + } else + memcpy(header.name, path, pathlen); if (S_ISLNK(mode) && buffer) { if (size > sizeof(header.linkname)) { From 9cb513b7988c2fe443c47186e42dd827b76ddb14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Thu, 3 May 2012 08:51:03 +0700 Subject: [PATCH 04/10] archive: delegate blob reading to backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit archive-tar.c and archive-zip.c now perform conversion check, with help of sha1_file_to_archive() from archive.c This gives backends more freedom in dealing with (streaming) large blobs. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-tar.c | 25 +++++++++++++++++++++---- archive-zip.c | 15 +++++++++++++-- archive.c | 28 +++++++++++----------------- archive.h | 10 +++++++++- 4 files changed, 54 insertions(+), 24 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 6c8a0bd3b..3be0cdf35 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -161,11 +161,15 @@ static int write_extended_header(struct archiver_args *args, } static int write_tar_entry(struct archiver_args *args, - const unsigned char *sha1, const char *path, size_t pathlen, - unsigned int mode, void *buffer, unsigned long size) + const unsigned char *sha1, + const char *path, size_t pathlen, + unsigned int mode) { struct ustar_header header; struct strbuf ext_header = STRBUF_INIT; + unsigned int old_mode = mode; + unsigned long size; + void *buffer; int err = 0; memset(&header, 0, sizeof(header)); @@ -199,7 +203,17 @@ static int write_tar_entry(struct archiver_args *args, } else memcpy(header.name, path, pathlen); - if (S_ISLNK(mode) && buffer) { + if (S_ISLNK(mode) || S_ISREG(mode)) { + enum object_type type; + buffer = sha1_file_to_archive(args, path, sha1, old_mode, &type, &size); + if (!buffer) + return error("cannot read %s", sha1_to_hex(sha1)); + } else { + buffer = NULL; + size = 0; + } + + if (S_ISLNK(mode)) { if (size > sizeof(header.linkname)) { sprintf(header.linkname, "see %s.paxheader", sha1_to_hex(sha1)); @@ -214,13 +228,16 @@ static int write_tar_entry(struct archiver_args *args, if (ext_header.len > 0) { err = write_extended_header(args, sha1, ext_header.buf, ext_header.len); - if (err) + if (err) { + free(buffer); return err; + } } strbuf_release(&ext_header); write_blocked(&header, sizeof(header)); if (S_ISREG(mode) && buffer && size > 0) write_blocked(buffer, size); + free(buffer); return err; } diff --git a/archive-zip.c b/archive-zip.c index 02d1f3787..716cc4271 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -121,8 +121,9 @@ static void *zlib_deflate(void *data, unsigned long size, } static int write_zip_entry(struct archiver_args *args, - const unsigned char *sha1, const char *path, size_t pathlen, - unsigned int mode, void *buffer, unsigned long size) + const unsigned char *sha1, + const char *path, size_t pathlen, + unsigned int mode) { struct zip_local_header header; struct zip_dir_header dirent; @@ -134,6 +135,8 @@ static int write_zip_entry(struct archiver_args *args, int method; unsigned char *out; void *deflated = NULL; + void *buffer; + unsigned long size; crc = crc32(0, NULL, 0); @@ -148,7 +151,14 @@ static int write_zip_entry(struct archiver_args *args, out = NULL; uncompressed_size = 0; compressed_size = 0; + buffer = NULL; + size = 0; } else if (S_ISREG(mode) || S_ISLNK(mode)) { + enum object_type type; + buffer = sha1_file_to_archive(args, path, sha1, mode, &type, &size); + if (!buffer) + return error("cannot read %s", sha1_to_hex(sha1)); + method = 0; attr2 = S_ISLNK(mode) ? ((mode | 0777) << 16) : (mode & 0111) ? ((mode) << 16) : 0; @@ -229,6 +239,7 @@ static int write_zip_entry(struct archiver_args *args, } free(deflated); + free(buffer); return 0; } diff --git a/archive.c b/archive.c index 1ee837d71..cd083eaf9 100644 --- a/archive.c +++ b/archive.c @@ -59,12 +59,15 @@ static void format_subst(const struct commit *commit, free(to_free); } -static void *sha1_file_to_archive(const char *path, const unsigned char *sha1, - unsigned int mode, enum object_type *type, - unsigned long *sizep, const struct commit *commit) +void *sha1_file_to_archive(const struct archiver_args *args, + const char *path, const unsigned char *sha1, + unsigned int mode, enum object_type *type, + unsigned long *sizep) { void *buffer; + const struct commit *commit = args->convert ? args->commit : NULL; + path += args->baselen; buffer = read_sha1_file(sha1, type, sizep); if (buffer && S_ISREG(mode)) { struct strbuf buf = STRBUF_INIT; @@ -109,12 +112,9 @@ static int write_archive_entry(const unsigned char *sha1, const char *base, write_archive_entry_fn_t write_entry = c->write_entry; struct git_attr_check check[2]; const char *path_without_prefix; - int convert = 0; int err; - enum object_type type; - unsigned long size; - void *buffer; + args->convert = 0; strbuf_reset(&path); strbuf_grow(&path, PATH_MAX); strbuf_add(&path, args->base, args->baselen); @@ -126,28 +126,22 @@ static int write_archive_entry(const unsigned char *sha1, const char *base, if (!git_check_attr(path_without_prefix, ARRAY_SIZE(check), check)) { if (ATTR_TRUE(check[0].value)) return 0; - convert = ATTR_TRUE(check[1].value); + args->convert = ATTR_TRUE(check[1].value); } if (S_ISDIR(mode) || S_ISGITLINK(mode)) { strbuf_addch(&path, '/'); if (args->verbose) fprintf(stderr, "%.*s\n", (int)path.len, path.buf); - err = write_entry(args, sha1, path.buf, path.len, mode, NULL, 0); + err = write_entry(args, sha1, path.buf, path.len, mode); if (err) return err; return (S_ISDIR(mode) ? READ_TREE_RECURSIVE : 0); } - buffer = sha1_file_to_archive(path_without_prefix, sha1, mode, - &type, &size, convert ? args->commit : NULL); - if (!buffer) - return error("cannot read %s", sha1_to_hex(sha1)); if (args->verbose) fprintf(stderr, "%.*s\n", (int)path.len, path.buf); - err = write_entry(args, sha1, path.buf, path.len, mode, buffer, size); - free(buffer); - return err; + return write_entry(args, sha1, path.buf, path.len, mode); } int write_archive_entries(struct archiver_args *args, @@ -167,7 +161,7 @@ int write_archive_entries(struct archiver_args *args, if (args->verbose) fprintf(stderr, "%.*s\n", (int)len, args->base); err = write_entry(args, args->tree->object.sha1, args->base, - len, 040777, NULL, 0); + len, 040777); if (err) return err; } diff --git a/archive.h b/archive.h index 2b0884f1e..895afcdc7 100644 --- a/archive.h +++ b/archive.h @@ -11,6 +11,7 @@ struct archiver_args { const char **pathspec; unsigned int verbose : 1; unsigned int worktree_attributes : 1; + unsigned int convert : 1; int compression_level; }; @@ -27,11 +28,18 @@ extern void register_archiver(struct archiver *); extern void init_tar_archiver(void); extern void init_zip_archiver(void); -typedef int (*write_archive_entry_fn_t)(struct archiver_args *args, const unsigned char *sha1, const char *path, size_t pathlen, unsigned int mode, void *buffer, unsigned long size); +typedef int (*write_archive_entry_fn_t)(struct archiver_args *args, + const unsigned char *sha1, + const char *path, size_t pathlen, + unsigned int mode); extern int write_archive_entries(struct archiver_args *args, write_archive_entry_fn_t write_entry); extern int write_archive(int argc, const char **argv, const char *prefix, int setup_prefix, const char *name_hint, int remote); const char *archive_format_from_filename(const char *filename); +extern void *sha1_file_to_archive(const struct archiver_args *args, + const char *path, const unsigned char *sha1, + unsigned int mode, enum object_type *type, + unsigned long *sizep); #endif /* ARCHIVE_H */ From 5544049def9a80bc5ea09a5649e13c1b56160518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Thu, 3 May 2012 08:51:04 +0700 Subject: [PATCH 05/10] archive-tar: stream large blobs to tar file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit t5000 verifies output while t1050 makes sure the command always respects core.bigfilethreshold Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-tar.c | 56 +++++++++++++++++++++++++++++++++++++++++---- t/t1050-large.sh | 4 ++++ t/t5000-tar-tree.sh | 6 +++++ 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 3be0cdf35..93387ea33 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -4,6 +4,7 @@ #include "cache.h" #include "tar.h" #include "archive.h" +#include "streaming.h" #include "run-command.h" #define RECORDSIZE (512) @@ -30,10 +31,9 @@ static void write_if_needed(void) * queues up writes, so that all our write(2) calls write exactly one * full block; pads writes to RECORDSIZE */ -static void write_blocked(const void *data, unsigned long size) +static void do_write_blocked(const void *data, unsigned long size) { const char *buf = data; - unsigned long tail; if (offset) { unsigned long chunk = BLOCKSIZE - offset; @@ -54,6 +54,11 @@ static void write_blocked(const void *data, unsigned long size) memcpy(block + offset, buf, size); offset += size; } +} + +static void finish_record(void) +{ + unsigned long tail; tail = offset % RECORDSIZE; if (tail) { memset(block + offset, 0, RECORDSIZE - tail); @@ -62,6 +67,12 @@ static void write_blocked(const void *data, unsigned long size) write_if_needed(); } +static void write_blocked(const void *data, unsigned long size) +{ + do_write_blocked(data, size); + finish_record(); +} + /* * The end of tar archives is marked by 2*512 nul bytes and after that * follows the rest of the block (if any). @@ -77,6 +88,33 @@ static void write_trailer(void) } } +/* + * queues up writes, so that all our write(2) calls write exactly one + * full block; pads writes to RECORDSIZE + */ +static int stream_blocked(const unsigned char *sha1) +{ + struct git_istream *st; + enum object_type type; + unsigned long sz; + char buf[BLOCKSIZE]; + ssize_t readlen; + + st = open_istream(sha1, &type, &sz, NULL); + if (!st) + return error("cannot stream blob %s", sha1_to_hex(sha1)); + for (;;) { + readlen = read_istream(st, buf, sizeof(buf)); + if (readlen <= 0) + break; + do_write_blocked(buf, readlen); + } + close_istream(st); + if (!readlen) + finish_record(); + return readlen; +} + /* * pax extended header records have the format "%u %s=%s\n". %u contains * the size of the whole string (including the %u), the first %s is the @@ -203,7 +241,11 @@ static int write_tar_entry(struct archiver_args *args, } else memcpy(header.name, path, pathlen); - if (S_ISLNK(mode) || S_ISREG(mode)) { + if (S_ISREG(mode) && !args->convert && + sha1_object_info(sha1, &size) == OBJ_BLOB && + size > big_file_threshold) + buffer = NULL; + else if (S_ISLNK(mode) || S_ISREG(mode)) { enum object_type type; buffer = sha1_file_to_archive(args, path, sha1, old_mode, &type, &size); if (!buffer) @@ -235,8 +277,12 @@ static int write_tar_entry(struct archiver_args *args, } strbuf_release(&ext_header); write_blocked(&header, sizeof(header)); - if (S_ISREG(mode) && buffer && size > 0) - write_blocked(buffer, size); + if (S_ISREG(mode) && size > 0) { + if (buffer) + write_blocked(buffer, size); + else + err = stream_blocked(sha1); + } free(buffer); return err; } diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 4d127f19b..fe475542f 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -134,4 +134,8 @@ test_expect_success 'repack' ' git repack -ad ' +test_expect_success 'tar achiving' ' + git archive --format=tar HEAD >/dev/null +' + test_done diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index 527c9e754..d9b997f5d 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -84,6 +84,12 @@ test_expect_success \ 'git archive vs. git tar-tree' \ 'test_cmp b.tar b2.tar' +test_expect_success 'git archive on large files' ' + test_config core.bigfilethreshold 1 && + git archive HEAD >b3.tar && + test_cmp b.tar b3.tar +' + test_expect_success \ 'git archive in a bare repo' \ '(cd bare.git && git archive HEAD) >b3.tar' From 60df6bd19ad40e3eae2926f3785a63e670c150ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Thu, 3 May 2012 08:51:05 +0700 Subject: [PATCH 06/10] archive-zip: remove uncompressed_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only need size and compressed_size. Signed-off-by: Rene Scharfe Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-zip.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/archive-zip.c b/archive-zip.c index 716cc4271..400ba38c7 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -129,7 +129,6 @@ static int write_zip_entry(struct archiver_args *args, struct zip_dir_header dirent; unsigned long attr2; unsigned long compressed_size; - unsigned long uncompressed_size; unsigned long crc; unsigned long direntsize; int method; @@ -149,7 +148,7 @@ static int write_zip_entry(struct archiver_args *args, method = 0; attr2 = 16; out = NULL; - uncompressed_size = 0; + size = 0; compressed_size = 0; buffer = NULL; size = 0; @@ -166,7 +165,6 @@ static int write_zip_entry(struct archiver_args *args, method = 8; crc = crc32(crc, buffer, size); out = buffer; - uncompressed_size = size; compressed_size = size; } else { return error("unsupported file mode: 0%o (SHA1: %s)", mode, @@ -204,7 +202,7 @@ static int write_zip_entry(struct archiver_args *args, copy_le16(dirent.mdate, zip_date); copy_le32(dirent.crc32, crc); copy_le32(dirent.compressed_size, compressed_size); - copy_le32(dirent.size, uncompressed_size); + copy_le32(dirent.size, size); copy_le16(dirent.filename_length, pathlen); copy_le16(dirent.extra_length, 0); copy_le16(dirent.comment_length, 0); @@ -226,7 +224,7 @@ static int write_zip_entry(struct archiver_args *args, copy_le16(header.mdate, zip_date); copy_le32(header.crc32, crc); copy_le32(header.compressed_size, compressed_size); - copy_le32(header.size, uncompressed_size); + copy_le32(header.size, size); copy_le16(header.filename_length, pathlen); copy_le16(header.extra_length, 0); write_or_die(1, &header, ZIP_LOCAL_HEADER_SIZE); From ebf5374afa87afa334b040faec35144c2a3d03d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Thu, 3 May 2012 08:51:06 +0700 Subject: [PATCH 07/10] archive-zip: factor out helpers for writing sizes and CRC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're going to reuse them soon for streaming. Also, update the ZIP directory only at the very end, which will also make streaming easier. Signed-off-by: Rene Scharfe Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-zip.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/archive-zip.c b/archive-zip.c index 400ba38c7..678569ab2 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -120,6 +120,26 @@ static void *zlib_deflate(void *data, unsigned long size, return buffer; } +static void set_zip_dir_data_desc(struct zip_dir_header *header, + unsigned long size, + unsigned long compressed_size, + unsigned long crc) +{ + copy_le32(header->crc32, crc); + copy_le32(header->compressed_size, compressed_size); + copy_le32(header->size, size); +} + +static void set_zip_header_data_desc(struct zip_local_header *header, + unsigned long size, + unsigned long compressed_size, + unsigned long crc) +{ + copy_le32(header->crc32, crc); + copy_le32(header->compressed_size, compressed_size); + copy_le32(header->size, size); +} + static int write_zip_entry(struct archiver_args *args, const unsigned char *sha1, const char *path, size_t pathlen, @@ -200,9 +220,7 @@ static int write_zip_entry(struct archiver_args *args, copy_le16(dirent.compression_method, method); copy_le16(dirent.mtime, zip_time); copy_le16(dirent.mdate, zip_date); - copy_le32(dirent.crc32, crc); - copy_le32(dirent.compressed_size, compressed_size); - copy_le32(dirent.size, size); + set_zip_dir_data_desc(&dirent, size, compressed_size, crc); copy_le16(dirent.filename_length, pathlen); copy_le16(dirent.extra_length, 0); copy_le16(dirent.comment_length, 0); @@ -210,11 +228,6 @@ static int write_zip_entry(struct archiver_args *args, copy_le16(dirent.attr1, 0); copy_le32(dirent.attr2, attr2); copy_le32(dirent.offset, zip_offset); - memcpy(zip_dir + zip_dir_offset, &dirent, ZIP_DIR_HEADER_SIZE); - zip_dir_offset += ZIP_DIR_HEADER_SIZE; - memcpy(zip_dir + zip_dir_offset, path, pathlen); - zip_dir_offset += pathlen; - zip_dir_entries++; copy_le32(header.magic, 0x04034b50); copy_le16(header.version, 10); @@ -222,9 +235,7 @@ static int write_zip_entry(struct archiver_args *args, copy_le16(header.compression_method, method); copy_le16(header.mtime, zip_time); copy_le16(header.mdate, zip_date); - copy_le32(header.crc32, crc); - copy_le32(header.compressed_size, compressed_size); - copy_le32(header.size, size); + set_zip_header_data_desc(&header, size, compressed_size, crc); copy_le16(header.filename_length, pathlen); copy_le16(header.extra_length, 0); write_or_die(1, &header, ZIP_LOCAL_HEADER_SIZE); @@ -239,6 +250,12 @@ static int write_zip_entry(struct archiver_args *args, free(deflated); free(buffer); + memcpy(zip_dir + zip_dir_offset, &dirent, ZIP_DIR_HEADER_SIZE); + zip_dir_offset += ZIP_DIR_HEADER_SIZE; + memcpy(zip_dir + zip_dir_offset, path, pathlen); + zip_dir_offset += pathlen; + zip_dir_entries++; + return 0; } From 2158f883d99a92f801534c91294305ccbe171f79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Thu, 3 May 2012 08:51:07 +0700 Subject: [PATCH 08/10] archive-zip: streaming for stored files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Write a data descriptor containing the CRC of the entry and its sizes after streaming it out. For simplicity, do that only if we're storing files (option -0) for now. t5000 verifies output. t1050 makes sure the command always respects core.bigfilethreshold Signed-off-by: Rene Scharfe Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-zip.c | 90 +++++++++++++++++++++++++++++++++++++++------ t/t1050-large.sh | 4 ++ t/t5000-tar-tree.sh | 6 +++ 3 files changed, 88 insertions(+), 12 deletions(-) diff --git a/archive-zip.c b/archive-zip.c index 678569ab2..1c6c39d42 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -3,6 +3,7 @@ */ #include "cache.h" #include "archive.h" +#include "streaming.h" static int zip_date; static int zip_time; @@ -15,6 +16,7 @@ static unsigned int zip_dir_offset; static unsigned int zip_dir_entries; #define ZIP_DIRECTORY_MIN_SIZE (1024 * 1024) +#define ZIP_STREAM (8) struct zip_local_header { unsigned char magic[4]; @@ -31,6 +33,14 @@ struct zip_local_header { unsigned char _end[1]; }; +struct zip_data_desc { + unsigned char magic[4]; + unsigned char crc32[4]; + unsigned char compressed_size[4]; + unsigned char size[4]; + unsigned char _end[1]; +}; + struct zip_dir_header { unsigned char magic[4]; unsigned char creator_version[2]; @@ -70,6 +80,7 @@ struct zip_dir_trailer { * we're interested in. */ #define ZIP_LOCAL_HEADER_SIZE offsetof(struct zip_local_header, _end) +#define ZIP_DATA_DESC_SIZE offsetof(struct zip_data_desc, _end) #define ZIP_DIR_HEADER_SIZE offsetof(struct zip_dir_header, _end) #define ZIP_DIR_TRAILER_SIZE offsetof(struct zip_dir_trailer, _end) @@ -120,6 +131,19 @@ static void *zlib_deflate(void *data, unsigned long size, return buffer; } +static void write_zip_data_desc(unsigned long size, + unsigned long compressed_size, + unsigned long crc) +{ + struct zip_data_desc trailer; + + copy_le32(trailer.magic, 0x08074b50); + copy_le32(trailer.crc32, crc); + copy_le32(trailer.compressed_size, compressed_size); + copy_le32(trailer.size, size); + write_or_die(1, &trailer, ZIP_DATA_DESC_SIZE); +} + static void set_zip_dir_data_desc(struct zip_dir_header *header, unsigned long size, unsigned long compressed_size, @@ -140,6 +164,8 @@ static void set_zip_header_data_desc(struct zip_local_header *header, copy_le32(header->size, size); } +#define STREAM_BUFFER_SIZE (1024 * 16) + static int write_zip_entry(struct archiver_args *args, const unsigned char *sha1, const char *path, size_t pathlen, @@ -155,6 +181,8 @@ static int write_zip_entry(struct archiver_args *args, unsigned char *out; void *deflated = NULL; void *buffer; + struct git_istream *stream = NULL; + unsigned long flags = 0; unsigned long size; crc = crc32(0, NULL, 0); @@ -173,25 +201,38 @@ static int write_zip_entry(struct archiver_args *args, buffer = NULL; size = 0; } else if (S_ISREG(mode) || S_ISLNK(mode)) { - enum object_type type; - buffer = sha1_file_to_archive(args, path, sha1, mode, &type, &size); - if (!buffer) - return error("cannot read %s", sha1_to_hex(sha1)); + enum object_type type = sha1_object_info(sha1, &size); method = 0; attr2 = S_ISLNK(mode) ? ((mode | 0777) << 16) : (mode & 0111) ? ((mode) << 16) : 0; - if (S_ISREG(mode) && args->compression_level != 0) + if (S_ISREG(mode) && args->compression_level != 0 && size > 0) method = 8; - crc = crc32(crc, buffer, size); - out = buffer; compressed_size = size; + + if (S_ISREG(mode) && type == OBJ_BLOB && !args->convert && + size > big_file_threshold && method == 0) { + stream = open_istream(sha1, &type, &size, NULL); + if (!stream) + return error("cannot stream blob %s", + sha1_to_hex(sha1)); + flags |= ZIP_STREAM; + out = buffer = NULL; + } else { + buffer = sha1_file_to_archive(args, path, sha1, mode, + &type, &size); + if (!buffer) + return error("cannot read %s", + sha1_to_hex(sha1)); + crc = crc32(crc, buffer, size); + out = buffer; + } } else { return error("unsupported file mode: 0%o (SHA1: %s)", mode, sha1_to_hex(sha1)); } - if (method == 8) { + if (buffer && method == 8) { deflated = zlib_deflate(buffer, size, args->compression_level, &compressed_size); if (deflated && compressed_size - 6 < size) { @@ -216,7 +257,7 @@ static int write_zip_entry(struct archiver_args *args, copy_le16(dirent.creator_version, S_ISLNK(mode) || (S_ISREG(mode) && (mode & 0111)) ? 0x0317 : 0); copy_le16(dirent.version, 10); - copy_le16(dirent.flags, 0); + copy_le16(dirent.flags, flags); copy_le16(dirent.compression_method, method); copy_le16(dirent.mtime, zip_time); copy_le16(dirent.mdate, zip_date); @@ -231,18 +272,43 @@ static int write_zip_entry(struct archiver_args *args, copy_le32(header.magic, 0x04034b50); copy_le16(header.version, 10); - copy_le16(header.flags, 0); + copy_le16(header.flags, flags); copy_le16(header.compression_method, method); copy_le16(header.mtime, zip_time); copy_le16(header.mdate, zip_date); - set_zip_header_data_desc(&header, size, compressed_size, crc); + if (flags & ZIP_STREAM) + set_zip_header_data_desc(&header, 0, 0, 0); + else + set_zip_header_data_desc(&header, size, compressed_size, crc); copy_le16(header.filename_length, pathlen); copy_le16(header.extra_length, 0); write_or_die(1, &header, ZIP_LOCAL_HEADER_SIZE); zip_offset += ZIP_LOCAL_HEADER_SIZE; write_or_die(1, path, pathlen); zip_offset += pathlen; - if (compressed_size > 0) { + if (stream && method == 0) { + unsigned char buf[STREAM_BUFFER_SIZE]; + ssize_t readlen; + + for (;;) { + readlen = read_istream(stream, buf, sizeof(buf)); + if (readlen <= 0) + break; + crc = crc32(crc, buf, readlen); + write_or_die(1, buf, readlen); + } + close_istream(stream); + if (readlen) + return readlen; + + compressed_size = size; + zip_offset += compressed_size; + + write_zip_data_desc(size, compressed_size, crc); + zip_offset += ZIP_DATA_DESC_SIZE; + + set_zip_dir_data_desc(&dirent, size, compressed_size, crc); + } else if (compressed_size > 0) { write_or_die(1, out, compressed_size); zip_offset += compressed_size; } diff --git a/t/t1050-large.sh b/t/t1050-large.sh index fe475542f..9db54b56b 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -138,4 +138,8 @@ test_expect_success 'tar achiving' ' git archive --format=tar HEAD >/dev/null ' +test_expect_success 'zip achiving, store only' ' + git archive --format=zip -0 HEAD >/dev/null +' + test_done diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index d9b997f5d..3b54c3862 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -244,6 +244,12 @@ test_expect_success UNZIP \ 'validate file contents with prefix' \ 'diff -r a e/prefix/a' +test_expect_success UNZIP 'git archive -0 --format=zip on large files' ' + test_config core.bigfilethreshold 1 && + git archive -0 --format=zip HEAD >large.zip && + (mkdir large && cd large && $UNZIP ../large.zip) +' + test_expect_success \ 'git archive --list outside of a git repo' \ 'GIT_DIR=some/non-existing/directory git archive --list' From c743c21591f9433fe784ac38902872701ce2e850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Thu, 3 May 2012 08:51:08 +0700 Subject: [PATCH 09/10] archive-zip: streaming for deflated files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After an entry has been streamed out, its CRC and sizes are written as part of a data descriptor. For simplicity, we make the buffer for the compressed chunks twice as big as for the uncompressed ones, to be sure the result fit in even if deflate makes them bigger. t5000 verifies output. t1050 makes sure the command always respects core.bigfilethreshold Signed-off-by: Rene Scharfe Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- archive-zip.c | 64 ++++++++++++++++++++++++++++++++++++++++++++- t/t1050-large.sh | 4 +++ t/t5000-tar-tree.sh | 7 +++++ 3 files changed, 74 insertions(+), 1 deletion(-) diff --git a/archive-zip.c b/archive-zip.c index 1c6c39d42..f5af81f90 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -211,7 +211,7 @@ static int write_zip_entry(struct archiver_args *args, compressed_size = size; if (S_ISREG(mode) && type == OBJ_BLOB && !args->convert && - size > big_file_threshold && method == 0) { + size > big_file_threshold) { stream = open_istream(sha1, &type, &size, NULL); if (!stream) return error("cannot stream blob %s", @@ -307,6 +307,68 @@ static int write_zip_entry(struct archiver_args *args, write_zip_data_desc(size, compressed_size, crc); zip_offset += ZIP_DATA_DESC_SIZE; + set_zip_dir_data_desc(&dirent, size, compressed_size, crc); + } else if (stream && method == 8) { + unsigned char buf[STREAM_BUFFER_SIZE]; + ssize_t readlen; + git_zstream zstream; + int result; + size_t out_len; + unsigned char compressed[STREAM_BUFFER_SIZE * 2]; + + memset(&zstream, 0, sizeof(zstream)); + git_deflate_init(&zstream, args->compression_level); + + compressed_size = 0; + zstream.next_out = compressed; + zstream.avail_out = sizeof(compressed); + + for (;;) { + readlen = read_istream(stream, buf, sizeof(buf)); + if (readlen <= 0) + break; + crc = crc32(crc, buf, readlen); + + zstream.next_in = buf; + zstream.avail_in = readlen; + result = git_deflate(&zstream, 0); + if (result != Z_OK) + die("deflate error (%d)", result); + out = compressed; + if (!compressed_size) + out += 2; + out_len = zstream.next_out - out; + + if (out_len > 0) { + write_or_die(1, out, out_len); + compressed_size += out_len; + zstream.next_out = compressed; + zstream.avail_out = sizeof(compressed); + } + + } + close_istream(stream); + if (readlen) + return readlen; + + zstream.next_in = buf; + zstream.avail_in = 0; + result = git_deflate(&zstream, Z_FINISH); + if (result != Z_STREAM_END) + die("deflate error (%d)", result); + + git_deflate_end(&zstream); + out = compressed; + if (!compressed_size) + out += 2; + out_len = zstream.next_out - out - 4; + write_or_die(1, out, out_len); + compressed_size += out_len; + zip_offset += compressed_size; + + write_zip_data_desc(size, compressed_size, crc); + zip_offset += ZIP_DATA_DESC_SIZE; + set_zip_dir_data_desc(&dirent, size, compressed_size, crc); } else if (compressed_size > 0) { write_or_die(1, out, compressed_size); diff --git a/t/t1050-large.sh b/t/t1050-large.sh index 9db54b56b..55ed955ce 100755 --- a/t/t1050-large.sh +++ b/t/t1050-large.sh @@ -142,4 +142,8 @@ test_expect_success 'zip achiving, store only' ' git archive --format=zip -0 HEAD >/dev/null ' +test_expect_success 'zip achiving, deflate' ' + git archive --format=zip HEAD >/dev/null +' + test_done diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index 3b54c3862..94f2ebac5 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -250,6 +250,13 @@ test_expect_success UNZIP 'git archive -0 --format=zip on large files' ' (mkdir large && cd large && $UNZIP ../large.zip) ' +test_expect_success UNZIP 'git archive --format=zip on large files' ' + test_config core.bigfilethreshold 1 && + git archive --format=zip HEAD >large-compressed.zip && + (mkdir large-compressed && cd large-compressed && $UNZIP ../large-compressed.zip) && + test_cmp large-compressed/a/bin/sh large/a/bin/sh +' + test_expect_success \ 'git archive --list outside of a git repo' \ 'GIT_DIR=some/non-existing/directory git archive --list' From 2dd42334dea6619c0774511beda9a02642088f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Scharfe?= Date: Thu, 3 May 2012 10:52:16 +0200 Subject: [PATCH 10/10] t5000: rationalize unzip tests Factor out a function for checking the contents of ZIP archives. It extracts their contents and compares them to the original files. This removes some duplicate code. Tests that just create archives can lose their UNZIP prerequisite. Signed-off-by: Rene Scharfe Signed-off-by: Junio C Hamano --- t/t5000-tar-tree.sh | 79 ++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/t/t5000-tar-tree.sh b/t/t5000-tar-tree.sh index 94f2ebac5..ecf00edab 100755 --- a/t/t5000-tar-tree.sh +++ b/t/t5000-tar-tree.sh @@ -31,6 +31,26 @@ GUNZIP=${GUNZIP:-gzip -d} SUBSTFORMAT=%H%n +check_zip() { + zipfile=$1.zip + listfile=$1.lst + dir=$1 + dir_with_prefix=$dir/$2 + + test_expect_success UNZIP " extract ZIP archive" " + (mkdir $dir && cd $dir && $UNZIP ../$zipfile) + " + + test_expect_success UNZIP " validate filenames" " + (cd ${dir_with_prefix}a && find .) | sort >$listfile && + test_cmp a.lst $listfile + " + + test_expect_success UNZIP " validate file contents" " + diff -r a ${dir_with_prefix}a + " +} + test_expect_success \ 'populate workdir' \ 'mkdir a b c && @@ -181,10 +201,19 @@ test_expect_success \ test_cmp a/substfile2 g/prefix/a/substfile2 ' +$UNZIP -v >/dev/null 2>&1 +if [ $? -eq 127 ]; then + say "Skipping ZIP tests, because unzip was not found" +else + test_set_prereq UNZIP +fi + test_expect_success \ 'git archive --format=zip' \ 'git archive --format=zip HEAD >d.zip' +check_zip d + test_expect_success \ 'git archive --format=zip in a bare repo' \ '(cd bare.git && git archive --format=zip HEAD) >d1.zip' @@ -207,55 +236,25 @@ test_expect_success 'git archive with --output, override inferred format' ' test_cmp b.tar d4.zip ' -$UNZIP -v >/dev/null 2>&1 -if [ $? -eq 127 ]; then - say "Skipping ZIP tests, because unzip was not found" -else - test_set_prereq UNZIP -fi - -test_expect_success UNZIP \ - 'extract ZIP archive' \ - '(mkdir d && cd d && $UNZIP ../d.zip)' - -test_expect_success UNZIP \ - 'validate filenames' \ - '(cd d/a && find .) | sort >d.lst && - test_cmp a.lst d.lst' - -test_expect_success UNZIP \ - 'validate file contents' \ - 'diff -r a d/a' - test_expect_success \ 'git archive --format=zip with prefix' \ 'git archive --format=zip --prefix=prefix/ HEAD >e.zip' -test_expect_success UNZIP \ - 'extract ZIP archive with prefix' \ - '(mkdir e && cd e && $UNZIP ../e.zip)' +check_zip e prefix/ -test_expect_success UNZIP \ - 'validate filenames with prefix' \ - '(cd e/prefix/a && find .) | sort >e.lst && - test_cmp a.lst e.lst' +test_expect_success 'git archive -0 --format=zip on large files' ' + test_config core.bigfilethreshold 1 && + git archive -0 --format=zip HEAD >large.zip +' -test_expect_success UNZIP \ - 'validate file contents with prefix' \ - 'diff -r a e/prefix/a' +check_zip large -test_expect_success UNZIP 'git archive -0 --format=zip on large files' ' - test_config core.bigfilethreshold 1 && - git archive -0 --format=zip HEAD >large.zip && - (mkdir large && cd large && $UNZIP ../large.zip) +test_expect_success 'git archive --format=zip on large files' ' + test_config core.bigfilethreshold 1 && + git archive --format=zip HEAD >large-compressed.zip ' -test_expect_success UNZIP 'git archive --format=zip on large files' ' - test_config core.bigfilethreshold 1 && - git archive --format=zip HEAD >large-compressed.zip && - (mkdir large-compressed && cd large-compressed && $UNZIP ../large-compressed.zip) && - test_cmp large-compressed/a/bin/sh large/a/bin/sh -' +check_zip large-compressed test_expect_success \ 'git archive --list outside of a git repo' \