From c4ce46fc7ac1b59372aa935e641ca15b12359f5b Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Sun, 8 May 2011 01:47:33 -0700
Subject: [PATCH 1/3] index_fd(): turn write_object and format_check arguments
 into one flag

The "format_check" parameter tucked after the existing parameters is too
ugly an afterthought to live in any reasonable API.

Combine it with the other boolean parameter "write_object" into a single
"flags" parameter.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/hash-object.c  |  5 ++++-
 builtin/update-index.c |  3 ++-
 cache.h                |  7 +++++--
 notes-merge.c          |  2 +-
 read-cache.c           |  4 ++--
 sha1_file.c            | 29 +++++++++++++----------------
 6 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/builtin/hash-object.c b/builtin/hash-object.c
index b96f46acf..33911fd5e 100644
--- a/builtin/hash-object.c
+++ b/builtin/hash-object.c
@@ -14,8 +14,11 @@ static void hash_fd(int fd, const char *type, int write_object, const char *path
 {
 	struct stat st;
 	unsigned char sha1[20];
+	unsigned flags = (HASH_FORMAT_CHECK |
+			  (write_object ? HASH_WRITE_OBJECT : 0));
+
 	if (fstat(fd, &st) < 0 ||
-	    index_fd(sha1, fd, &st, write_object, type_from_string(type), path, 1))
+	    index_fd(sha1, fd, &st, type_from_string(type), path, flags))
 		die(write_object
 		    ? "Unable to add %s to database"
 		    : "Unable to hash %s", path);
diff --git a/builtin/update-index.c b/builtin/update-index.c
index d7850c630..f14bc9083 100644
--- a/builtin/update-index.c
+++ b/builtin/update-index.c
@@ -99,7 +99,8 @@ static int add_one_path(struct cache_entry *old, const char *path, int len, stru
 	fill_stat_cache_info(ce, st);
 	ce->ce_mode = ce_mode_from_stat(old, st->st_mode);
 
-	if (index_path(ce->sha1, path, st, !info_only))
+	if (index_path(ce->sha1, path, st,
+		       info_only ? 0 : HASH_WRITE_OBJECT))
 		return -1;
 	option = allow_add ? ADD_CACHE_OK_TO_ADD : 0;
 	option |= allow_replace ? ADD_CACHE_OK_TO_REPLACE : 0;
diff --git a/cache.h b/cache.h
index 28899b7b7..bc3801395 100644
--- a/cache.h
+++ b/cache.h
@@ -518,8 +518,11 @@ struct pathspec {
 extern int init_pathspec(struct pathspec *, const char **);
 extern void free_pathspec(struct pathspec *);
 extern int ce_path_match(const struct cache_entry *ce, const struct pathspec *pathspec);
-extern int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object, enum object_type type, const char *path, int format_check);
-extern int index_path(unsigned char *sha1, const char *path, struct stat *st, int write_object);
+
+#define HASH_WRITE_OBJECT 1
+#define HASH_FORMAT_CHECK 2
+extern int index_fd(unsigned char *sha1, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
+extern int index_path(unsigned char *sha1, const char *path, struct stat *st, unsigned flags);
 extern void fill_stat_cache_info(struct cache_entry *ce, struct stat *st);
 
 #define REFRESH_REALLY		0x0001	/* ignore_valid */
diff --git a/notes-merge.c b/notes-merge.c
index 28046a998..e1aaf43b4 100644
--- a/notes-merge.c
+++ b/notes-merge.c
@@ -707,7 +707,7 @@ int notes_merge_commit(struct notes_merge_options *o,
 		/* write file as blob, and add to partial_tree */
 		if (stat(ent->name, &st))
 			die_errno("Failed to stat '%s'", ent->name);
-		if (index_path(blob_sha1, ent->name, &st, 1))
+		if (index_path(blob_sha1, ent->name, &st, HASH_WRITE_OBJECT))
 			die("Failed to write blob object from '%s'", ent->name);
 		if (add_note(partial_tree, obj_sha1, blob_sha1, NULL))
 			die("Failed to add resolved note '%s' to notes tree",
diff --git a/read-cache.c b/read-cache.c
index f38471cac..4ac9a037f 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -92,7 +92,7 @@ static int ce_compare_data(struct cache_entry *ce, struct stat *st)
 
 	if (fd >= 0) {
 		unsigned char sha1[20];
-		if (!index_fd(sha1, fd, st, 0, OBJ_BLOB, ce->name, 0))
+		if (!index_fd(sha1, fd, st, OBJ_BLOB, ce->name, 0))
 			match = hashcmp(sha1, ce->sha1);
 		/* index_fd() closed the file descriptor already */
 	}
@@ -641,7 +641,7 @@ int add_to_index(struct index_state *istate, const char *path, struct stat *st,
 		return 0;
 	}
 	if (!intent_only) {
-		if (index_path(ce->sha1, path, st, 1))
+		if (index_path(ce->sha1, path, st, HASH_WRITE_OBJECT))
 			return error("unable to index file %s", path);
 	} else
 		record_intent_to_add(ce);
diff --git a/sha1_file.c b/sha1_file.c
index 889fe7183..17c179c9f 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -2581,10 +2581,11 @@ static void check_tag(const void *buf, size_t size)
 }
 
 static int index_mem(unsigned char *sha1, void *buf, size_t size,
-		     int write_object, enum object_type type,
-		     const char *path, int format_check)
+		     enum object_type type,
+		     const char *path, unsigned flags)
 {
 	int ret, re_allocated = 0;
+	int write_object = flags & HASH_WRITE_OBJECT;
 
 	if (!type)
 		type = OBJ_BLOB;
@@ -2600,7 +2601,7 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
 			re_allocated = 1;
 		}
 	}
-	if (format_check) {
+	if (flags & HASH_FORMAT_CHECK) {
 		if (type == OBJ_TREE)
 			check_tree(buf, size);
 		if (type == OBJ_COMMIT)
@@ -2620,8 +2621,8 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
 
 #define SMALL_FILE_SIZE (32*1024)
 
-int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object,
-	     enum object_type type, const char *path, int format_check)
+int index_fd(unsigned char *sha1, int fd, struct stat *st,
+	     enum object_type type, const char *path, unsigned flags)
 {
 	int ret;
 	size_t size = xsize_t(st->st_size);
@@ -2629,33 +2630,29 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object,
 	if (!S_ISREG(st->st_mode)) {
 		struct strbuf sbuf = STRBUF_INIT;
 		if (strbuf_read(&sbuf, fd, 4096) >= 0)
-			ret = index_mem(sha1, sbuf.buf, sbuf.len, write_object,
-					type, path, format_check);
+			ret = index_mem(sha1, sbuf.buf, sbuf.len, type,	path, flags);
 		else
 			ret = -1;
 		strbuf_release(&sbuf);
 	} else if (!size) {
-		ret = index_mem(sha1, NULL, size, write_object, type, path,
-				format_check);
+		ret = index_mem(sha1, NULL, size, type, path, flags);
 	} else if (size <= SMALL_FILE_SIZE) {
 		char *buf = xmalloc(size);
 		if (size == read_in_full(fd, buf, size))
-			ret = index_mem(sha1, buf, size, write_object, type,
-					path, format_check);
+			ret = index_mem(sha1, buf, size, type, path, flags);
 		else
 			ret = error("short read %s", strerror(errno));
 		free(buf);
 	} else {
 		void *buf = xmmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
-		ret = index_mem(sha1, buf, size, write_object, type, path,
-				format_check);
+		ret = index_mem(sha1, buf, size, type, path, flags);
 		munmap(buf, size);
 	}
 	close(fd);
 	return ret;
 }
 
-int index_path(unsigned char *sha1, const char *path, struct stat *st, int write_object)
+int index_path(unsigned char *sha1, const char *path, struct stat *st, unsigned flags)
 {
 	int fd;
 	struct strbuf sb = STRBUF_INIT;
@@ -2666,7 +2663,7 @@ int index_path(unsigned char *sha1, const char *path, struct stat *st, int write
 		if (fd < 0)
 			return error("open(\"%s\"): %s", path,
 				     strerror(errno));
-		if (index_fd(sha1, fd, st, write_object, OBJ_BLOB, path, 0) < 0)
+		if (index_fd(sha1, fd, st, OBJ_BLOB, path, flags) < 0)
 			return error("%s: failed to insert into database",
 				     path);
 		break;
@@ -2676,7 +2673,7 @@ int index_path(unsigned char *sha1, const char *path, struct stat *st, int write
 			return error("readlink(\"%s\"): %s", path,
 			             errstr);
 		}
-		if (!write_object)
+		if (!(flags & HASH_WRITE_OBJECT))
 			hash_sha1_file(sb.buf, sb.len, blob_type, sha1);
 		else if (write_sha1_file(sb.buf, sb.len, blob_type, sha1))
 			return error("%s: failed to insert into database",

From 7b41e1e15b2cce13deaafc0aab10580036346a5a Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Sun, 8 May 2011 01:47:34 -0700
Subject: [PATCH 2/3] index_fd(): split into two helper functions

Split out the case where we do not know the size of the input (hence we
read everything into a strbuf before doing anything) to index_pipe(), and
the other case where we mmap or read the whole data to index_bulk().

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sha1_file.c | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index 17c179c9f..49416b029 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -2619,22 +2619,29 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
 	return ret;
 }
 
+static int index_pipe(unsigned char *sha1, int fd, enum object_type type,
+		      const char *path, unsigned flags)
+{
+	struct strbuf sbuf = STRBUF_INIT;
+	int ret;
+
+	if (strbuf_read(&sbuf, fd, 4096) >= 0)
+		ret = index_mem(sha1, sbuf.buf, sbuf.len, type,	path, flags);
+	else
+		ret = -1;
+	strbuf_release(&sbuf);
+	return ret;
+}
+
 #define SMALL_FILE_SIZE (32*1024)
 
-int index_fd(unsigned char *sha1, int fd, struct stat *st,
-	     enum object_type type, const char *path, unsigned flags)
+static int index_core(unsigned char *sha1, int fd, size_t size,
+		      enum object_type type, const char *path,
+		      unsigned flags)
 {
 	int ret;
-	size_t size = xsize_t(st->st_size);
 
-	if (!S_ISREG(st->st_mode)) {
-		struct strbuf sbuf = STRBUF_INIT;
-		if (strbuf_read(&sbuf, fd, 4096) >= 0)
-			ret = index_mem(sha1, sbuf.buf, sbuf.len, type,	path, flags);
-		else
-			ret = -1;
-		strbuf_release(&sbuf);
-	} else if (!size) {
+	if (!size) {
 		ret = index_mem(sha1, NULL, size, type, path, flags);
 	} else if (size <= SMALL_FILE_SIZE) {
 		char *buf = xmalloc(size);
@@ -2648,6 +2655,19 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
 		ret = index_mem(sha1, buf, size, type, path, flags);
 		munmap(buf, size);
 	}
+	return ret;
+}
+
+int index_fd(unsigned char *sha1, int fd, struct stat *st,
+	     enum object_type type, const char *path, unsigned flags)
+{
+	int ret;
+	size_t size = xsize_t(st->st_size);
+
+	if (!S_ISREG(st->st_mode))
+		ret = index_pipe(sha1, fd, type, path, flags);
+	else
+		ret = index_core(sha1, fd, size, type, path, flags);
 	close(fd);
 	return ret;
 }

From 4dd1fbc7b1df0030f813a05cee19cad2c7a9cbf9 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Sun, 8 May 2011 01:47:35 -0700
Subject: [PATCH 3/3] Bigfile: teach "git add" to send a large file straight to
 a pack

When adding a new content to the repository, we have always slurped
the blob in its entirety in-core first, and computed the object name
and compressed it into a loose object file.  Handling large binary
files (e.g.  video and audio asset for games) has been problematic
because of this design.

At the middle level of "git add" callchain is an internal API
index_fd() that takes an open file descriptor to read from the
working tree file being added with its size. Teach it to call out to
fast-import when adding a large blob.

The write-out codepath in entry.c::write_entry() should be taught to
stream, instead of reading everything in core. This should not be so
hard to implement, especially if we limit ourselves only to loose
object files and non-delta representation in packfiles.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sha1_file.c      | 84 +++++++++++++++++++++++++++++++++++++++++++++++-
 t/t1050-large.sh | 27 ++++++++++++++++
 2 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100755 t/t1050-large.sh

diff --git a/sha1_file.c b/sha1_file.c
index 49416b029..f0ca6a174 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -11,6 +11,7 @@
 #include "pack.h"
 #include "blob.h"
 #include "commit.h"
+#include "run-command.h"
 #include "tag.h"
 #include "tree.h"
 #include "tree-walk.h"
@@ -2658,6 +2659,85 @@ static int index_core(unsigned char *sha1, int fd, size_t size,
 	return ret;
 }
 
+/*
+ * This creates one packfile per large blob, because the caller
+ * immediately wants the result sha1, and fast-import can report the
+ * object name via marks mechanism only by closing the created
+ * packfile.
+ *
+ * This also bypasses the usual "convert-to-git" dance, and that is on
+ * purpose. We could write a streaming version of the converting
+ * functions and insert that before feeding the data to fast-import
+ * (or equivalent in-core API described above), but the primary
+ * motivation for trying to stream from the working tree file and to
+ * avoid mmaping it in core is to deal with large binary blobs, and
+ * by definition they do _not_ want to get any conversion.
+ */
+static int index_stream(unsigned char *sha1, int fd, size_t size,
+			enum object_type type, const char *path,
+			unsigned flags)
+{
+	struct child_process fast_import;
+	char export_marks[512];
+	const char *argv[] = { "fast-import", "--quiet", export_marks, NULL };
+	char tmpfile[512];
+	char fast_import_cmd[512];
+	char buf[512];
+	int len, tmpfd;
+
+	strcpy(tmpfile, git_path("hashstream_XXXXXX"));
+	tmpfd = git_mkstemp_mode(tmpfile, 0600);
+	if (tmpfd < 0)
+		die_errno("cannot create tempfile: %s", tmpfile);
+	if (close(tmpfd))
+		die_errno("cannot close tempfile: %s", tmpfile);
+	sprintf(export_marks, "--export-marks=%s", tmpfile);
+
+	memset(&fast_import, 0, sizeof(fast_import));
+	fast_import.in = -1;
+	fast_import.argv = argv;
+	fast_import.git_cmd = 1;
+	if (start_command(&fast_import))
+		die_errno("index-stream: git fast-import failed");
+
+	len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n",
+		      (unsigned long) size);
+	write_or_whine(fast_import.in, fast_import_cmd, len,
+		       "index-stream: feeding fast-import");
+	while (size) {
+		char buf[10240];
+		size_t sz = size < sizeof(buf) ? size : sizeof(buf);
+		size_t actual;
+
+		actual = read_in_full(fd, buf, sz);
+		if (actual < 0)
+			die_errno("index-stream: reading input");
+		if (write_in_full(fast_import.in, buf, actual) != actual)
+			die_errno("index-stream: feeding fast-import");
+		size -= actual;
+	}
+	if (close(fast_import.in))
+		die_errno("index-stream: closing fast-import");
+	if (finish_command(&fast_import))
+		die_errno("index-stream: finishing fast-import");
+
+	tmpfd = open(tmpfile, O_RDONLY);
+	if (tmpfd < 0)
+		die_errno("index-stream: cannot open fast-import mark");
+	len = read(tmpfd, buf, sizeof(buf));
+	if (len < 0)
+		die_errno("index-stream: reading fast-import mark");
+	if (close(tmpfd) < 0)
+		die_errno("index-stream: closing fast-import mark");
+	if (unlink(tmpfile))
+		die_errno("index-stream: unlinking fast-import mark");
+	if (len != 44 ||
+	    memcmp(":1 ", buf, 3) ||
+	    get_sha1_hex(buf + 3, sha1))
+		die_errno("index-stream: unexpected fast-import mark: <%s>", buf);
+	return 0;
+}
+
 int index_fd(unsigned char *sha1, int fd, struct stat *st,
 	     enum object_type type, const char *path, unsigned flags)
 {
@@ -2666,8 +2746,10 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
 
 	if (!S_ISREG(st->st_mode))
 		ret = index_pipe(sha1, fd, type, path, flags);
-	else
+	else if (size <= big_file_threshold || type != OBJ_BLOB)
 		ret = index_core(sha1, fd, size, type, path, flags);
+	else
+		ret = index_stream(sha1, fd, size, type, path, flags);
 	close(fd);
 	return ret;
 }
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
new file mode 100755
index 000000000..deba111bd
--- /dev/null
+++ b/t/t1050-large.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Copyright (c) 2011, Google Inc.
+
+test_description='adding and checking out large blobs'
+
+. ./test-lib.sh
+
+test_expect_success setup '
+	git config core.bigfilethreshold 200k &&
+	echo X | dd of=large bs=1k seek=2000
+'
+
+test_expect_success 'add a large file' '
+	git add large &&
+	# make sure we got a packfile and no loose objects
+	test -f .git/objects/pack/pack-*.pack &&
+	test ! -f .git/objects/??/??????????????????????????????????????
+'
+
+test_expect_success 'checkout a large file' '
+	large=$(git rev-parse :large) &&
+	git update-index --add --cacheinfo 100644 $large another &&
+	git checkout another &&
+	cmp large another ;# this must not be test_cmp
+'
+
+test_done