Skip to content

Commit

Permalink
convert: stream from fd to required clean filter to reduce used addre…
Browse files Browse the repository at this point in the history
…ss space

The data is streamed to the filter process anyway.  Better avoid mapping
the file if possible.  This is especially useful if a clean filter
reduces the size, for example if it computes a sha1 for binary data,
like git media.  The file size that the previous implementation could
handle was limited by the available address space; large files for
example could not be handled with (32-bit) msysgit.  The new
implementation can filter files of any size as long as the filter output
is small enough.

The new code path is only taken if the filter is required.  The filter
consumes data directly from the fd.  If it fails, the original data is
not immediately available.  The condition can easily be handled as
a fatal error, which is expected for a required filter anyway.

If the filter was not required, the condition would need to be handled
in a different way, like seeking to 0 and reading the data.  But this
would require more restructuring of the code and is probably not worth
it.  The obvious approach of falling back to reading all data would not
help achieving the main purpose of this patch, which is to handle large
files with limited address space.  If reading all data is an option, we
can simply take the old code path right away and mmap the entire file.

The environment variable GIT_MMAP_LIMIT, which has been introduced in
a previous commit is used to test that the expected code path is taken.
A related test that exercises required filters is modified to verify
that the data actually has been modified on its way from the file system
to the object store.

Signed-off-by: Steffen Prohaska <prohaska@zib.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information
Steffen Prohaska authored and Junio C Hamano committed Aug 28, 2014
1 parent b29763a commit 9035d75
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 12 deletions.
55 changes: 49 additions & 6 deletions convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -312,11 +312,12 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
struct filter_params {
const char *src;
unsigned long size;
int fd;
const char *cmd;
const char *path;
};

static int filter_buffer(int in, int out, void *data)
static int filter_buffer_or_fd(int in, int out, void *data)
{
/*
* Spawn cmd and feed the buffer contents through its stdin.
Expand Down Expand Up @@ -355,7 +356,12 @@ static int filter_buffer(int in, int out, void *data)

sigchain_push(SIGPIPE, SIG_IGN);

write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
if (params->src) {
write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
} else {
write_err = copy_fd(params->fd, child_process.in);
}

if (close(child_process.in))
write_err = 1;
if (write_err)
Expand All @@ -371,7 +377,7 @@ static int filter_buffer(int in, int out, void *data)
return (write_err || status);
}

static int apply_filter(const char *path, const char *src, size_t len,
static int apply_filter(const char *path, const char *src, size_t len, int fd,
struct strbuf *dst, const char *cmd)
{
/*
Expand All @@ -392,11 +398,12 @@ static int apply_filter(const char *path, const char *src, size_t len,
return 1;

memset(&async, 0, sizeof(async));
async.proc = filter_buffer;
async.proc = filter_buffer_or_fd;
async.data = &params;
async.out = -1;
params.src = src;
params.size = len;
params.fd = fd;
params.cmd = cmd;
params.path = path;

Expand Down Expand Up @@ -747,6 +754,25 @@ static void convert_attrs(struct conv_attrs *ca, const char *path)
}
}

int would_convert_to_git_filter_fd(const char *path)
{
struct conv_attrs ca;

convert_attrs(&ca, path);
if (!ca.drv)
return 0;

/*
* Apply a filter to an fd only if the filter is required to succeed.
* We must die if the filter fails, because the original data before
* filtering is not available.
*/
if (!ca.drv->required)
return 0;

return apply_filter(path, NULL, 0, -1, NULL, ca.drv->clean);
}

int convert_to_git(const char *path, const char *src, size_t len,
struct strbuf *dst, enum safe_crlf checksafe)
{
Expand All @@ -761,7 +787,7 @@ int convert_to_git(const char *path, const char *src, size_t len,
required = ca.drv->required;
}

ret |= apply_filter(path, src, len, dst, filter);
ret |= apply_filter(path, src, len, -1, dst, filter);
if (!ret && required)
die("%s: clean filter '%s' failed", path, ca.drv->name);

Expand All @@ -778,6 +804,23 @@ int convert_to_git(const char *path, const char *src, size_t len,
return ret | ident_to_git(path, src, len, dst, ca.ident);
}

void convert_to_git_filter_fd(const char *path, int fd, struct strbuf *dst,
enum safe_crlf checksafe)
{
struct conv_attrs ca;
convert_attrs(&ca, path);

assert(ca.drv);
assert(ca.drv->clean);

if (!apply_filter(path, NULL, 0, fd, dst, ca.drv->clean))
die("%s: clean filter '%s' failed", path, ca.drv->name);

ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr);
crlf_to_git(path, dst->buf, dst->len, dst, ca.crlf_action, checksafe);
ident_to_git(path, dst->buf, dst->len, dst, ca.ident);
}

static int convert_to_working_tree_internal(const char *path, const char *src,
size_t len, struct strbuf *dst,
int normalizing)
Expand Down Expand Up @@ -811,7 +854,7 @@ static int convert_to_working_tree_internal(const char *path, const char *src,
}
}

ret_filter = apply_filter(path, src, len, dst, filter);
ret_filter = apply_filter(path, src, len, -1, dst, filter);
if (!ret_filter && required)
die("%s: smudge filter %s failed", path, ca.drv->name);

Expand Down
5 changes: 5 additions & 0 deletions convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ static inline int would_convert_to_git(const char *path)
{
return convert_to_git(path, NULL, 0, NULL, 0);
}
/* Precondition: would_convert_to_git_filter_fd(path) == true */
extern void convert_to_git_filter_fd(const char *path, int fd,
struct strbuf *dst,
enum safe_crlf checksafe);
extern int would_convert_to_git_filter_fd(const char *path);

/*****************************************************************
*
Expand Down
27 changes: 26 additions & 1 deletion sha1_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -3090,6 +3090,29 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
return ret;
}

static int index_stream_convert_blob(unsigned char *sha1, int fd,
const char *path, unsigned flags)
{
int ret;
const int write_object = flags & HASH_WRITE_OBJECT;
struct strbuf sbuf = STRBUF_INIT;

assert(path);
assert(would_convert_to_git_filter_fd(path));

convert_to_git_filter_fd(path, fd, &sbuf,
write_object ? safe_crlf : SAFE_CRLF_FALSE);

if (write_object)
ret = write_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
sha1);
else
ret = hash_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
sha1);
strbuf_release(&sbuf);
return ret;
}

static int index_pipe(unsigned char *sha1, int fd, enum object_type type,
const char *path, unsigned flags)
{
Expand Down Expand Up @@ -3157,7 +3180,9 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
int ret;
size_t size = xsize_t(st->st_size);

if (!S_ISREG(st->st_mode))
if (type == OBJ_BLOB && path && would_convert_to_git_filter_fd(path))
ret = index_stream_convert_blob(sha1, fd, path, flags);
else if (!S_ISREG(st->st_mode))
ret = index_pipe(sha1, fd, type, path, flags);
else if (size <= big_file_threshold || type != OBJ_BLOB ||
(path && would_convert_to_git(path)))
Expand Down
24 changes: 19 additions & 5 deletions t/t0021-conversion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,23 @@ test_expect_success 'filter shell-escaped filenames' '
:
'

test_expect_success 'required filter success' '
git config filter.required.smudge cat &&
git config filter.required.clean cat &&
test_expect_success 'required filter should filter data' '
git config filter.required.smudge ./rot13.sh &&
git config filter.required.clean ./rot13.sh &&
git config filter.required.required true &&
echo "*.r filter=required" >.gitattributes &&
echo test >test.r &&
cat test.o >test.r &&
git add test.r &&
rm -f test.r &&
git checkout -- test.r
git checkout -- test.r &&
cmp test.o test.r &&
./rot13.sh <test.o >expected &&
git cat-file blob :test.r >actual &&
cmp expected actual
'

test_expect_success 'required filter smudge failure' '
Expand All @@ -190,6 +196,14 @@ test_expect_success 'required filter clean failure' '
test_must_fail git add test.fc
'

test_expect_success 'filtering large input to small output should use little memory' '
git config filter.devnull.clean "cat >/dev/null" &&
git config filter.devnull.required true &&
for i in $(test_seq 1 30); do printf "%1048576d" 1; done >30MB &&
echo "30MB filter=devnull" >.gitattributes &&
GIT_MMAP_LIMIT=1m GIT_ALLOC_LIMIT=1m git add 30MB
'

test_expect_success EXPENSIVE 'filter large file' '
git config filter.largefile.smudge cat &&
git config filter.largefile.clean cat &&
Expand Down

0 comments on commit 9035d75

Please sign in to comment.