From e41352b24e29eba43d00a3fd117befaef1d594bc Mon Sep 17 00:00:00 2001 From: Marcus Griep <marcus@griep.us> Date: Tue, 12 Aug 2008 12:00:18 -0400 Subject: [PATCH 1/3] Git.pm: Add faculties to allow temp files to be cached This patch offers a generic interface to allow temp files to be cached while using an instance of the 'Git' package. If many temp files are created and destroyed during the execution of a program, this caching mechanism can help reduce the amount of files created and destroyed by the filesystem. The temp_acquire method provides a weak guarantee that a temp file will not be stolen by subsequent requests. If a file is locked when another acquire request is made, a simple error is thrown. Signed-off-by: Marcus Griep <marcus@griep.us> Acked-by: Eric Wong <normalperson@yhbt.net> --- perl/Git.pm | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 2 deletions(-) diff --git a/perl/Git.pm b/perl/Git.pm index e1ca5b4a2..405f68fc3 100644 --- a/perl/Git.pm +++ b/perl/Git.pm @@ -57,7 +57,8 @@ require Exporter; command_output_pipe command_input_pipe command_close_pipe command_bidi_pipe command_close_bidi_pipe version exec_path hash_object git_cmd_try - remote_refs); + remote_refs + temp_acquire temp_release temp_reset); =head1 DESCRIPTION @@ -99,7 +100,9 @@ use Carp qw(carp croak); # but croak is bad - throw instead use Error qw(:try); use Cwd qw(abs_path); use IPC::Open2 qw(open2); - +use File::Temp (); +require File::Spec; +use Fcntl qw(SEEK_SET SEEK_CUR); } @@ -933,6 +936,124 @@ sub _close_cat_blob { delete @$self{@vars}; } + +{ # %TEMP_* Lexical Context + +my (%TEMP_LOCKS, %TEMP_FILES); + +=item temp_acquire ( NAME ) + +Attempts to retreive the temporary file mapped to the string C<NAME>. If an +associated temp file has not been created this session or was closed, it is +created, cached, and set for autoflush and binmode. + +Internally locks the file mapped to C<NAME>. This lock must be released with +C<temp_release()> when the temp file is no longer needed. Subsequent attempts +to retrieve temporary files mapped to the same C<NAME> while still locked will +cause an error. This locking mechanism provides a weak guarantee and is not +threadsafe. It does provide some error checking to help prevent temp file refs +writing over one another. + +In general, the L<File::Handle> returned should not be closed by consumers as +it defeats the purpose of this caching mechanism. If you need to close the temp +file handle, then you should use L<File::Temp> or another temp file faculty +directly. If a handle is closed and then requested again, then a warning will +issue. + +=cut + +sub temp_acquire { + my ($self, $name) = _maybe_self(@_); + + my $temp_fd = _temp_cache($name); + + $TEMP_LOCKS{$temp_fd} = 1; + $temp_fd; +} + +=item temp_release ( NAME ) + +=item temp_release ( FILEHANDLE ) + +Releases a lock acquired through C<temp_acquire()>. Can be called either with +the C<NAME> mapping used when acquiring the temp file or with the C<FILEHANDLE> +referencing a locked temp file. + +Warns if an attempt is made to release a file that is not locked. + +The temp file will be truncated before being released. This can help to reduce +disk I/O where the system is smart enough to detect the truncation while data +is in the output buffers. Beware that after the temp file is released and +truncated, any operations on that file may fail miserably until it is +re-acquired. All contents are lost between each release and acquire mapped to +the same string. + +=cut + +sub temp_release { + my ($self, $temp_fd, $trunc) = _maybe_self(@_); + + if (ref($temp_fd) ne 'File::Temp') { + $temp_fd = $TEMP_FILES{$temp_fd}; + } + unless ($TEMP_LOCKS{$temp_fd}) { + carp "Attempt to release temp file '", + $temp_fd, "' that has not been locked"; + } + temp_reset($temp_fd) if $trunc and $temp_fd->opened; + + $TEMP_LOCKS{$temp_fd} = 0; + undef; +} + +sub _temp_cache { + my ($name) = @_; + + my $temp_fd = \$TEMP_FILES{$name}; + if (defined $$temp_fd and $$temp_fd->opened) { + if ($TEMP_LOCKS{$$temp_fd}) { + throw Error::Simple("Temp file with moniker '", + $name, "' already in use"); + } + } else { + if (defined $$temp_fd) { + # then we're here because of a closed handle. + carp "Temp file '", $name, + "' was closed. Opening replacement."; + } + $$temp_fd = File::Temp->new( + TEMPLATE => 'Git_XXXXXX', + DIR => File::Spec->tmpdir + ) or throw Error::Simple("couldn't open new temp file"); + $$temp_fd->autoflush; + binmode $$temp_fd; + } + $$temp_fd; +} + +=item temp_reset ( FILEHANDLE ) + +Truncates and resets the position of the C<FILEHANDLE>. + +=cut + +sub temp_reset { + my ($self, $temp_fd) = _maybe_self(@_); + + truncate $temp_fd, 0 + or throw Error::Simple("couldn't truncate file"); + sysseek($temp_fd, 0, SEEK_SET) and seek($temp_fd, 0, SEEK_SET) + or throw Error::Simple("couldn't seek to beginning of file"); + sysseek($temp_fd, 0, SEEK_CUR) == 0 and tell($temp_fd) == 0 + or throw Error::Simple("expected file position to be reset"); +} + +sub END { + unlink values %TEMP_FILES if %TEMP_FILES; +} + +} # %TEMP_* Lexical Context + =back =head1 ERROR HANDLING From 0b19138ba3e4c129770565d364df65ec25ca0a8e Mon Sep 17 00:00:00 2001 From: Marcus Griep <marcus@griep.us> Date: Tue, 12 Aug 2008 12:00:53 -0400 Subject: [PATCH 2/3] git-svn: Make it incrementally faster by minimizing temp files Currently, git-svn would create a temp file on four occasions: 1. Reading a blob out of the object db 2. Creating a delta from svn 3. Hashing and writing a blob into the object db 4. Reading a blob out of the object db (in another place in code) Any time git-svn did the above, it would dutifully create and then delete said temp file. Unfortunately, this means that between 2-4 temporary files are created/deleted per file 'add/modify'-ed in svn (O(n)). This causes significant overhead and helps the inode counter to spin beautifully. By its nature, git-svn is a serial beast. Thus, reusing a temp file does not pose significant problems. "truncate and seek" takes much less time than "unlink and create". This patch centralizes the tempfile creation and holds onto the tempfile until they are deleted on exit. This significantly reduces file overhead, now requiring at most three (3) temp files per run (O(1)). Signed-off-by: Marcus Griep <marcus@griep.us> Acked-by: Eric Wong <normalperson@yhbt.net> --- git-svn.perl | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/git-svn.perl b/git-svn.perl index 4dc33801a..9eae5e8d8 100755 --- a/git-svn.perl +++ b/git-svn.perl @@ -1265,7 +1265,7 @@ sub md5sum { my $arg = shift; my $ref = ref $arg; my $md5 = Digest::MD5->new(); - if ($ref eq 'GLOB' || $ref eq 'IO::File') { + if ($ref eq 'GLOB' || $ref eq 'IO::File' || $ref eq 'File::Temp') { $md5->addfile($arg) or croak $!; } elsif ($ref eq 'SCALAR') { $md5->add($$arg) or croak $!; @@ -1328,6 +1328,7 @@ BEGIN } } + my (%LOCKFILES, %INDEX_FILES); END { unlink keys %LOCKFILES if %LOCKFILES; @@ -3230,13 +3231,11 @@ sub change_file_prop { sub apply_textdelta { my ($self, $fb, $exp) = @_; - my $fh = IO::File->new_tmpfile; - $fh->autoflush(1); + my $fh = Git::temp_acquire('svn_delta'); # $fh gets auto-closed() by SVN::TxDelta::apply(), # (but $base does not,) so dup() it for reading in close_file open my $dup, '<&', $fh or croak $!; - my $base = IO::File->new_tmpfile; - $base->autoflush(1); + my $base = Git::temp_acquire('git_blob'); if ($fb->{blob}) { print $base 'link ' if ($fb->{mode_a} == 120000); my $size = $::_repository->cat_blob($fb->{blob}, $base); @@ -3251,9 +3250,9 @@ sub apply_textdelta { } } seek $base, 0, 0 or croak $!; - $fb->{fh} = $dup; + $fb->{fh} = $fh; $fb->{base} = $base; - [ SVN::TxDelta::apply($base, $fh, undef, $fb->{path}, $fb->{pool}) ]; + [ SVN::TxDelta::apply($base, $dup, undef, $fb->{path}, $fb->{pool}) ]; } sub close_file { @@ -3282,22 +3281,25 @@ sub close_file { } } - my ($tmp_fh, $tmp_filename) = File::Temp::tempfile(UNLINK => 1); + my $tmp_fh = Git::temp_acquire('svn_hash'); my $result; while ($result = sysread($fh, my $string, 1024)) { my $wrote = syswrite($tmp_fh, $string, $result); defined($wrote) && $wrote == $result - or croak("write $tmp_filename: $!\n"); + or croak("write ", + $tmp_fh->filename, ": $!\n"); } defined $result or croak $!; - close $tmp_fh or croak $!; - close $fh or croak $!; - $hash = $::_repository->hash_and_insert_object($tmp_filename); - unlink($tmp_filename); + Git::temp_release($fh, 1); + + $hash = $::_repository->hash_and_insert_object( + $tmp_fh->filename); $hash =~ /^[a-f\d]{40}$/ or die "not a sha1: $hash\n"; - close $fb->{base} or croak $!; + + Git::temp_release($fb->{base}, 1); + Git::temp_release($tmp_fh, 1); } else { $hash = $fb->{blob} or die "no blob information\n"; } @@ -3667,7 +3669,7 @@ sub chg_file { } elsif ($m->{mode_b} !~ /755$/ && $m->{mode_a} =~ /755$/) { $self->change_file_prop($fbat,'svn:executable',undef); } - my $fh = IO::File->new_tmpfile or croak $!; + my $fh = Git::temp_acquire('git_blob'); if ($m->{mode_b} =~ /^120/) { print $fh 'link ' or croak $!; $self->change_file_prop($fbat,'svn:special','*'); @@ -3686,9 +3688,8 @@ sub chg_file { my $atd = $self->apply_textdelta($fbat, undef, $pool); my $got = SVN::TxDelta::send_stream($fh, @$atd, $pool); die "Checksum mismatch\nexpected: $exp\ngot: $got\n" if ($got ne $exp); + Git::temp_release($fh, 1); $pool->clear; - - close $fh or croak $!; } sub D { From 510b0945d041752db2f53dda00f1188308e2df4e Mon Sep 17 00:00:00 2001 From: Marcus Griep <marcus@griep.us> Date: Tue, 12 Aug 2008 12:45:39 -0400 Subject: [PATCH 3/3] git-svn: Reduce temp file usage when dealing with non-links Currently, in sub 'close_file', git-svn creates a temporary file and copies the contents of the blob to be written into it. This is useful for symlinks because svn stores symlinks in the form: link $FILE_PATH Git creates a blob only out of '$FILE_PATH' and uses file mode to indicate that the blob should be interpreted as a symlink. As git-hash-object is invoked with --stdin-paths, a duplicate of the link from svn must be created that leaves off the first five bytes, i.e. 'link '. However, this is wholly unnecessary for normal blobs, though, as we already have a temp file with their contents. Copying the entire file gains nothing, and effectively requires a file to be written twice before making it into the object db. This patch corrects that issue, holding onto the substr-like duplication for symlinks, but skipping it altogether for normal blobs by reusing the existing temp file. Signed-off-by: Marcus Griep <marcus@griep.us> Acked-by: Eric Wong <normalperson@yhbt.net> --- git-svn.perl | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/git-svn.perl b/git-svn.perl index 9eae5e8d8..099fd02b3 100755 --- a/git-svn.perl +++ b/git-svn.perl @@ -3268,38 +3268,36 @@ sub close_file { "expected: $exp\n got: $got\n"; } } - sysseek($fh, 0, 0) or croak $!; if ($fb->{mode_b} == 120000) { - eval { - sysread($fh, my $buf, 5) == 5 or croak $!; - $buf eq 'link ' or die "$path has mode 120000", - " but is not a link"; - }; - if ($@) { - warn "$@\n"; - sysseek($fh, 0, 0) or croak $!; - } - } - - my $tmp_fh = Git::temp_acquire('svn_hash'); - my $result; - while ($result = sysread($fh, my $string, 1024)) { - my $wrote = syswrite($tmp_fh, $string, $result); - defined($wrote) && $wrote == $result - or croak("write ", - $tmp_fh->filename, ": $!\n"); - } - defined $result or croak $!; + sysseek($fh, 0, 0) or croak $!; + sysread($fh, my $buf, 5) == 5 or croak $!; + unless ($buf eq 'link ') { + warn "$path has mode 120000", + " but is not a link\n"; + } else { + my $tmp_fh = Git::temp_acquire('svn_hash'); + my $res; + while ($res = sysread($fh, my $str, 1024)) { + my $out = syswrite($tmp_fh, $str, $res); + defined($out) && $out == $res + or croak("write ", + $tmp_fh->filename, + ": $!\n"); + } + defined $res or croak $!; - Git::temp_release($fh, 1); + ($fh, $tmp_fh) = ($tmp_fh, $fh); + Git::temp_release($tmp_fh, 1); + } + } $hash = $::_repository->hash_and_insert_object( - $tmp_fh->filename); + $fh->filename); $hash =~ /^[a-f\d]{40}$/ or die "not a sha1: $hash\n"; Git::temp_release($fb->{base}, 1); - Git::temp_release($tmp_fh, 1); + Git::temp_release($fh, 1); } else { $hash = $fb->{blob} or die "no blob information\n"; }