From e41352b24e29eba43d00a3fd117befaef1d594bc Mon Sep 17 00:00:00 2001
From: Marcus Griep <marcus@griep.us>
Date: Tue, 12 Aug 2008 12:00:18 -0400
Subject: [PATCH 1/3] Git.pm: Add faculties to allow temp files to be cached

This patch offers a generic interface to allow temp files to be
cached while using an instance of the 'Git' package. If many
temp files are created and destroyed during the execution of a
program, this caching mechanism can help reduce the amount of
files created and destroyed by the filesystem.

The temp_acquire method provides a weak guarantee that a temp
file will not be stolen by subsequent requests. If a file is
locked when another acquire request is made, a simple error is
thrown.

Signed-off-by: Marcus Griep <marcus@griep.us>
Acked-by: Eric Wong <normalperson@yhbt.net>
---
 perl/Git.pm | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/perl/Git.pm b/perl/Git.pm
index e1ca5b4a2..405f68fc3 100644
--- a/perl/Git.pm
+++ b/perl/Git.pm
@@ -57,7 +57,8 @@ require Exporter;
                 command_output_pipe command_input_pipe command_close_pipe
                 command_bidi_pipe command_close_bidi_pipe
                 version exec_path hash_object git_cmd_try
-                remote_refs);
+                remote_refs
+                temp_acquire temp_release temp_reset);
 
 
 =head1 DESCRIPTION
@@ -99,7 +100,9 @@ use Carp qw(carp croak); # but croak is bad - throw instead
 use Error qw(:try);
 use Cwd qw(abs_path);
 use IPC::Open2 qw(open2);
-
+use File::Temp ();
+require File::Spec;
+use Fcntl qw(SEEK_SET SEEK_CUR);
 }
 
 
@@ -933,6 +936,124 @@ sub _close_cat_blob {
 	delete @$self{@vars};
 }
 
+
+{ # %TEMP_* Lexical Context
+
+my (%TEMP_LOCKS, %TEMP_FILES);
+
+=item temp_acquire ( NAME )
+
+Attempts to retreive the temporary file mapped to the string C<NAME>. If an
+associated temp file has not been created this session or was closed, it is
+created, cached, and set for autoflush and binmode.
+
+Internally locks the file mapped to C<NAME>. This lock must be released with
+C<temp_release()> when the temp file is no longer needed. Subsequent attempts
+to retrieve temporary files mapped to the same C<NAME> while still locked will
+cause an error. This locking mechanism provides a weak guarantee and is not
+threadsafe. It does provide some error checking to help prevent temp file refs
+writing over one another.
+
+In general, the L<File::Handle> returned should not be closed by consumers as
+it defeats the purpose of this caching mechanism. If you need to close the temp
+file handle, then you should use L<File::Temp> or another temp file faculty
+directly. If a handle is closed and then requested again, then a warning will
+issue.
+
+=cut
+
+sub temp_acquire {
+	my ($self, $name) = _maybe_self(@_);
+
+	my $temp_fd = _temp_cache($name);
+
+	$TEMP_LOCKS{$temp_fd} = 1;
+	$temp_fd;
+}
+
+=item temp_release ( NAME )
+
+=item temp_release ( FILEHANDLE )
+
+Releases a lock acquired through C<temp_acquire()>. Can be called either with
+the C<NAME> mapping used when acquiring the temp file or with the C<FILEHANDLE>
+referencing a locked temp file.
+
+Warns if an attempt is made to release a file that is not locked.
+
+The temp file will be truncated before being released. This can help to reduce
+disk I/O where the system is smart enough to detect the truncation while data
+is in the output buffers. Beware that after the temp file is released and
+truncated, any operations on that file may fail miserably until it is
+re-acquired. All contents are lost between each release and acquire mapped to
+the same string.
+
+=cut
+
+sub temp_release {
+	my ($self, $temp_fd, $trunc) = _maybe_self(@_);
+
+	if (ref($temp_fd) ne 'File::Temp') {
+		$temp_fd = $TEMP_FILES{$temp_fd};
+	}
+	unless ($TEMP_LOCKS{$temp_fd}) {
+		carp "Attempt to release temp file '",
+			$temp_fd, "' that has not been locked";
+	}
+	temp_reset($temp_fd) if $trunc and $temp_fd->opened;
+
+	$TEMP_LOCKS{$temp_fd} = 0;
+	undef;
+}
+
+sub _temp_cache {
+	my ($name) = @_;
+
+	my $temp_fd = \$TEMP_FILES{$name};
+	if (defined $$temp_fd and $$temp_fd->opened) {
+		if ($TEMP_LOCKS{$$temp_fd}) {
+			throw Error::Simple("Temp file with moniker '",
+				$name, "' already in use");
+		}
+	} else {
+		if (defined $$temp_fd) {
+			# then we're here because of a closed handle.
+			carp "Temp file '", $name,
+				"' was closed. Opening replacement.";
+		}
+		$$temp_fd = File::Temp->new(
+			TEMPLATE => 'Git_XXXXXX',
+			DIR => File::Spec->tmpdir
+			) or throw Error::Simple("couldn't open new temp file");
+		$$temp_fd->autoflush;
+		binmode $$temp_fd;
+	}
+	$$temp_fd;
+}
+
+=item temp_reset ( FILEHANDLE )
+
+Truncates and resets the position of the C<FILEHANDLE>.
+
+=cut
+
+sub temp_reset {
+	my ($self, $temp_fd) = _maybe_self(@_);
+
+	truncate $temp_fd, 0
+		or throw Error::Simple("couldn't truncate file");
+	sysseek($temp_fd, 0, SEEK_SET) and seek($temp_fd, 0, SEEK_SET)
+		or throw Error::Simple("couldn't seek to beginning of file");
+	sysseek($temp_fd, 0, SEEK_CUR) == 0 and tell($temp_fd) == 0
+		or throw Error::Simple("expected file position to be reset");
+}
+
+sub END {
+	unlink values %TEMP_FILES if %TEMP_FILES;
+}
+
+} # %TEMP_* Lexical Context
+
 =back
 
 =head1 ERROR HANDLING

From 0b19138ba3e4c129770565d364df65ec25ca0a8e Mon Sep 17 00:00:00 2001
From: Marcus Griep <marcus@griep.us>
Date: Tue, 12 Aug 2008 12:00:53 -0400
Subject: [PATCH 2/3] git-svn: Make it incrementally faster by minimizing temp
 files

Currently, git-svn would create a temp file on four occasions:
1. Reading a blob out of the object db
2. Creating a delta from svn
3. Hashing and writing a blob into the object db
4. Reading a blob out of the object db (in another place in code)

Any time git-svn did the above, it would dutifully create and then
delete said temp file.  Unfortunately, this means that between 2-4
temporary files are created/deleted per file 'add/modify'-ed in
svn (O(n)).  This causes significant overhead and helps the inode
counter to spin beautifully.

By its nature, git-svn is a serial beast.  Thus, reusing a temp file
does not pose significant problems.  "truncate and seek" takes much
less time than "unlink and create".  This patch centralizes the
tempfile creation and holds onto the tempfile until they are deleted
on exit.  This significantly reduces file overhead, now requiring
at most three (3) temp files per run (O(1)).

Signed-off-by: Marcus Griep <marcus@griep.us>
Acked-by: Eric Wong <normalperson@yhbt.net>
---
 git-svn.perl | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/git-svn.perl b/git-svn.perl
index 4dc33801a..9eae5e8d8 100755
--- a/git-svn.perl
+++ b/git-svn.perl
@@ -1265,7 +1265,7 @@ sub md5sum {
 	my $arg = shift;
 	my $ref = ref $arg;
 	my $md5 = Digest::MD5->new();
-        if ($ref eq 'GLOB' || $ref eq 'IO::File') {
+        if ($ref eq 'GLOB' || $ref eq 'IO::File' || $ref eq 'File::Temp') {
 		$md5->addfile($arg) or croak $!;
 	} elsif ($ref eq 'SCALAR') {
 		$md5->add($$arg) or croak $!;
@@ -1328,6 +1328,7 @@ BEGIN
 	}
 }
 
+
 my (%LOCKFILES, %INDEX_FILES);
 END {
 	unlink keys %LOCKFILES if %LOCKFILES;
@@ -3230,13 +3231,11 @@ sub change_file_prop {
 
 sub apply_textdelta {
 	my ($self, $fb, $exp) = @_;
-	my $fh = IO::File->new_tmpfile;
-	$fh->autoflush(1);
+	my $fh = Git::temp_acquire('svn_delta');
 	# $fh gets auto-closed() by SVN::TxDelta::apply(),
 	# (but $base does not,) so dup() it for reading in close_file
 	open my $dup, '<&', $fh or croak $!;
-	my $base = IO::File->new_tmpfile;
-	$base->autoflush(1);
+	my $base = Git::temp_acquire('git_blob');
 	if ($fb->{blob}) {
 		print $base 'link ' if ($fb->{mode_a} == 120000);
 		my $size = $::_repository->cat_blob($fb->{blob}, $base);
@@ -3251,9 +3250,9 @@ sub apply_textdelta {
 		}
 	}
 	seek $base, 0, 0 or croak $!;
-	$fb->{fh} = $dup;
+	$fb->{fh} = $fh;
 	$fb->{base} = $base;
-	[ SVN::TxDelta::apply($base, $fh, undef, $fb->{path}, $fb->{pool}) ];
+	[ SVN::TxDelta::apply($base, $dup, undef, $fb->{path}, $fb->{pool}) ];
 }
 
 sub close_file {
@@ -3282,22 +3281,25 @@ sub close_file {
 			}
 		}
 
-		my ($tmp_fh, $tmp_filename) = File::Temp::tempfile(UNLINK => 1);
+		my $tmp_fh = Git::temp_acquire('svn_hash');
 		my $result;
 		while ($result = sysread($fh, my $string, 1024)) {
 			my $wrote = syswrite($tmp_fh, $string, $result);
 			defined($wrote) && $wrote == $result
-				or croak("write $tmp_filename: $!\n");
+				or croak("write ",
+					$tmp_fh->filename, ": $!\n");
 		}
 		defined $result or croak $!;
-		close $tmp_fh or croak $!;
 
-		close $fh or croak $!;
 
-		$hash = $::_repository->hash_and_insert_object($tmp_filename);
-		unlink($tmp_filename);
+		Git::temp_release($fh, 1);
+
+		$hash = $::_repository->hash_and_insert_object(
+				$tmp_fh->filename);
 		$hash =~ /^[a-f\d]{40}$/ or die "not a sha1: $hash\n";
-		close $fb->{base} or croak $!;
+
+		Git::temp_release($fb->{base}, 1);
+		Git::temp_release($tmp_fh, 1);
 	} else {
 		$hash = $fb->{blob} or die "no blob information\n";
 	}
@@ -3667,7 +3669,7 @@ sub chg_file {
 	} elsif ($m->{mode_b} !~ /755$/ && $m->{mode_a} =~ /755$/) {
 		$self->change_file_prop($fbat,'svn:executable',undef);
 	}
-	my $fh = IO::File->new_tmpfile or croak $!;
+	my $fh = Git::temp_acquire('git_blob');
 	if ($m->{mode_b} =~ /^120/) {
 		print $fh 'link ' or croak $!;
 		$self->change_file_prop($fbat,'svn:special','*');
@@ -3686,9 +3688,8 @@ sub chg_file {
 	my $atd = $self->apply_textdelta($fbat, undef, $pool);
 	my $got = SVN::TxDelta::send_stream($fh, @$atd, $pool);
 	die "Checksum mismatch\nexpected: $exp\ngot: $got\n" if ($got ne $exp);
+	Git::temp_release($fh, 1);
 	$pool->clear;
-
-	close $fh or croak $!;
 }
 
 sub D {

From 510b0945d041752db2f53dda00f1188308e2df4e Mon Sep 17 00:00:00 2001
From: Marcus Griep <marcus@griep.us>
Date: Tue, 12 Aug 2008 12:45:39 -0400
Subject: [PATCH 3/3] git-svn: Reduce temp file usage when dealing with
 non-links

Currently, in sub 'close_file', git-svn creates a temporary file and
copies the contents of the blob to be written into it. This is useful
for symlinks because svn stores symlinks in the form:

link $FILE_PATH

Git creates a blob only out of '$FILE_PATH' and uses file mode to
indicate that the blob should be interpreted as a symlink.

As git-hash-object is invoked with --stdin-paths, a duplicate of the
link from svn must be created that leaves off the first five bytes,
i.e. 'link '. However, this is wholly unnecessary for normal blobs,
though, as we already have a temp file with their contents. Copying
the entire file gains nothing, and effectively requires a file to be
written twice before making it into the object db.

This patch corrects that issue, holding onto the substr-like
duplication for symlinks, but skipping it altogether for normal blobs
by reusing the existing temp file.

Signed-off-by: Marcus Griep <marcus@griep.us>
Acked-by: Eric Wong <normalperson@yhbt.net>
---
 git-svn.perl | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/git-svn.perl b/git-svn.perl
index 9eae5e8d8..099fd02b3 100755
--- a/git-svn.perl
+++ b/git-svn.perl
@@ -3268,38 +3268,36 @@ sub close_file {
 				    "expected: $exp\n    got: $got\n";
 			}
 		}
-		sysseek($fh, 0, 0) or croak $!;
 		if ($fb->{mode_b} == 120000) {
-			eval {
-				sysread($fh, my $buf, 5) == 5 or croak $!;
-				$buf eq 'link ' or die "$path has mode 120000",
-						       " but is not a link";
-			};
-			if ($@) {
-				warn "$@\n";
-				sysseek($fh, 0, 0) or croak $!;
-			}
-		}
-
-		my $tmp_fh = Git::temp_acquire('svn_hash');
-		my $result;
-		while ($result = sysread($fh, my $string, 1024)) {
-			my $wrote = syswrite($tmp_fh, $string, $result);
-			defined($wrote) && $wrote == $result
-				or croak("write ",
-					$tmp_fh->filename, ": $!\n");
-		}
-		defined $result or croak $!;
+			sysseek($fh, 0, 0) or croak $!;
+			sysread($fh, my $buf, 5) == 5 or croak $!;
 
+			unless ($buf eq 'link ') {
+				warn "$path has mode 120000",
+						" but is not a link\n";
+			} else {
+				my $tmp_fh = Git::temp_acquire('svn_hash');
+				my $res;
+				while ($res = sysread($fh, my $str, 1024)) {
+					my $out = syswrite($tmp_fh, $str, $res);
+					defined($out) && $out == $res
+						or croak("write ",
+							$tmp_fh->filename,
+							": $!\n");
+				}
+				defined $res or croak $!;
 
-		Git::temp_release($fh, 1);
+				($fh, $tmp_fh) = ($tmp_fh, $fh);
+				Git::temp_release($tmp_fh, 1);
+			}
+		}
 
 		$hash = $::_repository->hash_and_insert_object(
-				$tmp_fh->filename);
+				$fh->filename);
 		$hash =~ /^[a-f\d]{40}$/ or die "not a sha1: $hash\n";
 
 		Git::temp_release($fb->{base}, 1);
-		Git::temp_release($tmp_fh, 1);
+		Git::temp_release($fh, 1);
 	} else {
 		$hash = $fb->{blob} or die "no blob information\n";
 	}