Skip to content

Commit

Permalink
make pack data reuse compatible with both delta types
Browse files Browse the repository at this point in the history
This is the missing part to git-pack-objects allowing it to reuse delta
data to/from any of the two delta types.  It can reuse delta from any
type, and it outputs base offsets when --allow-delta-base-offset is
provided and the base is also included in the pack.  Otherwise it
outputs base sha1 references just like it always did.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
  • Loading branch information
Nicolas Pitre authored and Junio C Hamano committed Sep 27, 2006
1 parent be6b191 commit 780e6e7
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 96 deletions.
204 changes: 130 additions & 74 deletions builtin-pack-objects.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ struct object_entry {
enum object_type type;
enum object_type in_pack_type; /* could be delta */
unsigned long delta_size; /* delta data size (uncompressed) */
#define in_pack_header_size delta_size /* only when reusing pack data */
struct object_entry *delta; /* delta base object */
struct packed_git *in_pack; /* already in pack */
unsigned int in_pack_offset;
Expand Down Expand Up @@ -86,17 +87,25 @@ static int object_ix_hashsz;
* Pack index for existing packs give us easy access to the offsets into
* corresponding pack file where each object's data starts, but the entries
* do not store the size of the compressed representation (uncompressed
* size is easily available by examining the pack entry header). We build
* a hashtable of existing packs (pack_revindex), and keep reverse index
* here -- pack index file is sorted by object name mapping to offset; this
* pack_revindex[].revindex array is an ordered list of offsets, so if you
* know the offset of an object, next offset is where its packed
* representation ends.
* size is easily available by examining the pack entry header). It is
* also rather expensive to find the sha1 for an object given its offset.
*
* We build a hashtable of existing packs (pack_revindex), and keep reverse
* index here -- pack index file is sorted by object name mapping to offset;
* this pack_revindex[].revindex array is a list of offset/index_nr pairs
* ordered by offset, so if you know the offset of an object, next offset
* is where its packed representation ends and the index_nr can be used to
* get the object sha1 from the main index.
*/
struct revindex_entry {
unsigned int offset;
unsigned int nr;
};
struct pack_revindex {
struct packed_git *p;
unsigned long *revindex;
} *pack_revindex = NULL;
struct revindex_entry *revindex;
};
static struct pack_revindex *pack_revindex;
static int pack_revindex_hashsz;

/*
Expand Down Expand Up @@ -143,14 +152,9 @@ static void prepare_pack_ix(void)

static int cmp_offset(const void *a_, const void *b_)
{
unsigned long a = *(unsigned long *) a_;
unsigned long b = *(unsigned long *) b_;
if (a < b)
return -1;
else if (a == b)
return 0;
else
return 1;
const struct revindex_entry *a = a_;
const struct revindex_entry *b = b_;
return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0;
}

/*
Expand All @@ -163,25 +167,27 @@ static void prepare_pack_revindex(struct pack_revindex *rix)
int i;
void *index = p->index_base + 256;

rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1));
rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
for (i = 0; i < num_ent; i++) {
unsigned int hl = *((unsigned int *)((char *) index + 24*i));
rix->revindex[i] = ntohl(hl);
rix->revindex[i].offset = ntohl(hl);
rix->revindex[i].nr = i;
}
/* This knows the pack format -- the 20-byte trailer
* follows immediately after the last object data.
*/
rix->revindex[num_ent] = p->pack_size - 20;
qsort(rix->revindex, num_ent, sizeof(unsigned long), cmp_offset);
rix->revindex[num_ent].offset = p->pack_size - 20;
rix->revindex[num_ent].nr = -1;
qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
}

static unsigned long find_packed_object_size(struct packed_git *p,
unsigned long ofs)
static struct revindex_entry * find_packed_object(struct packed_git *p,
unsigned int ofs)
{
int num;
int lo, hi;
struct pack_revindex *rix;
unsigned long *revindex;
struct revindex_entry *revindex;
num = pack_revindex_ix(p);
if (num < 0)
die("internal error: pack revindex uninitialized");
Expand All @@ -193,17 +199,31 @@ static unsigned long find_packed_object_size(struct packed_git *p,
hi = num_packed_objects(p) + 1;
do {
int mi = (lo + hi) / 2;
if (revindex[mi] == ofs) {
return revindex[mi+1] - ofs;
if (revindex[mi].offset == ofs) {
return revindex + mi;
}
else if (ofs < revindex[mi])
else if (ofs < revindex[mi].offset)
hi = mi;
else
lo = mi + 1;
} while (lo < hi);
die("internal error: pack revindex corrupt");
}

static unsigned long find_packed_object_size(struct packed_git *p,
unsigned long ofs)
{
struct revindex_entry *entry = find_packed_object(p, ofs);
return entry[1].offset - ofs;
}

static unsigned char *find_packed_object_name(struct packed_git *p,
unsigned long ofs)
{
struct revindex_entry *entry = find_packed_object(p, ofs);
return (unsigned char *)(p->index_base + 256) + 24 * entry->nr + 4;
}

static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
{
unsigned long othersize, delta_size;
Expand Down Expand Up @@ -249,6 +269,10 @@ static int encode_header(enum object_type type, unsigned long size, unsigned cha
return n;
}

/*
* we are going to reuse the existing object data as is. make
* sure it is not corrupt.
*/
static int check_inflate(unsigned char *data, unsigned long len, unsigned long expect)
{
z_stream stream;
Expand Down Expand Up @@ -280,32 +304,6 @@ static int check_inflate(unsigned char *data, unsigned long len, unsigned long e
return st;
}

/*
* we are going to reuse the existing pack entry data. make
* sure it is not corrupt.
*/
static int revalidate_pack_entry(struct object_entry *entry, unsigned char *data, unsigned long len)
{
enum object_type type;
unsigned long size, used;

if (pack_to_stdout)
return 0;

/* the caller has already called use_packed_git() for us,
* so it is safe to access the pack data from mmapped location.
* make sure the entry inflates correctly.
*/
used = unpack_object_header_gently(data, len, &type, &size);
if (!used)
return -1;
if (type == OBJ_REF_DELTA)
used += 20; /* skip base object name */
data += used;
len -= used;
return check_inflate(data, len, entry->size);
}

static int revalidate_loose_object(struct object_entry *entry,
unsigned char *map,
unsigned long mapsize)
Expand Down Expand Up @@ -339,7 +337,7 @@ static unsigned long write_object(struct sha1file *f,
obj_type = entry->type;
if (! entry->in_pack)
to_reuse = 0; /* can't reuse what we don't have */
else if (obj_type == OBJ_REF_DELTA)
else if (obj_type == OBJ_REF_DELTA || obj_type == OBJ_OFS_DELTA)
to_reuse = 1; /* check_object() decided it for us */
else if (obj_type != entry->in_pack_type)
to_reuse = 0; /* pack has delta which is unusable */
Expand Down Expand Up @@ -415,18 +413,38 @@ static unsigned long write_object(struct sha1file *f,
}
else {
struct packed_git *p = entry->in_pack;
use_packed_git(p);

datalen = find_packed_object_size(p, entry->in_pack_offset);
buf = (char *) p->pack_base + entry->in_pack_offset;
if (entry->delta) {
obj_type = (allow_ofs_delta && entry->delta->offset) ?
OBJ_OFS_DELTA : OBJ_REF_DELTA;
reused_delta++;
}
hdrlen = encode_header(obj_type, entry->size, header);
sha1write(f, header, hdrlen);
if (obj_type == OBJ_OFS_DELTA) {
unsigned long ofs = entry->offset - entry->delta->offset;
unsigned pos = sizeof(header) - 1;
header[pos] = ofs & 127;
while (ofs >>= 7)
header[--pos] = 128 | (--ofs & 127);
sha1write(f, header + pos, sizeof(header) - pos);
hdrlen += sizeof(header) - pos;
} else if (obj_type == OBJ_REF_DELTA) {
sha1write(f, entry->delta->sha1, 20);
hdrlen += 20;
}

if (revalidate_pack_entry(entry, buf, datalen))
use_packed_git(p);
buf = (char *) p->pack_base
+ entry->in_pack_offset
+ entry->in_pack_header_size;
datalen = find_packed_object_size(p, entry->in_pack_offset)
- entry->in_pack_header_size;
//fprintf(stderr, "reusing %d at %d header %d size %d\n", obj_type, entry->in_pack_offset, entry->in_pack_header_size, datalen);
if (!pack_to_stdout && check_inflate(buf, datalen, entry->size))
die("corrupt delta in pack %s", sha1_to_hex(entry->sha1));
sha1write(f, buf, datalen);
unuse_packed_git(p);
hdrlen = 0; /* not really */
if (obj_type == OBJ_REF_DELTA)
reused_delta++;
reused++;
}
if (entry->delta)
Expand Down Expand Up @@ -914,26 +932,64 @@ static void check_object(struct object_entry *entry)
char type[20];

if (entry->in_pack && !entry->preferred_base) {
unsigned char base[20];
unsigned long size;
struct object_entry *base_entry;
struct packed_git *p = entry->in_pack;
unsigned long left = p->pack_size - entry->in_pack_offset;
unsigned long size, used;
unsigned char *buf;
struct object_entry *base_entry = NULL;

use_packed_git(p);
buf = p->pack_base;
buf += entry->in_pack_offset;

/* We want in_pack_type even if we do not reuse delta.
* There is no point not reusing non-delta representations.
*/
check_reuse_pack_delta(entry->in_pack,
entry->in_pack_offset,
base, &size,
&entry->in_pack_type);
used = unpack_object_header_gently(buf, left,
&entry->in_pack_type, &size);
if (!used || left - used <= 20)
die("corrupt pack for %s", sha1_to_hex(entry->sha1));

/* Check if it is delta, and the base is also an object
* we are going to pack. If so we will reuse the existing
* delta.
*/
if (!no_reuse_delta &&
entry->in_pack_type == OBJ_REF_DELTA &&
(base_entry = locate_object_entry(base)) &&
(!base_entry->preferred_base)) {
if (!no_reuse_delta) {
unsigned char c, *base_name;
unsigned long ofs;
/* there is at least 20 bytes left in the pack */
switch (entry->in_pack_type) {
case OBJ_REF_DELTA:
base_name = buf + used;
used += 20;
break;
case OBJ_OFS_DELTA:
c = buf[used++];
ofs = c & 127;
while (c & 128) {
ofs += 1;
if (!ofs || ofs & ~(~0UL >> 7))
die("delta base offset overflow in pack for %s",
sha1_to_hex(entry->sha1));
c = buf[used++];
ofs = (ofs << 7) + (c & 127);
}
if (ofs >= entry->in_pack_offset)
die("delta base offset out of bound for %s",
sha1_to_hex(entry->sha1));
ofs = entry->in_pack_offset - ofs;
base_name = find_packed_object_name(p, ofs);
break;
default:
base_name = NULL;
}
if (base_name)
base_entry = locate_object_entry(base_name);
}
unuse_packed_git(p);
entry->in_pack_header_size = used;

if (base_entry && !base_entry->preferred_base) {

/* Depth value does not matter - find_deltas()
* will never consider reused delta as the
Expand All @@ -942,9 +998,9 @@ static void check_object(struct object_entry *entry)
*/

/* uncompressed size of the delta data */
entry->size = entry->delta_size = size;
entry->size = size;
entry->delta = base_entry;
entry->type = OBJ_REF_DELTA;
entry->type = entry->in_pack_type;

entry->delta_sibling = base_entry->delta_child;
base_entry->delta_child = entry;
Expand Down Expand Up @@ -1500,7 +1556,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
continue;
}
if (!strcmp("--delta-base-offset", arg)) {
allow_ofs_delta = no_reuse_delta = 1;
allow_ofs_delta = 1;
continue;
}
if (!strcmp("--stdout", arg)) {
Expand Down
3 changes: 0 additions & 3 deletions pack.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,4 @@ struct pack_header {
};

extern int verify_pack(struct packed_git *, int);
extern int check_reuse_pack_delta(struct packed_git *, unsigned long,
unsigned char *, unsigned long *,
enum object_type *);
#endif
19 changes: 0 additions & 19 deletions sha1_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -1000,25 +1000,6 @@ static unsigned long unpack_object_header(struct packed_git *p, unsigned long of
return offset + used;
}

int check_reuse_pack_delta(struct packed_git *p, unsigned long offset,
unsigned char *base, unsigned long *sizep,
enum object_type *kindp)
{
unsigned long ptr;
int status = -1;

use_packed_git(p);
ptr = offset;
ptr = unpack_object_header(p, ptr, kindp, sizep);
if (*kindp != OBJ_REF_DELTA)
goto done;
hashcpy(base, (unsigned char *) p->pack_base + ptr);
status = 0;
done:
unuse_packed_git(p);
return status;
}

void packed_object_info_detail(struct packed_git *p,
unsigned long offset,
char *type,
Expand Down

0 comments on commit 780e6e7

Please sign in to comment.