Skip to content

Commit

Permalink
diff-delta: produce optimal pack data
Browse files Browse the repository at this point in the history
Indexing based on adler32 has a match precision based on the block size
(currently 16).  Lowering the block size would produce smaller deltas
but the indexing memory and computing cost increases significantly.

For optimal delta result the indexing block size should be 3 with an
increment of 1 (instead of 16 and 16).  With such low params the adler32
becomes a clear overhead increasing the time for git-repack by a factor
of 3.  And with such small blocks the adler 32 is not very useful as the
whole of the block bits can be used directly.

This patch replaces the adler32 with an open coded index value based on
3 characters directly.  This gives sufficient bits for hashing and
allows for optimal delta with reasonable CPU cycles.

The resulting packs are 6% smaller on average.  The increase in CPU time
is about 25%.  But this cost is now hidden by the delta reuse patch
while the saving on data transfers is always there.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
  • Loading branch information
Nicolas Pitre authored and Junio C Hamano committed Mar 2, 2006
1 parent 5343cf1 commit cc5c59a
Showing 1 changed file with 30 additions and 47 deletions.
77 changes: 30 additions & 47 deletions diff-delta.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,61 +20,46 @@

#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include "delta.h"


/* block size: min = 16, max = 64k, power of 2 */
#define BLK_SIZE 16

#define MIN(a, b) ((a) < (b) ? (a) : (b))

#define GR_PRIME 0x9e370001
#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))

struct index {
const unsigned char *ptr;
unsigned int val;
struct index *next;
};

static struct index ** delta_index(const unsigned char *buf,
unsigned long bufsize,
unsigned int *hash_shift)
{
unsigned int hsize, hshift, entries, blksize, i;
unsigned long hsize;
unsigned int hshift, i;
const unsigned char *data;
struct index *entry, **hash;
void *mem;

/* determine index hash size */
entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
hsize = entries / 4;
for (i = 4; (1 << i) < hsize && i < 16; i++);
hsize = bufsize / 4;
for (i = 8; (1 << i) < hsize && i < 16; i++);
hsize = 1 << i;
hshift = 32 - i;
hshift = i - 8;
*hash_shift = hshift;

/* allocate lookup index */
mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
if (!mem)
return NULL;
hash = mem;
entry = mem + hsize * sizeof(*hash);
memset(hash, 0, hsize * sizeof(*hash));

/* then populate it */
data = buf + entries * BLK_SIZE - BLK_SIZE;
blksize = bufsize - (data - buf);
while (data >= buf) {
unsigned int val = adler32(0, data, blksize);
i = HASH(val, hshift);
entry->ptr = data;
entry->val = val;
data = buf + bufsize - 2;
while (data > buf) {
entry->ptr = --data;
i = data[0] ^ data[1] ^ (data[2] << hshift);
entry->next = hash[i];
hash[i] = entry++;
blksize = BLK_SIZE;
data -= BLK_SIZE;
}

return hash;
Expand Down Expand Up @@ -141,29 +126,27 @@ void *diff_delta(void *from_buf, unsigned long from_size,

while (data < top) {
unsigned int moff = 0, msize = 0;
unsigned int blksize = MIN(top - data, BLK_SIZE);
unsigned int val = adler32(0, data, blksize);
i = HASH(val, hash_shift);
for (entry = hash[i]; entry; entry = entry->next) {
const unsigned char *ref = entry->ptr;
const unsigned char *src = data;
unsigned int ref_size = ref_top - ref;
if (entry->val != val)
continue;
if (ref_size > top - src)
ref_size = top - src;
while (ref_size && *src++ == *ref) {
ref++;
ref_size--;
}
ref_size = ref - entry->ptr;
if (ref_size > msize) {
/* this is our best match so far */
moff = entry->ptr - ref_data;
msize = ref_size;
if (msize >= 0x10000) {
msize = 0x10000;
if (data + 2 < top) {
i = data[0] ^ data[1] ^ (data[2] << hash_shift);
for (entry = hash[i]; entry; entry = entry->next) {
const unsigned char *ref = entry->ptr;
const unsigned char *src = data;
unsigned int ref_size = ref_top - ref;
if (ref_size > top - src)
ref_size = top - src;
if (ref_size > 0x10000)
ref_size = 0x10000;
if (ref_size <= msize)
break;
while (ref_size && *src++ == *ref) {
ref++;
ref_size--;
}
ref_size = ref - entry->ptr;
if (msize < ref - entry->ptr) {
/* this is our best match so far */
msize = ref - entry->ptr;
moff = entry->ptr - ref_data;
}
}
}
Expand Down

0 comments on commit cc5c59a

Please sign in to comment.