-
Notifications
You must be signed in to change notification settings - Fork 0
Permalink
git/diff-delta.c
Newer
100644
489 lines (438 sloc)
15.4 KB
1
/*
2
* diff-delta.c: generate a delta between two buffers
3
*
4
* This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5
* http://www.xmailserver.org/xdiff-lib.html
7
* Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
9
* This code is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License version 2 as
11
* published by the Free Software Foundation.
14
#include "git-compat-util.h"
15
#include "delta.h"
17
/* maximum hash entry list for the same hash bucket */
18
#define HASH_LIMIT 64
20
#define RABIN_SHIFT 23
21
#define RABIN_WINDOW 16
22
23
static const unsigned int T[256] = {
24
0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25
0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26
0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27
0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28
0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29
0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30
0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31
0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32
0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33
0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34
0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35
0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36
0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37
0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38
0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39
0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40
0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41
0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42
0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43
0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44
0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45
0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46
0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47
0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48
0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49
0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50
0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51
0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52
0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53
0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54
0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55
0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56
0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57
0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58
0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59
0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60
0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61
0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62
0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63
0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64
0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65
0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66
0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
67
};
68
69
static const unsigned int U[256] = {
70
0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71
0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72
0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73
0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74
0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75
0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76
0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77
0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78
0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79
0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80
0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81
0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82
0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83
0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84
0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85
0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86
0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87
0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88
0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89
0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90
0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91
0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92
0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93
0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94
0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95
0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96
0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97
0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98
0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99
0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100
0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101
0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102
0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103
0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104
0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105
0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106
0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107
0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108
0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109
0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110
0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111
0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112
0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
113
};
115
struct index_entry {
116
const unsigned char *ptr;
117
unsigned int val;
118
};
119
120
struct unpacked_index_entry {
121
struct index_entry entry;
122
struct unpacked_index_entry *next;
125
struct delta_index {
126
unsigned long memsize;
127
const void *src_buf;
128
unsigned long src_size;
129
unsigned int hash_mask;
130
struct index_entry *hash[FLEX_ARRAY];
131
};
132
133
struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
135
unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
136
const unsigned char *data, *buffer = buf;
137
struct delta_index *index;
138
struct unpacked_index_entry *entry, **hash;
139
struct index_entry *packed_entry, **packed_hash;
140
void *mem;
141
unsigned long memsize;
143
if (!buf || !bufsize)
144
return NULL;
145
146
/* Determine index hash size. Note that indexing skips the
147
first byte to allow for optimizing the Rabin's polynomial
148
initialization in create_delta(). */
149
entries = (bufsize - 1) / RABIN_WINDOW;
150
if (bufsize >= 0xffffffffUL) {
151
/*
152
* Current delta format can't encode offsets into
153
* reference buffer with more than 32 bits.
154
*/
155
entries = 0xfffffffeU / RABIN_WINDOW;
156
}
157
hsize = entries / 4;
158
for (i = 4; (1u << i) < hsize; i++);
159
hsize = 1 << i;
160
hmask = hsize - 1;
161
162
/* allocate lookup index */
163
memsize = sizeof(*hash) * hsize +
164
sizeof(*entry) * entries;
165
mem = malloc(memsize);
166
if (!mem)
167
return NULL;
168
hash = mem;
169
mem = hash + hsize;
170
entry = mem;
171
172
memset(hash, 0, hsize * sizeof(*hash));
173
174
/* allocate an array to count hash entries */
175
hash_count = calloc(hsize, sizeof(*hash_count));
176
if (!hash_count) {
177
free(hash);
178
return NULL;
179
}
180
181
/* then populate the index */
182
prev_val = ~0;
183
for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
184
data >= buffer;
185
data -= RABIN_WINDOW) {
186
unsigned int val = 0;
187
for (i = 1; i <= RABIN_WINDOW; i++)
188
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
189
if (val == prev_val) {
190
/* keep the lowest of consecutive identical blocks */
191
entry[-1].entry.ptr = data + RABIN_WINDOW;
192
--entries;
193
} else {
194
prev_val = val;
195
i = val & hmask;
196
entry->entry.ptr = data + RABIN_WINDOW;
197
entry->entry.val = val;
198
entry->next = hash[i];
199
hash[i] = entry++;
200
hash_count[i]++;
201
}
202
}
204
/*
205
* Determine a limit on the number of entries in the same hash
206
* bucket. This guards us against pathological data sets causing
207
* really bad hash distribution with most entries in the same hash
208
* bucket that would bring us to O(m*n) computing costs (m and n
209
* corresponding to reference and target buffer sizes).
210
*
211
* Make sure none of the hash buckets has more entries than
212
* we're willing to test. Otherwise we cull the entry list
213
* uniformly to still preserve a good repartition across
214
* the reference buffer.
215
*/
216
for (i = 0; i < hsize; i++) {
217
int acc;
218
219
if (hash_count[i] <= HASH_LIMIT)
220
continue;
221
222
/* We leave exactly HASH_LIMIT entries in the bucket */
223
entries -= hash_count[i] - HASH_LIMIT;
224
225
entry = hash[i];
226
acc = 0;
227
228
/*
229
* Assume that this loop is gone through exactly
230
* HASH_LIMIT times and is entered and left with
231
* acc==0. So the first statement in the loop
232
* contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
233
* to the accumulator, and the inner loop consequently
234
* is run (hash_count[i]-HASH_LIMIT) times, removing
235
* one element from the list each time. Since acc
236
* balances out to 0 at the final run, the inner loop
237
* body can't be left with entry==NULL. So we indeed
238
* encounter entry==NULL in the outer loop only.
239
*/
240
do {
241
acc += hash_count[i] - HASH_LIMIT;
242
if (acc > 0) {
243
struct unpacked_index_entry *keep = entry;
244
do {
245
entry = entry->next;
246
acc -= HASH_LIMIT;
247
} while (acc > 0);
248
keep->next = entry->next;
249
}
250
entry = entry->next;
251
} while (entry);
252
}
253
free(hash_count);
254
255
/*
256
* Now create the packed index in array form
257
* rather than linked lists.
258
*/
259
memsize = sizeof(*index)
260
+ sizeof(*packed_hash) * (hsize+1)
261
+ sizeof(*packed_entry) * entries;
262
mem = malloc(memsize);
263
if (!mem) {
264
free(hash);
265
return NULL;
266
}
267
268
index = mem;
269
index->memsize = memsize;
270
index->src_buf = buf;
271
index->src_size = bufsize;
272
index->hash_mask = hmask;
273
274
mem = index->hash;
275
packed_hash = mem;
276
mem = packed_hash + (hsize+1);
277
packed_entry = mem;
278
279
for (i = 0; i < hsize; i++) {
280
/*
281
* Coalesce all entries belonging to one linked list
282
* into consecutive array entries.
283
*/
284
packed_hash[i] = packed_entry;
285
for (entry = hash[i]; entry; entry = entry->next)
286
*packed_entry++ = entry->entry;
287
}
288
289
/* Sentinel value to indicate the length of the last hash bucket */
290
packed_hash[hsize] = packed_entry;
291
292
assert(packed_entry - (struct index_entry *)mem == entries);
293
free(hash);
294
295
return index;
296
}
297
298
void free_delta_index(struct delta_index *index)
299
{
300
free(index);
303
unsigned long sizeof_delta_index(struct delta_index *index)
304
{
305
if (index)
306
return index->memsize;
307
else
308
return 0;
309
}
310
311
/*
312
* The maximum size for any opcode sequence, including the initial header
313
* plus Rabin window plus biggest copy.
314
*/
315
#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
316
317
void *
318
create_delta(const struct delta_index *index,
319
const void *trg_buf, unsigned long trg_size,
320
unsigned long *delta_size, unsigned long max_size)
322
unsigned int i, outpos, outsize, moff, msize, val;
324
const unsigned char *ref_data, *ref_top, *data, *top;
325
unsigned char *out;
327
if (!trg_buf || !trg_size)
328
return NULL;
329
330
outpos = 0;
331
outsize = 8192;
332
if (max_size && outsize >= max_size)
333
outsize = max_size + MAX_OP_SIZE + 1;
336
return NULL;
337
338
/* store reference buffer size */
339
i = index->src_size;
340
while (i >= 0x80) {
341
out[outpos++] = i | 0x80;
342
i >>= 7;
344
out[outpos++] = i;
345
346
/* store target buffer size */
347
i = trg_size;
348
while (i >= 0x80) {
349
out[outpos++] = i | 0x80;
350
i >>= 7;
352
out[outpos++] = i;
354
ref_data = index->src_buf;
355
ref_top = ref_data + index->src_size;
356
data = trg_buf;
357
top = (const unsigned char *) trg_buf + trg_size;
358
359
outpos++;
360
val = 0;
361
for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
362
out[outpos++] = *data;
363
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
364
}
365
inscnt = i;
367
moff = 0;
368
msize = 0;
369
while (data < top) {
370
if (msize < 4096) {
371
struct index_entry *entry;
372
val ^= U[data[-RABIN_WINDOW]];
373
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
374
i = val & index->hash_mask;
375
for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
376
const unsigned char *ref = entry->ptr;
377
const unsigned char *src = data;
378
unsigned int ref_size = ref_top - ref;
379
if (entry->val != val)
380
continue;
381
if (ref_size > top - src)
382
ref_size = top - src;
383
if (ref_size <= msize)
384
break;
385
while (ref_size-- && *src++ == *ref)
386
ref++;
387
if (msize < ref - entry->ptr) {
388
/* this is our best match so far */
389
msize = ref - entry->ptr;
390
moff = entry->ptr - ref_data;
391
if (msize >= 4096) /* good enough */
392
break;
393
}
397
if (msize < 4) {
398
if (!inscnt)
399
outpos++;
400
out[outpos++] = *data++;
401
inscnt++;
402
if (inscnt == 0x7f) {
403
out[outpos - inscnt - 1] = inscnt;
404
inscnt = 0;
405
}
406
msize = 0;
408
unsigned int left;
409
unsigned char *op;
410
412
while (moff && ref_data[moff-1] == data[-1]) {
413
/* we can match one byte back */
414
msize++;
415
moff--;
416
data--;
417
outpos--;
418
if (--inscnt)
419
continue;
420
outpos--; /* remove count slot */
421
inscnt--; /* make it -1 */
422
break;
423
}
424
out[outpos - inscnt - 1] = inscnt;
425
inscnt = 0;
426
}
427
428
/* A copy op is currently limited to 64KB (pack v2) */
429
left = (msize < 0x10000) ? 0 : (msize - 0x10000);
430
msize -= left;
431
432
op = out + outpos++;
435
if (moff & 0x000000ff)
436
out[outpos++] = moff >> 0, i |= 0x01;
437
if (moff & 0x0000ff00)
438
out[outpos++] = moff >> 8, i |= 0x02;
439
if (moff & 0x00ff0000)
440
out[outpos++] = moff >> 16, i |= 0x04;
441
if (moff & 0xff000000)
442
out[outpos++] = moff >> 24, i |= 0x08;
444
if (msize & 0x00ff)
445
out[outpos++] = msize >> 0, i |= 0x10;
446
if (msize & 0xff00)
447
out[outpos++] = msize >> 8, i |= 0x20;
449
*op = i;
450
451
data += msize;
452
moff += msize;
453
msize = left;
454
455
if (msize < 4096) {
456
int j;
457
val = 0;
458
for (j = -RABIN_WINDOW; j < 0; j++)
459
val = ((val << 8) | data[j])
460
^ T[val >> RABIN_SHIFT];
461
}
464
if (outpos >= outsize - MAX_OP_SIZE) {
465
void *tmp = out;
466
outsize = outsize * 3 / 2;
467
if (max_size && outsize >= max_size)
468
outsize = max_size + MAX_OP_SIZE + 1;
469
if (max_size && outpos > max_size)
470
break;
471
out = realloc(out, outsize);
472
if (!out) {
473
free(tmp);
474
return NULL;
475
}
476
}
477
}
478
479
if (inscnt)
480
out[outpos - inscnt - 1] = inscnt;
481
482
if (max_size && outpos > max_size) {
483
free(out);
484
return NULL;
485
}
486
487
*delta_size = outpos;
488
return out;
489
}