monotone

monotone Mtn Source Tree

Root/xdelta.cc

1// copyright (C) 2002, 2003 graydon hoare <graydon@pobox.com>
2// all rights reserved.
3// licensed to the public under the terms of the GNU GPL (>= 2)
4// see the file COPYING for details
5
6// this file implements the xdelta algorithm, produces and consumes simple
7// copy/insert binary patches with the following structure:
8//
9// patch := (copy|insert)*
10// copy := 'C', ' ', pos=uint, ' ', len=uint, '\n'
11// insert := 'I', ' ', len=uint, '\n', payload=(byte x len), '\n'
12//
13// this means you can generally read the patch if you print it on stdout,
14// when it applies to text, but it can also apply to any binary, so the
15// hunk payload itself might look awful. I made it semi-ascii only to make
16// it slightly easier to debug; you really shouldn't read it normally. it's
17// a strict format with minimal checking, so it must be transport-encoded
18// to avoid whitespace munging.
19//
20// if you want to *read* a patch, you will like unidiff format much better.
21// take a look in diff_patch.(cc|hh) for a nice interface to that.
22
23#include <ext/hash_map>
24#include <algorithm>
25#include <vector>
26#include <set>
27#include <string>
28#include <sstream>
29
30#include <boost/shared_ptr.hpp>
31#include <boost/version.hpp>
32
33#include "adler32.hh"
34#include "numeric_vocab.hh"
35#include "sanity.hh"
36#include "xdelta.hh"
37
38using namespace std;
39using namespace __gnu_cxx;
40
41struct identity {size_t operator()(u32 const & v) const { return static_cast<size_t>(v);}};
42typedef pair<string::size_type, string::size_type> extent;
43typedef hash_map<u32, extent, identity> match_table;
44
45struct
46insn
47{
48 insn(char c) : code(insert), pos(0), len(0), payload("") { payload += c; }
49 insn(string s) : code(insert), pos(0), len(s.size()), payload(s) {}
50 insn(u32 p, u32 l) : code(copy), pos(p), len(l) {}
51 enum { insert, copy } code;
52 u32 pos, len;
53 string payload;
54};
55
56ostream & operator<<(ostream & ost, insn const & i)
57{
58 if (i.code == insn::insert)
59 {
60 ost << "I " << i.payload.size() << '\n';
61 ost.write(i.payload.data(), i.payload.size());
62 ost << '\n';
63 }
64 else
65 ost << "C " << i.pos << ' ' << i.len << '\n';
66 return ost;
67}
68
69
70static inline void
71init_match_table(string const & a,
72 string::size_type blocksz,
73 match_table & tab)
74{
75 string::size_type sz = a.size();
76 for (string::size_type i = 0; i < sz; i += blocksz)
77 {
78 string::size_type step = ((i + blocksz) >= sz) ? (sz - i) : blocksz;
79 u32 sum = adler32(reinterpret_cast<u8 const *>(a.data() + i), step).sum();
80 if (tab.find(sum) == tab.end())
81 tab.insert(make_pair(sum, make_pair(i, step)));
82 }
83 return;
84}
85
86
87static inline bool
88find_match(match_table const & matches,
89 vector<insn> & delta,
90 adler32 const & rolling,
91 string const & a,
92 string const & b,
93 string::size_type bpos,
94 string::size_type & apos,
95 string::size_type & alen,
96 string::size_type & badvance)
97{
98 u32 sum = rolling.sum();
99 match_table::const_iterator e = matches.find(sum);
100
101 // maybe we haven't seen it at all?
102 if (e == matches.end())
103 return false;
104
105 string::size_type tpos = e->second.first;
106 string::size_type tlen = e->second.second;
107
108 I(tpos < a.size());
109 I(tpos + tlen <= a.size());
110
111 // maybe it's a false match?
112 if (memcmp(a.data() + tpos, b.data() + bpos, tlen) != 0)
113 return false;
114
115 apos = tpos;
116 alen = tlen;
117 badvance = tlen;
118
119 // see if we can extend our match forwards
120 while((apos + alen >= 0)
121&& (bpos + badvance >= 0)
122&& (apos + alen < a.size())
123 && (bpos + badvance < b.size())
124 && (a[apos + alen] == b[bpos + badvance]))
125 {
126 ++alen;
127 ++badvance;
128 }
129
130 // see if we can extend backwards into a previous insert hunk
131 if (! delta.empty() && delta.back().code == insn::insert)
132 {
133 while(apos > 0
134 && bpos > 0
135 && a[apos - 1] == b[bpos - 1]
136 && !delta.back().payload.empty())
137{
138 I(a[apos - 1] == *(delta.back().payload.rbegin()));
139 I(delta.back().payload.size() > 0);
140 delta.back().payload.resize(delta.back().payload.size() - 1);
141 --apos;
142 --bpos;
143 ++alen;
144 // the significant thing here is that we do not move
145 // 'badvance' forward, just alen.
146}
147
148 // if we've extended back to consume the *entire* insert,
149 // let's do away with it altogether.
150 if (delta.back().payload.empty())
151{
152 delta.pop_back();
153}
154 }
155
156 I(memcmp(a.data() + apos, b.data() + bpos, alen) == 0);
157 return true;
158}
159
160static inline void
161insert_insn(vector<insn> & delta, char c)
162{
163 if (delta.empty() || delta.back().code == insn::copy)
164 delta.push_back(insn(c));
165 else
166 delta.back().payload += c;
167}
168
169
170static inline void
171copy_insn(vector<insn> & delta, string::size_type i, string::size_type matchlen)
172{
173 delta.push_back(insn(i, matchlen));
174}
175
176
177static void
178compute_delta_insns(string const & a,
179 string const & b,
180 vector<insn> & delta)
181{
182 string::size_type blocksz = 64;
183 match_table matches ((a.size() / blocksz) * 2);
184 init_match_table(a, blocksz, matches);
185
186 if (b.size() < blocksz)
187 {
188 for (string::size_type i = 0; i < b.size(); ++i)
189insert_insn(delta, b[i]);
190 return;
191 }
192
193 adler32 rolling(reinterpret_cast<u8 const *>(b.data()), blocksz);
194
195 for (string::size_type
196 sz = b.size(),
197 lo = 0,
198 hi = blocksz;
199 lo < sz; )
200 {
201 string::size_type apos = 0, alen = 1, badvance = 1;
202
203 bool found_match = find_match(matches, delta, rolling, a, b, lo, apos, alen, badvance);
204
205 if (found_match)
206 {
207 copy_insn(delta, apos, alen);
208 }
209 else
210{
211 I(apos + alen <= a.size());
212 I(alen == 1);
213 I(alen < blocksz);
214 I(lo >= 0);
215 I(lo < b.size());
216 insert_insn(delta, b[lo]);
217}
218
219 string::size_type next = lo;
220 for (; next < lo + badvance; ++next)
221{
222 I(next >= 0);
223 I(next < b.size());
224 rolling.out(static_cast<u8>(b[next]));
225 if (next + blocksz < b.size())
226 rolling.in(static_cast<u8>(b[next + blocksz]));
227}
228 lo = next;
229 hi = lo + blocksz;
230 }
231}
232
233
234// specialized form for manifest maps, which
235// are sorted, so have far more structure
236
237static void
238flush_copy(ostringstream & oss, u32 & pos, u32 & len)
239{
240 if (len != 0)
241 {
242 // flush any copy which is going on
243 oss << insn(pos, len);
244 pos = pos + len;
245 len = 0;
246 }
247}
248
249void
250compute_delta(manifest_map const & a,
251 manifest_map const & b,
252 string & delta)
253{
254 delta.clear();
255 ostringstream oss;
256
257 manifest_map::const_iterator i = a.begin();
258 manifest_map::const_iterator j = b.begin();
259
260 u32 pos = 0;
261 u32 len = 0;
262 while (j != b.end())
263 {
264 size_t isz = 0;
265
266 if (i != a.end())
267 isz = i->first().size() + 2 + i->second.inner()().size() + 1;
268
269 if (i != a.end() && i->first == j->first)
270{
271 if (i->second == j->second)
272 {
273 // this line was copied
274 len += isz;
275 }
276 else
277 {
278 // this line was changed, but the *entry* remained, so we
279 // treat it as a simultaneous delete + insert. that means
280 // advance pos over what used to be here, set len to 0, and
281 // copy the new data.
282 flush_copy(oss, pos, len);
283 pos += isz;
284 ostringstream ss;
285 ss << *j;
286 oss << insn(ss.str());
287 }
288 ++i; ++j;
289}
290 else
291{
292
293 flush_copy(oss, pos, len);
294
295 if (i != a.end() && i->first < j->first)
296 {
297 // this line was deleted
298 ++i;
299 pos += isz;
300 }
301
302 else
303 {
304 // this line was added
305 ostringstream ss;
306 ss << *j;
307 oss << insn(ss.str());
308 ++j;
309 }
310}
311 }
312
313 flush_copy(oss,pos,len);
314 delta = oss.str();
315}
316
317
318void
319compute_delta(string const & a,
320 string const & b,
321 string & delta)
322{
323 vector<insn> delta_insns;
324
325 // FIXME: in theory you can do empty files and empty deltas; write some
326 // tests to be sure you're doing it right, and in any case implement the
327 // necessary logic directly in this function, don't bother doing an
328 // xdelta. several places of the xdelta code prefer assertions which are
329 // only true with non-empty chunks anyways.
330
331 if (a == b)
332 {
333 delta.clear();
334 return;
335 }
336
337 if (a.size() == 0 && b.size() != 0)
338 delta_insns.push_back(insn(b));
339 else if (a.size() != 0 && b.size() == 0)
340 delta_insns.push_back(insn(0, 0));
341 else
342 {
343 I(a.size() > 0);
344 I(b.size() > 0);
345
346 L(F("computing binary delta instructions\n"));
347 compute_delta_insns(a, b, delta_insns);
348 L(F("computed binary delta instructions\n"));
349 }
350
351 ostringstream oss;
352 for (vector<insn>::const_iterator i = delta_insns.begin();
353 i != delta_insns.end(); ++i)
354 {
355 oss << *i;
356 }
357 delta = oss.str();
358}
359
360struct
361simple_applicator
362 : public delta_applicator
363{
364 string src;
365 string dst;
366 virtual ~simple_applicator () {}
367 virtual void begin(string const & base)
368 {
369 src = base;
370 dst.clear();
371 }
372 virtual void next()
373 {
374 swap(src,dst);
375 dst.clear();
376 }
377 virtual void finish(string & out)
378 {
379 out = src;
380 }
381
382 virtual void copy(string::size_type pos, string::size_type len)
383 {
384 dst.append(src, pos, len);
385 }
386 virtual void insert(string const & str)
387 {
388 dst.append(str);
389 }
390};
391
392void
393apply_delta(boost::shared_ptr<delta_applicator> da,
394 std::string const & delta)
395{
396 istringstream del(delta);
397 for (char c = del.get(); c == 'I' || c == 'C'; c = del.get())
398 {
399 I(del.good());
400 if (c == 'I')
401{
402 string::size_type len = string::npos;
403 del >> len;
404 I(del.good());
405 I(len != string::npos);
406 string tmp;
407 tmp.reserve(len);
408 I(del.get(c).good());
409 I(c == '\n');
410 while(len--)
411 {
412 I(del.get(c).good());
413 tmp += c;
414 }
415 I(del.get(c).good());
416 I(c == '\n');
417 da->insert(tmp);
418}
419 else
420{
421 string::size_type pos = string::npos, len = string::npos;
422 del >> pos >> len;
423 I(del.good());
424 I(len != string::npos);
425 I(del.get(c).good());
426 I(c == '\n');
427 da->copy(pos, len);
428}
429 }
430 I(del.eof());
431}
432
433void
434apply_delta(string const & a,
435 string const & delta,
436 string & b)
437{
438 boost::shared_ptr<delta_applicator> da(new simple_applicator());
439 da->begin(a);
440 apply_delta(da, delta);
441 da->next();
442 da->finish(b);
443}
444
445struct
446size_accumulating_delta_applicator :
447 public delta_applicator
448{
449 u64 & sz;
450 size_accumulating_delta_applicator(u64 & s) : sz(s) {}
451 virtual void begin(std::string const & base) {}
452 virtual void next() {}
453 virtual void finish(std::string & out) {}
454
455 virtual void copy(std::string::size_type pos,
456 std::string::size_type len)
457 { sz += len; }
458 virtual void insert(std::string const & str)
459 { sz += str.size(); }
460};
461
462
463u64
464measure_delta_target_size(std::string const & delta)
465{
466 u64 sz = 0;
467 boost::shared_ptr<delta_applicator> da(new size_accumulating_delta_applicator(sz));
468 apply_delta(da, delta);
469 return sz;
470}
471
472
473
474// piecewise-applicator stuff follows (warning: ugly)
475
476typedef string::size_type version_pos;
477typedef string::size_type piece_pos;
478typedef string::size_type length;
479typedef unsigned long piece_id;
480
481struct chunk
482{
483 length len; // how many chars in this chunk
484 piece_id piece; // which piece to take chars from
485 version_pos vpos; // position in the current version
486 piece_pos ppos; // position in piece to take chars from
487
488 chunk (length ln, piece_id p, version_pos vp, piece_pos pp) :
489 len(ln), piece(p), vpos(vp), ppos(pp)
490 {}
491
492 chunk subchunk(version_pos vp,
493 length ln,
494 length offset) const
495 {
496 I(ppos + offset >= ppos);
497 I(ppos + offset + ln <= ppos + len);
498
499 chunk c = *this;
500 c.len = ln;
501 c.vpos = vp;
502 c.ppos += offset;
503 return c;
504 }
505};
506
507typedef vector<chunk> version_spec;
508
509struct
510piece_table
511{
512 vector<string> pieces;
513
514 void clear()
515 {
516 pieces.clear();
517 }
518
519 piece_id insert(string const & p)
520 {
521 pieces.push_back(p);
522 return pieces.size() - 1;
523 }
524
525 void append(string & targ, piece_id p, piece_pos pp, length ln)
526 {
527 I(p >= 0);
528 I(p < pieces.size());
529 targ.append(pieces[p], pp, ln);
530 }
531
532 void build(version_spec const & in, string & out)
533 {
534 out.clear();
535 for (version_spec::const_iterator i = in.begin();
536 i != in.end(); ++i)
537 {
538append(out, i->piece, i->ppos, i->len);
539 }
540 }
541};
542
543static void
544apply_insert(piece_table & p, version_spec & out, string const & str)
545{
546 piece_id piece = p.insert(str);
547 version_pos vpos = 0;
548 if (!out.empty())
549 vpos = out.back().vpos + out.back().len;
550 out.push_back(chunk(str.size(), piece, vpos, 0));
551}
552
553struct
554chunk_less_than
555{
556 bool operator()(chunk const & ch, version_pos vp) const
557 {
558 // nb: ch.vpos + ch.len is the 0-based index of the first element *not*
559 // included in ch; thus we measure against ch.len - 1.
560 I(ch.len > 0);
561 return (ch.vpos + ch.len - 1) < vp;
562 }
563};
564
565static void
566apply_copy(version_spec const & in, version_spec & out,
567 version_pos src_vpos, length src_len)
568{
569 //
570 // this is a little tricky because there's *4* different extents
571 // we're talking about at any time:
572 //
573 //
574 // - the 'src' extent, which is 1 or more chunks in the previous version.
575 // its address in the previous version is given in terms of a version_pos
576 // + length value.
577 //
578 // - the 'dst' extent, which is 1 chunk in the new version. its address
579 // in the new version is given in terms of a version_pos + length value.
580 //
581 // - the portion of a piece referenced by the src extent, which we're
582 // selecting a subset of. this is given in terms of a piece_pos + length
583 // value, against a particular piece.
584 //
585 // - the portion of a piece going into the dst extent, which is the
586 // selected subset. this is given in terms of a piece_pos + length value,
587 // against a particular piece.
588 //
589
590 version_pos src_final = src_vpos + src_len;
591 version_pos dst_vpos = 0;
592 if (!out.empty())
593 dst_vpos = out.back().vpos + out.back().len;
594 version_pos dst_final = dst_vpos + src_len;
595 version_spec::const_iterator lo = lower_bound(in.begin(),
596in.end(),
597src_vpos,
598chunk_less_than());
599 for ( ; src_len > 0; ++lo)
600 {
601 I(lo != in.end());
602
603 // now we are iterating over src extents which cover the current dst
604 // extent. we found these src extents by calling lower_bound,
605 // above. note, this entire function is called once per dst extent.
606 //
607 // there's two possible arrangements of spanning src extents:
608 //
609 // [ src extent 1 ][ src extent 2 ]
610 // [ ... dst extent .. ]
611 //
612 // or
613 //
614 // [ ... src extent ... ]
615 // [ ... dst extent .. ]
616 //
617 // the following arithmetic should bite off the lowest chunk of
618 // either of these two scenarios, append it to the dst version
619 // vector, and advance the 2 pos' and 1 len value appropriately.
620
621 version_pos src_end = min ((src_vpos + src_len), (lo->vpos + lo->len));
622 version_pos offset = src_vpos - lo->vpos;
623 length seglen = src_end - src_vpos;
624
625 I(seglen > 0);
626 I(src_vpos >= lo->vpos);
627 I(src_vpos + seglen <= lo->vpos + lo->len);
628
629 out.push_back(lo->subchunk(dst_vpos, seglen, offset));
630 src_vpos += seglen;
631 dst_vpos += seglen;
632 src_len -= seglen;
633 I(src_len >= 0);
634 I(out.back().vpos + out.back().len == dst_vpos);
635 }
636
637 I(src_vpos == src_final);
638 I(dst_vpos == dst_final);
639 I(src_len == 0);
640}
641
642
643struct
644piecewise_applicator
645 : public delta_applicator
646{
647 piece_table pt;
648 boost::shared_ptr<version_spec> src;
649 boost::shared_ptr<version_spec> dst;
650
651 piecewise_applicator() :
652 src(new version_spec()),
653 dst(new version_spec())
654 {}
655
656 virtual ~piecewise_applicator () {}
657
658 virtual void begin(string const & base)
659 {
660 pt.clear();
661 piece_id piece = pt.insert(base);
662 src->clear();
663 src->push_back(chunk(base.size(), piece, 0, 0));
664 dst->clear();
665 }
666
667 virtual void next()
668 {
669 swap(src,dst);
670 dst->clear();
671 }
672
673 virtual void finish(string & out)
674 {
675 out.clear();
676 pt.build(*src, out);
677 }
678
679 virtual void copy(string::size_type pos, string::size_type len)
680 {
681 apply_copy(*src, *dst, pos, len);
682 }
683
684 virtual void insert(string const & str)
685 {
686 apply_insert(pt, *dst, str);
687 }
688};
689
690
691// these just hide our implementation types from outside
692
693boost::shared_ptr<delta_applicator>
694new_simple_applicator()
695{
696 return boost::shared_ptr<delta_applicator>(new simple_applicator());
697}
698
699boost::shared_ptr<delta_applicator>
700new_piecewise_applicator()
701{
702 return boost::shared_ptr<delta_applicator>(new piecewise_applicator());
703}
704
705#ifdef BUILD_UNIT_TESTS
706
707#include "unit_tests.hh"
708#ifdef WIN32
709#define BOOST_NO_STDC_NAMESPACE
710#endif
711#include <boost/random.hpp>
712
713boost::mt19937 xdelta_prng;
714
715#if BOOST_VERSION >= 103100
716boost::uniform_smallint<char> xdelta_chargen('a', 'z');
717boost::uniform_smallint<size_t> xdelta_sizegen(1024, 65536);
718boost::uniform_smallint<size_t> xdelta_editgen(3, 10);
719boost::uniform_smallint<size_t> xdelta_lengen(1, 256);
720#define PRNG xdelta_prng
721#else
722boost::uniform_smallint<boost::mt19937, char> xdelta_chargen(xdelta_prng, 'a', 'z');
723boost::uniform_smallint<boost::mt19937, size_t> xdelta_sizegen(xdelta_prng, 1024, 65536);
724boost::uniform_smallint<boost::mt19937, size_t> xdelta_editgen(xdelta_prng, 3, 10);
725boost::uniform_smallint<boost::mt19937, size_t> xdelta_lengen(xdelta_prng, 1, 256);
726#define PRNG
727#endif
728
729void
730xdelta_random_string(string & str)
731{
732 size_t sz = xdelta_sizegen(PRNG);
733 str.clear();
734 str.reserve(sz);
735 while(sz-- > 0)
736 {
737 str += xdelta_chargen(PRNG);
738 }
739}
740
741void
742xdelta_randomly_insert(string & str)
743{
744 size_t nedits = xdelta_editgen(PRNG);
745 while (nedits > 0)
746 {
747 size_t pos = xdelta_sizegen(PRNG) % str.size();
748 size_t len = xdelta_lengen(PRNG);
749 if (pos+len >= str.size())
750continue;
751 string tmp;
752 tmp.reserve(len);
753 for (size_t i = 0; i < len; ++i)
754tmp += xdelta_chargen(PRNG);
755str.insert(pos, tmp);
756 nedits--;
757 }
758}
759
760void
761xdelta_randomly_change(string & str)
762{
763 size_t nedits = xdelta_editgen(PRNG);
764 while (nedits > 0)
765 {
766 size_t pos = xdelta_sizegen(PRNG) % str.size();
767 size_t len = xdelta_lengen(PRNG);
768 if (pos+len >= str.size())
769continue;
770 for (size_t i = 0; i < len; ++i)
771str[pos+i] = xdelta_chargen(PRNG);
772 nedits--;
773 }
774}
775
776void
777xdelta_randomly_delete(string & str)
778{
779 size_t nedits = xdelta_editgen(PRNG);
780 while (nedits > 0)
781 {
782 size_t pos = xdelta_sizegen(PRNG) % str.size();
783 size_t len = xdelta_lengen(PRNG);
784 if (pos+len >= str.size())
785continue;
786 str.erase(pos, len);
787 --nedits;
788 }
789}
790
791void
792xdelta_random_simple_delta_test()
793{
794 for (int i = 0; i < 100; ++i)
795 {
796 string a, b, fdel, rdel, c, d;
797 xdelta_random_string(a);
798 b = a;
799 xdelta_randomly_change(b);
800 xdelta_randomly_insert(b);
801 xdelta_randomly_delete(b);
802 compute_delta(a, b, fdel);
803 compute_delta(b, a, rdel);
804 L(F("src %d, dst %d, fdel %d, rdel %d\n")
805% a.size() % b.size()% fdel.size() % rdel.size()) ;
806 if (fdel.size() == 0)
807{
808 L(F("confirming src == dst and rdel == 0\n"));
809 BOOST_CHECK(a == b);
810 BOOST_CHECK(rdel.size() == 0);
811}
812 else
813{
814 apply_delta(a, fdel, c);
815 apply_delta(b, rdel, d);
816 L(F("confirming dst1 %d, dst2 %d\n") % c.size() % d.size());
817 BOOST_CHECK(b == c);
818 BOOST_CHECK(a == d);
819}
820 }
821}
822
823void
824add_xdelta_tests(test_suite * suite)
825{
826 I(suite);
827 suite->add(BOOST_TEST_CASE(&xdelta_random_simple_delta_test));
828}
829
830#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status