monotone

monotone Mtn Source Tree

Root/xdelta.cc

1// copyright (C) 2002, 2003 graydon hoare <graydon@pobox.com>
2// all rights reserved.
3// licensed to the public under the terms of the GNU GPL (>= 2)
4// see the file COPYING for details
5
6// this file implements the xdelta algorithm, produces and consumes simple
7// copy/insert binary patches with the following structure:
8//
9// patch := (copy|insert)*
10// copy := 'C', ' ', pos=uint, ' ', len=uint, '\n'
11// insert := 'I', ' ', len=uint, '\n', payload=(byte x len), '\n'
12//
13// this means you can generally read the patch if you print it on stdout,
14// when it applies to text, but it can also apply to any binary, so the
15// hunk payload itself might look awful. I made it semi-ascii only to make
16// it slightly easier to debug; you really shouldn't read it normally. it's
17// a strict format with minimal checking, so it must be transport-encoded
18// to avoid whitespace munging.
19//
20// if you want to *read* a patch, you will like unidiff format much better.
21// take a look in diff_patch.(cc|hh) for a nice interface to that.
22
23#include <algorithm>
24#include <vector>
25#include <set>
26#include <string>
27#include <sstream>
28
29#include <boost/shared_ptr.hpp>
30#include <boost/version.hpp>
31
32#include "adler32.hh"
33#include "hash_map.hh"
34#include "numeric_vocab.hh"
35#include "sanity.hh"
36#include "xdelta.hh"
37
38using namespace std;
39
40struct identity {size_t operator()(u32 const & v) const { return static_cast<size_t>(v);}};
41typedef pair<string::size_type, string::size_type> extent;
42typedef hashmap::hash_map<u32, extent> match_table;
43
44struct
45insn
46{
47 insn(char c) : code(insert), pos(0), len(0), payload("") { payload += c; }
48 insn(string s) : code(insert), pos(0), len(s.size()), payload(s) {}
49 insn(u32 p, u32 l) : code(copy), pos(p), len(l) {}
50 enum { insert, copy } code;
51 u32 pos, len;
52 string payload;
53};
54
55ostream & operator<<(ostream & ost, insn const & i)
56{
57 if (i.code == insn::insert)
58 {
59 ost << "I " << i.payload.size() << '\n';
60 ost.write(i.payload.data(), i.payload.size());
61 ost << '\n';
62 }
63 else
64 ost << "C " << i.pos << ' ' << i.len << '\n';
65 return ost;
66}
67
68
69static inline void
70init_match_table(string const & a,
71 string::size_type blocksz,
72 match_table & tab)
73{
74 string::size_type sz = a.size();
75 for (string::size_type i = 0; i < sz; i += blocksz)
76 {
77 string::size_type step = ((i + blocksz) >= sz) ? (sz - i) : blocksz;
78 u32 sum = adler32(reinterpret_cast<u8 const *>(a.data() + i), step).sum();
79 if (tab.find(sum) == tab.end())
80 tab.insert(make_pair(sum, make_pair(i, step)));
81 }
82 return;
83}
84
85
86static inline bool
87find_match(match_table const & matches,
88 vector<insn> & delta,
89 adler32 const & rolling,
90 string const & a,
91 string const & b,
92 string::size_type bpos,
93 string::size_type & apos,
94 string::size_type & alen,
95 string::size_type & badvance)
96{
97 u32 sum = rolling.sum();
98 match_table::const_iterator e = matches.find(sum);
99
100 // maybe we haven't seen it at all?
101 if (e == matches.end())
102 return false;
103
104 string::size_type tpos = e->second.first;
105 string::size_type tlen = e->second.second;
106
107 I(tpos < a.size());
108 I(tpos + tlen <= a.size());
109
110 // maybe it's a false match?
111 if (memcmp(a.data() + tpos, b.data() + bpos, tlen) != 0)
112 return false;
113
114 apos = tpos;
115 alen = tlen;
116 badvance = tlen;
117
118 // see if we can extend our match forwards
119 while((apos + alen >= 0)
120 && (bpos + badvance >= 0)
121 && (apos + alen < a.size())
122 && (bpos + badvance < b.size())
123 && (a[apos + alen] == b[bpos + badvance]))
124 {
125 ++alen;
126 ++badvance;
127 }
128
129 // see if we can extend backwards into a previous insert hunk
130 if (! delta.empty() && delta.back().code == insn::insert)
131 {
132 while(apos > 0
133 && bpos > 0
134 && a[apos - 1] == b[bpos - 1]
135 && !delta.back().payload.empty())
136 {
137 I(a[apos - 1] == *(delta.back().payload.rbegin()));
138 I(delta.back().payload.size() > 0);
139 delta.back().payload.resize(delta.back().payload.size() - 1);
140 --apos;
141 --bpos;
142 ++alen;
143 // the significant thing here is that we do not move
144 // 'badvance' forward, just alen.
145 }
146
147 // if we've extended back to consume the *entire* insert,
148 // let's do away with it altogether.
149 if (delta.back().payload.empty())
150 {
151 delta.pop_back();
152 }
153 }
154
155 I(memcmp(a.data() + apos, b.data() + bpos, alen) == 0);
156 return true;
157}
158
159static inline void
160insert_insn(vector<insn> & delta, char c)
161{
162 if (delta.empty() || delta.back().code == insn::copy)
163 delta.push_back(insn(c));
164 else
165 delta.back().payload += c;
166}
167
168
169static inline void
170copy_insn(vector<insn> & delta, string::size_type i, string::size_type matchlen)
171{
172 delta.push_back(insn(i, matchlen));
173}
174
175
176static void
177compute_delta_insns(string const & a,
178 string const & b,
179 vector<insn> & delta)
180{
181 string::size_type blocksz = 64;
182 match_table matches ((a.size() / blocksz) * 2);
183 init_match_table(a, blocksz, matches);
184
185 if (b.size() < blocksz)
186 {
187 for (string::size_type i = 0; i < b.size(); ++i)
188 insert_insn(delta, b[i]);
189 return;
190 }
191
192 adler32 rolling(reinterpret_cast<u8 const *>(b.data()), blocksz);
193
194 for (string::size_type
195 sz = b.size(),
196 lo = 0,
197 hi = blocksz;
198 lo < sz; )
199 {
200 string::size_type apos = 0, alen = 1, badvance = 1;
201
202 bool found_match = find_match(matches, delta, rolling, a, b, lo, apos, alen, badvance);
203
204 if (found_match)
205 {
206 copy_insn(delta, apos, alen);
207 }
208 else
209 {
210 I(apos + alen <= a.size());
211 I(alen == 1);
212 I(alen < blocksz);
213 I(lo >= 0);
214 I(lo < b.size());
215 insert_insn(delta, b[lo]);
216 }
217
218 string::size_type next = lo;
219 for (; next < lo + badvance; ++next)
220 {
221 I(next >= 0);
222 I(next < b.size());
223 rolling.out(static_cast<u8>(b[next]));
224 if (next + blocksz < b.size())
225 rolling.in(static_cast<u8>(b[next + blocksz]));
226 }
227 lo = next;
228 hi = lo + blocksz;
229 }
230}
231
232
233// specialized form for manifest maps, which
234// are sorted, so have far more structure
235
236static void
237flush_copy(ostringstream & oss, u32 & pos, u32 & len)
238{
239 if (len != 0)
240 {
241 // flush any copy which is going on
242 oss << insn(pos, len);
243 pos = pos + len;
244 len = 0;
245 }
246}
247
248void
249compute_delta(manifest_map const & a,
250 manifest_map const & b,
251 string & delta)
252{
253 delta.clear();
254 ostringstream oss;
255
256 manifest_map::const_iterator i = a.begin();
257 manifest_map::const_iterator j = b.begin();
258
259 u32 pos = 0;
260 u32 len = 0;
261 while (j != b.end())
262 {
263 size_t isz = 0;
264
265 if (i != a.end())
266 isz = i->first.as_internal().size() + 2 + i->second.inner()().size() + 1;
267
268 if (i != a.end() && i->first == j->first)
269 {
270 if (i->second == j->second)
271 {
272 // this line was copied
273 len += isz;
274 }
275 else
276 {
277 // this line was changed, but the *entry* remained, so we
278 // treat it as a simultaneous delete + insert. that means
279 // advance pos over what used to be here, set len to 0, and
280 // copy the new data.
281 flush_copy(oss, pos, len);
282 pos += isz;
283 ostringstream ss;
284 ss << *j;
285 oss << insn(ss.str());
286 }
287 ++i; ++j;
288 }
289 else
290 {
291
292 flush_copy(oss, pos, len);
293
294 if (i != a.end() && i->first < j->first)
295 {
296 // this line was deleted
297 ++i;
298 pos += isz;
299 }
300
301 else
302 {
303 // this line was added
304 ostringstream ss;
305 ss << *j;
306 oss << insn(ss.str());
307 ++j;
308 }
309 }
310 }
311
312 flush_copy(oss,pos,len);
313 delta = oss.str();
314}
315
316
317void
318compute_delta(string const & a,
319 string const & b,
320 string & delta)
321{
322 vector<insn> delta_insns;
323
324 // FIXME: in theory you can do empty files and empty deltas; write some
325 // tests to be sure you're doing it right, and in any case implement the
326 // necessary logic directly in this function, don't bother doing an
327 // xdelta. several places of the xdelta code prefer assertions which are
328 // only true with non-empty chunks anyways.
329
330 if (a == b)
331 {
332 delta.clear();
333 return;
334 }
335
336 if (a.size() == 0 && b.size() != 0)
337 delta_insns.push_back(insn(b));
338 else if (a.size() != 0 && b.size() == 0)
339 delta_insns.push_back(insn(0, 0));
340 else
341 {
342 I(a.size() > 0);
343 I(b.size() > 0);
344
345 L(F("computing binary delta instructions\n"));
346 compute_delta_insns(a, b, delta_insns);
347 L(F("computed binary delta instructions\n"));
348 }
349
350 ostringstream oss;
351 for (vector<insn>::const_iterator i = delta_insns.begin();
352 i != delta_insns.end(); ++i)
353 {
354 oss << *i;
355 }
356 delta = oss.str();
357}
358
359struct
360simple_applicator
361 : public delta_applicator
362{
363 string src;
364 string dst;
365 virtual ~simple_applicator () {}
366 virtual void begin(string const & base)
367 {
368 src = base;
369 dst.clear();
370 }
371 virtual void next()
372 {
373 swap(src,dst);
374 dst.clear();
375 }
376 virtual void finish(string & out)
377 {
378 out = src;
379 }
380
381 virtual void copy(string::size_type pos, string::size_type len)
382 {
383 dst.append(src, pos, len);
384 }
385 virtual void insert(string const & str)
386 {
387 dst.append(str);
388 }
389};
390
391void
392apply_delta(boost::shared_ptr<delta_applicator> da,
393 std::string const & delta)
394{
395 istringstream del(delta);
396 for (char c = del.get(); c == 'I' || c == 'C'; c = del.get())
397 {
398 I(del.good());
399 if (c == 'I')
400 {
401 string::size_type len = string::npos;
402 del >> len;
403 I(del.good());
404 I(len != string::npos);
405 string tmp;
406 tmp.reserve(len);
407 I(del.get(c).good());
408 I(c == '\n');
409 while(len--)
410 {
411 I(del.get(c).good());
412 tmp += c;
413 }
414 I(del.get(c).good());
415 I(c == '\n');
416 da->insert(tmp);
417 }
418 else
419 {
420 string::size_type pos = string::npos, len = string::npos;
421 del >> pos >> len;
422 I(del.good());
423 I(len != string::npos);
424 I(del.get(c).good());
425 I(c == '\n');
426 da->copy(pos, len);
427 }
428 }
429 I(del.eof());
430}
431
432void
433apply_delta(string const & a,
434 string const & delta,
435 string & b)
436{
437 boost::shared_ptr<delta_applicator> da(new simple_applicator());
438 da->begin(a);
439 apply_delta(da, delta);
440 da->next();
441 da->finish(b);
442}
443
444struct
445size_accumulating_delta_applicator :
446 public delta_applicator
447{
448 u64 & sz;
449 size_accumulating_delta_applicator(u64 & s) : sz(s) {}
450 virtual void begin(std::string const & base) {}
451 virtual void next() {}
452 virtual void finish(std::string & out) {}
453
454 virtual void copy(std::string::size_type pos,
455 std::string::size_type len)
456 { sz += len; }
457 virtual void insert(std::string const & str)
458 { sz += str.size(); }
459};
460
461
462u64
463measure_delta_target_size(std::string const & delta)
464{
465 u64 sz = 0;
466 boost::shared_ptr<delta_applicator> da(new size_accumulating_delta_applicator(sz));
467 apply_delta(da, delta);
468 return sz;
469}
470
471
472
473// piecewise-applicator stuff follows (warning: ugly)
474
475typedef string::size_type version_pos;
476typedef string::size_type piece_pos;
477typedef string::size_type length;
478typedef unsigned long piece_id;
479
480struct chunk
481{
482 length len; // how many chars in this chunk
483 piece_id piece; // which piece to take chars from
484 version_pos vpos; // position in the current version
485 piece_pos ppos; // position in piece to take chars from
486
487 chunk (length ln, piece_id p, version_pos vp, piece_pos pp) :
488 len(ln), piece(p), vpos(vp), ppos(pp)
489 {}
490
491 chunk subchunk(version_pos vp,
492 length ln,
493 length offset) const
494 {
495 I(ppos + offset >= ppos);
496 I(ppos + offset + ln <= ppos + len);
497
498 chunk c = *this;
499 c.len = ln;
500 c.vpos = vp;
501 c.ppos += offset;
502 return c;
503 }
504};
505
506typedef vector<chunk> version_spec;
507
508struct
509piece_table
510{
511 vector<string> pieces;
512
513 void clear()
514 {
515 pieces.clear();
516 }
517
518 piece_id insert(string const & p)
519 {
520 pieces.push_back(p);
521 return pieces.size() - 1;
522 }
523
524 void append(string & targ, piece_id p, piece_pos pp, length ln)
525 {
526 I(p >= 0);
527 I(p < pieces.size());
528 targ.append(pieces[p], pp, ln);
529 }
530
531 void build(version_spec const & in, string & out)
532 {
533 out.clear();
534 for (version_spec::const_iterator i = in.begin();
535 i != in.end(); ++i)
536 {
537 append(out, i->piece, i->ppos, i->len);
538 }
539 }
540};
541
542static void
543apply_insert(piece_table & p, version_spec & out, string const & str)
544{
545 piece_id piece = p.insert(str);
546 version_pos vpos = 0;
547 if (!out.empty())
548 vpos = out.back().vpos + out.back().len;
549 out.push_back(chunk(str.size(), piece, vpos, 0));
550}
551
552struct
553chunk_less_than
554{
555 bool operator()(chunk const & ch, version_pos vp) const
556 {
557 // nb: ch.vpos + ch.len is the 0-based index of the first element *not*
558 // included in ch; thus we measure against ch.len - 1.
559 I(ch.len > 0);
560 return (ch.vpos + ch.len - 1) < vp;
561 }
562};
563
564static void
565apply_copy(version_spec const & in, version_spec & out,
566 version_pos src_vpos, length src_len)
567{
568 //
569 // this is a little tricky because there's *4* different extents
570 // we're talking about at any time:
571 //
572 //
573 // - the 'src' extent, which is 1 or more chunks in the previous version.
574 // its address in the previous version is given in terms of a version_pos
575 // + length value.
576 //
577 // - the 'dst' extent, which is 1 chunk in the new version. its address
578 // in the new version is given in terms of a version_pos + length value.
579 //
580 // - the portion of a piece referenced by the src extent, which we're
581 // selecting a subset of. this is given in terms of a piece_pos + length
582 // value, against a particular piece.
583 //
584 // - the portion of a piece going into the dst extent, which is the
585 // selected subset. this is given in terms of a piece_pos + length value,
586 // against a particular piece.
587 //
588
589 version_pos src_final = src_vpos + src_len;
590 version_pos dst_vpos = 0;
591 if (!out.empty())
592 dst_vpos = out.back().vpos + out.back().len;
593 version_pos dst_final = dst_vpos + src_len;
594 version_spec::const_iterator lo = lower_bound(in.begin(),
595 in.end(),
596 src_vpos,
597 chunk_less_than());
598 for ( ; src_len > 0; ++lo)
599 {
600 I(lo != in.end());
601
602 // now we are iterating over src extents which cover the current dst
603 // extent. we found these src extents by calling lower_bound,
604 // above. note, this entire function is called once per dst extent.
605 //
606 // there's two possible arrangements of spanning src extents:
607 //
608 // [ src extent 1 ][ src extent 2 ]
609 // [ ... dst extent .. ]
610 //
611 // or
612 //
613 // [ ... src extent ... ]
614 // [ ... dst extent .. ]
615 //
616 // the following arithmetic should bite off the lowest chunk of
617 // either of these two scenarios, append it to the dst version
618 // vector, and advance the 2 pos' and 1 len value appropriately.
619
620 version_pos src_end = min ((src_vpos + src_len), (lo->vpos + lo->len));
621 version_pos offset = src_vpos - lo->vpos;
622 length seglen = src_end - src_vpos;
623
624 I(seglen > 0);
625 I(src_vpos >= lo->vpos);
626 I(src_vpos + seglen <= lo->vpos + lo->len);
627
628 out.push_back(lo->subchunk(dst_vpos, seglen, offset));
629 src_vpos += seglen;
630 dst_vpos += seglen;
631 src_len -= seglen;
632 I(src_len >= 0);
633 I(out.back().vpos + out.back().len == dst_vpos);
634 }
635
636 I(src_vpos == src_final);
637 I(dst_vpos == dst_final);
638 I(src_len == 0);
639}
640
641
642struct
643piecewise_applicator
644 : public delta_applicator
645{
646 piece_table pt;
647 boost::shared_ptr<version_spec> src;
648 boost::shared_ptr<version_spec> dst;
649
650 piecewise_applicator() :
651 src(new version_spec()),
652 dst(new version_spec())
653 {}
654
655 virtual ~piecewise_applicator () {}
656
657 virtual void begin(string const & base)
658 {
659 pt.clear();
660 piece_id piece = pt.insert(base);
661 src->clear();
662 src->push_back(chunk(base.size(), piece, 0, 0));
663 dst->clear();
664 }
665
666 virtual void next()
667 {
668 swap(src,dst);
669 dst->clear();
670 }
671
672 virtual void finish(string & out)
673 {
674 out.clear();
675 pt.build(*src, out);
676 }
677
678 virtual void copy(string::size_type pos, string::size_type len)
679 {
680 apply_copy(*src, *dst, pos, len);
681 }
682
683 virtual void insert(string const & str)
684 {
685 apply_insert(pt, *dst, str);
686 }
687};
688
689
690// these just hide our implementation types from outside
691
692boost::shared_ptr<delta_applicator>
693new_simple_applicator()
694{
695 return boost::shared_ptr<delta_applicator>(new simple_applicator());
696}
697
698boost::shared_ptr<delta_applicator>
699new_piecewise_applicator()
700{
701 return boost::shared_ptr<delta_applicator>(new piecewise_applicator());
702}
703
704#ifdef BUILD_UNIT_TESTS
705
706#include "unit_tests.hh"
707#ifdef WIN32
708#define BOOST_NO_STDC_NAMESPACE
709#endif
710#include <boost/random.hpp>
711
712boost::mt19937 xdelta_prng;
713
714#if BOOST_VERSION >= 103100
715boost::uniform_smallint<char> xdelta_chargen('a', 'z');
716boost::uniform_smallint<size_t> xdelta_sizegen(1024, 65536);
717boost::uniform_smallint<size_t> xdelta_editgen(3, 10);
718boost::uniform_smallint<size_t> xdelta_lengen(1, 256);
719#define PRNG xdelta_prng
720#else
721boost::uniform_smallint<boost::mt19937, char> xdelta_chargen(xdelta_prng, 'a', 'z');
722boost::uniform_smallint<boost::mt19937, size_t> xdelta_sizegen(xdelta_prng, 1024, 65536);
723boost::uniform_smallint<boost::mt19937, size_t> xdelta_editgen(xdelta_prng, 3, 10);
724boost::uniform_smallint<boost::mt19937, size_t> xdelta_lengen(xdelta_prng, 1, 256);
725#define PRNG
726#endif
727
728void
729xdelta_random_string(string & str)
730{
731 size_t sz = xdelta_sizegen(PRNG);
732 str.clear();
733 str.reserve(sz);
734 while(sz-- > 0)
735 {
736 str += xdelta_chargen(PRNG);
737 }
738}
739
740void
741xdelta_randomly_insert(string & str)
742{
743 size_t nedits = xdelta_editgen(PRNG);
744 while (nedits > 0)
745 {
746 size_t pos = xdelta_sizegen(PRNG) % str.size();
747 size_t len = xdelta_lengen(PRNG);
748 if (pos+len >= str.size())
749 continue;
750 string tmp;
751 tmp.reserve(len);
752 for (size_t i = 0; i < len; ++i)
753 tmp += xdelta_chargen(PRNG);
754 str.insert(pos, tmp);
755 nedits--;
756 }
757}
758
759void
760xdelta_randomly_change(string & str)
761{
762 size_t nedits = xdelta_editgen(PRNG);
763 while (nedits > 0)
764 {
765 size_t pos = xdelta_sizegen(PRNG) % str.size();
766 size_t len = xdelta_lengen(PRNG);
767 if (pos+len >= str.size())
768 continue;
769 for (size_t i = 0; i < len; ++i)
770 str[pos+i] = xdelta_chargen(PRNG);
771 nedits--;
772 }
773}
774
775void
776xdelta_randomly_delete(string & str)
777{
778 size_t nedits = xdelta_editgen(PRNG);
779 while (nedits > 0)
780 {
781 size_t pos = xdelta_sizegen(PRNG) % str.size();
782 size_t len = xdelta_lengen(PRNG);
783 if (pos+len >= str.size())
784 continue;
785 str.erase(pos, len);
786 --nedits;
787 }
788}
789
790void
791xdelta_random_simple_delta_test()
792{
793 for (int i = 0; i < 100; ++i)
794 {
795 string a, b, fdel, rdel, c, d;
796 xdelta_random_string(a);
797 b = a;
798 xdelta_randomly_change(b);
799 xdelta_randomly_insert(b);
800 xdelta_randomly_delete(b);
801 compute_delta(a, b, fdel);
802 compute_delta(b, a, rdel);
803 L(boost::format("src %d, dst %d, fdel %d, rdel %d\n")
804 % a.size() % b.size()% fdel.size() % rdel.size()) ;
805 if (fdel.size() == 0)
806 {
807 L(boost::format("confirming src == dst and rdel == 0\n"));
808 BOOST_CHECK(a == b);
809 BOOST_CHECK(rdel.size() == 0);
810 }
811 else
812 {
813 apply_delta(a, fdel, c);
814 apply_delta(b, rdel, d);
815 L(boost::format("confirming dst1 %d, dst2 %d\n") % c.size() % d.size());
816 BOOST_CHECK(b == c);
817 BOOST_CHECK(a == d);
818 }
819 }
820}
821
822void
823add_xdelta_tests(test_suite * suite)
824{
825 I(suite);
826 suite->add(BOOST_TEST_CASE(&xdelta_random_simple_delta_test));
827}
828
829#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status