monotone

monotone Mtn Source Tree

Root/xdelta.cc

1// copyright (C) 2002, 2003 graydon hoare <graydon@pobox.com>
2// all rights reserved.
3// licensed to the public under the terms of the GNU GPL (>= 2)
4// see the file COPYING for details
5
6// this file implements the xdelta algorithm, produces and consumes simple
7// copy/insert binary patches with the following structure:
8//
9// patch := (copy|insert)*
10// copy := 'C', ' ', pos=uint, ' ', len=uint, '\n'
11// insert := 'I', ' ', len=uint, '\n', payload=(byte x len), '\n'
12//
13// this means you can generally read the patch if you print it on stdout,
14// when it applies to text, but it can also apply to any binary, so the
15// hunk payload itself might look awful. I made it semi-ascii only to make
16// it slightly easier to debug; you really shouldn't read it normally. it's
17// a strict format with minimal checking, so it must be transport-encoded
18// to avoid whitespace munging.
19//
20// if you want to *read* a patch, you will like unidiff format much better.
21// take a look in diff_patch.(cc|hh) for a nice interface to that.
22
23#include <ext/hash_map>
24#include <algorithm>
25#include <vector>
26#include <set>
27#include <string>
28#include <sstream>
29#include <boost/shared_ptr.hpp>
30
31#include "adler32.hh"
32#include "numeric_vocab.hh"
33#include "sanity.hh"
34#include "xdelta.hh"
35
36using namespace std;
37using namespace __gnu_cxx;
38
39struct identity {size_t operator()(u32 const & v) const { return static_cast<size_t>(v);}};
40typedef pair<string::size_type, string::size_type> extent;
41typedef hash_map<u32, extent, identity> match_table;
42
43struct insn
44{
45 insn(char c) : code(insert), pos(0), len(0), payload("") { payload += c; }
46 insn(string s) : code(insert), pos(0), len(s.size()), payload(s) {}
47 insn(u32 p, u32 l) : code(copy), pos(p), len(l) {}
48 enum { insert, copy } code;
49 u32 pos, len;
50 string payload;
51};
52
53ostream & operator<<(ostream & ost, insn const & i)
54{
55 if (i.code == insn::insert)
56 {
57 ost << "I " << i.payload.size() << '\n';
58 ost.write(i.payload.data(), i.payload.size());
59 ost << '\n';
60 }
61 else
62 ost << "C " << i.pos << ' ' << i.len << '\n';
63 return ost;
64}
65
66
67static inline void
68init_match_table(string const & a,
69 string::size_type blocksz,
70 match_table & tab)
71{
72 string::size_type sz = a.size();
73 for (string::size_type i = 0; i < sz; i += blocksz)
74 {
75 string::size_type step = ((i + blocksz) >= sz) ? (sz - i) : blocksz;
76 u32 sum = adler32(a.data() + i, step).sum();
77 if (tab.find(sum) == tab.end())
78 tab.insert(make_pair(sum, make_pair(i, step)));
79 }
80 return;
81}
82
83
84static inline void
85find_match(match_table const & matches,
86 vector<insn> & delta,
87 adler32 const & rolling,
88 string const & a,
89 string const & b,
90 string::size_type bpos,
91 string::size_type & apos,
92 string::size_type & alen,
93 string::size_type & badvance)
94{
95 u32 sum = rolling.sum();
96 match_table::const_iterator e = matches.find(sum);
97
98 // maybe we haven't seen it at all?
99 if (e == matches.end())
100 return;
101
102 string::size_type tpos = e->second.first;
103 string::size_type tlen = e->second.second;
104
105 I(tpos < a.size());
106 I(tpos + tlen <= a.size());
107
108 // maybe it's a false match?
109 if (memcmp(a.data() + tpos, b.data() + bpos, tlen) != 0)
110 return;
111
112 apos = tpos;
113 alen = tlen;
114 badvance = tlen;
115
116 // see if we can extend our match forwards
117 while((apos + alen >= 0)
118&& (bpos + badvance >= 0)
119&& (apos + alen < a.size())
120 && (bpos + badvance < b.size())
121 && (a[apos + alen] == b[bpos + badvance]))
122 {
123 ++alen;
124 ++badvance;
125 }
126
127 // see if we can extend backwards into a previous insert hunk
128 if (! delta.empty() && delta.back().code == insn::insert)
129 {
130 while(apos > 0
131 && bpos > 0
132 && a[apos - 1] == b[bpos - 1]
133 && !delta.back().payload.empty())
134{
135 I(a[apos - 1] == *(delta.back().payload.rbegin()));
136 I(delta.back().payload.size() > 0);
137 delta.back().payload.resize(delta.back().payload.size() - 1);
138 --apos;
139 --bpos;
140 ++alen;
141 // the significant thing here is that we do not move
142 // 'badvance' forward, just alen.
143}
144
145 // if we've extended back to consume the *entire* insert,
146 // let's do away with it altogether.
147 if (delta.back().payload.empty())
148{
149 delta.pop_back();
150}
151 }
152
153 I(memcmp(a.data() + apos, b.data() + bpos, alen) == 0);
154 return;
155}
156
157static inline void
158insert_insn(vector<insn> & delta, char c)
159{
160 if (delta.empty() || delta.back().code == insn::copy)
161 delta.push_back(insn(c));
162 else
163 delta.back().payload += c;
164}
165
166
167static inline void
168copy_insn(vector<insn> & delta, string::size_type i, string::size_type matchlen)
169{
170 delta.push_back(insn(i, matchlen));
171}
172
173
174static void
175compute_delta_insns(string const & a,
176 string const & b,
177 vector<insn> & delta)
178{
179 string::size_type blocksz = 64;
180 match_table matches ((a.size() / blocksz) * 2);
181 init_match_table(a, blocksz, matches);
182
183 if (b.size() < blocksz)
184 {
185 for (string::size_type i = 0; i < b.size(); ++i)
186insert_insn(delta, b[i]);
187 return;
188 }
189
190 adler32 rolling(b.data(), blocksz);
191
192 for (string::size_type
193 sz = b.size(),
194 lo = 0,
195 hi = blocksz;
196 lo < sz; )
197 {
198 string::size_type apos = 0, alen = 1, badvance = 1;
199
200 find_match(matches, delta, rolling, a, b, lo, apos, alen, badvance);
201
202 if (alen < blocksz &&
203 (apos + alen < a.size()))
204{
205 I(alen == 1);
206 I(lo >= 0);
207 I(lo < b.size());
208 insert_insn(delta, b[lo]);
209}
210 else
211 copy_insn(delta, apos, alen);
212
213 string::size_type next = lo;
214 for (; next < lo + badvance; ++next)
215{
216 I(next >= 0);
217 I(next < b.size());
218 rolling.out(b[next]);
219 if (next + blocksz < b.size())
220 rolling.in(b[next + blocksz]);
221}
222 lo = next;
223 hi = lo + blocksz;
224 }
225}
226
227
228// specialized form for manifest maps, which
229// are sorted, so have far more structure
230
231static void
232flush_copy(ostringstream & oss, u32 & pos, u32 & len)
233{
234 if (len != 0)
235 {
236 // flush any copy which is going on
237 oss << insn(pos, len);
238 pos = pos + len;
239 len = 0;
240 }
241}
242
243void
244compute_delta(manifest_map const & a,
245 manifest_map const & b,
246 string & delta)
247{
248 delta.clear();
249 ostringstream oss;
250
251 manifest_map::const_iterator i = a.begin();
252 manifest_map::const_iterator j = b.begin();
253
254 u32 pos = 0;
255 u32 len = 0;
256 while (j != b.end())
257 {
258 size_t isz = 0;
259
260 if (i != a.end())
261 isz = i->first().size() + 2 + i->second.inner()().size() + 1;
262
263 if (i != a.end() && i->first == j->first)
264{
265 if (i->second == j->second)
266 {
267 // this line was copied
268 len += isz;
269 }
270 else
271 {
272 // this line was changed, but the *entry* remained, so we
273 // treat it as a simultaneous delete + insert. that means
274 // advance pos over what used to be here, set len to 0, and
275 // copy the new data.
276 flush_copy(oss, pos, len);
277 pos += isz;
278 ostringstream ss;
279 ss << *j;
280 oss << insn(ss.str());
281 }
282 ++i; ++j;
283}
284 else
285{
286
287 flush_copy(oss, pos, len);
288
289 if (i != a.end() && i->first < j->first)
290 {
291 // this line was deleted
292 ++i;
293 pos += isz;
294 }
295
296 else
297 {
298 // this line was added
299 ostringstream ss;
300 ss << *j;
301 oss << insn(ss.str());
302 ++j;
303 }
304}
305 }
306
307 flush_copy(oss,pos,len);
308 delta = oss.str();
309}
310
311
312void
313compute_delta(string const & a,
314 string const & b,
315 string & delta)
316{
317 vector<insn> delta_insns;
318
319 // FIXME: in theory you can do empty files and empty deltas; write some
320 // tests to be sure you're doing it right, and in any case implement the
321 // necessary logic directly in this function, don't bother doing an
322 // xdelta. several places of the xdelta code prefer assertions which are
323 // only true with non-empty chunks anyways.
324
325 if (a == b)
326 {
327 delta.clear();
328 return;
329 }
330
331 if (a.size() == 0 && b.size() != 0)
332 delta_insns.push_back(insn(b));
333 else if (a.size() != 0 && b.size() == 0)
334 delta_insns.push_back(insn(0, 0));
335 else
336 {
337 I(a.size() > 0);
338 I(b.size() > 0);
339
340 L("computing binary delta instructions\n");
341 compute_delta_insns(a, b, delta_insns);
342 L("computed binary delta instructions\n");
343 }
344
345 ostringstream oss;
346 for (vector<insn>::const_iterator i = delta_insns.begin();
347 i != delta_insns.end(); ++i)
348 {
349 oss << *i;
350 }
351 delta = oss.str();
352}
353
354struct simple_applicator : public delta_applicator
355{
356 string src;
357 string dst;
358 virtual ~simple_applicator () {}
359 virtual void begin(string const & base)
360 {
361 src = base;
362 dst.clear();
363 }
364 virtual void next()
365 {
366 swap(src,dst);
367 dst.clear();
368 }
369 virtual void finish(string & out)
370 {
371 out = src;
372 }
373
374 virtual void copy(string::size_type pos, string::size_type len)
375 {
376 dst.append(src, pos, len);
377 }
378 virtual void insert(string const & str)
379 {
380 dst.append(str);
381 }
382};
383
384void apply_delta(boost::shared_ptr<delta_applicator> da,
385 std::string const & delta)
386{
387 istringstream del(delta);
388 for (char c = del.get(); c == 'I' || c == 'C'; c = del.get())
389 {
390 I(del.good());
391 if (c == 'I')
392{
393 string::size_type len = string::npos;
394 del >> len;
395 I(del.good());
396 I(len != string::npos);
397 string tmp;
398 tmp.reserve(len);
399 I(del.get(c).good());
400 I(c == '\n');
401 while(len--)
402 {
403 I(del.get(c).good());
404 tmp += c;
405 }
406 I(del.get(c).good());
407 I(c == '\n');
408 da->insert(tmp);
409}
410 else
411{
412 string::size_type pos = string::npos, len = string::npos;
413 del >> pos >> len;
414 I(del.good());
415 I(len != string::npos);
416 I(del.get(c).good());
417 I(c == '\n');
418 da->copy(pos, len);
419}
420 }
421 I(del.eof());
422}
423
424void
425apply_delta(string const & a,
426 string const & delta,
427 string & b)
428{
429 boost::shared_ptr<delta_applicator> da(new simple_applicator());
430 da->begin(a);
431 apply_delta(da, delta);
432 da->next();
433 da->finish(b);
434}
435
436struct size_accumulating_delta_applicator :
437 public delta_applicator
438{
439 u64 & sz;
440 size_accumulating_delta_applicator(u64 & s) : sz(s) {}
441 virtual void begin(std::string const & base) {}
442 virtual void next() {}
443 virtual void finish(std::string & out) {}
444
445 virtual void copy(std::string::size_type pos,
446 std::string::size_type len)
447 { sz += len; }
448 virtual void insert(std::string const & str)
449 { sz += str.size(); }
450};
451
452
453u64 measure_delta_target_size(std::string const & delta)
454{
455 u64 sz = 0;
456 boost::shared_ptr<delta_applicator> da(new size_accumulating_delta_applicator(sz));
457 apply_delta(da, delta);
458 return sz;
459}
460
461
462
463// piecewise-applicator stuff follows (warning: ugly)
464
465typedef string::size_type version_pos;
466typedef string::size_type piece_pos;
467typedef string::size_type length;
468typedef unsigned long piece_id;
469
470struct chunk
471{
472 length len; // how many chars in this chunk
473 piece_id piece; // which piece to take chars from
474 version_pos vpos; // position in the current version
475 piece_pos ppos; // position in piece to take chars from
476
477 chunk (length ln, piece_id p, version_pos vp, piece_pos pp) :
478 len(ln), piece(p), vpos(vp), ppos(pp)
479 {}
480
481 chunk subchunk(version_pos vp,
482 length ln,
483 length offset) const
484 {
485 I(ppos + offset >= ppos);
486 I(ppos + offset + ln <= ppos + len);
487
488 chunk c = *this;
489 c.len = ln;
490 c.vpos = vp;
491 c.ppos += offset;
492 return c;
493 }
494};
495
496typedef vector<chunk> version_spec;
497
498struct piece_table
499{
500 vector<string> pieces;
501
502 void clear()
503 {
504 pieces.clear();
505 }
506
507 piece_id insert(string const & p)
508 {
509 pieces.push_back(p);
510 return pieces.size() - 1;
511 }
512
513 void append(string & targ, piece_id p, piece_pos pp, length ln)
514 {
515 I(p >= 0);
516 I(p < pieces.size());
517 targ.append(pieces[p], pp, ln);
518 }
519
520 void build(version_spec const & in, string & out)
521 {
522 out.clear();
523 for (version_spec::const_iterator i = in.begin();
524 i != in.end(); ++i)
525 {
526append(out, i->piece, i->ppos, i->len);
527 }
528 }
529};
530
531static void
532apply_insert(piece_table & p, version_spec & out, string const & str)
533{
534 piece_id piece = p.insert(str);
535 version_pos vpos = 0;
536 if (!out.empty())
537 vpos = out.back().vpos + out.back().len;
538 out.push_back(chunk(str.size(), piece, vpos, 0));
539}
540
541struct chunk_less_than
542{
543 bool operator()(chunk const & ch, version_pos vp) const
544 {
545 // nb: ch.vpos + ch.len is the 0-based index of the first element *not*
546 // included in ch; thus we measure against ch.len - 1.
547 I(ch.len > 0);
548 return (ch.vpos + ch.len - 1) < vp;
549 }
550};
551
552static void
553apply_copy(version_spec const & in, version_spec & out,
554 version_pos src_vpos, length src_len)
555{
556 //
557 // this is a little tricky because there's *4* different extents
558 // we're talking about at any time:
559 //
560 //
561 // - the 'src' extent, which is 1 or more chunks in the previous version.
562 // its address in the previous version is given in terms of a version_pos
563 // + length value.
564 //
565 // - the 'dst' extent, which is 1 chunk in the new version. its address
566 // in the new version is given in terms of a version_pos + length value.
567 //
568 // - the portion of a piece referenced by the src extent, which we're
569 // selecting a subset of. this is given in terms of a piece_pos + length
570 // value, against a particular piece.
571 //
572 // - the portion of a piece going into the dst extent, which is the
573 // selected subset. this is given in terms of a piece_pos + length value,
574 // against a particular piece.
575 //
576
577 version_pos src_final = src_vpos + src_len;
578 version_pos dst_vpos = 0;
579 if (!out.empty())
580 dst_vpos = out.back().vpos + out.back().len;
581 version_pos dst_final = dst_vpos + src_len;
582 version_spec::const_iterator lo = lower_bound(in.begin(),
583in.end(),
584src_vpos,
585chunk_less_than());
586 for ( ; src_len > 0; ++lo)
587 {
588 I(lo != in.end());
589
590 // now we are iterating over src extents which cover the current dst
591 // extent. we found these src extents by calling lower_bound,
592 // above. note, this entire function is called once per dst extent.
593 //
594 // there's two possible arrangements of spanning src extents:
595 //
596 // [ src extent 1 ][ src extent 2 ]
597 // [ ... dst extent .. ]
598 //
599 // or
600 //
601 // [ ... src extent ... ]
602 // [ ... dst extent .. ]
603 //
604 // the following arithmetic should bite off the lowest chunk of
605 // either of these two scenarios, append it to the dst version
606 // vector, and advance the 2 pos' and 1 len value appropriately.
607
608 version_pos src_end = min ((src_vpos + src_len), (lo->vpos + lo->len));
609 version_pos offset = src_vpos - lo->vpos;
610 length seglen = src_end - src_vpos;
611
612 I(seglen > 0);
613 I(src_vpos >= lo->vpos);
614 I(src_vpos + seglen <= lo->vpos + lo->len);
615
616 out.push_back(lo->subchunk(dst_vpos, seglen, offset));
617 src_vpos += seglen;
618 dst_vpos += seglen;
619 src_len -= seglen;
620 I(src_len >= 0);
621 I(out.back().vpos + out.back().len == dst_vpos);
622 }
623
624 I(src_vpos == src_final);
625 I(dst_vpos == dst_final);
626 I(src_len == 0);
627}
628
629
630struct piecewise_applicator : public delta_applicator
631{
632 piece_table pt;
633 boost::shared_ptr<version_spec> src;
634 boost::shared_ptr<version_spec> dst;
635
636 piecewise_applicator() :
637 src(new version_spec()),
638 dst(new version_spec())
639 {}
640
641 virtual ~piecewise_applicator () {}
642
643 virtual void begin(string const & base)
644 {
645 pt.clear();
646 piece_id piece = pt.insert(base);
647 src->clear();
648 src->push_back(chunk(base.size(), piece, 0, 0));
649 dst->clear();
650 }
651
652 virtual void next()
653 {
654 swap(src,dst);
655 dst->clear();
656 }
657
658 virtual void finish(string & out)
659 {
660 out.clear();
661 pt.build(*src, out);
662 }
663
664 virtual void copy(string::size_type pos, string::size_type len)
665 {
666 apply_copy(*src, *dst, pos, len);
667 }
668
669 virtual void insert(string const & str)
670 {
671 apply_insert(pt, *dst, str);
672 }
673};
674
675
676// these just hide our implementation types from outside
677
678boost::shared_ptr<delta_applicator> new_simple_applicator()
679{
680 return boost::shared_ptr<delta_applicator>(new simple_applicator());
681}
682
683boost::shared_ptr<delta_applicator> new_piecewise_applicator()
684{
685 return boost::shared_ptr<delta_applicator>(new piecewise_applicator());
686}
687
688#ifdef BUILD_UNIT_TESTS
689
690#include "unit_tests.hh"
691#include <boost/random.hpp>
692
693boost::mt19937 xdelta_prng;
694boost::uniform_smallint<boost::mt19937, char> xdelta_chargen(xdelta_prng, 'a', 'z');
695boost::uniform_smallint<boost::mt19937, size_t> xdelta_sizegen(xdelta_prng, 1024, 65536);
696boost::uniform_smallint<boost::mt19937, size_t> xdelta_editgen(xdelta_prng, 3, 10);
697boost::uniform_smallint<boost::mt19937, size_t> xdelta_lengen(xdelta_prng, 1, 256);
698
699void xdelta_random_string(string & str)
700{
701 size_t sz = xdelta_sizegen();
702 str.clear();
703 str.reserve(sz);
704 while(sz-- > 0)
705 {
706 str += xdelta_chargen();
707 }
708}
709
710void xdelta_randomly_insert(string & str)
711{
712 size_t nedits = xdelta_editgen();
713 while (nedits > 0)
714 {
715 size_t pos = xdelta_sizegen() % str.size();
716 size_t len = xdelta_lengen();
717 if (pos+len >= str.size())
718continue;
719 string tmp;
720 tmp.reserve(len);
721 for (size_t i = 0; i < len; ++i)
722tmp += xdelta_chargen();
723str.insert(pos, tmp);
724 nedits--;
725 }
726}
727
728void xdelta_randomly_change(string & str)
729{
730 size_t nedits = xdelta_editgen();
731 while (nedits > 0)
732 {
733 size_t pos = xdelta_sizegen() % str.size();
734 size_t len = xdelta_lengen();
735 if (pos+len >= str.size())
736continue;
737 for (size_t i = 0; i < len; ++i)
738str[pos+i] = xdelta_chargen();
739 nedits--;
740 }
741}
742
743void xdelta_randomly_delete(string & str)
744{
745 size_t nedits = xdelta_editgen();
746 while (nedits > 0)
747 {
748 size_t pos = xdelta_sizegen() % str.size();
749 size_t len = xdelta_lengen();
750 if (pos+len >= str.size())
751continue;
752 str.erase(pos, len);
753 --nedits;
754 }
755}
756
757void xdelta_random_simple_delta_test()
758{
759 for (int i = 0; i < 100; ++i)
760 {
761 string a, b, fdel, rdel, c, d;
762 xdelta_random_string(a);
763 b = a;
764 xdelta_randomly_change(b);
765 xdelta_randomly_insert(b);
766 xdelta_randomly_delete(b);
767 compute_delta(a, b, fdel);
768 compute_delta(b, a, rdel);
769 L(F("src %d, dst %d, fdel %d, rdel %d\n")
770% a.size() % b.size()% fdel.size() % rdel.size()) ;
771 if (fdel.size() == 0)
772{
773 L(F("confirming src == dst and rdel == 0\n"));
774 BOOST_CHECK(a == b);
775 BOOST_CHECK(rdel.size() == 0);
776}
777 else
778{
779 apply_delta(a, fdel, c);
780 apply_delta(b, rdel, d);
781 L(F("confirming dst1 %d, dst2 %d\n") % c.size() % d.size());
782 BOOST_CHECK(b == c);
783 BOOST_CHECK(a == d);
784}
785 }
786}
787
788void add_xdelta_tests(test_suite * suite)
789{
790 I(suite);
791 suite->add(BOOST_TEST_CASE(&xdelta_random_simple_delta_test));
792}
793
794#endif // BUILD_UNIT_TESTS

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status