monotone

monotone Mtn Source Tree

Root/src/diff_output.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2// 2008, 2012 Stephen Leake <stephen_leake@stephe-leake.org>
3//
4// This program is made available under the GNU GPL version 2.0 or
5// greater. See the accompanying file COPYING for details.
6//
7// This program is distributed WITHOUT ANY WARRANTY; without even the
8// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
9// PURPOSE.
10
11#include "base.hh"
12#include "diff_output.hh"
13#include "file_io.hh"
14#include "interner.hh"
15#include "lcs.hh"
16#include "pcrewrap.hh"
17#include "simplestring_xform.hh"
18
19#include <ostream>
20#include <iterator>
21#include <boost/scoped_ptr.hpp>
22
23using std::max;
24using std::min;
25using std::ostream;
26using std::ostream_iterator;
27using std::string;
28using std::vector;
29using boost::scoped_ptr;
30
31// This file handles printing out various diff formats for the case where
32// someone wants to *read* a diff rather than apply it. The actual diff
33// computation is done in lcs.cc.
34
35struct hunk_consumer
36{
37 vector<string> const & a;
38 vector<string> const & b;
39 size_t ctx;
40 ostream & ost;
41 boost::scoped_ptr<pcre::regex const> encloser_re;
42 size_t a_begin, b_begin, a_len, b_len;
43 long skew;
44
45 vector<string>::const_reverse_iterator encloser_last_match;
46 vector<string>::const_reverse_iterator encloser_last_search;
47
48 virtual void flush_hunk(size_t pos) = 0;
49 virtual void advance_to(size_t newpos) = 0;
50 virtual void insert_at(size_t b_pos) = 0;
51 virtual void delete_at(size_t a_pos) = 0;
52 virtual void find_encloser(size_t pos, string & encloser);
53 virtual ~hunk_consumer() {}
54 hunk_consumer(vector<string> const & a,
55 vector<string> const & b,
56 size_t ctx,
57 ostream & ost,
58 string const & encloser_pattern)
59 : a(a), b(b), ctx(ctx), ost(ost), encloser_re(0),
60 a_begin(0), b_begin(0), a_len(0), b_len(0), skew(0),
61 encloser_last_match(a.rend()), encloser_last_search(a.rend())
62 {
63 if (encloser_pattern != "")
64 encloser_re.reset(new pcre::regex(encloser_pattern, origin::user));
65 }
66};
67
68/* Find, and write to ENCLOSER, the nearest line before POS which matches
69 ENCLOSER_PATTERN. We remember the last line scanned, and the matched, to
70 avoid duplication of effort. */
71
72void
73hunk_consumer::find_encloser(size_t pos, string & encloser)
74{
75 typedef vector<string>::const_reverse_iterator riter;
76
77 // Precondition: encloser_last_search <= pos <= a.size().
78 I(pos <= a.size());
79 // static_cast<> to silence compiler unsigned vs. signed comparison
80 // warning, after first making sure that the static_cast is safe.
81 I(a.rend() - encloser_last_search >= 0);
82 I(pos >= static_cast<size_t>(a.rend() - encloser_last_search));
83
84 if (!encloser_re)
85 return;
86
87 riter last = encloser_last_search;
88 riter i = riter(a.begin() + pos);
89
90 encloser_last_search = i;
91
92 // i is a reverse_iterator, so this loop goes backward through the vector.
93 for (; i != last; i++)
94 if (encloser_re->match(*i, origin::user))
95 {
96 encloser_last_match = i;
97 break;
98 }
99
100 if (encloser_last_match == a.rend())
101 return;
102
103 L(FL("find_encloser: from %u matching %d, \"%s\"")
104 % pos % (a.rend() - encloser_last_match) % *encloser_last_match);
105
106 // the number 40 is chosen to match GNU diff. it could safely be
107 // increased up to about 60 without overflowing the standard
108 // terminal width.
109 encloser = string(" ") + (*encloser_last_match).substr(0, 40);
110}
111
112void walk_hunk_consumer(vector<long, QA(long)> const & lcs,
113 vector<long, QA(long)> const & lines1,
114 vector<long, QA(long)> const & lines2,
115 hunk_consumer & cons)
116{
117
118 size_t a = 0, b = 0;
119 if (lcs.begin() == lcs.end())
120 {
121 // degenerate case: files have nothing in common
122 cons.advance_to(0);
123 while (a < lines1.size())
124 cons.delete_at(a++);
125 while (b < lines2.size())
126 cons.insert_at(b++);
127 cons.flush_hunk(a);
128 }
129 else
130 {
131 // normal case: files have something in common
132 for (vector<long, QA(long)>::const_iterator i = lcs.begin();
133 i != lcs.end(); ++i, ++a, ++b)
134 {
135 if (idx(lines1, a) == *i && idx(lines2, b) == *i)
136 continue;
137
138 cons.advance_to(a);
139 while (idx(lines1,a) != *i)
140 cons.delete_at(a++);
141 while (idx(lines2,b) != *i)
142 cons.insert_at(b++);
143 }
144 if (a < lines1.size())
145 {
146 cons.advance_to(a);
147 while(a < lines1.size())
148 cons.delete_at(a++);
149 }
150 if (b < lines2.size())
151 {
152 cons.advance_to(a);
153 while(b < lines2.size())
154 cons.insert_at(b++);
155 }
156 cons.flush_hunk(a);
157 }
158}
159
160struct unidiff_hunk_writer : public hunk_consumer
161{
162 vector<string> hunk;
163
164 virtual void flush_hunk(size_t pos);
165 virtual void advance_to(size_t newpos);
166 virtual void insert_at(size_t b_pos);
167 virtual void delete_at(size_t a_pos);
168 virtual ~unidiff_hunk_writer() {}
169 unidiff_hunk_writer(vector<string> const & a,
170 vector<string> const & b,
171 size_t ctx,
172 ostream & ost,
173 string const & encloser_pattern)
174 : hunk_consumer(a, b, ctx, ost, encloser_pattern)
175 {}
176};
177
178void unidiff_hunk_writer::insert_at(size_t b_pos)
179{
180 b_len++;
181 hunk.push_back(string("+") + b[b_pos]);
182}
183
184void unidiff_hunk_writer::delete_at(size_t a_pos)
185{
186 a_len++;
187 hunk.push_back(string("-") + a[a_pos]);
188}
189
190void unidiff_hunk_writer::flush_hunk(size_t pos)
191{
192 if (!hunk.empty())
193 {
194 // insert trailing context
195 size_t a_pos = a_begin + a_len;
196 for (size_t i = 0; (i < ctx) && (a_pos + i < a.size()); ++i)
197 {
198 hunk.push_back(string(" ") + a[a_pos + i]);
199 a_len++;
200 b_len++;
201 }
202
203 // write hunk to stream
204 if (a_len == 0)
205 ost << "@@ -0,0";
206 else
207 {
208 ost << "@@ -" << a_begin+1;
209 if (a_len > 1)
210 ost << ',' << a_len;
211 }
212
213 if (b_len == 0)
214 ost << " +0,0";
215 else
216 {
217 ost << " +" << b_begin+1;
218 if (b_len > 1)
219 ost << ',' << b_len;
220 }
221
222 {
223 string encloser;
224 ptrdiff_t first_mod = 0;
225 vector<string>::const_iterator i;
226 for (i = hunk.begin(); i != hunk.end(); i++)
227 if ((*i)[0] != ' ')
228 {
229 first_mod = i - hunk.begin();
230 break;
231 }
232
233 find_encloser(a_begin + first_mod, encloser);
234 ost << " @@" << encloser << '\n';
235 }
236 copy(hunk.begin(), hunk.end(), ostream_iterator<string>(ost, "\n"));
237 }
238
239 // reset hunk
240 hunk.clear();
241 skew += b_len - a_len;
242 a_begin = pos;
243 b_begin = pos + skew;
244 a_len = 0;
245 b_len = 0;
246}
247
248void unidiff_hunk_writer::advance_to(size_t newpos)
249{
250 if (a_begin + a_len + (2 * ctx) < newpos || hunk.empty())
251 {
252 flush_hunk(newpos);
253
254 // insert new leading context
255 for (size_t p = max(ctx, newpos) - ctx;
256 p < min(a.size(), newpos); ++p)
257 {
258 hunk.push_back(string(" ") + a[p]);
259 a_begin--; a_len++;
260 b_begin--; b_len++;
261 }
262 }
263 else
264 {
265 // pad intermediate context
266 while(a_begin + a_len < newpos)
267 {
268 hunk.push_back(string(" ") + a[a_begin + a_len]);
269 a_len++;
270 b_len++;
271 }
272 }
273}
274
275struct cxtdiff_hunk_writer : public hunk_consumer
276{
277 // For context diffs, we have to queue up calls to insert_at/delete_at
278 // until we hit an advance_to, so that we can get the tags right: an
279 // unpaired insert gets a + in the left margin, an unpaired delete a -,
280 // but if they are paired, they both get !. Hence, we have both the
281 // 'inserts' and 'deletes' queues of line numbers, and the 'from_file' and
282 // 'to_file' queues of line strings.
283 vector<size_t> inserts;
284 vector<size_t> deletes;
285 vector<string> from_file;
286 vector<string> to_file;
287 bool have_insertions;
288 bool have_deletions;
289
290 virtual void flush_hunk(size_t pos);
291 virtual void advance_to(size_t newpos);
292 virtual void insert_at(size_t b_pos);
293 virtual void delete_at(size_t a_pos);
294 void flush_pending_mods();
295 virtual ~cxtdiff_hunk_writer() {}
296 cxtdiff_hunk_writer(vector<string> const & a,
297 vector<string> const & b,
298 size_t ctx,
299 ostream & ost,
300 string const & encloser_pattern)
301 : hunk_consumer(a, b, ctx, ost, encloser_pattern),
302 have_insertions(false), have_deletions(false)
303 {}
304};
305
306void cxtdiff_hunk_writer::insert_at(size_t b_pos)
307{
308 inserts.push_back(b_pos);
309 have_insertions = true;
310}
311
312void cxtdiff_hunk_writer::delete_at(size_t a_pos)
313{
314 deletes.push_back(a_pos);
315 have_deletions = true;
316}
317
318void cxtdiff_hunk_writer::flush_hunk(size_t pos)
319{
320 flush_pending_mods();
321
322 if (have_deletions || have_insertions)
323 {
324 // insert trailing context
325 size_t ctx_start = a_begin + a_len;
326 for (size_t i = 0; (i < ctx) && (ctx_start + i < a.size()); ++i)
327 {
328 from_file.push_back(string(" ") + a[ctx_start + i]);
329 a_len++;
330 }
331
332 ctx_start = b_begin + b_len;
333 for (size_t i = 0; (i < ctx) && (ctx_start + i < b.size()); ++i)
334 {
335 to_file.push_back(string(" ") + b[ctx_start + i]);
336 b_len++;
337 }
338
339 {
340 string encloser;
341 ptrdiff_t first_insert = b_len;
342 ptrdiff_t first_delete = a_len;
343 vector<string>::const_iterator i;
344
345 if (have_deletions)
346 for (i = from_file.begin(); i != from_file.end(); i++)
347 if ((*i)[0] != ' ')
348 {
349 first_delete = i - from_file.begin();
350 break;
351 }
352 if (have_insertions)
353 for (i = to_file.begin(); i != to_file.end(); i++)
354 if ((*i)[0] != ' ')
355 {
356 first_insert = i - to_file.begin();
357 break;
358 }
359
360 find_encloser(a_begin + min(first_insert, first_delete),
361 encloser);
362
363 ost << "***************" << encloser << '\n';
364 }
365
366 ost << "*** " << (a_begin + 1) << ',' << (a_begin + a_len) << " ****\n";
367 if (have_deletions)
368 copy(from_file.begin(), from_file.end(), ostream_iterator<string>(ost, "\n"));
369
370 ost << "--- " << (b_begin + 1) << ',' << (b_begin + b_len) << " ----\n";
371 if (have_insertions)
372 copy(to_file.begin(), to_file.end(), ostream_iterator<string>(ost, "\n"));
373 }
374
375 // reset hunk
376 to_file.clear();
377 from_file.clear();
378 have_insertions = false;
379 have_deletions = false;
380 skew += b_len - a_len;
381 a_begin = pos;
382 b_begin = pos + skew;
383 a_len = 0;
384 b_len = 0;
385}
386
387void cxtdiff_hunk_writer::flush_pending_mods()
388{
389 // nothing to flush?
390 if (inserts.empty() && deletes.empty())
391 return;
392
393 string prefix;
394
395 // if we have just insertions to flush, prefix them with "+"; if
396 // just deletions, prefix with "-"; if both, prefix with "!"
397 if (inserts.empty() && !deletes.empty())
398 prefix = "-";
399 else if (deletes.empty() && !inserts.empty())
400 prefix = "+";
401 else
402 prefix = "!";
403
404 for (vector<size_t>::const_iterator i = deletes.begin();
405 i != deletes.end(); ++i)
406 {
407 from_file.push_back(prefix + string(" ") + a[*i]);
408 a_len++;
409 }
410 for (vector<size_t>::const_iterator i = inserts.begin();
411 i != inserts.end(); ++i)
412 {
413 to_file.push_back(prefix + string(" ") + b[*i]);
414 b_len++;
415 }
416
417 // clear pending mods
418 inserts.clear();
419 deletes.clear();
420}
421
422void cxtdiff_hunk_writer::advance_to(size_t newpos)
423{
424 // We must first flush out pending mods because otherwise our calculation
425 // of whether we need to generate a new hunk header will be way off.
426 // It is correct (i.e. consistent with diff(1)) to reset the +/-/!
427 // generation algorithm between sub-components of a single hunk.
428 flush_pending_mods();
429
430 if (a_begin + a_len + (2 * ctx) < newpos)
431 {
432 flush_hunk(newpos);
433
434 // insert new leading context
435 if (newpos - ctx < a.size())
436 {
437 for (size_t i = ctx; i > 0; --i)
438 {
439 // The original test was (newpos - i < 0), but since newpos
440 // is size_t (unsigned), it will never go negative. Testing
441 // that newpos is smaller than i is the same test, really.
442 if (newpos < i)
443 continue;
444
445 // note that context diffs prefix common text with two
446 // spaces, whereas unified diffs use a single space
447 from_file.push_back(string(" ") + a[newpos - i]);
448 to_file.push_back(string(" ") + a[newpos - i]);
449 a_begin--; a_len++;
450 b_begin--; b_len++;
451 }
452 }
453 }
454 else
455 // pad intermediate context
456 while (a_begin + a_len < newpos)
457 {
458 from_file.push_back(string(" ") + a[a_begin + a_len]);
459 to_file.push_back(string(" ") + a[a_begin + a_len]);
460 a_len++;
461 b_len++;
462 }
463}
464
465void
466make_diff(string const & filename1,
467 string const & filename2,
468 file_id const & id1,
469 file_id const & id2,
470 data const & data1,
471 data const & data2,
472 bool is_manual_merge,
473 ostream & ost,
474 diff_type type,
475 string const & pattern)
476{
477 if (is_manual_merge || guess_binary(data1()) || guess_binary(data2()))
478 {
479 // If a file has been removed, filename2 will be "/dev/null".
480 // It doesn't make sense to output that.
481 if (filename2 == "/dev/null")
482 ost << "# " << filename1 << " is binary\n";
483 else
484 ost << "# " << filename2 << " is binary\n";
485 return;
486 }
487
488 vector<string> lines1, lines2;
489 split_into_lines(data1(), lines1, split_flags::diff_compat);
490 split_into_lines(data2(), lines2, split_flags::diff_compat);
491
492 vector<long, QA(long)> left_interned;
493 vector<long, QA(long)> right_interned;
494 vector<long, QA(long)> lcs;
495
496 interner<long> in;
497
498 left_interned.reserve(lines1.size());
499 for (vector<string>::const_iterator i = lines1.begin();
500 i != lines1.end(); ++i)
501 left_interned.push_back(in.intern(*i));
502
503 right_interned.reserve(lines2.size());
504 for (vector<string>::const_iterator i = lines2.begin();
505 i != lines2.end(); ++i)
506 right_interned.push_back(in.intern(*i));
507
508 lcs.reserve(min(lines1.size(),lines2.size()));
509 longest_common_subsequence(left_interned.begin(), left_interned.end(),
510 right_interned.begin(), right_interned.end(),
511 back_inserter(lcs));
512
513 // The existence of various hacky diff parsers in the world somewhat
514 // constrains what output we can use. Here are some notes on how various
515 // tools interpret the header lines of a diff file:
516 //
517 // interdiff/filterdiff (patchutils):
518 // Attempt to parse a timestamp after each whitespace. If they succeed,
519 // then they take the filename as everything up to the whitespace they
520 // succeeded at, and the timestamp as everything after. If they fail,
521 // then they take the filename to be everything up to the first
522 // whitespace. Have hardcoded that /dev/null and timestamps at the
523 // epoch (in any timezone) indicate a file that did not exist.
524 //
525 // filterdiff filters on the first filename line. interdiff matches on
526 // the first filename line.
527 // PatchReader perl library (used by Bugzilla):
528 // Takes the filename to be everything up to the first tab; requires
529 // that there be a tab. Determines the filename based on the first
530 // filename line.
531 // diffstat:
532 // Can handle pretty much everything; tries to read up to the first tab
533 // to get the filename. Knows that "/dev/null", "", and anything
534 // beginning "/tmp/" are meaningless. Uses the second filename line.
535 // patch:
536 // If there is a tab, considers everything up to that tab to be the
537 // filename. If there is not a tab, considers everything up to the
538 // first whitespace to be the filename.
539 //
540 // Contains comment: 'If the [file]name is "/dev/null", ignore the name
541 // and mark the file as being nonexistent. The name "/dev/null" appears
542 // in patches regardless of how NULL_DEVICE is spelled.' Also detects
543 // timestamps at the epoch as indicating that a file does not exist.
544 //
545 // Uses the first filename line as the target, unless it is /dev/null or
546 // has an epoch timestamp in which case it uses the second.
547 // trac:
548 // Anything up to the first whitespace, or end of line, is considered
549 // filename. Does not care about timestamp. Uses the shorter of the
550 // two filenames as the filename (!).
551 //
552 // Conclusions:
553 // -- You must have a tab, both to prevent PatchReader blowing up, and
554 // to make it possible to have filenames with spaces in them.
555 // (Filenames with tabs in them are always impossible to properly
556 // express; FIXME what should be done if one occurs?)
557 // -- What comes after that tab matters not at all, though it probably
558 // shouldn't look like a timestamp, or have any trailing part that
559 // looks like a timestamp, unless it really is a timestamp. Simply
560 // having a trailing tab should work fine.
561 // -- If you need to express that some file does not exist, you should
562 // use /dev/null as the path. patch(1) goes so far as to claim that
563 // this is part of the diff format definition.
564 // -- If you want your patches to actually _work_ with patch(1), then
565 // renames are basically hopeless (you can do them by hand _after_
566 // running patch), adds work so long as the first line says either
567 // the new file's name or "/dev/null", nothing else, and deletes work
568 // if the new file name is "/dev/null", nothing else.
569 switch (type)
570 {
571 case unified_diff:
572 {
573 ost << "--- " << filename1 << '\t'
574 << id1 << '\n';
575 ost << "+++ " << filename2 << '\t'
576 << id2 << '\n';
577
578 unidiff_hunk_writer hunks(lines1, lines2, 3, ost, pattern);
579 walk_hunk_consumer(lcs, left_interned, right_interned, hunks);
580 break;
581 }
582 case context_diff:
583 {
584 ost << "*** " << filename1 << '\t'
585 << id1 << '\n';
586 ost << "--- " << filename2 << '\t'
587 << id2 << '\n';
588
589 cxtdiff_hunk_writer hunks(lines1, lines2, 3, ost, pattern);
590 walk_hunk_consumer(lcs, left_interned, right_interned, hunks);
591 break;
592 }
593 default:
594 {
595 // should never reach this; the external_diff type is not
596 // handled by this function.
597 I(false);
598 }
599 }
600}
601
602
603// Local Variables:
604// mode: C++
605// fill-column: 76
606// c-file-style: "gnu"
607// indent-tabs-mode: nil
608// End:
609// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status