monotone

monotone Mtn Source Tree

Root/graph.cc

1#include "base.hh"
2#include <map>
3#include <utility>
4#include <list>
5#include <boost/shared_ptr.hpp>
6
7#include "sanity.hh"
8#include "graph.hh"
9#include "safe_map.hh"
10#include "numeric_vocab.hh"
11#include "hash_map.hh"
12#include "vocab_hash.hh"
13
14using boost::shared_ptr;
15using std::string;
16using std::vector;
17using std::set;
18using std::pair;
19using std::map;
20using std::multimap;
21using std::make_pair;
22using std::list;
23
24using hashmap::hash_set;
25
26void
27get_reconstruction_path(std::string const & start,
28 reconstruction_graph const & graph,
29 reconstruction_path & path)
30{
31 // This function does a breadth-first search from a starting point, until it
32 // finds some node that matches an arbitrary condition. The intended usage
33 // is for finding reconstruction paths in a database of deltas -- we start
34 // from the node we want to reconstruct, and follow existing deltas outward
35 // until we reach a full-text base. We return the shortest path from
36 // 'start' to a base version.
37 //
38 // The algorithm involves keeping a set of parallel linear paths, starting
39 // from 'start', that move forward through the DAG until we hit a base.
40 //
41 // On each iteration, we extend every active path by one step. If our
42 // extension involves a fork, we duplicate the path. If any path
43 // contains a cycle, we fault.
44 //
45 // If, by extending a path C, we enter a node which another path
46 // D has already seen, we kill path C. This avoids the possibility of
47 // exponential growth in the number of paths due to extensive forking
48 // and merging.
49
50 // Long ago, we used to do this with the boost graph library, but it
51 // involved loading too much of the storage graph into memory at any
52 // moment. this imperative version only loads the descendents of the
53 // reconstruction node, so it much cheaper in terms of memory.
54
55 vector< shared_ptr<reconstruction_path> > live_paths;
56
57 {
58 shared_ptr<reconstruction_path> pth0 = shared_ptr<reconstruction_path>(new reconstruction_path());
59 pth0->push_back(start);
60 live_paths.push_back(pth0);
61 }
62
63 shared_ptr<reconstruction_path> selected_path;
64 set<string> seen_nodes;
65
66 while (!selected_path)
67 {
68 vector< shared_ptr<reconstruction_path> > next_paths;
69
70 I(!live_paths.empty());
71 for (vector<shared_ptr<reconstruction_path> >::const_iterator i = live_paths.begin();
72 i != live_paths.end(); ++i)
73 {
74 shared_ptr<reconstruction_path> pth = *i;
75 string tip = pth->back();
76
77 if (graph.is_base(tip))
78 {
79 selected_path = pth;
80 break;
81 }
82 else
83 {
84 // This tip is not a root, so extend the path.
85 set<string> next;
86 graph.get_next(tip, next);
87 I(!next.empty());
88
89 // Replicate the path if there's a fork.
90 bool first = true;
91 for (set<string>::const_iterator j = next.begin(); j != next.end(); ++j)
92 {
93 L(FL("considering %s -> %s") % tip % *j);
94 if (seen_nodes.find(*j) == seen_nodes.end())
95 {
96 shared_ptr<reconstruction_path> pthN;
97 if (first)
98 {
99 pthN = pth;
100 first = false;
101 }
102 else
103 {
104 // NOTE: this is not the first iteration of the loop, and
105 // the first iteration appended one item to pth. So, we
106 // want to remove one before we use it. (Why not just
107 // copy every time? Because that makes this into an
108 // O(n^2) algorithm, in the common case where there is
109 // only one direction to go at each stop.)
110 pthN = shared_ptr<reconstruction_path>(new reconstruction_path(*pth));
111 I(!pthN->empty());
112 pthN->pop_back();
113 }
114 // check for a cycle... not that anything would break if
115 // there were one, but it's nice to let us know we have a bug
116 for (reconstruction_path::const_iterator k = pthN->begin(); k != pthN->end(); ++k)
117 I(*k != *j);
118 pthN->push_back(*j);
119 next_paths.push_back(pthN);
120 seen_nodes.insert(*j);
121 }
122 }
123 }
124 }
125
126 I(selected_path || !next_paths.empty());
127 live_paths = next_paths;
128 }
129
130 path = *selected_path;
131}
132
133
134#ifdef BUILD_UNIT_TESTS
135
136#include <map>
137#include "unit_tests.hh"
138#include "randomizer.hh"
139
140#include "lexical_cast.hh"
141
142using boost::lexical_cast;
143using std::pair;
144
145typedef std::multimap<string, string> rg_map;
146struct mock_reconstruction_graph : public reconstruction_graph
147{
148 rg_map ancestry;
149 set<string> bases;
150 mock_reconstruction_graph(rg_map const & ancestry, set<string> const & bases)
151 : ancestry(ancestry), bases(bases)
152 {}
153 virtual bool is_base(string const & node) const
154 {
155 return bases.find(node) != bases.end();
156 }
157 virtual void get_next(string const & from, set<string> & next) const
158 {
159 typedef rg_map::const_iterator ci;
160 pair<ci, ci> range = ancestry.equal_range(from);
161 for (ci i = range.first; i != range.second; ++i)
162 next.insert(i->second);
163 }
164};
165
166static void
167make_random_reconstruction_graph(size_t num_nodes, size_t num_random_edges,
168 size_t num_random_bases,
169 vector<string> & all_nodes, rg_map & ancestry,
170 set<string> & bases,
171 randomizer & rng)
172{
173 for (size_t i = 0; i != num_nodes; ++i)
174 all_nodes.push_back(lexical_cast<string>(i));
175 // We put a single long chain of edges in, to make sure that everything is
176 // reconstructable somehow.
177 for (size_t i = 1; i != num_nodes; ++i)
178 ancestry.insert(make_pair(idx(all_nodes, i - 1), idx(all_nodes, i)));
179 bases.insert(all_nodes.back());
180 // Then we insert a bunch of random edges too. These edges always go
181 // forwards, to avoid creating cycles (which make get_reconstruction_path
182 // unhappy).
183 for (size_t i = 0; i != num_random_edges; ++i)
184 {
185 size_t from_idx = rng.uniform(all_nodes.size() - 1);
186 size_t to_idx = from_idx + 1 + rng.uniform(all_nodes.size() - 1 - from_idx);
187 ancestry.insert(make_pair(idx(all_nodes, from_idx),
188 idx(all_nodes, to_idx)));
189 }
190 // And a bunch of random bases.
191 for (size_t i = 0; i != num_random_bases; ++i)
192 bases.insert(idx(all_nodes, rng.uniform(all_nodes.size())));
193}
194
195static void
196check_reconstruction_path(string const & start, reconstruction_graph const & graph,
197 reconstruction_path const & path)
198{
199 I(!path.empty());
200 I(*path.begin() == start);
201 reconstruction_path::const_iterator last = path.end();
202 --last;
203 I(graph.is_base(*last));
204 for (reconstruction_path::const_iterator i = path.begin(); i != last; ++i)
205 {
206 set<string> children;
207 graph.get_next(*i, children);
208 reconstruction_path::const_iterator next = i;
209 ++next;
210 I(children.find(*next) != children.end());
211 }
212}
213
214static void
215run_get_reconstruction_path_tests_on_random_graph(size_t num_nodes,
216 size_t num_random_edges,
217 size_t num_random_bases,
218 randomizer & rng)
219{
220 vector<string> all_nodes;
221 rg_map ancestry;
222 set<string> bases;
223 make_random_reconstruction_graph(num_nodes, num_random_edges, num_random_bases,
224 all_nodes, ancestry, bases,
225 rng);
226 mock_reconstruction_graph graph(ancestry, bases);
227 for (vector<string>::const_iterator i = all_nodes.begin();
228 i != all_nodes.end(); ++i)
229 {
230 reconstruction_path path;
231 get_reconstruction_path(*i, graph, path);
232 check_reconstruction_path(*i, graph, path);
233 }
234}
235
236UNIT_TEST(graph, random_get_reconstruction_path)
237{
238 randomizer rng;
239 // Some arbitrary numbers.
240 run_get_reconstruction_path_tests_on_random_graph(100, 100, 10, rng);
241 run_get_reconstruction_path_tests_on_random_graph(100, 200, 5, rng);
242 run_get_reconstruction_path_tests_on_random_graph(1000, 1000, 50, rng);
243 run_get_reconstruction_path_tests_on_random_graph(1000, 2000, 100, rng);
244}
245
246#endif // BUILD_UNIT_TESTS
247
248
249// graph is a parent->child map
250void toposort_rev_ancestry(rev_ancestry_map const & graph,
251 vector<revision_id> & revisions)
252{
253 typedef multimap<revision_id, revision_id>::const_iterator gi;
254 typedef map<revision_id, int>::iterator pi;
255
256 revisions.clear();
257 // determine the number of parents for each rev
258 map<revision_id, int> pcount;
259 for (gi i = graph.begin(); i != graph.end(); ++i)
260 pcount.insert(make_pair(i->first, 0));
261 for (gi i = graph.begin(); i != graph.end(); ++i)
262 ++(pcount[i->second]);
263
264 // find the set of graph roots
265 list<revision_id> roots;
266 for (pi i = pcount.begin(); i != pcount.end(); ++i)
267 if(i->second==0)
268 roots.push_back(i->first);
269
270 while (!roots.empty())
271 {
272 revision_id cur = roots.front();
273 roots.pop_front();
274 if (!null_id(cur))
275 revisions.push_back(cur);
276
277 for(gi i = graph.lower_bound(cur);
278 i != graph.upper_bound(cur); i++)
279 if(--(pcount[i->second]) == 0)
280 roots.push_back(i->second);
281 }
282}
283
284
285// get_uncommon_ancestors
286typedef std::pair<rev_height, revision_id> height_rev_pair;
287
288static void
289advance_frontier(set<height_rev_pair> & frontier,
290 hash_set<revision_id> & seen,
291 rev_graph const & rg)
292{
293 const height_rev_pair h_node = *frontier.rbegin();
294 const revision_id & node(h_node.second);
295 frontier.erase(h_node);
296 set<revision_id> parents;
297 rg.get_parents(node, parents);
298 for (set<revision_id>::const_iterator r = parents.begin();
299 r != parents.end(); r++)
300 {
301 if (seen.find(*r) == seen.end())
302 {
303 rev_height h;
304 rg.get_height(*r, h);
305 frontier.insert(make_pair(h, *r));
306 seen.insert(*r);
307 }
308 }
309}
310
311void
312get_uncommon_ancestors(revision_id const & a,
313 revision_id const & b,
314 rev_graph const & rg,
315 set<revision_id> & a_uncommon_ancs,
316 set<revision_id> & b_uncommon_ancs)
317{
318 a_uncommon_ancs.clear();
319 b_uncommon_ancs.clear();
320
321 // We extend a frontier from each revision until it reaches
322 // a revision that has been seen by the other frontier. By
323 // traversing in ascending height order we can ensure that
324 // any common ancestor will have been 'seen' by both sides
325 // before it is traversed.
326
327 set<height_rev_pair> a_frontier, b_frontier, common_frontier;
328 {
329 rev_height h;
330 rg.get_height(a, h);
331 a_frontier.insert(make_pair(h, a));
332 rg.get_height(b, h);
333 b_frontier.insert(make_pair(h, b));
334 }
335
336 hash_set<revision_id> a_seen, b_seen, common_seen;
337 a_seen.insert(a);
338 b_seen.insert(b);
339
340 while (!a_frontier.empty() || !b_frontier.empty())
341 {
342 // We take the leaf-most (ie highest) height entry from any frontier.
343 // Note: the default height is the lowest possible.
344 rev_height a_height, b_height, common_height;
345 if (!a_frontier.empty())
346 a_height = a_frontier.rbegin()->first;
347 if (!b_frontier.empty())
348 b_height = b_frontier.rbegin()->first;
349 if (!common_frontier.empty())
350 common_height = common_frontier.rbegin()->first;
351
352 if (a_height > b_height && a_height > common_height)
353 {
354 a_uncommon_ancs.insert(a_frontier.rbegin()->second);
355 advance_frontier(a_frontier, a_seen, rg);
356 }
357 else if (b_height > a_height && b_height > common_height)
358 {
359 b_uncommon_ancs.insert(b_frontier.rbegin()->second);
360 advance_frontier(b_frontier, b_seen, rg);
361 }
362 else if (common_height > a_height && common_height > b_height)
363 {
364 advance_frontier(common_frontier, common_seen, rg);
365 }
366 else if (a_height == b_height) // may or may not also == common_height
367 {
368 // if both frontiers are the same, then we can safely say that
369 // we've found all uncommon ancestors. This stopping condition
370 // can result in traversing more nodes than required, but is simple.
371 if (a_frontier == b_frontier)
372 break;
373
374 common_frontier.insert(*a_frontier.rbegin());
375 a_frontier.erase(*a_frontier.rbegin());
376 b_frontier.erase(*b_frontier.rbegin());
377 }
378 else if (a_height == common_height)
379 {
380 a_frontier.erase(*a_frontier.rbegin());
381 }
382 else if (b_height == common_height)
383 {
384 b_frontier.erase(*b_frontier.rbegin());
385 }
386 else
387 I(false);
388 }
389}
390
391#ifdef BUILD_UNIT_TESTS
392
393#include <map>
394#include "unit_tests.hh"
395#include "randomizer.hh"
396#include "roster.hh"
397
398
399static void
400get_all_ancestors(revision_id const & start, rev_ancestry_map const & child_to_parent_map,
401 set<revision_id> & ancestors)
402{
403 ancestors.clear();
404 vector<revision_id> frontier;
405 frontier.push_back(start);
406 while (!frontier.empty())
407 {
408 revision_id rid = frontier.back();
409 frontier.pop_back();
410 if (ancestors.find(rid) != ancestors.end())
411 continue;
412 safe_insert(ancestors, rid);
413 typedef rev_ancestry_map::const_iterator ci;
414 pair<ci,ci> range = child_to_parent_map.equal_range(rid);
415 for (ci i = range.first; i != range.second; ++i)
416 frontier.push_back(i->second);
417 }
418}
419
420struct mock_rev_graph : rev_graph
421{
422 mock_rev_graph(rev_ancestry_map const & child_to_parent_map)
423 : child_to_parent_map(child_to_parent_map)
424 {
425 // assign sensible heights
426 height_map.clear();
427
428 // toposort expects parent->child
429 rev_ancestry_map parent_to_child;
430 for (rev_ancestry_map::const_iterator i = child_to_parent_map.begin();
431 i != child_to_parent_map.end(); i++)
432 {
433 parent_to_child.insert(make_pair(i->second, i->first));
434 }
435 vector<revision_id> topo_revs;
436 toposort_rev_ancestry(parent_to_child, topo_revs);
437
438 // this is ugly but works. just give each one a sequential number.
439 rev_height top = rev_height::root_height();
440 u32 num = 1;
441 for (vector<revision_id>::const_iterator r = topo_revs.begin();
442 r != topo_revs.end(); r++, num++)
443 {
444 height_map.insert(make_pair(*r, top.child_height(num)));
445 }
446 }
447
448 virtual void get_parents(revision_id const & node, set<revision_id> & parents) const
449 {
450 parents.clear();
451 for (rev_ancestry_map::const_iterator i = child_to_parent_map.lower_bound(node);
452 i != child_to_parent_map.upper_bound(node); i++)
453 {
454 if (!null_id(i->second))
455 safe_insert(parents, i->second);
456 }
457 }
458
459 virtual void get_children(revision_id const & node, set<revision_id> & parents) const
460 {
461 // not required
462 I(false);
463 }
464
465 virtual void get_height(revision_id const & rev, rev_height & h) const
466 {
467 MM(rev);
468 h = safe_get(height_map, rev);
469 }
470
471
472 rev_ancestry_map const & child_to_parent_map;
473 map<revision_id, rev_height> height_map;
474};
475
476
477static void
478run_a_get_uncommon_ancestors_test(rev_ancestry_map const & child_to_parent_map,
479 revision_id const & left, revision_id const & right)
480{
481 set<revision_id> true_left_ancestors, true_right_ancestors;
482 get_all_ancestors(left, child_to_parent_map, true_left_ancestors);
483 get_all_ancestors(right, child_to_parent_map, true_right_ancestors);
484 set<revision_id> true_left_uncommon_ancestors, true_right_uncommon_ancestors;
485 MM(true_left_uncommon_ancestors);
486 MM(true_right_uncommon_ancestors);
487 set_difference(true_left_ancestors.begin(), true_left_ancestors.end(),
488 true_right_ancestors.begin(), true_right_ancestors.end(),
489 inserter(true_left_uncommon_ancestors, true_left_uncommon_ancestors.begin()));
490 set_difference(true_right_ancestors.begin(), true_right_ancestors.end(),
491 true_left_ancestors.begin(), true_left_ancestors.end(),
492 inserter(true_right_uncommon_ancestors, true_right_uncommon_ancestors.begin()));
493
494 set<revision_id> calculated_left_uncommon_ancestors, calculated_right_uncommon_ancestors;
495 MM(calculated_left_uncommon_ancestors);
496 MM(calculated_right_uncommon_ancestors);
497 mock_rev_graph rg(child_to_parent_map);
498 get_uncommon_ancestors(left, right, rg,
499 calculated_left_uncommon_ancestors,
500 calculated_right_uncommon_ancestors);
501 I(calculated_left_uncommon_ancestors == true_left_uncommon_ancestors);
502 I(calculated_right_uncommon_ancestors == true_right_uncommon_ancestors);
503 get_uncommon_ancestors(right, left, rg,
504 calculated_right_uncommon_ancestors,
505 calculated_left_uncommon_ancestors);
506 I(calculated_left_uncommon_ancestors == true_left_uncommon_ancestors);
507 I(calculated_right_uncommon_ancestors == true_right_uncommon_ancestors);
508}
509
510UNIT_TEST(graph, get_uncommon_ancestors_nasty_convexity_case)
511{
512 // This tests the nasty case described in the giant comment above
513 // get_uncommon_ancestors:
514 //
515 // 9
516 // |\ . Extraneous dots brought to you by the
517 // 8 \ . Committee to Shut Up the C Preprocessor
518 // /| \ . (CSUCPP), and viewers like you and me.
519 // / | |
520 // / 7 |
521 // | | |
522 // | 6 |
523 // | | |
524 // | 5 |
525 // | | |
526 // | 4 |
527 // | | |
528 // | : | <-- insert arbitrarily many revisions at the ellipsis
529 // | : |
530 // | | |
531 // 1 2 3
532 // \ / \ /
533 // L R
534
535 rev_ancestry_map child_to_parent_map;
536 revision_id left(fake_id()), right(fake_id());
537 revision_id one(fake_id()), two(fake_id()), eight(fake_id()), three(fake_id()), nine(fake_id());
538 MM(left);
539 MM(right);
540 MM(one);
541 MM(two);
542 MM(three);
543 MM(eight);
544 MM(nine);
545 child_to_parent_map.insert(make_pair(left, one));
546 child_to_parent_map.insert(make_pair(one, eight));
547 child_to_parent_map.insert(make_pair(eight, nine));
548 child_to_parent_map.insert(make_pair(right, three));
549 child_to_parent_map.insert(make_pair(three, nine));
550
551 revision_id middle(fake_id());
552 child_to_parent_map.insert(make_pair(left, two));
553 child_to_parent_map.insert(make_pair(right, two));
554 // We insert a _lot_ of revisions at the ellipsis, to make sure that
555 // whatever sort of step-size is used on the expansion, we can't take the
556 // entire middle portion in one big gulp and make the test pointless.
557 for (int i = 0; i != 1000; ++i)
558 {
559 revision_id next(fake_id());
560 child_to_parent_map.insert(make_pair(middle, next));
561 middle = next;
562 }
563 child_to_parent_map.insert(make_pair(middle, eight));
564
565 run_a_get_uncommon_ancestors_test(child_to_parent_map, left, right);
566}
567
568double const new_root_freq = 0.05;
569double const merge_node_freq = 0.2;
570double const skip_up_freq = 0.5;
571
572static revision_id
573pick_node_from_set(set<revision_id> const & heads,
574 randomizer & rng)
575{
576 I(!heads.empty());
577 size_t which_start = rng.uniform(heads.size());
578 set<revision_id>::const_iterator i = heads.begin();
579 for (size_t j = 0; j != which_start; ++j)
580 ++i;
581 return *i;
582}
583
584static revision_id
585pick_node_or_ancestor(set<revision_id> const & heads,
586 rev_ancestry_map const & child_to_parent_map,
587 randomizer & rng)
588{
589 revision_id rev = pick_node_from_set(heads, rng);
590 // now we recurse up from this starting point
591 while (rng.bernoulli(skip_up_freq))
592 {
593 typedef rev_ancestry_map::const_iterator ci;
594 pair<ci,ci> range = child_to_parent_map.equal_range(rev);
595 if (range.first == range.second)
596 break;
597 ci second = range.first;
598 ++second;
599 if (second == range.second)
600 // there was only one parent
601 rev = range.first->second;
602 else
603 {
604 // there are two parents, pick one randomly
605 if (rng.flip())
606 rev = range.first->second;
607 else
608 rev = second->second;
609 }
610 }
611 return rev;
612}
613
614static void
615make_random_graph(size_t num_nodes,
616 rev_ancestry_map & child_to_parent_map,
617 vector<revision_id> & nodes,
618 randomizer & rng)
619{
620 set<revision_id> heads;
621
622 for (size_t i = 0; i != num_nodes; ++i)
623 {
624 revision_id new_rid = revision_id(fake_id());
625 nodes.push_back(new_rid);
626 set<revision_id> parents;
627 if (heads.empty() || rng.bernoulli(new_root_freq))
628 parents.insert(revision_id());
629 else if (rng.bernoulli(merge_node_freq) && heads.size() > 1)
630 {
631 // maybe we'll pick the same node twice and end up not doing a
632 // merge, oh well...
633 parents.insert(pick_node_from_set(heads, rng));
634 parents.insert(pick_node_from_set(heads, rng));
635 }
636 else
637 {
638 parents.insert(pick_node_or_ancestor(heads, child_to_parent_map, rng));
639 }
640 for (set<revision_id>::const_iterator j = parents.begin();
641 j != parents.end(); ++j)
642 {
643 heads.erase(*j);
644 child_to_parent_map.insert(std::make_pair(new_rid, *j));
645 }
646 safe_insert(heads, new_rid);
647 }
648}
649
650static void
651run_a_get_uncommon_ancestors_random_test(size_t num_nodes,
652 size_t iterations,
653 randomizer & rng)
654{
655 rev_ancestry_map child_to_parent_map;
656 vector<revision_id> nodes;
657 make_random_graph(num_nodes, child_to_parent_map, nodes, rng);
658 for (size_t i = 0; i != iterations; ++i)
659 {
660 L(FL("get_uncommon_ancestors: random test %s-%s") % num_nodes % i);
661 revision_id left = idx(nodes, rng.uniform(nodes.size()));
662 revision_id right = idx(nodes, rng.uniform(nodes.size()));
663 run_a_get_uncommon_ancestors_test(child_to_parent_map, left, right);
664 }
665}
666
667UNIT_TEST(graph, get_uncommon_ancestors_randomly)
668{
669 randomizer rng;
670 run_a_get_uncommon_ancestors_random_test(100, 100, rng);
671 run_a_get_uncommon_ancestors_random_test(1000, 100, rng);
672 run_a_get_uncommon_ancestors_random_test(10000, 1000, rng);
673}
674
675
676#endif // BUILD_UNIT_TESTS
677
678// Local Variables:
679// mode: C++
680// fill-column: 76
681// c-file-style: "gnu"
682// indent-tabs-mode: nil
683// End:
684// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status