monotone

monotone Mtn Source Tree

Root/rcs_import.cc

1// -*- mode: C++; c-file-style: "gnu"; indent-tabs-mode: nil -*-
2// copyright (C) 2002, 2003, 2004 graydon hoare <graydon@pobox.com>
3// all rights reserved.
4// licensed to the public under the terms of the GNU GPL (>= 2)
5// see the file COPYING for details
6
7#include <algorithm>
8#include <iostream>
9#include <iterator>
10#include <list>
11#include <map>
12#include <set>
13#include <sstream>
14#include <stack>
15#include <stdexcept>
16#include <string>
17#include <vector>
18
19#include <unistd.h>
20
21#include <boost/shared_ptr.hpp>
22#include <boost/scoped_ptr.hpp>
23#include <boost/lexical_cast.hpp>
24#include <boost/tokenizer.hpp>
25
26#include "app_state.hh"
27#include "cert.hh"
28#include "constants.hh"
29#include "cycle_detector.hh"
30#include "database.hh"
31#include "file_io.hh"
32#include "keys.hh"
33#include "interner.hh"
34#include "manifest.hh"
35#include "packet.hh"
36#include "rcs_file.hh"
37#include "sanity.hh"
38#include "transforms.hh"
39#include "ui.hh"
40#include "platform.hh"
41#include "paths.hh"
42
43using namespace std;
44using boost::shared_ptr;
45using boost::scoped_ptr;
46
47// cvs history recording stuff
48
49typedef unsigned long cvs_branchname;
50typedef unsigned long cvs_author;
51typedef unsigned long cvs_changelog;
52typedef unsigned long cvs_version;
53typedef unsigned long cvs_path;
54typedef unsigned long cvs_tag;
55
56struct cvs_history;
57
58struct
59cvs_commit
60{
61 cvs_commit(rcs_file const & r,
62 string const & rcs_version,
63 file_id const & ident,
64 cvs_history & cvs);
65
66 bool is_synthetic_branch_root;
67 time_t time;
68 bool alive;
69 cvs_author author;
70 cvs_changelog changelog;
71 cvs_version version;
72 cvs_path path;
73 vector<cvs_tag> tags;
74
75 bool operator<(cvs_commit const & other) const
76 {
77 return time < other.time;
78 }
79};
80
81struct
82cvs_branch
83{
84 bool has_a_branchpoint;
85 bool has_a_commit;
86 time_t last_branchpoint;
87 time_t first_commit;
88
89 map<cvs_path, cvs_version> live_at_beginning;
90 vector<cvs_commit> lineage;
91
92 cvs_branch()
93 : has_a_branchpoint(false),
94 has_a_commit(false),
95 last_branchpoint(0),
96 first_commit(0)
97 {
98 }
99
100 void note_commit(time_t now)
101 {
102 if (!has_a_commit)
103 {
104 first_commit = now;
105 }
106 else
107 {
108 if (now < first_commit)
109 first_commit = now;
110 }
111 has_a_commit = true;
112 }
113
114 void note_branchpoint(time_t now)
115 {
116 has_a_branchpoint = true;
117 if (now > last_branchpoint)
118 last_branchpoint = now;
119 }
120
121 time_t beginning() const
122 {
123 I(has_a_branchpoint || has_a_commit);
124 if (has_a_commit)
125 {
126 I(first_commit != 0);
127 return first_commit;
128 }
129 else
130 {
131 I(last_branchpoint != 0);
132 return last_branchpoint;
133 }
134 }
135
136 void append_commit(cvs_commit const & c)
137 {
138 I(c.time != 0);
139 note_commit(c.time);
140 lineage.push_back(c);
141 }
142};
143
144struct
145cvs_history
146{
147
148 interner<unsigned long> branch_interner;
149 interner<unsigned long> author_interner;
150 interner<unsigned long> changelog_interner;
151 interner<unsigned long> file_version_interner;
152 interner<unsigned long> path_interner;
153 interner<unsigned long> tag_interner;
154 interner<unsigned long> manifest_version_interner;
155
156 cycle_detector<unsigned long> manifest_cycle_detector;
157
158 // assume admin has foo:X.Y.0.N in it, then
159 // this multimap contains entries of the form
160 // X.Y -> foo
161 multimap<string, string> branchpoints;
162
163 // and this map contains entries of the form
164 // X.Y.N.1 -> foo
165 map<string, string> branch_first_entries;
166
167 // branch name -> branch
168 map<string, shared_ptr<cvs_branch> > branches;
169 shared_ptr<cvs_branch> trunk;
170
171 // stack of branches we're injecting states into
172 stack< shared_ptr<cvs_branch> > stk;
173 stack< cvs_branchname > bstk;
174
175 // tag -> time, revision
176 //
177 // used to resolve the *last* revision which has a given tag
178 // applied; this is the revision which wins the tag.
179 map<unsigned long, pair<time_t, revision_id> > resolved_tags;
180
181 file_path curr_file;
182 cvs_path curr_file_interned;
183
184 string base_branch;
185
186 ticker n_versions;
187 ticker n_tree_branches;
188
189 cvs_history();
190 void set_filename(string const & file,
191 file_id const & ident);
192
193 void index_branchpoint_symbols(rcs_file const & r);
194
195 void push_branch(string const & branch_name, bool private_branch);
196 void pop_branch();
197};
198
199
200static bool
201is_sbr(shared_ptr<rcs_delta> dl,
202 shared_ptr<rcs_deltatext> dt)
203{
204
205 // CVS abuses the RCS format a bit (ha!) when storing a file which
206 // was only added on a branch: on the root of the branch there'll be
207 // a commit with dead state, empty text, and a log message
208 // containing the string "file foo was initially added on branch
209 // bar". We recognize and ignore these cases, as they do not
210 // "really" represent commits to be clustered together.
211
212 if (dl->state != "dead")
213 return false;
214
215 if (!dt->text.empty())
216 return false;
217
218 string log_bit = "was initially added on branch";
219 string::const_iterator i = search(dt->log.begin(),
220 dt->log.end(),
221 log_bit.begin(),
222 log_bit.end());
223
224 return i != dt->log.end();
225}
226
227
228cvs_commit::cvs_commit(rcs_file const & r,
229 string const & rcs_version,
230 file_id const & ident,
231 cvs_history & cvs)
232{
233 map<string, shared_ptr<rcs_delta> >::const_iterator delta =
234 r.deltas.find(rcs_version);
235 I(delta != r.deltas.end());
236
237 map<string, shared_ptr<rcs_deltatext> >::const_iterator deltatext =
238 r.deltatexts.find(rcs_version);
239 I(deltatext != r.deltatexts.end());
240
241 struct tm t;
242 // We need to initialize t to all zeros, because strptime has a habit of
243 // leaving bits of the data structure alone, letting garbage sneak into
244 // our output.
245 memset(&t, 0, sizeof(t));
246 char const * dp = delta->second->date.c_str();
247 L(F("Calculating time of %s\n") % dp);
248#ifdef HAVE_STRPTIME
249 if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL)
250 I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL);
251#else
252 I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon),
253 &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6);
254 t.tm_mon--;
255 // Apparently some RCS files have 2 digit years, others four; tm always
256 // wants a 2 (or 3) digit year (years since 1900).
257 if (t.tm_year > 1900)
258 t.tm_year-=1900;
259#endif
260 time = mktime(&t);
261 L(boost::format("= %i\n") % time);
262
263 is_synthetic_branch_root = is_sbr(delta->second,
264 deltatext->second);
265
266 alive = delta->second->state != "dead";
267 if (is_synthetic_branch_root)
268 changelog = cvs.changelog_interner.intern("synthetic branch root changelog");
269 else
270 changelog = cvs.changelog_interner.intern(deltatext->second->log);
271 author = cvs.author_interner.intern(delta->second->author);
272 path = cvs.curr_file_interned;
273 version = cvs.file_version_interner.intern(ident.inner()());
274
275 typedef multimap<string,string>::const_iterator ity;
276 pair<ity,ity> range = r.admin.symbols.equal_range(rcs_version);
277 for (ity i = range.first; i != range.second; ++i)
278 {
279 if (i->first == rcs_version)
280 {
281 L(F("version %s -> tag %s\n") % rcs_version % i->second);
282 tags.push_back(cvs.tag_interner.intern(i->second));
283 }
284 }
285
286}
287
288// piece table stuff
289#include "piece_table.hh"
290
291static void
292construct_version(piece::piece_table const & source_lines,
293 string const & dest_version,
294 piece::piece_table & dest_lines,
295 rcs_file const & r)
296{
297 I(r.deltas.find(dest_version) != r.deltas.end());
298// shared_ptr<rcs_delta> delta = r.deltas.find(dest_version)->second;
299
300 I(r.deltatexts.find(dest_version) != r.deltatexts.end());
301 shared_ptr<rcs_deltatext> deltatext = r.deltatexts.find(dest_version)->second;
302
303 piece::apply_diff(source_lines, dest_lines, deltatext->text);
304}
305
306// FIXME: should these be someplace else? using 'friend' to reach into the
307// DB is stupid, but it's also stupid to put raw edge insert methods on the
308// DB itself. or is it? hmm.. encapsulation vs. usage guidance..
309void
310rcs_put_raw_file_edge(hexenc<id> const & old_id,
311 hexenc<id> const & new_id,
312 delta const & del,
313 database & db)
314{
315 if (old_id == new_id)
316 {
317 L(F("skipping identity file edge\n"));
318 return;
319 }
320
321 if (db.file_version_exists(old_id))
322 {
323 // we already have a way to get to this old version,
324 // no need to insert another reconstruction path
325 L(F("existing path to %s found, skipping\n") % old_id);
326 }
327 else
328 {
329 I(db.exists(new_id, "files")
330 || db.delta_exists(new_id, "file_deltas"));
331 db.put_delta(old_id, new_id, del, "file_deltas");
332 }
333}
334
335void
336rcs_put_raw_manifest_edge(hexenc<id> const & old_id,
337 hexenc<id> const & new_id,
338 delta const & del,
339 database & db)
340{
341 if (old_id == new_id)
342 {
343 L(F("skipping identity manifest edge\n"));
344 return;
345 }
346
347 if (db.manifest_version_exists(old_id))
348 {
349 // we already have a way to get to this old version,
350 // no need to insert another reconstruction path
351 L(F("existing path to %s found, skipping\n") % old_id);
352 }
353 else
354 {
355 db.put_delta(old_id, new_id, del, "manifest_deltas");
356 }
357}
358
359
360static void
361insert_into_db(data const & curr_data,
362 hexenc<id> const & curr_id,
363 piece::piece_table const & next_lines,
364 data & next_data,
365 hexenc<id> & next_id,
366 database & db)
367{
368 // inserting into the DB
369 // note: curr_lines is a "new" (base) version
370 // and next_lines is an "old" (derived) version.
371 // all storage edges go from new -> old.
372 {
373 string tmp;
374 piece::build_string(next_lines, tmp);
375 next_data = tmp;
376 }
377 delta del;
378 diff(curr_data, next_data, del);
379 calculate_ident(next_data, next_id);
380 rcs_put_raw_file_edge(next_id, curr_id, del, db);
381}
382
383
384
385/*
386
387please read this exhaustingly long comment and understand it
388before mucking with the branch inference logic.
389
390we are processing a file version. a branch might begin here. if
391the current version is X.Y, then there is a branch B starting
392here iff there is a symbol in the admin section called X.Y.0.Z,
393where Z is the branch number (or if there is a private branch
394called X.Y.Z, which is either an import branch or some private
395RCS cruft).
396
397the version X.Y is then considered the branchpoint of B in the
398current file. this does *not* mean that the CVS key -- an
399abstraction representing whole-tree operations -- of X.Y is the
400branchpoint across the CVS archive we're processing.
401
402in fact, CVS does not record the occurrence of a branching
403action (tag -b). we have no idea who executed that command and
404when. what we know instead is the commit X.Y immediately
405preceeding the branch -- CVS consideres this the branchpoint --
406in this file's reduced view of history. we also know the first
407commit X.Y.Z.1 inside the branch (which might not exist).
408
409our old strategy was to consider all branches nested in a
410hierarchy, which was a super-tree of all the branch trees in all
411the CVS files in a repository. this involved considering X.Y as
412the parent version of branch X.Y.Z, an selecting "the"
413branchpoint connecting the two as the least CVS key X.Y.Z.1
414committed inside the branch B.
415
416this was a mistake, for two significant reasons.
417
418first, some files do not *have* any commit inside the branch B,
419only a branchpoint X.Y.0.Z. this branchpoint is actually the
420last commit *before* the user branched, and could be a very old
421commit, long before the branch was formed, so it is useless in
422determining the branch structure.
423
424second, some files do not have a branch B, or worse, have
425branched into B from an "ancestor" branch A, where a different
426file branches into B from a different ancestor branch C. in
427other words, while there *is* a tree structure within the X.Y.Z
428branches of each file, there is *no* shared tree structure
429between the branch names across a repository. in one file A can
430be an ancestor of B, in another file B can be an ancestor of A.
431
432thus, we give up on establishing a hierarchy between branches
433altogether. all branches exist in a flat namespace, and all are
434direct descendents of the empty revision at the root of
435history. each branchpoint symbol mentioned in the
436administrative section of a file is considered the root of a new
437lineage.
438
439*/
440
441
442static void
443process_branch(string const & begin_version,
444 piece::piece_table const & begin_lines,
445 data const & begin_data,
446 hexenc<id> const & begin_id,
447 rcs_file const & r,
448 database & db,
449 cvs_history & cvs)
450{
451 string curr_version = begin_version;
452 scoped_ptr< piece::piece_table > next_lines(new piece::piece_table);
453 scoped_ptr< piece::piece_table > curr_lines(new piece::piece_table
454 (begin_lines.begin(),
455 begin_lines.end()));
456 data curr_data(begin_data), next_data;
457 hexenc<id> curr_id(begin_id), next_id;
458
459 while(! (r.deltas.find(curr_version) == r.deltas.end()))
460 {
461 L(F("version %s has %d lines\n") % curr_version % curr_lines->size());
462
463 cvs_commit curr_commit(r, curr_version, curr_id, cvs);
464 if (!curr_commit.is_synthetic_branch_root)
465 {
466 cvs.stk.top()->append_commit(curr_commit);
467 ++cvs.n_versions;
468 }
469
470 string next_version = r.deltas.find(curr_version)->second->next;
471
472 if (! next_version.empty())
473 {
474 L(F("following RCS edge %s -> %s\n") % curr_version % next_version);
475
476 construct_version(*curr_lines, next_version, *next_lines, r);
477 L(F("constructed RCS version %s, inserting into database\n") %
478 next_version);
479
480 insert_into_db(curr_data, curr_id,
481 *next_lines, next_data, next_id, db);
482 }
483
484 // mark the beginning-of-branch time and state of this file if
485 // we're at a branchpoint
486 typedef multimap<string,string>::const_iterator ity;
487 pair<ity,ity> range = cvs.branchpoints.equal_range(curr_version);
488 if (range.first != cvs.branchpoints.end()
489 && range.first->first == curr_version)
490 {
491 for (ity i = range.first; i != range.second; ++i)
492 {
493 cvs.push_branch(i->second, false);
494 shared_ptr<cvs_branch> b = cvs.stk.top();
495 if (curr_commit.alive)
496 b->live_at_beginning[cvs.curr_file_interned] = curr_commit.version;
497 b->note_branchpoint(curr_commit.time);
498 cvs.pop_branch();
499 }
500 }
501
502
503 // recursively follow any branch commits coming from the branchpoint
504 boost::shared_ptr<rcs_delta> curr_delta = r.deltas.find(curr_version)->second;
505 for(vector<string>::const_iterator i = curr_delta->branches.begin();
506 i != curr_delta->branches.end(); ++i)
507 {
508 string branch;
509 data branch_data;
510 hexenc<id> branch_id;
511 piece::piece_table branch_lines;
512 bool priv = false;
513 map<string, string>::const_iterator be = cvs.branch_first_entries.find(*i);
514
515 if (be != cvs.branch_first_entries.end())
516 branch = be->second;
517 else
518 priv = true;
519
520 L(F("following RCS branch %s = '%s'\n") % (*i) % branch);
521
522 construct_version(*curr_lines, *i, branch_lines, r);
523 insert_into_db(curr_data, curr_id,
524 branch_lines, branch_data, branch_id, db);
525
526 cvs.push_branch(branch, priv);
527 process_branch(*i, branch_lines, branch_data, branch_id, r, db, cvs);
528 cvs.pop_branch();
529
530 L(F("finished RCS branch %s = '%s'\n") % (*i) % branch);
531 }
532
533 if (!r.deltas.find(curr_version)->second->next.empty())
534 {
535 // advance
536 curr_data = next_data;
537 curr_id = next_id;
538 curr_version = next_version;
539 swap(next_lines, curr_lines);
540 next_lines->clear();
541 }
542 else break;
543 }
544}
545
546
547static void
548import_rcs_file_with_cvs(string const & filename, database & db, cvs_history & cvs)
549{
550 rcs_file r;
551 L(F("parsing RCS file %s\n") % filename);
552 parse_rcs_file(filename, r);
553 L(F("parsed RCS file %s OK\n") % filename);
554
555 {
556 piece::piece_table head_lines;
557 I(r.deltatexts.find(r.admin.head) != r.deltatexts.end());
558 I(r.deltas.find(r.admin.head) != r.deltas.end());
559
560 hexenc<id> id;
561 data dat(r.deltatexts.find(r.admin.head)->second->text);
562 calculate_ident(dat, id);
563 file_id fid = id;
564
565 cvs.set_filename (filename, fid);
566 cvs.index_branchpoint_symbols (r);
567
568 if (! db.file_version_exists (fid))
569 {
570 db.put_file(fid, dat);
571 }
572
573 {
574 // create the head state in case it is a loner
575 // cvs_key k;
576 // shared_ptr<cvs_state> s;
577 // L(F("noting head version %s : %s\n") % cvs.curr_file % r.admin.head);
578 // cvs.find_key_and_state (r, r.admin.head, k, s);
579 }
580
581 piece::reset();
582 piece::index_deltatext(r.deltatexts.find(r.admin.head)->second->text, head_lines);
583 process_branch(r.admin.head, head_lines, dat, id, r, db, cvs);
584 piece::reset();
585 }
586
587 ui.set_tick_trailer("");
588}
589
590
591void
592test_parse_rcs_file(system_path const & filename, database & db)
593{
594 cvs_history cvs;
595
596 I(! filename.empty());
597 assert_path_is_file(filename);
598
599 P(F("parsing RCS file %s\n") % filename);
600 rcs_file r;
601 parse_rcs_file(filename.as_external(), r);
602 P(F("parsed RCS file %s OK\n") % filename);
603}
604
605
606// CVS importing stuff follows
607
608
609static void
610split_version(string const & v, vector<string> & vs)
611{
612 vs.clear();
613 boost::char_separator<char> sep(".");
614 typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
615 tokenizer tokens(v, sep);
616 copy(tokens.begin(), tokens.end(), back_inserter(vs));
617}
618
619static void
620join_version(vector<string> const & vs, string & v)
621{
622 v.clear();
623 for (vector<string>::const_iterator i = vs.begin();
624 i != vs.end(); ++i)
625 {
626 if (i != vs.begin())
627 v += ".";
628 v += *i;
629 }
630}
631
632cvs_history::cvs_history() :
633 n_versions("versions", "v", 1),
634 n_tree_branches("branches", "b", 1)
635{
636}
637
638void
639cvs_history::set_filename(string const & file,
640 file_id const & ident)
641{
642 L(F("importing file '%s'\n") % file);
643 I(file.size() > 2);
644 I(file.substr(file.size() - 2) == string(",v"));
645 string ss = file;
646 ui.set_tick_trailer(ss);
647 ss.resize(ss.size() - 2);
648 // remove Attic/ if present
649 std::string::size_type last_slash=ss.rfind('/');
650 if (last_slash!=std::string::npos && last_slash>=5
651 && ss.substr(last_slash-5,6)=="Attic/")
652 ss.erase(last_slash-5,6);
653 curr_file = file_path_internal(ss);
654 curr_file_interned = path_interner.intern(ss);
655}
656
657void cvs_history::index_branchpoint_symbols(rcs_file const & r)
658{
659 branchpoints.clear();
660 branch_first_entries.clear();
661
662 for (std::multimap<std::string, std::string>::const_iterator i =
663 r.admin.symbols.begin(); i != r.admin.symbols.end(); ++i)
664 {
665 std::string const & num = i->first;
666 std::string const & sym = i->second;
667
668 vector<string> components;
669 split_version(num, components);
670
671 vector<string> first_entry_components;
672 vector<string> branchpoint_components;
673
674 if (components.size() > 2 &&
675 (components.size() % 2 == 1))
676 {
677 // this is a "vendor" branch
678 //
679 // such as "1.1.1", where "1.1" is the branchpoint and
680 // "1.1.1.1" will be the first commit on it.
681
682 first_entry_components = components;
683 first_entry_components.push_back("1");
684
685 branchpoint_components = components;
686 branchpoint_components.erase(branchpoint_components.end() - 1,
687 branchpoint_components.end());
688
689 }
690
691 else if (components.size() > 2 &&
692 (components.size() % 2 == 0) &&
693 components[components.size() - 2] == string("0"))
694 {
695 // this is a "normal" branch
696 //
697 // such as "1.3.0.2", where "1.3" is the branchpoint and
698 // "1.3.2.1"
699
700 first_entry_components = components;
701 first_entry_components[first_entry_components.size() - 2]
702 = first_entry_components[first_entry_components.size() - 1];
703 first_entry_components[first_entry_components.size() - 1]
704 = string("1");
705
706 branchpoint_components = components;
707 branchpoint_components.erase(branchpoint_components.end() - 2,
708 branchpoint_components.end());
709 }
710
711 string first_entry_version;
712 join_version(first_entry_components, first_entry_version);
713
714 L(F("first version in branch %s would be %s\n")
715 % sym % first_entry_version);
716 branch_first_entries.insert(make_pair(first_entry_version, sym));
717
718 string branchpoint_version;
719 join_version(branchpoint_components, branchpoint_version);
720
721 L(F("file branchpoint for %s at %s\n") % sym % branchpoint_version);
722 branchpoints.insert(make_pair(branchpoint_version, sym));
723 }
724}
725
726
727
728void
729cvs_history::push_branch(string const & branch_name, bool private_branch)
730{
731 shared_ptr<cvs_branch> branch;
732
733 string bname = base_branch + "." + branch_name;
734 I(stk.size() > 0);
735
736 if (private_branch)
737 {
738 branch = shared_ptr<cvs_branch>(new cvs_branch());
739 stk.push(branch);
740 bstk.push(branch_interner.intern(""));
741 return;
742 }
743 else
744 {
745 map<string, shared_ptr<cvs_branch> >::const_iterator b = branches.find(bname);
746 if (b == branches.end())
747 {
748 branch = shared_ptr<cvs_branch>(new cvs_branch());
749 branches.insert(make_pair(bname, branch));
750 ++n_tree_branches;
751 }
752 else
753 branch = b->second;
754
755 stk.push(branch);
756 bstk.push(branch_interner.intern(bname));
757 }
758}
759
760void
761cvs_history::pop_branch()
762{
763 I(stk.size() > 1);
764 stk.pop();
765 bstk.pop();
766}
767
768
769class
770cvs_tree_walker
771 : public tree_walker
772{
773 cvs_history & cvs;
774 database & db;
775public:
776 cvs_tree_walker(cvs_history & c, database & d) :
777 cvs(c), db(d)
778 {
779 }
780 virtual void visit_file(file_path const & path)
781 {
782 string file = path.as_external();
783 if (file.substr(file.size() - 2) == string(",v"))
784 {
785 try
786 {
787 import_rcs_file_with_cvs(file, db, cvs);
788 }
789 catch (oops const & o)
790 {
791 W(F("error reading RCS file %s: %s\n") % file % o.what());
792 }
793 }
794 else
795 L(F("skipping non-RCS file %s\n") % file);
796 }
797 virtual ~cvs_tree_walker() {}
798};
799
800
801
802
803//
804// our task here is to produce a sequence of revision descriptions
805// from the per-file commit records we have. we do this by rolling
806// forwards through the temporally sorted file-commit list
807// accumulating file-commits into revisions and flushing the
808// revisions when we feel they are "complete".
809//
810// revisions have to have a time associated with them. this time
811// will be the first time of any commit associated with the
812// revision. they have an author and a changelog, which is shared
813// by all the file-commits in the revision.
814//
815// there might be multiple revisions overlapping in time. this is
816// legal wrt. CVS. we keep a set, and search all members of the set
817// for the best match.
818//
819// consider this situation of overlapping revisions:
820//
821// +---------------+ +---------------+ +---------------+
822// | rev #1 @ 0011 | | rev #2 @ 0012 | | rev #3 @ 0013 |
823// |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~|
824// | patch foo.txt | | patch bar.txt | | patch baz.txt |
825// +---------------+ +---------------+ +---------------+
826//
827// suppose you have this situation and you run across a "patch
828// bar.txt" commit at timestamp 0014. what do you do?
829//
830// - you know that rev #2 cannot accept this commit, simply because
831// two commits on the same file makes *two* revisions, not one.
832//
833// - perhaps rev #3 could accept it; after all, it could be that the
834// commit associated with rev #2 released its commit lock, and the
835// commit associated with rev #3 quickly updated and committed at
836// 0013, finishing off at 0014.
837//
838// - can rev #1 accept it? no. because CVS calcualted the version it
839// expected to see in bar.txt before calling up the server, when
840// committing rev #1. the version it expected to see was the version
841// in bar.txt *before* time 0012; that is, before rev #2 had any affect
842// on bar.txt. when it contacted the server, the commit associated
843// with rev #1 would have aborted if it had seen any other number.
844// so rev #1 could not start before an edit to bar.txt and then
845// include its own edit to bar.txt.
846//
847// so we have only one case where bar.txt can be accepted. if the
848// commit is not accepted into a legal rev (outside the window,
849// wrong changelog/author) it starts a new revision.
850//
851// as we scan forwards, if we hit timestamps which lie beyond rev #n's
852// window, we flush rev #n.
853//
854// if there are multiple coincident and legal revs to direct a
855// commit to (all with the same author/changelog), we direct the
856// commit to the rev with the closest initial timestamp. that is,
857// the *latest* beginning time.
858
859struct
860cvs_cluster
861{
862 time_t first_time;
863 cvs_author author;
864 cvs_changelog changelog;
865 set<cvs_tag> tags;
866
867 cvs_cluster(time_t t,
868 cvs_author a,
869 cvs_changelog c)
870 : first_time(t),
871 author(a),
872 changelog(c)
873 {}
874
875 struct entry
876 {
877 bool live;
878 cvs_version version;
879 time_t time;
880 entry(bool l, cvs_version v, time_t t)
881 : live(l),
882 version(v),
883 time(t)
884 {}
885 };
886
887 typedef map<cvs_path, entry> entry_map;
888 entry_map entries;
889};
890
891
892struct
893cluster_consumer
894{
895 cvs_history & cvs;
896 app_state & app;
897 string const & branchname;
898 cvs_branch const & branch;
899 map<cvs_path, cvs_version> live_files;
900 ticker & n_manifests;
901 ticker & n_revisions;
902
903 struct prepared_revision
904 {
905 prepared_revision(revision_id i,
906 shared_ptr<revision_set> r,
907 cvs_cluster const & c);
908 revision_id rid;
909 shared_ptr<revision_set> rev;
910 time_t time;
911 cvs_author author;
912 cvs_changelog changelog;
913 vector<cvs_tag> tags;
914 };
915
916 vector<prepared_revision> preps;
917
918 manifest_map parent_map, child_map;
919 manifest_id parent_mid, child_mid;
920 revision_id parent_rid, child_rid;
921
922 cluster_consumer(cvs_history & cvs,
923 app_state & app,
924 string const & branchname,
925 cvs_branch const & branch,
926 ticker & n_manifests,
927 ticker & n_revs);
928
929 void consume_cluster(cvs_cluster const & c,
930 bool head_p);
931 void build_change_set(cvs_cluster const & c,
932 change_set & cs);
933 void store_manifest_edge(bool head_p);
934 void store_auxiliary_certs(prepared_revision const & p);
935 void store_revisions();
936};
937
938typedef shared_ptr<cvs_cluster>
939cluster_ptr;
940
941struct
942cluster_ptr_lt
943{
944 bool operator()(cluster_ptr const & a,
945 cluster_ptr const & b) const
946 {
947 return a->first_time < b->first_time;
948 }
949};
950
951typedef set<cluster_ptr, cluster_ptr_lt>
952cluster_set;
953
954void
955import_branch(cvs_history & cvs,
956 app_state & app,
957 string const & branchname,
958 shared_ptr<cvs_branch> const & branch,
959 ticker & n_manifests,
960 ticker & n_revs)
961{
962 cluster_set clusters;
963 cluster_consumer cons(cvs, app, branchname, *branch, n_manifests, n_revs);
964 unsigned long commits_remaining = branch->lineage.size();
965
966 // step 1: sort the lineage
967 stable_sort(branch->lineage.begin(), branch->lineage.end());
968
969 for (vector<cvs_commit>::const_iterator i = branch->lineage.begin();
970 i != branch->lineage.end(); ++i)
971 {
972 commits_remaining--;
973
974 L(F("examining next commit [t:%d] [p:%s] [a:%s] [c:%s]\n")
975 % i->time
976 % cvs.path_interner.lookup(i->path)
977 % cvs.author_interner.lookup(i->author)
978 % cvs.changelog_interner.lookup(i->changelog));
979
980 // step 2: expire all clusters from the beginning of the set which
981 // have passed the window size
982 while (!clusters.empty())
983 {
984 cluster_set::const_iterator j = clusters.begin();
985 if ((*j)->first_time + constants::cvs_window < i->time)
986 {
987 L(F("expiring cluster\n"));
988 cons.consume_cluster(**j, false);
989 clusters.erase(j);
990 }
991 else
992 break;
993 }
994
995 // step 3: find the last still-live cluster to have touched this
996 // file
997 time_t time_of_last_cluster_touching_this_file = 0;
998
999 unsigned clu = 0;
1000 for (cluster_set::const_iterator j = clusters.begin();
1001 j != clusters.end(); ++j)
1002 {
1003 L(F("examining cluster %d to see if it touched %d\n")
1004 % clu++
1005 % i->path);
1006
1007 cvs_cluster::entry_map::const_iterator k = (*j)->entries.find(i->path);
1008 if ((k != (*j)->entries.end())
1009 && (k->second.time > time_of_last_cluster_touching_this_file))
1010 {
1011 L(F("found cluster touching %d: [t:%d] [a:%d] [c:%d]\n")
1012 % i->path
1013 % (*j)->first_time
1014 % (*j)->author
1015 % (*j)->changelog);
1016 time_of_last_cluster_touching_this_file = (*j)->first_time;
1017 }
1018 }
1019 L(F("last modification time is %d\n")
1020 % time_of_last_cluster_touching_this_file);
1021
1022 // step 4: find a cluster which starts on or after the
1023 // last_modify_time, which doesn't modify the file in question,
1024 // and which contains the same author and changelog as our
1025 // commit
1026 cluster_ptr target;
1027 for (cluster_set::const_iterator j = clusters.begin();
1028 j != clusters.end(); ++j)
1029 {
1030 if (((*j)->first_time >= time_of_last_cluster_touching_this_file)
1031 && ((*j)->author == i->author)
1032 && ((*j)->changelog == i->changelog)
1033 && ((*j)->entries.find(i->path) == (*j)->entries.end()))
1034 {
1035 L(F("picked existing cluster [t:%d] [a:%d] [c:%d]\n")
1036 % (*j)->first_time
1037 % (*j)->author
1038 % (*j)->changelog);
1039
1040 target = (*j);
1041 }
1042 }
1043
1044 // if we're still not finding an active cluster,
1045 // this is probably the first commit in it. make
1046 // a new one.
1047 if (!target)
1048 {
1049 L(F("building new cluster [t:%d] [a:%d] [c:%d]\n")
1050 % i->time
1051 % i->author
1052 % i->changelog);
1053
1054 target = cluster_ptr(new cvs_cluster(i->time,
1055 i->author,
1056 i->changelog));
1057 clusters.insert(target);
1058 }
1059
1060 I(target);
1061 target->entries.insert(make_pair(i->path,
1062 cvs_cluster::entry(i->alive,
1063 i->version,
1064 i->time)));
1065 for (vector<cvs_tag>::const_iterator j = i->tags.begin();
1066 j != i->tags.end(); ++j)
1067 {
1068 target->tags.insert(*j);
1069 }
1070 }
1071
1072
1073 // now we are done this lineage; flush all remaining clusters
1074 L(F("finished branch commits, writing all pending clusters\n"));
1075 while (!clusters.empty())
1076 {
1077 cons.consume_cluster(**clusters.begin(), clusters.size() == 1);
1078 clusters.erase(clusters.begin());
1079 }
1080 L(F("finished writing pending clusters\n"));
1081
1082 cons.store_revisions();
1083
1084}
1085
1086
1087void
1088import_cvs_repo(system_path const & cvsroot,
1089 app_state & app)
1090{
1091 N(!directory_exists(cvsroot / "CVSROOT"),
1092 F("%s appears to be a CVS repository root directory\n"
1093 "try importing a module instead, with 'cvs_import %s/<module_name>")
1094 % cvsroot % cvsroot);
1095
1096 {
1097 // early short-circuit to avoid failure after lots of work
1098 rsa_keypair_id key;
1099 get_user_key(key, app);
1100 require_password(key, app);
1101 }
1102
1103 cvs_history cvs;
1104 N(app.branch_name() != "", F("need base --branch argument for importing"));
1105 cvs.base_branch = app.branch_name();
1106
1107 // push the trunk
1108 cvs.trunk = shared_ptr<cvs_branch>(new cvs_branch());
1109 cvs.stk.push(cvs.trunk);
1110 cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch));
1111
1112 {
1113 transaction_guard guard(app.db);
1114 cvs_tree_walker walker(cvs, app.db);
1115 require_path_is_directory(cvsroot,
1116 F("path %s does not exist") % cvsroot,
1117 F("'%s' is not a directory") % cvsroot);
1118 app.db.ensure_open();
1119 change_current_working_dir(cvsroot);
1120 walk_tree(file_path(), walker);
1121 guard.commit();
1122 }
1123
1124 I(cvs.stk.size() == 1);
1125
1126 ticker n_revs(_("revisions"), "r", 1);
1127 ticker n_manifests(_("manifests"), "m", 1);
1128
1129 while (cvs.branches.size() > 0)
1130 {
1131 transaction_guard guard(app.db);
1132 map<string, shared_ptr<cvs_branch> >::const_iterator i = cvs.branches.begin();
1133 string branchname = i->first;
1134 shared_ptr<cvs_branch> branch = i->second;
1135 L(F("branch %s has %d entries\n") % branchname % branch->lineage.size());
1136 import_branch(cvs, app, branchname, branch, n_manifests, n_revs);
1137
1138 // free up some memory
1139 cvs.branches.erase(branchname);
1140 guard.commit();
1141 }
1142
1143 {
1144 transaction_guard guard(app.db);
1145 L(F("trunk has %d entries\n") % cvs.trunk->lineage.size());
1146 import_branch(cvs, app, cvs.base_branch, cvs.trunk, n_manifests, n_revs);
1147 guard.commit();
1148 }
1149
1150 // now we have a "last" rev for each tag
1151 {
1152 ticker n_tags(_("tags"), "t", 1);
1153 packet_db_writer dbw(app);
1154 transaction_guard guard(app.db);
1155 for (map<unsigned long, pair<time_t, revision_id> >::const_iterator i = cvs.resolved_tags.begin();
1156 i != cvs.resolved_tags.end(); ++i)
1157 {
1158 string tag = cvs.tag_interner.lookup(i->first);
1159 ui.set_tick_trailer("marking tag " + tag);
1160 cert_revision_tag(i->second.second, tag, app, dbw);
1161 ++n_tags;
1162 }
1163 guard.commit();
1164 }
1165
1166
1167 return;
1168
1169}
1170
1171cluster_consumer::cluster_consumer(cvs_history & cvs,
1172 app_state & app,
1173 string const & branchname,
1174 cvs_branch const & branch,
1175 ticker & n_mans,
1176 ticker & n_revs)
1177 : cvs(cvs),
1178 app(app),
1179 branchname(branchname),
1180 branch(branch),
1181 n_manifests(n_mans),
1182 n_revisions(n_revs)
1183{
1184 if (!branch.live_at_beginning.empty())
1185 {
1186 cvs_author synthetic_author =
1187 cvs.author_interner.intern("cvs_import");
1188
1189 cvs_changelog synthetic_cl =
1190 cvs.changelog_interner.intern("beginning of branch "
1191 + branchname);
1192
1193 time_t synthetic_time = branch.beginning();
1194 cvs_cluster initial_cluster(synthetic_time,
1195 synthetic_author,
1196 synthetic_cl);
1197
1198 L(F("initial cluster on branch %s has %d live entries\n") %
1199 branchname % branch.live_at_beginning.size());
1200
1201 for (map<cvs_path, cvs_version>::const_iterator i = branch.live_at_beginning.begin();
1202 i != branch.live_at_beginning.end(); ++i)
1203 {
1204 cvs_cluster::entry e(true, i->second, synthetic_time);
1205 L(F("initial cluster contains %s at %s\n") %
1206 cvs.path_interner.lookup(i->first) %
1207 cvs.file_version_interner.lookup(i->second));
1208 initial_cluster.entries.insert(make_pair(i->first, e));
1209 }
1210 consume_cluster(initial_cluster, branch.lineage.empty());
1211 }
1212}
1213
1214cluster_consumer::prepared_revision::prepared_revision(revision_id i,
1215 shared_ptr<revision_set> r,
1216 cvs_cluster const & c)
1217 : rid(i),
1218 rev(r),
1219 time(c.first_time),
1220 author(c.author),
1221 changelog(c.changelog)
1222{
1223 for (set<cvs_tag>::const_iterator i = c.tags.begin();
1224 i != c.tags.end(); ++i)
1225 {
1226 tags.push_back(*i);
1227 }
1228}
1229
1230
1231void
1232cluster_consumer::store_revisions()
1233{
1234 for (vector<prepared_revision>::const_iterator i = preps.begin();
1235 i != preps.end(); ++i)
1236 {
1237 if (! app.db.revision_exists(i->rid))
1238 {
1239 data tmp;
1240 write_revision_set(*(i->rev), tmp);
1241 app.db.put_revision(i->rid, *(i->rev));
1242 store_auxiliary_certs(*i);
1243 ++n_revisions;
1244 }
1245 }
1246}
1247
1248void
1249cluster_consumer::store_manifest_edge(bool head_p)
1250{
1251 L(F("storing manifest '%s' (base %s)\n") % parent_mid % child_mid);
1252 ++n_manifests;
1253
1254 if (head_p)
1255 {
1256 L(F("storing head %s\n") % child_mid);
1257 // a branch has one very important manifest: the head. this is
1258 // the "newest" of all manifests within the branch (including
1259 // the trunk), and we store it in its entirety, before the
1260 // cluster consumer is destroyed.
1261 if (! app.db.manifest_version_exists(child_mid))
1262 {
1263 manifest_data mdat;
1264 write_manifest_map(child_map, mdat);
1265 app.db.put_manifest(child_mid, mdat);
1266 }
1267 }
1268
1269 if (null_id(parent_mid))
1270 {
1271 L(F("skipping delta to null manifest\n"));
1272 return;
1273 }
1274
1275 unsigned long older, newer;
1276
1277 older = cvs.manifest_version_interner.intern(parent_mid.inner()());
1278 newer = cvs.manifest_version_interner.intern(child_mid.inner()());
1279
1280 if (cvs.manifest_cycle_detector.edge_makes_cycle(older,newer))
1281 {
1282
1283 L(F("skipping cyclical manifest delta %s -> %s\n")
1284 % parent_mid % child_mid);
1285 // we are potentially breaking the chain one would use to get to
1286 // p. we need to make sure p exists.
1287 if (!app.db.manifest_version_exists(parent_mid))
1288 {
1289 L(F("writing full manifest %s\n") % parent_mid);
1290 manifest_data mdat;
1291 write_manifest_map(parent_map, mdat);
1292 app.db.put_manifest(parent_mid, mdat);
1293 }
1294 return;
1295 }
1296
1297 cvs.manifest_cycle_detector.put_edge(older,newer);
1298
1299 L(F("storing manifest delta %s -> %s\n")
1300 % child_mid % parent_mid);
1301
1302 // the ancestry-based 'child' is a 'new' version as far as the
1303 // storage system is concerned; that is to say that the
1304 // ancestry-based 'parent' is a temporally older tree version, which
1305 // can be constructed from the 'newer' child. so the delta should
1306 // run from child (new) -> parent (old).
1307
1308 delta del;
1309 diff(child_map, parent_map, del);
1310 rcs_put_raw_manifest_edge(parent_mid.inner(),
1311 child_mid.inner(),
1312 del, app.db);
1313}
1314
1315void
1316cluster_consumer::store_auxiliary_certs(prepared_revision const & p)
1317{
1318 packet_db_writer dbw(app);
1319
1320 for (vector<cvs_tag>::const_iterator i = p.tags.begin();
1321 i != p.tags.end(); ++i)
1322 {
1323 map<unsigned long, pair<time_t, revision_id> >::const_iterator j
1324 = cvs.resolved_tags.find(*i);
1325
1326 if (j != cvs.resolved_tags.end())
1327 {
1328 if (j->second.first < p.time)
1329 {
1330 // move the tag forwards
1331 cvs.resolved_tags.erase(*i);
1332 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1333 }
1334 }
1335 else
1336 {
1337 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1338 }
1339 }
1340
1341 cert_revision_in_branch(p.rid, cert_value(branchname), app, dbw);
1342 cert_revision_author(p.rid, cvs.author_interner.lookup(p.author), app, dbw);
1343 cert_revision_changelog(p.rid, cvs.changelog_interner.lookup(p.changelog), app, dbw);
1344 cert_revision_date_time(p.rid, p.time, app, dbw);
1345}
1346
1347void
1348cluster_consumer::build_change_set(cvs_cluster const & c,
1349 change_set & cs)
1350{
1351 for (cvs_cluster::entry_map::const_iterator i = c.entries.begin();
1352 i != c.entries.end(); ++i)
1353 {
1354 file_path pth = file_path_internal(cvs.path_interner.lookup(i->first));
1355 file_id fid(cvs.file_version_interner.lookup(i->second.version));
1356 if (i->second.live)
1357 {
1358 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1359 if (e == live_files.end())
1360 {
1361 L(F("adding entry state '%s' on '%s'\n") % fid % pth);
1362 cs.add_file(pth, fid);
1363 live_files[i->first] = i->second.version;
1364 }
1365 else if (e->second != i->second.version)
1366 {
1367 file_id old_fid(cvs.file_version_interner.lookup(e->second));
1368 L(F("applying state delta on '%s' : '%s' -> '%s'\n")
1369 % pth % old_fid % fid);
1370 cs.apply_delta(pth, old_fid, fid);
1371 live_files[i->first] = i->second.version;
1372 }
1373 }
1374 else
1375 {
1376 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1377 if (e != live_files.end())
1378 {
1379 L(F("deleting entry state '%s' on '%s'\n") % fid % pth);
1380 cs.delete_file(pth);
1381 live_files.erase(i->first);
1382 }
1383 }
1384 }
1385}
1386
1387void
1388cluster_consumer::consume_cluster(cvs_cluster const & c,
1389 bool head_p)
1390{
1391 // we should never have an empty cluster; it's *possible* to have
1392 // an empty changeset (say on a vendor import) but every cluster
1393 // should have been created by at least one file commit, even
1394 // if the commit made no changes. it's a logical inconsistency if
1395 // you have an empty cluster.
1396 I(!c.entries.empty());
1397
1398 L(F("BEGIN consume_cluster()\n"));
1399 shared_ptr<revision_set> rev(new revision_set());
1400 boost::shared_ptr<change_set> cs(new change_set());
1401 build_change_set(c, *cs);
1402
1403 apply_change_set(*cs, child_map);
1404 calculate_ident(child_map, child_mid);
1405
1406 rev->new_manifest = child_mid;
1407 rev->edges.insert(make_pair(parent_rid, make_pair(parent_mid, cs)));
1408 calculate_ident(*rev, child_rid);
1409
1410 store_manifest_edge(head_p);
1411
1412 preps.push_back(prepared_revision(child_rid, rev, c));
1413
1414 // now apply same change set to parent_map, making parent_map == child_map
1415 apply_change_set(*cs, parent_map);
1416 parent_mid = child_mid;
1417 parent_rid = child_rid;
1418 L(F("END consume_cluster('%s') (parent '%s')\n") % child_rid % rev->edges.begin()->first);
1419}

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status