monotone

monotone Mtn Source Tree

Root/src/rcs_import.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include "base.hh"
11#include <algorithm>
12#include <iterator>
13#include <list>
14#include <map>
15#include <set>
16#include <sstream>
17#include <stack>
18#include <stdexcept>
19#include "vector.hh"
20#include <cstring> // memset
21
22#include <boost/shared_ptr.hpp>
23#include <boost/scoped_ptr.hpp>
24#include "lexical_cast.hh"
25#include <boost/tokenizer.hpp>
26
27#include "cert.hh"
28#include "constants.hh"
29#include "cycle_detector.hh"
30#include "database.hh"
31#include "dates.hh"
32#include "file_io.hh"
33#include "interner.hh"
34#include "paths.hh"
35#include "platform-wrapped.hh"
36#include "project.hh"
37#include "rcs_file.hh"
38#include "revision.hh"
39#include "safe_map.hh"
40#include "sanity.hh"
41#include "transforms.hh"
42#include "ui.hh"
43#include "roster.hh"
44#include "xdelta.hh"
45
46using std::make_pair;
47using std::map;
48using std::multimap;
49using std::out_of_range;
50using std::pair;
51using std::search;
52using std::set;
53using std::sscanf;
54using std::stable_sort;
55using std::stack;
56using std::string;
57using std::vector;
58
59using boost::scoped_ptr;
60using boost::shared_ptr;
61using boost::lexical_cast;
62
63// cvs history recording stuff
64
65typedef unsigned long cvs_branchname;
66typedef unsigned long cvs_author;
67typedef unsigned long cvs_changelog;
68typedef unsigned long cvs_version;
69typedef unsigned long cvs_path;
70typedef unsigned long cvs_tag;
71
72struct cvs_history;
73
74struct
75cvs_commit
76{
77 cvs_commit(rcs_file const & r,
78 string const & rcs_version,
79 file_id const & ident,
80 cvs_history & cvs);
81
82 bool is_synthetic_branch_root;
83 time_t time;
84 bool alive;
85 cvs_author author;
86 cvs_changelog changelog;
87 cvs_version version;
88 cvs_path path;
89 vector<cvs_tag> tags;
90
91 bool operator<(cvs_commit const & other) const
92 {
93 return time < other.time;
94 }
95};
96
97struct
98cvs_branch
99{
100 bool has_a_branchpoint;
101 bool has_a_commit;
102 time_t last_branchpoint;
103 time_t first_commit;
104
105 map<cvs_path, cvs_version> live_at_beginning;
106 vector<cvs_commit> lineage;
107
108 cvs_branch()
109 : has_a_branchpoint(false),
110 has_a_commit(false),
111 last_branchpoint(0),
112 first_commit(0)
113 {
114 }
115
116 void note_commit(time_t now)
117 {
118 if (!has_a_commit)
119 {
120 first_commit = now;
121 }
122 else
123 {
124 if (now < first_commit)
125 first_commit = now;
126 }
127 has_a_commit = true;
128 }
129
130 void note_branchpoint(time_t now)
131 {
132 has_a_branchpoint = true;
133 if (now > last_branchpoint)
134 last_branchpoint = now;
135 }
136
137 time_t beginning() const
138 {
139 I(has_a_branchpoint || has_a_commit);
140 if (has_a_commit)
141 {
142 I(first_commit != 0);
143 return first_commit;
144 }
145 else
146 {
147 I(last_branchpoint != 0);
148 return last_branchpoint;
149 }
150 }
151
152 void append_commit(cvs_commit const & c)
153 {
154 I(c.time != 0);
155 note_commit(c.time);
156 lineage.push_back(c);
157 }
158};
159
160struct
161cvs_history
162{
163
164 interner<unsigned long> branch_interner;
165 interner<unsigned long> author_interner;
166 interner<unsigned long> changelog_interner;
167 interner<unsigned long> file_version_interner;
168 interner<unsigned long> path_interner;
169 interner<unsigned long> tag_interner;
170
171 // assume admin has foo:X.Y.0.N in it, then
172 // this multimap contains entries of the form
173 // X.Y -> foo
174 multimap<string, string> branchpoints;
175
176 // and this map contains entries of the form
177 // X.Y.N.1 -> foo
178 map<string, string> branch_first_entries;
179
180 // branch name -> branch
181 map<string, shared_ptr<cvs_branch> > branches;
182 shared_ptr<cvs_branch> trunk;
183
184 // stack of branches we're injecting states into
185 stack< shared_ptr<cvs_branch> > stk;
186 stack< cvs_branchname > bstk;
187
188 // tag -> time, revision
189 //
190 // used to resolve the *last* revision which has a given tag
191 // applied; this is the revision which wins the tag.
192 map<unsigned long, pair<time_t, revision_id> > resolved_tags;
193
194 file_path curr_file;
195 cvs_path curr_file_interned;
196
197 string base_branch;
198
199 ticker n_versions;
200 ticker n_tree_branches;
201
202 cvs_history();
203 void set_filename(string const & file,
204 file_id const & ident);
205
206 void index_branchpoint_symbols(rcs_file const & r);
207
208 void push_branch(string const & branch_name, bool private_branch);
209 void pop_branch();
210};
211
212
213static bool
214is_sbr(shared_ptr<rcs_delta> dl,
215 shared_ptr<rcs_deltatext> dt)
216{
217
218 // CVS abuses the RCS format a bit (ha!) when storing a file which
219 // was only added on a branch: on the root of the branch there'll be
220 // a commit with dead state, empty text, and a log message
221 // containing the string "file foo was initially added on branch
222 // bar". We recognize and ignore these cases, as they do not
223 // "really" represent commits to be clustered together.
224
225 if (dl->state != "dead")
226 return false;
227
228 if (!dt->text.empty())
229 return false;
230
231 string log_bit = "was initially added on branch";
232 string::const_iterator i = search(dt->log.begin(),
233 dt->log.end(),
234 log_bit.begin(),
235 log_bit.end());
236
237 return i != dt->log.end();
238}
239
240
241cvs_commit::cvs_commit(rcs_file const & r,
242 string const & rcs_version,
243 file_id const & ident,
244 cvs_history & cvs)
245{
246 map<string, shared_ptr<rcs_delta> >::const_iterator delta =
247 r.deltas.find(rcs_version);
248 I(delta != r.deltas.end());
249
250 map<string, shared_ptr<rcs_deltatext> >::const_iterator deltatext =
251 r.deltatexts.find(rcs_version);
252 I(deltatext != r.deltatexts.end());
253
254 struct tm t;
255 // We need to initialize t to all zeros, because strptime has a habit of
256 // leaving bits of the data structure alone, letting garbage sneak into
257 // our output.
258 memset(&t, 0, sizeof(t));
259 char const * dp = delta->second->date.c_str();
260 L(FL("Calculating time of %s") % dp);
261#ifdef HAVE_STRPTIME
262 if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL)
263 I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL);
264#else
265 I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon),
266 &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6);
267 t.tm_mon--;
268 // Apparently some RCS files have 2 digit years, others four; tm always
269 // wants a 2 (or 3) digit year (years since 1900).
270 if (t.tm_year > 1900)
271 t.tm_year-=1900;
272#endif
273 time = mktime(&t);
274 L(FL("= %i") % time);
275
276 is_synthetic_branch_root = is_sbr(delta->second,
277 deltatext->second);
278
279 alive = delta->second->state != "dead";
280 if (is_synthetic_branch_root)
281 changelog = cvs.changelog_interner.intern("synthetic branch root changelog");
282 else
283 changelog = cvs.changelog_interner.intern(deltatext->second->log);
284 author = cvs.author_interner.intern(delta->second->author);
285 path = cvs.curr_file_interned;
286 version = cvs.file_version_interner.intern(ident.inner()());
287
288 typedef multimap<string,string>::const_iterator ity;
289 pair<ity,ity> range = r.admin.symbols.equal_range(rcs_version);
290 for (ity i = range.first; i != range.second; ++i)
291 {
292 if (i->first == rcs_version)
293 {
294 L(FL("version %s -> tag %s") % rcs_version % i->second);
295 tags.push_back(cvs.tag_interner.intern(i->second));
296 }
297 }
298
299}
300
301
302// piece table stuff
303
304struct piece;
305
306struct
307piece_store
308{
309 vector< shared_ptr<rcs_deltatext> > texts;
310 void index_deltatext(shared_ptr<rcs_deltatext> const & dt,
311 vector<piece> & pieces);
312 void build_string(vector<piece> const & pieces,
313 string & out);
314 void reset() { texts.clear(); }
315};
316
317// FIXME: kludge, I was lazy and did not make this
318// a properly scoped variable.
319
320static piece_store global_pieces;
321
322
323struct
324piece
325{
326 piece(string::size_type p, string::size_type l, unsigned long id) :
327 pos(p), len(l), string_id(id) {}
328 string::size_type pos;
329 string::size_type len;
330 unsigned long string_id;
331 string operator*() const
332 {
333 return string(global_pieces.texts.at(string_id)->text.data() + pos, len);
334 }
335};
336
337
338void
339piece_store::build_string(vector<piece> const & pieces,
340 string & out)
341{
342 out.clear();
343 out.reserve(pieces.size() * 60);
344 for(vector<piece>::const_iterator i = pieces.begin();
345 i != pieces.end(); ++i)
346 out.append(texts.at(i->string_id)->text, i->pos, i->len);
347}
348
349void
350piece_store::index_deltatext(shared_ptr<rcs_deltatext> const & dt,
351 vector<piece> & pieces)
352{
353 pieces.clear();
354 pieces.reserve(dt->text.size() / 30);
355 texts.push_back(dt);
356 unsigned long id = texts.size() - 1;
357 string::size_type begin = 0;
358 string::size_type end = dt->text.find('\n');
359 while(end != string::npos)
360 {
361 // nb: the piece includes the '\n'
362 pieces.push_back(piece(begin, (end - begin) + 1, id));
363 begin = end + 1;
364 end = dt->text.find('\n', begin);
365 }
366 if (begin != dt->text.size())
367 {
368 // the text didn't end with '\n', so neither does the piece
369 end = dt->text.size();
370 pieces.push_back(piece(begin, end - begin, id));
371 }
372}
373
374
375static void
376process_one_hunk(vector< piece > const & source,
377 vector< piece > & dest,
378 vector< piece >::const_iterator & i,
379 int & cursor)
380{
381 string directive = **i;
382 assert(directive.size() > 1);
383 ++i;
384
385 try
386 {
387 char code;
388 int pos, len;
389 if (sscanf(directive.c_str(), " %c %d %d", &code, &pos, &len) != 3)
390 throw oops("illformed directive '" + directive + "'");
391
392 if (code == 'a')
393 {
394 // 'ax y' means "copy from source to dest until cursor == x, then
395 // copy y lines from delta, leaving cursor where it is"
396 while (cursor < pos)
397 dest.push_back(source.at(cursor++));
398 I(cursor == pos);
399 while (len--)
400 dest.push_back(*i++);
401 }
402 else if (code == 'd')
403 {
404 // 'dx y' means "copy from source to dest until cursor == x-1,
405 // then increment cursor by y, ignoring those y lines"
406 while (cursor < (pos - 1))
407 dest.push_back(source.at(cursor++));
408 I(cursor == pos - 1);
409 cursor += len;
410 }
411 else
412 throw oops("unknown directive '" + directive + "'");
413 }
414 catch (out_of_range &)
415 {
416 throw oops("out_of_range while processing " + directive
417 + " with source.size() == "
418 + lexical_cast<string>(source.size())
419 + " and cursor == "
420 + lexical_cast<string>(cursor));
421 }
422}
423
424static void
425construct_version(vector< piece > const & source_lines,
426 string const & dest_version,
427 vector< piece > & dest_lines,
428 rcs_file const & r)
429{
430 dest_lines.clear();
431 dest_lines.reserve(source_lines.size());
432
433 I(r.deltas.find(dest_version) != r.deltas.end());
434 shared_ptr<rcs_delta> delta = r.deltas.find(dest_version)->second;
435
436 I(r.deltatexts.find(dest_version) != r.deltatexts.end());
437 shared_ptr<rcs_deltatext> deltatext = r.deltatexts.find(dest_version)->second;
438
439 vector<piece> deltalines;
440 global_pieces.index_deltatext(deltatext, deltalines);
441
442 int cursor = 0;
443 for (vector<piece>::const_iterator i = deltalines.begin();
444 i != deltalines.end(); )
445 process_one_hunk(source_lines, dest_lines, i, cursor);
446 while (cursor < static_cast<int>(source_lines.size()))
447 dest_lines.push_back(source_lines[cursor++]);
448}
449
450// FIXME: should these be someplace else? using 'friend' to reach into the
451// DB is stupid, but it's also stupid to put raw edge insert methods on the
452// DB itself. or is it? hmm.. encapsulation vs. usage guidance..
453void
454rcs_put_raw_file_edge(database & db,
455 file_id const & old_id,
456 file_id const & new_id,
457 delta const & del)
458{
459 if (old_id == new_id)
460 {
461 L(FL("skipping identity file edge"));
462 return;
463 }
464
465 if (db.file_version_exists(old_id))
466 {
467 // we already have a way to get to this old version,
468 // no need to insert another reconstruction path
469 L(FL("existing path to %s found, skipping") % old_id);
470 }
471 else
472 {
473 I(db.file_or_manifest_base_exists(new_id, "files")
474 || db.delta_exists(new_id.inner(), "file_deltas"));
475 db.put_file_delta(old_id, new_id, file_delta(del));
476 }
477}
478
479
480static void
481insert_into_db(database & db, data const & curr_data,
482 file_id const & curr_id,
483 vector< piece > const & next_lines,
484 data & next_data,
485 file_id & next_id)
486{
487 // inserting into the DB
488 // note: curr_lines is a "new" (base) version
489 // and next_lines is an "old" (derived) version.
490 // all storage edges go from new -> old.
491 {
492 string tmp;
493 global_pieces.build_string(next_lines, tmp);
494 next_data = data(tmp, origin::internal);
495 }
496 delta del;
497 diff(curr_data, next_data, del);
498 calculate_ident(file_data(next_data), next_id);
499 rcs_put_raw_file_edge(db, next_id, curr_id, del);
500}
501
502
503
504/*
505
506please read this exhaustingly long comment and understand it
507before mucking with the branch inference logic.
508
509we are processing a file version. a branch might begin here. if
510the current version is X.Y, then there is a branch B starting
511here iff there is a symbol in the admin section called X.Y.0.Z,
512where Z is the branch number (or if there is a private branch
513called X.Y.Z, which is either an import branch or some private
514RCS cruft).
515
516the version X.Y is then considered the branchpoint of B in the
517current file. this does *not* mean that the CVS key -- an
518abstraction representing whole-tree operations -- of X.Y is the
519branchpoint across the CVS archive we're processing.
520
521in fact, CVS does not record the occurrence of a branching
522action (tag -b). we have no idea who executed that command and
523when. what we know instead is the commit X.Y immediately
524preceeding the branch -- CVS consideres this the branchpoint --
525in this file's reduced view of history. we also know the first
526commit X.Y.Z.1 inside the branch (which might not exist).
527
528our old strategy was to consider all branches nested in a
529hierarchy, which was a super-tree of all the branch trees in all
530the CVS files in a repository. this involved considering X.Y as
531the parent version of branch X.Y.Z, an selecting "the"
532branchpoint connecting the two as the least CVS key X.Y.Z.1
533committed inside the branch B.
534
535this was a mistake, for two significant reasons.
536
537first, some files do not *have* any commit inside the branch B,
538only a branchpoint X.Y.0.Z. this branchpoint is actually the
539last commit *before* the user branched, and could be a very old
540commit, long before the branch was formed, so it is useless in
541determining the branch structure.
542
543second, some files do not have a branch B, or worse, have
544branched into B from an "ancestor" branch A, where a different
545file branches into B from a different ancestor branch C. in
546other words, while there *is* a tree structure within the X.Y.Z
547branches of each file, there is *no* shared tree structure
548between the branch names across a repository. in one file A can
549be an ancestor of B, in another file B can be an ancestor of A.
550
551thus, we give up on establishing a hierarchy between branches
552altogether. all branches exist in a flat namespace, and all are
553direct descendents of the empty revision at the root of
554history. each branchpoint symbol mentioned in the
555administrative section of a file is considered the root of a new
556lineage.
557
558*/
559
560
561static void
562process_branch(database & db,
563 string const & begin_version,
564 vector< piece > const & begin_lines,
565 data const & begin_data,
566 file_id const & begin_id,
567 rcs_file const & r,
568 cvs_history & cvs)
569{
570 string curr_version = begin_version;
571 scoped_ptr< vector< piece > > next_lines(new vector<piece>);
572 scoped_ptr< vector< piece > > curr_lines(new vector<piece>
573 (begin_lines.begin(),
574 begin_lines.end()));
575 data curr_data(begin_data), next_data;
576 file_id curr_id(begin_id), next_id;
577
578 while(! (r.deltas.find(curr_version) == r.deltas.end()))
579 {
580 L(FL("version %s has %d lines") % curr_version % curr_lines->size());
581
582 cvs_commit curr_commit(r, curr_version, file_id(curr_id), cvs);
583 if (!curr_commit.is_synthetic_branch_root)
584 {
585 cvs.stk.top()->append_commit(curr_commit);
586 ++cvs.n_versions;
587 }
588
589 string next_version = r.deltas.find(curr_version)->second->next;
590
591 if (! next_version.empty())
592 {
593 L(FL("following RCS edge %s -> %s") % curr_version % next_version);
594
595 construct_version(*curr_lines, next_version, *next_lines, r);
596 L(FL("constructed RCS version %s, inserting into database") %
597 next_version);
598
599 insert_into_db(db, curr_data, curr_id,
600 *next_lines, next_data, next_id);
601 }
602
603 // mark the beginning-of-branch time and state of this file if
604 // we're at a branchpoint
605 typedef multimap<string,string>::const_iterator ity;
606 pair<ity,ity> range = cvs.branchpoints.equal_range(curr_version);
607 if (range.first != cvs.branchpoints.end()
608 && range.first->first == curr_version)
609 {
610 for (ity i = range.first; i != range.second; ++i)
611 {
612 cvs.push_branch(i->second, false);
613 shared_ptr<cvs_branch> b = cvs.stk.top();
614 if (curr_commit.alive)
615 b->live_at_beginning[cvs.curr_file_interned] = curr_commit.version;
616 b->note_branchpoint(curr_commit.time);
617 cvs.pop_branch();
618 }
619 }
620
621
622 // recursively follow any branch commits coming from the branchpoint
623 shared_ptr<rcs_delta> curr_delta = r.deltas.find(curr_version)->second;
624 for(vector<string>::const_iterator i = curr_delta->branches.begin();
625 i != curr_delta->branches.end(); ++i)
626 {
627 string branch;
628 data branch_data;
629 file_id branch_id;
630 vector< piece > branch_lines;
631 bool priv = false;
632 map<string, string>::const_iterator be = cvs.branch_first_entries.find(*i);
633
634 if (be != cvs.branch_first_entries.end())
635 branch = be->second;
636 else
637 priv = true;
638
639 L(FL("following RCS branch %s = '%s'") % (*i) % branch);
640
641 construct_version(*curr_lines, *i, branch_lines, r);
642 insert_into_db(db, curr_data, curr_id,
643 branch_lines, branch_data, branch_id);
644
645 cvs.push_branch(branch, priv);
646 process_branch(db, *i, branch_lines, branch_data, branch_id, r, cvs);
647 cvs.pop_branch();
648
649 L(FL("finished RCS branch %s = '%s'") % (*i) % branch);
650 }
651
652 if (!r.deltas.find(curr_version)->second->next.empty())
653 {
654 // advance
655 curr_data = next_data;
656 curr_id = next_id;
657 curr_version = next_version;
658 swap(next_lines, curr_lines);
659 next_lines->clear();
660 }
661 else break;
662 }
663}
664
665
666static void
667import_rcs_file_with_cvs(database & db, string const & filename,
668 cvs_history & cvs)
669{
670 rcs_file r;
671 L(FL("parsing RCS file %s") % filename);
672 parse_rcs_file(filename, r);
673 L(FL("parsed RCS file %s OK") % filename);
674
675 {
676 vector< piece > head_lines;
677 I(r.deltatexts.find(r.admin.head) != r.deltatexts.end());
678 I(r.deltas.find(r.admin.head) != r.deltas.end());
679
680 file_id fid;
681 file_data dat(r.deltatexts.find(r.admin.head)->second->text,
682 origin::user);
683 calculate_ident(dat, fid);
684
685 cvs.set_filename(filename, fid);
686 cvs.index_branchpoint_symbols (r);
687 db.put_file(fid, dat);
688
689 {
690 // create the head state in case it is a loner
691 // cvs_key k;
692 // shared_ptr<cvs_state> s;
693 // L(FL("noting head version %s : %s") % cvs.curr_file % r.admin.head);
694 // cvs.find_key_and_state (r, r.admin.head, k, s);
695 }
696
697 global_pieces.reset();
698 global_pieces.index_deltatext(r.deltatexts.find(r.admin.head)->second, head_lines);
699 process_branch(db, r.admin.head, head_lines, dat.inner(), fid, r, cvs);
700 global_pieces.reset();
701 }
702
703 ui.set_tick_trailer("");
704}
705
706void
707test_parse_rcs_file(system_path const & filename)
708{
709 cvs_history cvs;
710
711 I(! filename.empty());
712 assert_path_is_file(filename);
713
714 P(F("parsing RCS file '%s'") % filename);
715 rcs_file r;
716 parse_rcs_file(filename.as_external(), r);
717 P(F("parsed RCS file '%s' OK") % filename);
718}
719
720
721// CVS importing stuff follows
722
723
724static void
725split_version(string const & v, vector<string> & vs)
726{
727 vs.clear();
728 boost::char_separator<char> sep(".");
729 typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
730 tokenizer tokens(v, sep);
731 copy(tokens.begin(), tokens.end(), back_inserter(vs));
732}
733
734static void
735join_version(vector<string> const & vs, string & v)
736{
737 v.clear();
738 for (vector<string>::const_iterator i = vs.begin();
739 i != vs.end(); ++i)
740 {
741 if (i != vs.begin())
742 v += ".";
743 v += *i;
744 }
745}
746
747cvs_history::cvs_history() :
748 n_versions("versions", "v", 1),
749 n_tree_branches("branches", "b", 1)
750{
751}
752
753void
754cvs_history::set_filename(string const & file,
755 file_id const & ident)
756{
757 L(FL("importing file '%s'") % file);
758 I(file.size() > 2);
759 I(file.substr(file.size() - 2) == string(",v"));
760 string ss = file;
761 ui.set_tick_trailer(ss);
762 ss.resize(ss.size() - 2);
763 // remove Attic/ if present
764 string::size_type last_slash=ss.rfind('/');
765 if (last_slash!=string::npos && last_slash>=5
766 && ss.substr(last_slash-5,6)=="Attic/")
767 ss.erase(last_slash-5,6);
768 curr_file = file_path_internal(ss);
769 curr_file_interned = path_interner.intern(ss);
770}
771
772void cvs_history::index_branchpoint_symbols(rcs_file const & r)
773{
774 branchpoints.clear();
775 branch_first_entries.clear();
776
777 for (multimap<string, string>::const_iterator i =
778 r.admin.symbols.begin(); i != r.admin.symbols.end(); ++i)
779 {
780 string const & num = i->first;
781 string const & sym = i->second;
782
783 vector<string> components;
784 split_version(num, components);
785
786 vector<string> first_entry_components;
787 vector<string> branchpoint_components;
788
789 if (components.size() > 2 &&
790 (components.size() % 2 == 1))
791 {
792 // this is a "vendor" branch
793 //
794 // such as "1.1.1", where "1.1" is the branchpoint and
795 // "1.1.1.1" will be the first commit on it.
796
797 first_entry_components = components;
798 first_entry_components.push_back("1");
799
800 branchpoint_components = components;
801 branchpoint_components.erase(branchpoint_components.end() - 1,
802 branchpoint_components.end());
803
804 }
805
806 else if (components.size() > 2 &&
807 (components.size() % 2 == 0) &&
808 components[components.size() - 2] == string("0"))
809 {
810 // this is a "normal" branch
811 //
812 // such as "1.3.0.2", where "1.3" is the branchpoint and
813 // "1.3.2.1"
814
815 first_entry_components = components;
816 first_entry_components[first_entry_components.size() - 2]
817 = first_entry_components[first_entry_components.size() - 1];
818 first_entry_components[first_entry_components.size() - 1]
819 = string("1");
820
821 branchpoint_components = components;
822 branchpoint_components.erase(branchpoint_components.end() - 2,
823 branchpoint_components.end());
824 }
825
826 string first_entry_version;
827 join_version(first_entry_components, first_entry_version);
828
829 L(FL("first version in branch %s would be %s")
830 % sym % first_entry_version);
831 branch_first_entries.insert(make_pair(first_entry_version, sym));
832
833 string branchpoint_version;
834 join_version(branchpoint_components, branchpoint_version);
835
836 L(FL("file branchpoint for %s at %s") % sym % branchpoint_version);
837 branchpoints.insert(make_pair(branchpoint_version, sym));
838 }
839}
840
841
842
843void
844cvs_history::push_branch(string const & branch_name, bool private_branch)
845{
846 shared_ptr<cvs_branch> branch;
847
848 string bname = base_branch + "." + branch_name;
849 I(!stk.empty());
850
851 if (private_branch)
852 {
853 branch = shared_ptr<cvs_branch>(new cvs_branch());
854 stk.push(branch);
855 bstk.push(branch_interner.intern(""));
856 return;
857 }
858 else
859 {
860 map<string, shared_ptr<cvs_branch> >::const_iterator b = branches.find(bname);
861 if (b == branches.end())
862 {
863 branch = shared_ptr<cvs_branch>(new cvs_branch());
864 branches.insert(make_pair(bname, branch));
865 ++n_tree_branches;
866 }
867 else
868 branch = b->second;
869
870 stk.push(branch);
871 bstk.push(branch_interner.intern(bname));
872 }
873}
874
875void
876cvs_history::pop_branch()
877{
878 I(stk.size() > 1);
879 stk.pop();
880 bstk.pop();
881}
882
883
884class
885cvs_tree_walker
886 : public tree_walker
887{
888 cvs_history & cvs;
889 database & db;
890public:
891 cvs_tree_walker(cvs_history & c, database & d) :
892 cvs(c), db(d)
893 {
894 }
895 virtual void visit_file(file_path const & path)
896 {
897 string file = path.as_external();
898 if (file.substr(file.size() - 2) == string(",v"))
899 {
900 try
901 {
902 import_rcs_file_with_cvs(db, file, cvs);
903 }
904 catch (oops const & o)
905 {
906 W(F("error reading RCS file '%s': %s") % file % o.what());
907 }
908 }
909 else
910 L(FL("skipping non-RCS file '%s'") % file);
911 }
912 virtual ~cvs_tree_walker() {}
913};
914
915
916
917
918//
919// our task here is to produce a sequence of revision descriptions
920// from the per-file commit records we have. we do this by rolling
921// forwards through the temporally sorted file-commit list
922// accumulating file-commits into revisions and flushing the
923// revisions when we feel they are "complete".
924//
925// revisions have to have a time associated with them. this time
926// will be the first time of any commit associated with the
927// revision. they have an author and a changelog, which is shared
928// by all the file-commits in the revision.
929//
930// there might be multiple revisions overlapping in time. this is
931// legal wrt. CVS. we keep a set, and search all members of the set
932// for the best match.
933//
934// consider this situation of overlapping revisions:
935//
936// +---------------+ +---------------+ +---------------+
937// | rev #1 @ 0011 | | rev #2 @ 0012 | | rev #3 @ 0013 |
938// |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~|
939// | patch foo.txt | | patch bar.txt | | patch baz.txt |
940// +---------------+ +---------------+ +---------------+
941//
942// suppose you have this situation and you run across a "patch
943// bar.txt" commit at timestamp 0014. what do you do?
944//
945// - you know that rev #2 cannot accept this commit, simply because
946// two commits on the same file makes *two* revisions, not one.
947//
948// - perhaps rev #3 could accept it; after all, it could be that the
949// commit associated with rev #2 released its commit lock, and the
950// commit associated with rev #3 quickly updated and committed at
951// 0013, finishing off at 0014.
952//
953// - can rev #1 accept it? no. because CVS calcualted the version it
954// expected to see in bar.txt before calling up the server, when
955// committing rev #1. the version it expected to see was the version
956// in bar.txt *before* time 0012; that is, before rev #2 had any affect
957// on bar.txt. when it contacted the server, the commit associated
958// with rev #1 would have aborted if it had seen any other number.
959// so rev #1 could not start before an edit to bar.txt and then
960// include its own edit to bar.txt.
961//
962// so we have only one case where bar.txt can be accepted. if the
963// commit is not accepted into a legal rev (outside the window,
964// wrong changelog/author) it starts a new revision.
965//
966// as we scan forwards, if we hit timestamps which lie beyond rev #n's
967// window, we flush rev #n.
968//
969// if there are multiple coincident and legal revs to direct a
970// commit to (all with the same author/changelog), we direct the
971// commit to the rev with the closest initial timestamp. that is,
972// the *latest* beginning time.
973
974struct
975cvs_cluster
976{
977 time_t first_time;
978 cvs_author author;
979 cvs_changelog changelog;
980 set<cvs_tag> tags;
981
982 cvs_cluster(time_t t,
983 cvs_author a,
984 cvs_changelog c)
985 : first_time(t),
986 author(a),
987 changelog(c)
988 {}
989
990 struct entry
991 {
992 bool live;
993 cvs_version version;
994 time_t time;
995 entry(bool l, cvs_version v, time_t t)
996 : live(l),
997 version(v),
998 time(t)
999 {}
1000 };
1001
1002 typedef map<cvs_path, entry> entry_map;
1003 entry_map entries;
1004};
1005
1006
1007struct
1008cluster_consumer
1009{
1010 cvs_history & cvs;
1011 key_store & keys;
1012 project_t & project;
1013
1014 string const & branchname;
1015 cvs_branch const & branch;
1016 set<file_path> created_dirs;
1017 map<cvs_path, cvs_version> live_files;
1018 ticker & n_revisions;
1019
1020 struct prepared_revision
1021 {
1022 prepared_revision(revision_id i,
1023 shared_ptr<revision_t> r,
1024 cvs_cluster const & c);
1025 revision_id rid;
1026 shared_ptr<revision_t> rev;
1027 time_t time;
1028 cvs_author author;
1029 cvs_changelog changelog;
1030 vector<cvs_tag> tags;
1031 };
1032
1033 vector<prepared_revision> preps;
1034
1035 roster_t ros;
1036 temp_node_id_source nis;
1037 editable_roster_base editable_ros;
1038 revision_id parent_rid, child_rid;
1039
1040 cluster_consumer(project_t & project,
1041 key_store & keys,
1042 cvs_history & cvs,
1043 string const & branchname,
1044 cvs_branch const & branch,
1045 ticker & n_revs);
1046
1047 void consume_cluster(cvs_cluster const & c);
1048 void add_missing_parents(file_path const & sp, cset & cs);
1049 void build_cset(cvs_cluster const & c, cset & cs);
1050 void store_auxiliary_certs(prepared_revision const & p);
1051 void store_revisions();
1052};
1053
1054typedef shared_ptr<cvs_cluster>
1055cluster_ptr;
1056
1057struct
1058cluster_ptr_lt
1059{
1060 bool operator()(cluster_ptr const & a,
1061 cluster_ptr const & b) const
1062 {
1063 return a->first_time < b->first_time;
1064 }
1065};
1066
1067typedef set<cluster_ptr, cluster_ptr_lt>
1068cluster_set;
1069
1070void
1071import_branch(project_t & project,
1072 key_store & keys,
1073 cvs_history & cvs,
1074 string const & branchname,
1075 shared_ptr<cvs_branch> const & branch,
1076 ticker & n_revs)
1077{
1078 cluster_set clusters;
1079 cluster_consumer cons(project, keys, cvs, branchname, *branch, n_revs);
1080 unsigned long commits_remaining = branch->lineage.size();
1081
1082 // step 1: sort the lineage
1083 stable_sort(branch->lineage.begin(), branch->lineage.end());
1084
1085 for (vector<cvs_commit>::const_iterator i = branch->lineage.begin();
1086 i != branch->lineage.end(); ++i)
1087 {
1088 commits_remaining--;
1089
1090 L(FL("examining next commit [t:%d] [p:%s] [a:%s] [c:%s]")
1091 % i->time
1092 % cvs.path_interner.lookup(i->path)
1093 % cvs.author_interner.lookup(i->author)
1094 % cvs.changelog_interner.lookup(i->changelog));
1095
1096 // step 2: expire all clusters from the beginning of the set which
1097 // have passed the window size
1098 while (!clusters.empty())
1099 {
1100 cluster_set::iterator j = clusters.begin();
1101 if ((*j)->first_time + constants::cvs_window < i->time)
1102 {
1103 L(FL("expiring cluster"));
1104 cons.consume_cluster(**j);
1105 clusters.erase(j);
1106 }
1107 else
1108 break;
1109 }
1110
1111 // step 3: find the last still-live cluster to have touched this
1112 // file
1113 time_t time_of_last_cluster_touching_this_file = 0;
1114
1115 unsigned clu = 0;
1116 for (cluster_set::const_iterator j = clusters.begin();
1117 j != clusters.end(); ++j)
1118 {
1119 L(FL("examining cluster %d to see if it touched %d")
1120 % clu++
1121 % i->path);
1122
1123 cvs_cluster::entry_map::const_iterator k = (*j)->entries.find(i->path);
1124 if ((k != (*j)->entries.end())
1125 && (k->second.time > time_of_last_cluster_touching_this_file))
1126 {
1127 L(FL("found cluster touching %d: [t:%d] [a:%d] [c:%d]")
1128 % i->path
1129 % (*j)->first_time
1130 % (*j)->author
1131 % (*j)->changelog);
1132 time_of_last_cluster_touching_this_file = (*j)->first_time;
1133 }
1134 }
1135 L(FL("last modification time is %d")
1136 % time_of_last_cluster_touching_this_file);
1137
1138 // step 4: find a cluster which starts on or after the
1139 // last_modify_time, which doesn't modify the file in question,
1140 // and which contains the same author and changelog as our
1141 // commit
1142 cluster_ptr target;
1143 for (cluster_set::const_iterator j = clusters.begin();
1144 j != clusters.end(); ++j)
1145 {
1146 if (((*j)->first_time >= time_of_last_cluster_touching_this_file)
1147 && ((*j)->author == i->author)
1148 && ((*j)->changelog == i->changelog)
1149 && ((*j)->entries.find(i->path) == (*j)->entries.end()))
1150 {
1151 L(FL("picked existing cluster [t:%d] [a:%d] [c:%d]")
1152 % (*j)->first_time
1153 % (*j)->author
1154 % (*j)->changelog);
1155
1156 target = (*j);
1157 }
1158 }
1159
1160 // if we're still not finding an active cluster,
1161 // this is probably the first commit in it. make
1162 // a new one.
1163 if (!target)
1164 {
1165 L(FL("building new cluster [t:%d] [a:%d] [c:%d]")
1166 % i->time
1167 % i->author
1168 % i->changelog);
1169
1170 target = cluster_ptr(new cvs_cluster(i->time,
1171 i->author,
1172 i->changelog));
1173 clusters.insert(target);
1174 }
1175
1176 I(target);
1177 target->entries.insert(make_pair(i->path,
1178 cvs_cluster::entry(i->alive,
1179 i->version,
1180 i->time)));
1181 for (vector<cvs_tag>::const_iterator j = i->tags.begin();
1182 j != i->tags.end(); ++j)
1183 {
1184 target->tags.insert(*j);
1185 }
1186 }
1187
1188
1189 // now we are done this lineage; flush all remaining clusters
1190 L(FL("finished branch commits, writing all pending clusters"));
1191 while (!clusters.empty())
1192 {
1193 cons.consume_cluster(**clusters.begin());
1194 clusters.erase(clusters.begin());
1195 }
1196 L(FL("finished writing pending clusters"));
1197
1198 cons.store_revisions();
1199
1200}
1201
1202void
1203import_cvs_repo(project_t & project,
1204 key_store & keys,
1205 system_path const & cvsroot,
1206 branch_name const & branchname)
1207
1208{
1209 E(!directory_exists(cvsroot / "CVSROOT"), origin::user,
1210 F("'%s' appears to be a CVS repository root directory\n"
1211 "try importing a module instead, with 'cvs_import %s/<module_name>'")
1212 % cvsroot % cvsroot);
1213
1214 cvs_history cvs;
1215
1216 cvs.base_branch = branchname();
1217
1218 // push the trunk
1219 cvs.trunk = shared_ptr<cvs_branch>(new cvs_branch());
1220 cvs.stk.push(cvs.trunk);
1221 cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch));
1222
1223 {
1224 transaction_guard guard(project.db);
1225 cvs_tree_walker walker(cvs, project.db);
1226 project.db.ensure_open();
1227 change_current_working_dir(cvsroot);
1228 walk_tree(file_path(), walker);
1229 guard.commit();
1230 }
1231
1232 I(cvs.stk.size() == 1);
1233
1234 ticker n_revs(_("revisions"), "r", 1);
1235
1236 while (!cvs.branches.empty())
1237 {
1238 transaction_guard guard(project.db);
1239 map<string, shared_ptr<cvs_branch> >::const_iterator i = cvs.branches.begin();
1240 string branchname = i->first;
1241 shared_ptr<cvs_branch> branch = i->second;
1242 L(FL("branch %s has %d entries") % branchname % branch->lineage.size());
1243 import_branch(project, keys, cvs, branchname, branch, n_revs);
1244
1245 // free up some memory
1246 cvs.branches.erase(branchname);
1247 guard.commit();
1248 }
1249
1250 {
1251 transaction_guard guard(project.db);
1252 L(FL("trunk has %d entries") % cvs.trunk->lineage.size());
1253 import_branch(project, keys, cvs, cvs.base_branch, cvs.trunk, n_revs);
1254 guard.commit();
1255 }
1256
1257 // now we have a "last" rev for each tag
1258 {
1259 ticker n_tags(_("tags"), "t", 1);
1260 transaction_guard guard(project.db);
1261 for (map<unsigned long, pair<time_t, revision_id> >::const_iterator i = cvs.resolved_tags.begin();
1262 i != cvs.resolved_tags.end(); ++i)
1263 {
1264 string tag = cvs.tag_interner.lookup(i->first);
1265 ui.set_tick_trailer("marking tag " + tag);
1266 project.put_tag(keys, i->second.second, tag);
1267 ++n_tags;
1268 }
1269 guard.commit();
1270 }
1271}
1272
1273cluster_consumer::cluster_consumer(project_t & project,
1274 key_store & keys,
1275 cvs_history & cvs,
1276 string const & branchname,
1277 cvs_branch const & branch,
1278 ticker & n_revs)
1279 : cvs(cvs),
1280 keys(keys),
1281 project(project),
1282 branchname(branchname),
1283 branch(branch),
1284 n_revisions(n_revs),
1285 editable_ros(ros, nis)
1286{
1287 if (!branch.live_at_beginning.empty())
1288 {
1289 cvs_author synthetic_author =
1290 cvs.author_interner.intern("cvs_import");
1291
1292 cvs_changelog synthetic_cl =
1293 cvs.changelog_interner.intern("beginning of branch "
1294 + branchname);
1295
1296 time_t synthetic_time = branch.beginning();
1297 cvs_cluster initial_cluster(synthetic_time,
1298 synthetic_author,
1299 synthetic_cl);
1300
1301 L(FL("initial cluster on branch %s has %d live entries") %
1302 branchname % branch.live_at_beginning.size());
1303
1304 for (map<cvs_path, cvs_version>::const_iterator i = branch.live_at_beginning.begin();
1305 i != branch.live_at_beginning.end(); ++i)
1306 {
1307 cvs_cluster::entry e(true, i->second, synthetic_time);
1308 L(FL("initial cluster contains %s at %s") %
1309 cvs.path_interner.lookup(i->first) %
1310 cvs.file_version_interner.lookup(i->second));
1311 initial_cluster.entries.insert(make_pair(i->first, e));
1312 }
1313 consume_cluster(initial_cluster);
1314 }
1315}
1316
1317cluster_consumer::prepared_revision::prepared_revision(revision_id i,
1318 shared_ptr<revision_t> r,
1319 cvs_cluster const & c)
1320 : rid(i),
1321 rev(r),
1322 time(c.first_time),
1323 author(c.author),
1324 changelog(c.changelog)
1325{
1326 for (set<cvs_tag>::const_iterator i = c.tags.begin();
1327 i != c.tags.end(); ++i)
1328 {
1329 tags.push_back(*i);
1330 }
1331}
1332
1333
1334void
1335cluster_consumer::store_revisions()
1336{
1337 for (vector<prepared_revision>::const_iterator i = preps.begin();
1338 i != preps.end(); ++i)
1339 if (project.db.put_revision(i->rid, *(i->rev)))
1340 {
1341 store_auxiliary_certs(*i);
1342 ++n_revisions;
1343 }
1344}
1345
1346void
1347cluster_consumer::store_auxiliary_certs(prepared_revision const & p)
1348{
1349 for (vector<cvs_tag>::const_iterator i = p.tags.begin();
1350 i != p.tags.end(); ++i)
1351 {
1352 map<unsigned long, pair<time_t, revision_id> >::const_iterator j
1353 = cvs.resolved_tags.find(*i);
1354
1355 if (j != cvs.resolved_tags.end())
1356 {
1357 if (j->second.first < p.time)
1358 {
1359 // move the tag forwards
1360 cvs.resolved_tags.erase(*i);
1361 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1362 }
1363 }
1364 else
1365 {
1366 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1367 }
1368 }
1369
1370 project.put_standard_certs(keys, p.rid,
1371 branch_name(branchname, origin::user),
1372 utf8(cvs.changelog_interner.lookup(p.changelog),
1373 origin::internal),
1374 date_t(s64(p.time) * 1000),
1375 cvs.author_interner.lookup(p.author));
1376}
1377
1378void
1379cluster_consumer::add_missing_parents(file_path const & path, cset & cs)
1380{
1381 if (created_dirs.find(path) != created_dirs.end())
1382 return;
1383
1384 if (!path.empty())
1385 add_missing_parents(path.dirname(), cs);
1386
1387 safe_insert(created_dirs, path);
1388 safe_insert(cs.dirs_added, path);
1389}
1390
1391void
1392cluster_consumer::build_cset(cvs_cluster const & c,
1393 cset & cs)
1394{
1395 for (cvs_cluster::entry_map::const_iterator i = c.entries.begin();
1396 i != c.entries.end(); ++i)
1397 {
1398 file_path pth = file_path_internal(cvs.path_interner.lookup(i->first));
1399
1400 file_id fid(cvs.file_version_interner.lookup(i->second.version),
1401 origin::internal);
1402 if (i->second.live)
1403 {
1404 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1405 if (e == live_files.end())
1406 {
1407 add_missing_parents(pth.dirname(), cs);
1408 L(FL("adding entry state '%s' on '%s'")
1409 % fid % pth);
1410 safe_insert(cs.files_added, make_pair(pth, fid));
1411 live_files[i->first] = i->second.version;
1412 }
1413 else if (e->second != i->second.version)
1414 {
1415 file_id old_fid(cvs.file_version_interner.lookup(e->second),
1416 origin::internal);
1417 L(FL("applying state delta on '%s' : '%s' -> '%s'")
1418 % pth
1419 % old_fid
1420 % fid);
1421 safe_insert(cs.deltas_applied,
1422 make_pair(pth, make_pair(old_fid, fid)));
1423 live_files[i->first] = i->second.version;
1424 }
1425 }
1426 else
1427 {
1428 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1429 if (e != live_files.end())
1430 {
1431 L(FL("deleting entry state '%s' on '%s'")
1432 % fid % pth);
1433 safe_insert(cs.nodes_deleted, pth);
1434 live_files.erase(i->first);
1435 }
1436 }
1437 }
1438}
1439
1440void
1441cluster_consumer::consume_cluster(cvs_cluster const & c)
1442{
1443 // we should never have an empty cluster; it's *possible* to have
1444 // an empty changeset (say on a vendor import) but every cluster
1445 // should have been created by at least one file commit, even
1446 // if the commit made no changes. it's a logical inconsistency if
1447 // you have an empty cluster.
1448 I(!c.entries.empty());
1449
1450 shared_ptr<revision_t> rev(new revision_t());
1451 shared_ptr<cset> cs(new cset());
1452 build_cset(c, *cs);
1453
1454 cs->apply_to(editable_ros);
1455 manifest_id child_mid;
1456 calculate_ident(ros, child_mid);
1457 rev->made_for = made_for_database;
1458 rev->new_manifest = child_mid;
1459 rev->edges.insert(make_pair(parent_rid, cs));
1460 calculate_ident(*rev, child_rid);
1461
1462 preps.push_back(prepared_revision(child_rid, rev, c));
1463
1464 parent_rid = child_rid;
1465}
1466
1467// Local Variables:
1468// mode: C++
1469// fill-column: 76
1470// c-file-style: "gnu"
1471// indent-tabs-mode: nil
1472// End:
1473// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status