monotone

monotone Mtn Source Tree

Root/rcs_import.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include "base.hh"
11#include <algorithm>
12#include <iterator>
13#include <list>
14#include <map>
15#include <set>
16#include <sstream>
17#include <stack>
18#include <stdexcept>
19#include "vector.hh"
20#include <cstring> // memset
21
22#include <boost/shared_ptr.hpp>
23#include <boost/scoped_ptr.hpp>
24#include "lexical_cast.hh"
25#include <boost/tokenizer.hpp>
26
27#include "cert.hh"
28#include "constants.hh"
29#include "cycle_detector.hh"
30#include "database.hh"
31#include "file_io.hh"
32#include "interner.hh"
33#include "paths.hh"
34#include "platform-wrapped.hh"
35#include "project.hh"
36#include "rcs_file.hh"
37#include "revision.hh"
38#include "safe_map.hh"
39#include "sanity.hh"
40#include "transforms.hh"
41#include "ui.hh"
42#include "roster.hh"
43#include "xdelta.hh"
44
45using std::make_pair;
46using std::map;
47using std::multimap;
48using std::out_of_range;
49using std::pair;
50using std::search;
51using std::set;
52using std::sscanf;
53using std::stable_sort;
54using std::stack;
55using std::string;
56using std::vector;
57
58using boost::scoped_ptr;
59using boost::shared_ptr;
60using boost::lexical_cast;
61
62// cvs history recording stuff
63
64typedef unsigned long cvs_branchname;
65typedef unsigned long cvs_author;
66typedef unsigned long cvs_changelog;
67typedef unsigned long cvs_version;
68typedef unsigned long cvs_path;
69typedef unsigned long cvs_tag;
70
71struct cvs_history;
72
73struct
74cvs_commit
75{
76 cvs_commit(rcs_file const & r,
77 string const & rcs_version,
78 file_id const & ident,
79 cvs_history & cvs);
80
81 bool is_synthetic_branch_root;
82 time_t time;
83 bool alive;
84 cvs_author author;
85 cvs_changelog changelog;
86 cvs_version version;
87 cvs_path path;
88 vector<cvs_tag> tags;
89
90 bool operator<(cvs_commit const & other) const
91 {
92 return time < other.time;
93 }
94};
95
96struct
97cvs_branch
98{
99 bool has_a_branchpoint;
100 bool has_a_commit;
101 time_t last_branchpoint;
102 time_t first_commit;
103
104 map<cvs_path, cvs_version> live_at_beginning;
105 vector<cvs_commit> lineage;
106
107 cvs_branch()
108 : has_a_branchpoint(false),
109 has_a_commit(false),
110 last_branchpoint(0),
111 first_commit(0)
112 {
113 }
114
115 void note_commit(time_t now)
116 {
117 if (!has_a_commit)
118 {
119 first_commit = now;
120 }
121 else
122 {
123 if (now < first_commit)
124 first_commit = now;
125 }
126 has_a_commit = true;
127 }
128
129 void note_branchpoint(time_t now)
130 {
131 has_a_branchpoint = true;
132 if (now > last_branchpoint)
133 last_branchpoint = now;
134 }
135
136 time_t beginning() const
137 {
138 I(has_a_branchpoint || has_a_commit);
139 if (has_a_commit)
140 {
141 I(first_commit != 0);
142 return first_commit;
143 }
144 else
145 {
146 I(last_branchpoint != 0);
147 return last_branchpoint;
148 }
149 }
150
151 void append_commit(cvs_commit const & c)
152 {
153 I(c.time != 0);
154 note_commit(c.time);
155 lineage.push_back(c);
156 }
157};
158
159struct
160cvs_history
161{
162
163 interner<unsigned long> branch_interner;
164 interner<unsigned long> author_interner;
165 interner<unsigned long> changelog_interner;
166 interner<unsigned long> file_version_interner;
167 interner<unsigned long> path_interner;
168 interner<unsigned long> tag_interner;
169
170 // assume admin has foo:X.Y.0.N in it, then
171 // this multimap contains entries of the form
172 // X.Y -> foo
173 multimap<string, string> branchpoints;
174
175 // and this map contains entries of the form
176 // X.Y.N.1 -> foo
177 map<string, string> branch_first_entries;
178
179 // branch name -> branch
180 map<string, shared_ptr<cvs_branch> > branches;
181 shared_ptr<cvs_branch> trunk;
182
183 // stack of branches we're injecting states into
184 stack< shared_ptr<cvs_branch> > stk;
185 stack< cvs_branchname > bstk;
186
187 // tag -> time, revision
188 //
189 // used to resolve the *last* revision which has a given tag
190 // applied; this is the revision which wins the tag.
191 map<unsigned long, pair<time_t, revision_id> > resolved_tags;
192
193 file_path curr_file;
194 cvs_path curr_file_interned;
195
196 string base_branch;
197
198 ticker n_versions;
199 ticker n_tree_branches;
200
201 cvs_history();
202 void set_filename(string const & file,
203 file_id const & ident);
204
205 void index_branchpoint_symbols(rcs_file const & r);
206
207 void push_branch(string const & branch_name, bool private_branch);
208 void pop_branch();
209};
210
211
212static bool
213is_sbr(shared_ptr<rcs_delta> dl,
214 shared_ptr<rcs_deltatext> dt)
215{
216
217 // CVS abuses the RCS format a bit (ha!) when storing a file which
218 // was only added on a branch: on the root of the branch there'll be
219 // a commit with dead state, empty text, and a log message
220 // containing the string "file foo was initially added on branch
221 // bar". We recognize and ignore these cases, as they do not
222 // "really" represent commits to be clustered together.
223
224 if (dl->state != "dead")
225 return false;
226
227 if (!dt->text.empty())
228 return false;
229
230 string log_bit = "was initially added on branch";
231 string::const_iterator i = search(dt->log.begin(),
232 dt->log.end(),
233 log_bit.begin(),
234 log_bit.end());
235
236 return i != dt->log.end();
237}
238
239
240cvs_commit::cvs_commit(rcs_file const & r,
241 string const & rcs_version,
242 file_id const & ident,
243 cvs_history & cvs)
244{
245 map<string, shared_ptr<rcs_delta> >::const_iterator delta =
246 r.deltas.find(rcs_version);
247 I(delta != r.deltas.end());
248
249 map<string, shared_ptr<rcs_deltatext> >::const_iterator deltatext =
250 r.deltatexts.find(rcs_version);
251 I(deltatext != r.deltatexts.end());
252
253 struct tm t;
254 // We need to initialize t to all zeros, because strptime has a habit of
255 // leaving bits of the data structure alone, letting garbage sneak into
256 // our output.
257 memset(&t, 0, sizeof(t));
258 char const * dp = delta->second->date.c_str();
259 L(FL("Calculating time of %s") % dp);
260#ifdef HAVE_STRPTIME
261 if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL)
262 I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL);
263#else
264 I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon),
265 &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6);
266 t.tm_mon--;
267 // Apparently some RCS files have 2 digit years, others four; tm always
268 // wants a 2 (or 3) digit year (years since 1900).
269 if (t.tm_year > 1900)
270 t.tm_year-=1900;
271#endif
272 time = mktime(&t);
273 L(FL("= %i") % time);
274
275 is_synthetic_branch_root = is_sbr(delta->second,
276 deltatext->second);
277
278 alive = delta->second->state != "dead";
279 if (is_synthetic_branch_root)
280 changelog = cvs.changelog_interner.intern("synthetic branch root changelog");
281 else
282 changelog = cvs.changelog_interner.intern(deltatext->second->log);
283 author = cvs.author_interner.intern(delta->second->author);
284 path = cvs.curr_file_interned;
285 version = cvs.file_version_interner.intern(ident.inner()());
286
287 typedef multimap<string,string>::const_iterator ity;
288 pair<ity,ity> range = r.admin.symbols.equal_range(rcs_version);
289 for (ity i = range.first; i != range.second; ++i)
290 {
291 if (i->first == rcs_version)
292 {
293 L(FL("version %s -> tag %s") % rcs_version % i->second);
294 tags.push_back(cvs.tag_interner.intern(i->second));
295 }
296 }
297
298}
299
300
301// piece table stuff
302
303struct piece;
304
305struct
306piece_store
307{
308 vector< shared_ptr<rcs_deltatext> > texts;
309 void index_deltatext(shared_ptr<rcs_deltatext> const & dt,
310 vector<piece> & pieces);
311 void build_string(vector<piece> const & pieces,
312 string & out);
313 void reset() { texts.clear(); }
314};
315
316// FIXME: kludge, I was lazy and did not make this
317// a properly scoped variable.
318
319static piece_store global_pieces;
320
321
322struct
323piece
324{
325 piece(string::size_type p, string::size_type l, unsigned long id) :
326 pos(p), len(l), string_id(id) {}
327 string::size_type pos;
328 string::size_type len;
329 unsigned long string_id;
330 string operator*() const
331 {
332 return string(global_pieces.texts.at(string_id)->text.data() + pos, len);
333 }
334};
335
336
337void
338piece_store::build_string(vector<piece> const & pieces,
339 string & out)
340{
341 out.clear();
342 out.reserve(pieces.size() * 60);
343 for(vector<piece>::const_iterator i = pieces.begin();
344 i != pieces.end(); ++i)
345 out.append(texts.at(i->string_id)->text, i->pos, i->len);
346}
347
348void
349piece_store::index_deltatext(shared_ptr<rcs_deltatext> const & dt,
350 vector<piece> & pieces)
351{
352 pieces.clear();
353 pieces.reserve(dt->text.size() / 30);
354 texts.push_back(dt);
355 unsigned long id = texts.size() - 1;
356 string::size_type begin = 0;
357 string::size_type end = dt->text.find('\n');
358 while(end != string::npos)
359 {
360 // nb: the piece includes the '\n'
361 pieces.push_back(piece(begin, (end - begin) + 1, id));
362 begin = end + 1;
363 end = dt->text.find('\n', begin);
364 }
365 if (begin != dt->text.size())
366 {
367 // the text didn't end with '\n', so neither does the piece
368 end = dt->text.size();
369 pieces.push_back(piece(begin, end - begin, id));
370 }
371}
372
373
374static void
375process_one_hunk(vector< piece > const & source,
376 vector< piece > & dest,
377 vector< piece >::const_iterator & i,
378 int & cursor)
379{
380 string directive = **i;
381 assert(directive.size() > 1);
382 ++i;
383
384 try
385 {
386 char code;
387 int pos, len;
388 if (sscanf(directive.c_str(), " %c %d %d", &code, &pos, &len) != 3)
389 throw oops("illformed directive '" + directive + "'");
390
391 if (code == 'a')
392 {
393 // 'ax y' means "copy from source to dest until cursor == x, then
394 // copy y lines from delta, leaving cursor where it is"
395 while (cursor < pos)
396 dest.push_back(source.at(cursor++));
397 I(cursor == pos);
398 while (len--)
399 dest.push_back(*i++);
400 }
401 else if (code == 'd')
402 {
403 // 'dx y' means "copy from source to dest until cursor == x-1,
404 // then increment cursor by y, ignoring those y lines"
405 while (cursor < (pos - 1))
406 dest.push_back(source.at(cursor++));
407 I(cursor == pos - 1);
408 cursor += len;
409 }
410 else
411 throw oops("unknown directive '" + directive + "'");
412 }
413 catch (out_of_range &)
414 {
415 throw oops("out_of_range while processing " + directive
416 + " with source.size() == "
417 + lexical_cast<string>(source.size())
418 + " and cursor == "
419 + lexical_cast<string>(cursor));
420 }
421}
422
423static void
424construct_version(vector< piece > const & source_lines,
425 string const & dest_version,
426 vector< piece > & dest_lines,
427 rcs_file const & r)
428{
429 dest_lines.clear();
430 dest_lines.reserve(source_lines.size());
431
432 I(r.deltas.find(dest_version) != r.deltas.end());
433 shared_ptr<rcs_delta> delta = r.deltas.find(dest_version)->second;
434
435 I(r.deltatexts.find(dest_version) != r.deltatexts.end());
436 shared_ptr<rcs_deltatext> deltatext = r.deltatexts.find(dest_version)->second;
437
438 vector<piece> deltalines;
439 global_pieces.index_deltatext(deltatext, deltalines);
440
441 int cursor = 0;
442 for (vector<piece>::const_iterator i = deltalines.begin();
443 i != deltalines.end(); )
444 process_one_hunk(source_lines, dest_lines, i, cursor);
445 while (cursor < static_cast<int>(source_lines.size()))
446 dest_lines.push_back(source_lines[cursor++]);
447}
448
449// FIXME: should these be someplace else? using 'friend' to reach into the
450// DB is stupid, but it's also stupid to put raw edge insert methods on the
451// DB itself. or is it? hmm.. encapsulation vs. usage guidance..
452void
453rcs_put_raw_file_edge(database & db,
454 file_id const & old_id,
455 file_id const & new_id,
456 delta const & del)
457{
458 if (old_id == new_id)
459 {
460 L(FL("skipping identity file edge"));
461 return;
462 }
463
464 if (db.file_version_exists(old_id))
465 {
466 // we already have a way to get to this old version,
467 // no need to insert another reconstruction path
468 L(FL("existing path to %s found, skipping") % old_id);
469 }
470 else
471 {
472 I(db.file_or_manifest_base_exists(new_id, "files")
473 || db.delta_exists(new_id.inner(), "file_deltas"));
474 db.put_file_delta(old_id, new_id, file_delta(del));
475 }
476}
477
478
479static void
480insert_into_db(database & db, data const & curr_data,
481 file_id const & curr_id,
482 vector< piece > const & next_lines,
483 data & next_data,
484 file_id & next_id)
485{
486 // inserting into the DB
487 // note: curr_lines is a "new" (base) version
488 // and next_lines is an "old" (derived) version.
489 // all storage edges go from new -> old.
490 {
491 string tmp;
492 global_pieces.build_string(next_lines, tmp);
493 next_data = data(tmp);
494 }
495 delta del;
496 diff(curr_data, next_data, del);
497 calculate_ident(file_data(next_data), next_id);
498 rcs_put_raw_file_edge(db, next_id, curr_id, del);
499}
500
501
502
503/*
504
505please read this exhaustingly long comment and understand it
506before mucking with the branch inference logic.
507
508we are processing a file version. a branch might begin here. if
509the current version is X.Y, then there is a branch B starting
510here iff there is a symbol in the admin section called X.Y.0.Z,
511where Z is the branch number (or if there is a private branch
512called X.Y.Z, which is either an import branch or some private
513RCS cruft).
514
515the version X.Y is then considered the branchpoint of B in the
516current file. this does *not* mean that the CVS key -- an
517abstraction representing whole-tree operations -- of X.Y is the
518branchpoint across the CVS archive we're processing.
519
520in fact, CVS does not record the occurrence of a branching
521action (tag -b). we have no idea who executed that command and
522when. what we know instead is the commit X.Y immediately
523preceeding the branch -- CVS consideres this the branchpoint --
524in this file's reduced view of history. we also know the first
525commit X.Y.Z.1 inside the branch (which might not exist).
526
527our old strategy was to consider all branches nested in a
528hierarchy, which was a super-tree of all the branch trees in all
529the CVS files in a repository. this involved considering X.Y as
530the parent version of branch X.Y.Z, an selecting "the"
531branchpoint connecting the two as the least CVS key X.Y.Z.1
532committed inside the branch B.
533
534this was a mistake, for two significant reasons.
535
536first, some files do not *have* any commit inside the branch B,
537only a branchpoint X.Y.0.Z. this branchpoint is actually the
538last commit *before* the user branched, and could be a very old
539commit, long before the branch was formed, so it is useless in
540determining the branch structure.
541
542second, some files do not have a branch B, or worse, have
543branched into B from an "ancestor" branch A, where a different
544file branches into B from a different ancestor branch C. in
545other words, while there *is* a tree structure within the X.Y.Z
546branches of each file, there is *no* shared tree structure
547between the branch names across a repository. in one file A can
548be an ancestor of B, in another file B can be an ancestor of A.
549
550thus, we give up on establishing a hierarchy between branches
551altogether. all branches exist in a flat namespace, and all are
552direct descendents of the empty revision at the root of
553history. each branchpoint symbol mentioned in the
554administrative section of a file is considered the root of a new
555lineage.
556
557*/
558
559
560static void
561process_branch(database & db,
562 string const & begin_version,
563 vector< piece > const & begin_lines,
564 data const & begin_data,
565 file_id const & begin_id,
566 rcs_file const & r,
567 cvs_history & cvs)
568{
569 string curr_version = begin_version;
570 scoped_ptr< vector< piece > > next_lines(new vector<piece>);
571 scoped_ptr< vector< piece > > curr_lines(new vector<piece>
572 (begin_lines.begin(),
573 begin_lines.end()));
574 data curr_data(begin_data), next_data;
575 file_id curr_id(begin_id), next_id;
576
577 while(! (r.deltas.find(curr_version) == r.deltas.end()))
578 {
579 L(FL("version %s has %d lines") % curr_version % curr_lines->size());
580
581 cvs_commit curr_commit(r, curr_version, file_id(curr_id), cvs);
582 if (!curr_commit.is_synthetic_branch_root)
583 {
584 cvs.stk.top()->append_commit(curr_commit);
585 ++cvs.n_versions;
586 }
587
588 string next_version = r.deltas.find(curr_version)->second->next;
589
590 if (! next_version.empty())
591 {
592 L(FL("following RCS edge %s -> %s") % curr_version % next_version);
593
594 construct_version(*curr_lines, next_version, *next_lines, r);
595 L(FL("constructed RCS version %s, inserting into database") %
596 next_version);
597
598 insert_into_db(db, curr_data, curr_id,
599 *next_lines, next_data, next_id);
600 }
601
602 // mark the beginning-of-branch time and state of this file if
603 // we're at a branchpoint
604 typedef multimap<string,string>::const_iterator ity;
605 pair<ity,ity> range = cvs.branchpoints.equal_range(curr_version);
606 if (range.first != cvs.branchpoints.end()
607 && range.first->first == curr_version)
608 {
609 for (ity i = range.first; i != range.second; ++i)
610 {
611 cvs.push_branch(i->second, false);
612 shared_ptr<cvs_branch> b = cvs.stk.top();
613 if (curr_commit.alive)
614 b->live_at_beginning[cvs.curr_file_interned] = curr_commit.version;
615 b->note_branchpoint(curr_commit.time);
616 cvs.pop_branch();
617 }
618 }
619
620
621 // recursively follow any branch commits coming from the branchpoint
622 shared_ptr<rcs_delta> curr_delta = r.deltas.find(curr_version)->second;
623 for(vector<string>::const_iterator i = curr_delta->branches.begin();
624 i != curr_delta->branches.end(); ++i)
625 {
626 string branch;
627 data branch_data;
628 file_id branch_id;
629 vector< piece > branch_lines;
630 bool priv = false;
631 map<string, string>::const_iterator be = cvs.branch_first_entries.find(*i);
632
633 if (be != cvs.branch_first_entries.end())
634 branch = be->second;
635 else
636 priv = true;
637
638 L(FL("following RCS branch %s = '%s'") % (*i) % branch);
639
640 construct_version(*curr_lines, *i, branch_lines, r);
641 insert_into_db(db, curr_data, curr_id,
642 branch_lines, branch_data, branch_id);
643
644 cvs.push_branch(branch, priv);
645 process_branch(db, *i, branch_lines, branch_data, branch_id, r, cvs);
646 cvs.pop_branch();
647
648 L(FL("finished RCS branch %s = '%s'") % (*i) % branch);
649 }
650
651 if (!r.deltas.find(curr_version)->second->next.empty())
652 {
653 // advance
654 curr_data = next_data;
655 curr_id = next_id;
656 curr_version = next_version;
657 swap(next_lines, curr_lines);
658 next_lines->clear();
659 }
660 else break;
661 }
662}
663
664
665static void
666import_rcs_file_with_cvs(database & db, string const & filename,
667 cvs_history & cvs)
668{
669 rcs_file r;
670 L(FL("parsing RCS file %s") % filename);
671 parse_rcs_file(filename, r);
672 L(FL("parsed RCS file %s OK") % filename);
673
674 {
675 vector< piece > head_lines;
676 I(r.deltatexts.find(r.admin.head) != r.deltatexts.end());
677 I(r.deltas.find(r.admin.head) != r.deltas.end());
678
679 file_id fid;
680 file_data dat(r.deltatexts.find(r.admin.head)->second->text);
681 calculate_ident(dat, fid);
682
683 cvs.set_filename(filename, fid);
684 cvs.index_branchpoint_symbols (r);
685 db.put_file(fid, dat);
686
687 {
688 // create the head state in case it is a loner
689 // cvs_key k;
690 // shared_ptr<cvs_state> s;
691 // L(FL("noting head version %s : %s") % cvs.curr_file % r.admin.head);
692 // cvs.find_key_and_state (r, r.admin.head, k, s);
693 }
694
695 global_pieces.reset();
696 global_pieces.index_deltatext(r.deltatexts.find(r.admin.head)->second, head_lines);
697 process_branch(db, r.admin.head, head_lines, dat.inner(), fid, r, cvs);
698 global_pieces.reset();
699 }
700
701 ui.set_tick_trailer("");
702}
703
704void
705test_parse_rcs_file(system_path const & filename)
706{
707 cvs_history cvs;
708
709 I(! filename.empty());
710 assert_path_is_file(filename);
711
712 P(F("parsing RCS file %s") % filename);
713 rcs_file r;
714 parse_rcs_file(filename.as_external(), r);
715 P(F("parsed RCS file %s OK") % filename);
716}
717
718
719// CVS importing stuff follows
720
721
722static void
723split_version(string const & v, vector<string> & vs)
724{
725 vs.clear();
726 boost::char_separator<char> sep(".");
727 typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
728 tokenizer tokens(v, sep);
729 copy(tokens.begin(), tokens.end(), back_inserter(vs));
730}
731
732static void
733join_version(vector<string> const & vs, string & v)
734{
735 v.clear();
736 for (vector<string>::const_iterator i = vs.begin();
737 i != vs.end(); ++i)
738 {
739 if (i != vs.begin())
740 v += ".";
741 v += *i;
742 }
743}
744
745cvs_history::cvs_history() :
746 n_versions("versions", "v", 1),
747 n_tree_branches("branches", "b", 1)
748{
749}
750
751void
752cvs_history::set_filename(string const & file,
753 file_id const & ident)
754{
755 L(FL("importing file '%s'") % file);
756 I(file.size() > 2);
757 I(file.substr(file.size() - 2) == string(",v"));
758 string ss = file;
759 ui.set_tick_trailer(ss);
760 ss.resize(ss.size() - 2);
761 // remove Attic/ if present
762 string::size_type last_slash=ss.rfind('/');
763 if (last_slash!=string::npos && last_slash>=5
764 && ss.substr(last_slash-5,6)=="Attic/")
765 ss.erase(last_slash-5,6);
766 curr_file = file_path_internal(ss);
767 curr_file_interned = path_interner.intern(ss);
768}
769
770void cvs_history::index_branchpoint_symbols(rcs_file const & r)
771{
772 branchpoints.clear();
773 branch_first_entries.clear();
774
775 for (multimap<string, string>::const_iterator i =
776 r.admin.symbols.begin(); i != r.admin.symbols.end(); ++i)
777 {
778 string const & num = i->first;
779 string const & sym = i->second;
780
781 vector<string> components;
782 split_version(num, components);
783
784 vector<string> first_entry_components;
785 vector<string> branchpoint_components;
786
787 if (components.size() > 2 &&
788 (components.size() % 2 == 1))
789 {
790 // this is a "vendor" branch
791 //
792 // such as "1.1.1", where "1.1" is the branchpoint and
793 // "1.1.1.1" will be the first commit on it.
794
795 first_entry_components = components;
796 first_entry_components.push_back("1");
797
798 branchpoint_components = components;
799 branchpoint_components.erase(branchpoint_components.end() - 1,
800 branchpoint_components.end());
801
802 }
803
804 else if (components.size() > 2 &&
805 (components.size() % 2 == 0) &&
806 components[components.size() - 2] == string("0"))
807 {
808 // this is a "normal" branch
809 //
810 // such as "1.3.0.2", where "1.3" is the branchpoint and
811 // "1.3.2.1"
812
813 first_entry_components = components;
814 first_entry_components[first_entry_components.size() - 2]
815 = first_entry_components[first_entry_components.size() - 1];
816 first_entry_components[first_entry_components.size() - 1]
817 = string("1");
818
819 branchpoint_components = components;
820 branchpoint_components.erase(branchpoint_components.end() - 2,
821 branchpoint_components.end());
822 }
823
824 string first_entry_version;
825 join_version(first_entry_components, first_entry_version);
826
827 L(FL("first version in branch %s would be %s")
828 % sym % first_entry_version);
829 branch_first_entries.insert(make_pair(first_entry_version, sym));
830
831 string branchpoint_version;
832 join_version(branchpoint_components, branchpoint_version);
833
834 L(FL("file branchpoint for %s at %s") % sym % branchpoint_version);
835 branchpoints.insert(make_pair(branchpoint_version, sym));
836 }
837}
838
839
840
841void
842cvs_history::push_branch(string const & branch_name, bool private_branch)
843{
844 shared_ptr<cvs_branch> branch;
845
846 string bname = base_branch + "." + branch_name;
847 I(stk.size() > 0);
848
849 if (private_branch)
850 {
851 branch = shared_ptr<cvs_branch>(new cvs_branch());
852 stk.push(branch);
853 bstk.push(branch_interner.intern(""));
854 return;
855 }
856 else
857 {
858 map<string, shared_ptr<cvs_branch> >::const_iterator b = branches.find(bname);
859 if (b == branches.end())
860 {
861 branch = shared_ptr<cvs_branch>(new cvs_branch());
862 branches.insert(make_pair(bname, branch));
863 ++n_tree_branches;
864 }
865 else
866 branch = b->second;
867
868 stk.push(branch);
869 bstk.push(branch_interner.intern(bname));
870 }
871}
872
873void
874cvs_history::pop_branch()
875{
876 I(stk.size() > 1);
877 stk.pop();
878 bstk.pop();
879}
880
881
882class
883cvs_tree_walker
884 : public tree_walker
885{
886 cvs_history & cvs;
887 database & db;
888public:
889 cvs_tree_walker(cvs_history & c, database & d) :
890 cvs(c), db(d)
891 {
892 }
893 virtual void visit_file(file_path const & path)
894 {
895 string file = path.as_external();
896 if (file.substr(file.size() - 2) == string(",v"))
897 {
898 try
899 {
900 import_rcs_file_with_cvs(db, file, cvs);
901 }
902 catch (oops const & o)
903 {
904 W(F("error reading RCS file %s: %s") % file % o.what());
905 }
906 }
907 else
908 L(FL("skipping non-RCS file %s") % file);
909 }
910 virtual ~cvs_tree_walker() {}
911};
912
913
914
915
916//
917// our task here is to produce a sequence of revision descriptions
918// from the per-file commit records we have. we do this by rolling
919// forwards through the temporally sorted file-commit list
920// accumulating file-commits into revisions and flushing the
921// revisions when we feel they are "complete".
922//
923// revisions have to have a time associated with them. this time
924// will be the first time of any commit associated with the
925// revision. they have an author and a changelog, which is shared
926// by all the file-commits in the revision.
927//
928// there might be multiple revisions overlapping in time. this is
929// legal wrt. CVS. we keep a set, and search all members of the set
930// for the best match.
931//
932// consider this situation of overlapping revisions:
933//
934// +---------------+ +---------------+ +---------------+
935// | rev #1 @ 0011 | | rev #2 @ 0012 | | rev #3 @ 0013 |
936// |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~|
937// | patch foo.txt | | patch bar.txt | | patch baz.txt |
938// +---------------+ +---------------+ +---------------+
939//
940// suppose you have this situation and you run across a "patch
941// bar.txt" commit at timestamp 0014. what do you do?
942//
943// - you know that rev #2 cannot accept this commit, simply because
944// two commits on the same file makes *two* revisions, not one.
945//
946// - perhaps rev #3 could accept it; after all, it could be that the
947// commit associated with rev #2 released its commit lock, and the
948// commit associated with rev #3 quickly updated and committed at
949// 0013, finishing off at 0014.
950//
951// - can rev #1 accept it? no. because CVS calcualted the version it
952// expected to see in bar.txt before calling up the server, when
953// committing rev #1. the version it expected to see was the version
954// in bar.txt *before* time 0012; that is, before rev #2 had any affect
955// on bar.txt. when it contacted the server, the commit associated
956// with rev #1 would have aborted if it had seen any other number.
957// so rev #1 could not start before an edit to bar.txt and then
958// include its own edit to bar.txt.
959//
960// so we have only one case where bar.txt can be accepted. if the
961// commit is not accepted into a legal rev (outside the window,
962// wrong changelog/author) it starts a new revision.
963//
964// as we scan forwards, if we hit timestamps which lie beyond rev #n's
965// window, we flush rev #n.
966//
967// if there are multiple coincident and legal revs to direct a
968// commit to (all with the same author/changelog), we direct the
969// commit to the rev with the closest initial timestamp. that is,
970// the *latest* beginning time.
971
972struct
973cvs_cluster
974{
975 time_t first_time;
976 cvs_author author;
977 cvs_changelog changelog;
978 set<cvs_tag> tags;
979
980 cvs_cluster(time_t t,
981 cvs_author a,
982 cvs_changelog c)
983 : first_time(t),
984 author(a),
985 changelog(c)
986 {}
987
988 struct entry
989 {
990 bool live;
991 cvs_version version;
992 time_t time;
993 entry(bool l, cvs_version v, time_t t)
994 : live(l),
995 version(v),
996 time(t)
997 {}
998 };
999
1000 typedef map<cvs_path, entry> entry_map;
1001 entry_map entries;
1002};
1003
1004
1005struct
1006cluster_consumer
1007{
1008 cvs_history & cvs;
1009 key_store & keys;
1010 project_t & project;
1011
1012 string const & branchname;
1013 cvs_branch const & branch;
1014 set<file_path> created_dirs;
1015 map<cvs_path, cvs_version> live_files;
1016 ticker & n_revisions;
1017
1018 struct prepared_revision
1019 {
1020 prepared_revision(revision_id i,
1021 shared_ptr<revision_t> r,
1022 cvs_cluster const & c);
1023 revision_id rid;
1024 shared_ptr<revision_t> rev;
1025 time_t time;
1026 cvs_author author;
1027 cvs_changelog changelog;
1028 vector<cvs_tag> tags;
1029 };
1030
1031 vector<prepared_revision> preps;
1032
1033 roster_t ros;
1034 temp_node_id_source nis;
1035 editable_roster_base editable_ros;
1036 revision_id parent_rid, child_rid;
1037
1038 cluster_consumer(project_t & project,
1039 key_store & keys,
1040 cvs_history & cvs,
1041 string const & branchname,
1042 cvs_branch const & branch,
1043 ticker & n_revs);
1044
1045 void consume_cluster(cvs_cluster const & c);
1046 void add_missing_parents(file_path const & sp, cset & cs);
1047 void build_cset(cvs_cluster const & c, cset & cs);
1048 void store_auxiliary_certs(prepared_revision const & p);
1049 void store_revisions();
1050};
1051
1052typedef shared_ptr<cvs_cluster>
1053cluster_ptr;
1054
1055struct
1056cluster_ptr_lt
1057{
1058 bool operator()(cluster_ptr const & a,
1059 cluster_ptr const & b) const
1060 {
1061 return a->first_time < b->first_time;
1062 }
1063};
1064
1065typedef set<cluster_ptr, cluster_ptr_lt>
1066cluster_set;
1067
1068void
1069import_branch(project_t & project,
1070 key_store & keys,
1071 cvs_history & cvs,
1072 string const & branchname,
1073 shared_ptr<cvs_branch> const & branch,
1074 ticker & n_revs)
1075{
1076 cluster_set clusters;
1077 cluster_consumer cons(project, keys, cvs, branchname, *branch, n_revs);
1078 unsigned long commits_remaining = branch->lineage.size();
1079
1080 // step 1: sort the lineage
1081 stable_sort(branch->lineage.begin(), branch->lineage.end());
1082
1083 for (vector<cvs_commit>::const_iterator i = branch->lineage.begin();
1084 i != branch->lineage.end(); ++i)
1085 {
1086 commits_remaining--;
1087
1088 L(FL("examining next commit [t:%d] [p:%s] [a:%s] [c:%s]")
1089 % i->time
1090 % cvs.path_interner.lookup(i->path)
1091 % cvs.author_interner.lookup(i->author)
1092 % cvs.changelog_interner.lookup(i->changelog));
1093
1094 // step 2: expire all clusters from the beginning of the set which
1095 // have passed the window size
1096 while (!clusters.empty())
1097 {
1098 cluster_set::iterator j = clusters.begin();
1099 if ((*j)->first_time + constants::cvs_window < i->time)
1100 {
1101 L(FL("expiring cluster"));
1102 cons.consume_cluster(**j);
1103 clusters.erase(j);
1104 }
1105 else
1106 break;
1107 }
1108
1109 // step 3: find the last still-live cluster to have touched this
1110 // file
1111 time_t time_of_last_cluster_touching_this_file = 0;
1112
1113 unsigned clu = 0;
1114 for (cluster_set::const_iterator j = clusters.begin();
1115 j != clusters.end(); ++j)
1116 {
1117 L(FL("examining cluster %d to see if it touched %d")
1118 % clu++
1119 % i->path);
1120
1121 cvs_cluster::entry_map::const_iterator k = (*j)->entries.find(i->path);
1122 if ((k != (*j)->entries.end())
1123 && (k->second.time > time_of_last_cluster_touching_this_file))
1124 {
1125 L(FL("found cluster touching %d: [t:%d] [a:%d] [c:%d]")
1126 % i->path
1127 % (*j)->first_time
1128 % (*j)->author
1129 % (*j)->changelog);
1130 time_of_last_cluster_touching_this_file = (*j)->first_time;
1131 }
1132 }
1133 L(FL("last modification time is %d")
1134 % time_of_last_cluster_touching_this_file);
1135
1136 // step 4: find a cluster which starts on or after the
1137 // last_modify_time, which doesn't modify the file in question,
1138 // and which contains the same author and changelog as our
1139 // commit
1140 cluster_ptr target;
1141 for (cluster_set::const_iterator j = clusters.begin();
1142 j != clusters.end(); ++j)
1143 {
1144 if (((*j)->first_time >= time_of_last_cluster_touching_this_file)
1145 && ((*j)->author == i->author)
1146 && ((*j)->changelog == i->changelog)
1147 && ((*j)->entries.find(i->path) == (*j)->entries.end()))
1148 {
1149 L(FL("picked existing cluster [t:%d] [a:%d] [c:%d]")
1150 % (*j)->first_time
1151 % (*j)->author
1152 % (*j)->changelog);
1153
1154 target = (*j);
1155 }
1156 }
1157
1158 // if we're still not finding an active cluster,
1159 // this is probably the first commit in it. make
1160 // a new one.
1161 if (!target)
1162 {
1163 L(FL("building new cluster [t:%d] [a:%d] [c:%d]")
1164 % i->time
1165 % i->author
1166 % i->changelog);
1167
1168 target = cluster_ptr(new cvs_cluster(i->time,
1169 i->author,
1170 i->changelog));
1171 clusters.insert(target);
1172 }
1173
1174 I(target);
1175 target->entries.insert(make_pair(i->path,
1176 cvs_cluster::entry(i->alive,
1177 i->version,
1178 i->time)));
1179 for (vector<cvs_tag>::const_iterator j = i->tags.begin();
1180 j != i->tags.end(); ++j)
1181 {
1182 target->tags.insert(*j);
1183 }
1184 }
1185
1186
1187 // now we are done this lineage; flush all remaining clusters
1188 L(FL("finished branch commits, writing all pending clusters"));
1189 while (!clusters.empty())
1190 {
1191 cons.consume_cluster(**clusters.begin());
1192 clusters.erase(clusters.begin());
1193 }
1194 L(FL("finished writing pending clusters"));
1195
1196 cons.store_revisions();
1197
1198}
1199
1200void
1201import_cvs_repo(project_t & project,
1202 key_store & keys,
1203 system_path const & cvsroot,
1204 branch_name const & branchname)
1205
1206{
1207 N(!directory_exists(cvsroot / "CVSROOT"),
1208 F("%s appears to be a CVS repository root directory\n"
1209 "try importing a module instead, with 'cvs_import %s/<module_name>")
1210 % cvsroot % cvsroot);
1211
1212 cvs_history cvs;
1213
1214 cvs.base_branch = branchname();
1215
1216 // push the trunk
1217 cvs.trunk = shared_ptr<cvs_branch>(new cvs_branch());
1218 cvs.stk.push(cvs.trunk);
1219 cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch));
1220
1221 {
1222 transaction_guard guard(project.db);
1223 cvs_tree_walker walker(cvs, project.db);
1224 project.db.ensure_open();
1225 change_current_working_dir(cvsroot);
1226 walk_tree(file_path(), walker);
1227 guard.commit();
1228 }
1229
1230 I(cvs.stk.size() == 1);
1231
1232 ticker n_revs(_("revisions"), "r", 1);
1233
1234 while (cvs.branches.size() > 0)
1235 {
1236 transaction_guard guard(project.db);
1237 map<string, shared_ptr<cvs_branch> >::const_iterator i = cvs.branches.begin();
1238 string branchname = i->first;
1239 shared_ptr<cvs_branch> branch = i->second;
1240 L(FL("branch %s has %d entries") % branchname % branch->lineage.size());
1241 import_branch(project, keys, cvs, branchname, branch, n_revs);
1242
1243 // free up some memory
1244 cvs.branches.erase(branchname);
1245 guard.commit();
1246 }
1247
1248 {
1249 transaction_guard guard(project.db);
1250 L(FL("trunk has %d entries") % cvs.trunk->lineage.size());
1251 import_branch(project, keys, cvs, cvs.base_branch, cvs.trunk, n_revs);
1252 guard.commit();
1253 }
1254
1255 // now we have a "last" rev for each tag
1256 {
1257 ticker n_tags(_("tags"), "t", 1);
1258 transaction_guard guard(project.db);
1259 for (map<unsigned long, pair<time_t, revision_id> >::const_iterator i = cvs.resolved_tags.begin();
1260 i != cvs.resolved_tags.end(); ++i)
1261 {
1262 string tag = cvs.tag_interner.lookup(i->first);
1263 ui.set_tick_trailer("marking tag " + tag);
1264 project.put_tag(keys, i->second.second, tag);
1265 ++n_tags;
1266 }
1267 guard.commit();
1268 }
1269}
1270
1271cluster_consumer::cluster_consumer(project_t & project,
1272 key_store & keys,
1273 cvs_history & cvs,
1274 string const & branchname,
1275 cvs_branch const & branch,
1276 ticker & n_revs)
1277 : cvs(cvs),
1278 keys(keys),
1279 project(project),
1280 branchname(branchname),
1281 branch(branch),
1282 n_revisions(n_revs),
1283 editable_ros(ros, nis)
1284{
1285 if (!branch.live_at_beginning.empty())
1286 {
1287 cvs_author synthetic_author =
1288 cvs.author_interner.intern("cvs_import");
1289
1290 cvs_changelog synthetic_cl =
1291 cvs.changelog_interner.intern("beginning of branch "
1292 + branchname);
1293
1294 time_t synthetic_time = branch.beginning();
1295 cvs_cluster initial_cluster(synthetic_time,
1296 synthetic_author,
1297 synthetic_cl);
1298
1299 L(FL("initial cluster on branch %s has %d live entries") %
1300 branchname % branch.live_at_beginning.size());
1301
1302 for (map<cvs_path, cvs_version>::const_iterator i = branch.live_at_beginning.begin();
1303 i != branch.live_at_beginning.end(); ++i)
1304 {
1305 cvs_cluster::entry e(true, i->second, synthetic_time);
1306 L(FL("initial cluster contains %s at %s") %
1307 cvs.path_interner.lookup(i->first) %
1308 cvs.file_version_interner.lookup(i->second));
1309 initial_cluster.entries.insert(make_pair(i->first, e));
1310 }
1311 consume_cluster(initial_cluster);
1312 }
1313}
1314
1315cluster_consumer::prepared_revision::prepared_revision(revision_id i,
1316 shared_ptr<revision_t> r,
1317 cvs_cluster const & c)
1318 : rid(i),
1319 rev(r),
1320 time(c.first_time),
1321 author(c.author),
1322 changelog(c.changelog)
1323{
1324 for (set<cvs_tag>::const_iterator i = c.tags.begin();
1325 i != c.tags.end(); ++i)
1326 {
1327 tags.push_back(*i);
1328 }
1329}
1330
1331
1332void
1333cluster_consumer::store_revisions()
1334{
1335 for (vector<prepared_revision>::const_iterator i = preps.begin();
1336 i != preps.end(); ++i)
1337 if (project.db.put_revision(i->rid, *(i->rev)))
1338 {
1339 store_auxiliary_certs(*i);
1340 ++n_revisions;
1341 }
1342}
1343
1344void
1345cluster_consumer::store_auxiliary_certs(prepared_revision const & p)
1346{
1347 for (vector<cvs_tag>::const_iterator i = p.tags.begin();
1348 i != p.tags.end(); ++i)
1349 {
1350 map<unsigned long, pair<time_t, revision_id> >::const_iterator j
1351 = cvs.resolved_tags.find(*i);
1352
1353 if (j != cvs.resolved_tags.end())
1354 {
1355 if (j->second.first < p.time)
1356 {
1357 // move the tag forwards
1358 cvs.resolved_tags.erase(*i);
1359 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1360 }
1361 }
1362 else
1363 {
1364 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1365 }
1366 }
1367
1368 project.put_standard_certs(keys, p.rid,
1369 branch_name(branchname),
1370 utf8(cvs.changelog_interner.lookup(p.changelog)),
1371 date_t::from_unix_epoch(p.time),
1372 cvs.author_interner.lookup(p.author));
1373}
1374
1375void
1376cluster_consumer::add_missing_parents(file_path const & path, cset & cs)
1377{
1378 if (created_dirs.find(path) != created_dirs.end())
1379 return;
1380
1381 if (!path.empty())
1382 add_missing_parents(path.dirname(), cs);
1383
1384 safe_insert(created_dirs, path);
1385 safe_insert(cs.dirs_added, path);
1386}
1387
1388void
1389cluster_consumer::build_cset(cvs_cluster const & c,
1390 cset & cs)
1391{
1392 for (cvs_cluster::entry_map::const_iterator i = c.entries.begin();
1393 i != c.entries.end(); ++i)
1394 {
1395 file_path pth = file_path_internal(cvs.path_interner.lookup(i->first));
1396
1397 file_id fid(cvs.file_version_interner.lookup(i->second.version));
1398 if (i->second.live)
1399 {
1400 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1401 if (e == live_files.end())
1402 {
1403 add_missing_parents(pth.dirname(), cs);
1404 L(FL("adding entry state '%s' on '%s'")
1405 % fid % pth);
1406 safe_insert(cs.files_added, make_pair(pth, fid));
1407 live_files[i->first] = i->second.version;
1408 }
1409 else if (e->second != i->second.version)
1410 {
1411 file_id old_fid(cvs.file_version_interner.lookup(e->second));
1412 L(FL("applying state delta on '%s' : '%s' -> '%s'")
1413 % pth
1414 % old_fid
1415 % fid);
1416 safe_insert(cs.deltas_applied,
1417 make_pair(pth, make_pair(old_fid, fid)));
1418 live_files[i->first] = i->second.version;
1419 }
1420 }
1421 else
1422 {
1423 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1424 if (e != live_files.end())
1425 {
1426 L(FL("deleting entry state '%s' on '%s'")
1427 % fid % pth);
1428 safe_insert(cs.nodes_deleted, pth);
1429 live_files.erase(i->first);
1430 }
1431 }
1432 }
1433}
1434
1435void
1436cluster_consumer::consume_cluster(cvs_cluster const & c)
1437{
1438 // we should never have an empty cluster; it's *possible* to have
1439 // an empty changeset (say on a vendor import) but every cluster
1440 // should have been created by at least one file commit, even
1441 // if the commit made no changes. it's a logical inconsistency if
1442 // you have an empty cluster.
1443 I(!c.entries.empty());
1444
1445 shared_ptr<revision_t> rev(new revision_t());
1446 shared_ptr<cset> cs(new cset());
1447 build_cset(c, *cs);
1448
1449 cs->apply_to(editable_ros);
1450 manifest_id child_mid;
1451 calculate_ident(ros, child_mid);
1452 rev->made_for = made_for_database;
1453 rev->new_manifest = child_mid;
1454 rev->edges.insert(make_pair(parent_rid, cs));
1455 calculate_ident(*rev, child_rid);
1456
1457 preps.push_back(prepared_revision(child_rid, rev, c));
1458
1459 parent_rid = child_rid;
1460}
1461
1462// Local Variables:
1463// mode: C++
1464// fill-column: 76
1465// c-file-style: "gnu"
1466// indent-tabs-mode: nil
1467// End:
1468// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status