monotone

monotone Mtn Source Tree

Root/rcs_import.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include <algorithm>
11#include <iterator>
12#include <list>
13#include <map>
14#include <set>
15#include <sstream>
16#include <stack>
17#include <stdexcept>
18#include <string>
19#include <vector>
20
21#include <unistd.h>
22
23#include <boost/shared_ptr.hpp>
24#include <boost/scoped_ptr.hpp>
25#include <boost/lexical_cast.hpp>
26#include <boost/tokenizer.hpp>
27
28#include "app_state.hh"
29#include "cert.hh"
30#include "constants.hh"
31#include "cycle_detector.hh"
32#include "database.hh"
33#include "file_io.hh"
34#include "interner.hh"
35#include "keys.hh"
36#include "packet.hh"
37#include "paths.hh"
38#include "platform-wrapped.hh"
39#include "project.hh"
40#include "rcs_file.hh"
41#include "revision.hh"
42#include "safe_map.hh"
43#include "sanity.hh"
44#include "transforms.hh"
45#include "ui.hh"
46
47using std::make_pair;
48using std::map;
49using std::multimap;
50using std::out_of_range;
51using std::pair;
52using std::search;
53using std::set;
54using std::sscanf;
55using std::stable_sort;
56using std::stack;
57using std::string;
58using std::vector;
59
60using boost::scoped_ptr;
61using boost::shared_ptr;
62using boost::lexical_cast;
63
64// cvs history recording stuff
65
66typedef unsigned long cvs_branchname;
67typedef unsigned long cvs_author;
68typedef unsigned long cvs_changelog;
69typedef unsigned long cvs_version;
70typedef unsigned long cvs_path;
71typedef unsigned long cvs_tag;
72
73struct cvs_history;
74
75struct
76cvs_commit
77{
78 cvs_commit(rcs_file const & r,
79 string const & rcs_version,
80 file_id const & ident,
81 cvs_history & cvs);
82
83 bool is_synthetic_branch_root;
84 time_t time;
85 bool alive;
86 cvs_author author;
87 cvs_changelog changelog;
88 cvs_version version;
89 cvs_path path;
90 vector<cvs_tag> tags;
91
92 bool operator<(cvs_commit const & other) const
93 {
94 return time < other.time;
95 }
96};
97
98struct
99cvs_branch
100{
101 bool has_a_branchpoint;
102 bool has_a_commit;
103 time_t last_branchpoint;
104 time_t first_commit;
105
106 map<cvs_path, cvs_version> live_at_beginning;
107 vector<cvs_commit> lineage;
108
109 cvs_branch()
110 : has_a_branchpoint(false),
111 has_a_commit(false),
112 last_branchpoint(0),
113 first_commit(0)
114 {
115 }
116
117 void note_commit(time_t now)
118 {
119 if (!has_a_commit)
120 {
121 first_commit = now;
122 }
123 else
124 {
125 if (now < first_commit)
126 first_commit = now;
127 }
128 has_a_commit = true;
129 }
130
131 void note_branchpoint(time_t now)
132 {
133 has_a_branchpoint = true;
134 if (now > last_branchpoint)
135 last_branchpoint = now;
136 }
137
138 time_t beginning() const
139 {
140 I(has_a_branchpoint || has_a_commit);
141 if (has_a_commit)
142 {
143 I(first_commit != 0);
144 return first_commit;
145 }
146 else
147 {
148 I(last_branchpoint != 0);
149 return last_branchpoint;
150 }
151 }
152
153 void append_commit(cvs_commit const & c)
154 {
155 I(c.time != 0);
156 note_commit(c.time);
157 lineage.push_back(c);
158 }
159};
160
161struct
162cvs_history
163{
164
165 interner<unsigned long> branch_interner;
166 interner<unsigned long> author_interner;
167 interner<unsigned long> changelog_interner;
168 interner<unsigned long> file_version_interner;
169 interner<unsigned long> path_interner;
170 interner<unsigned long> tag_interner;
171
172 // assume admin has foo:X.Y.0.N in it, then
173 // this multimap contains entries of the form
174 // X.Y -> foo
175 multimap<string, string> branchpoints;
176
177 // and this map contains entries of the form
178 // X.Y.N.1 -> foo
179 map<string, string> branch_first_entries;
180
181 // branch name -> branch
182 map<string, shared_ptr<cvs_branch> > branches;
183 shared_ptr<cvs_branch> trunk;
184
185 // stack of branches we're injecting states into
186 stack< shared_ptr<cvs_branch> > stk;
187 stack< cvs_branchname > bstk;
188
189 // tag -> time, revision
190 //
191 // used to resolve the *last* revision which has a given tag
192 // applied; this is the revision which wins the tag.
193 map<unsigned long, pair<time_t, revision_id> > resolved_tags;
194
195 file_path curr_file;
196 cvs_path curr_file_interned;
197
198 string base_branch;
199
200 ticker n_versions;
201 ticker n_tree_branches;
202
203 cvs_history();
204 void set_filename(string const & file,
205 file_id const & ident);
206
207 void index_branchpoint_symbols(rcs_file const & r);
208
209 void push_branch(string const & branch_name, bool private_branch);
210 void pop_branch();
211};
212
213
214static bool
215is_sbr(shared_ptr<rcs_delta> dl,
216 shared_ptr<rcs_deltatext> dt)
217{
218
219 // CVS abuses the RCS format a bit (ha!) when storing a file which
220 // was only added on a branch: on the root of the branch there'll be
221 // a commit with dead state, empty text, and a log message
222 // containing the string "file foo was initially added on branch
223 // bar". We recognize and ignore these cases, as they do not
224 // "really" represent commits to be clustered together.
225
226 if (dl->state != "dead")
227 return false;
228
229 if (!dt->text.empty())
230 return false;
231
232 string log_bit = "was initially added on branch";
233 string::const_iterator i = search(dt->log.begin(),
234 dt->log.end(),
235 log_bit.begin(),
236 log_bit.end());
237
238 return i != dt->log.end();
239}
240
241
242cvs_commit::cvs_commit(rcs_file const & r,
243 string const & rcs_version,
244 file_id const & ident,
245 cvs_history & cvs)
246{
247 map<string, shared_ptr<rcs_delta> >::const_iterator delta =
248 r.deltas.find(rcs_version);
249 I(delta != r.deltas.end());
250
251 map<string, shared_ptr<rcs_deltatext> >::const_iterator deltatext =
252 r.deltatexts.find(rcs_version);
253 I(deltatext != r.deltatexts.end());
254
255 struct tm t;
256 // We need to initialize t to all zeros, because strptime has a habit of
257 // leaving bits of the data structure alone, letting garbage sneak into
258 // our output.
259 memset(&t, 0, sizeof(t));
260 char const * dp = delta->second->date.c_str();
261 L(FL("Calculating time of %s") % dp);
262#ifdef HAVE_STRPTIME
263 if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL)
264 I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL);
265#else
266 I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon),
267 &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6);
268 t.tm_mon--;
269 // Apparently some RCS files have 2 digit years, others four; tm always
270 // wants a 2 (or 3) digit year (years since 1900).
271 if (t.tm_year > 1900)
272 t.tm_year-=1900;
273#endif
274 time = mktime(&t);
275 L(FL("= %i") % time);
276
277 is_synthetic_branch_root = is_sbr(delta->second,
278 deltatext->second);
279
280 alive = delta->second->state != "dead";
281 if (is_synthetic_branch_root)
282 changelog = cvs.changelog_interner.intern("synthetic branch root changelog");
283 else
284 changelog = cvs.changelog_interner.intern(deltatext->second->log);
285 author = cvs.author_interner.intern(delta->second->author);
286 path = cvs.curr_file_interned;
287 version = cvs.file_version_interner.intern(ident.inner()());
288
289 typedef multimap<string,string>::const_iterator ity;
290 pair<ity,ity> range = r.admin.symbols.equal_range(rcs_version);
291 for (ity i = range.first; i != range.second; ++i)
292 {
293 if (i->first == rcs_version)
294 {
295 L(FL("version %s -> tag %s") % rcs_version % i->second);
296 tags.push_back(cvs.tag_interner.intern(i->second));
297 }
298 }
299
300}
301
302
303// piece table stuff
304
305struct piece;
306
307struct
308piece_store
309{
310 vector< shared_ptr<rcs_deltatext> > texts;
311 void index_deltatext(shared_ptr<rcs_deltatext> const & dt,
312 vector<piece> & pieces);
313 void build_string(vector<piece> const & pieces,
314 string & out);
315 void reset() { texts.clear(); }
316};
317
318// FIXME: kludge, I was lazy and did not make this
319// a properly scoped variable.
320
321static piece_store global_pieces;
322
323
324struct
325piece
326{
327 piece(string::size_type p, string::size_type l, unsigned long id) :
328 pos(p), len(l), string_id(id) {}
329 string::size_type pos;
330 string::size_type len;
331 unsigned long string_id;
332 string operator*() const
333 {
334 return string(global_pieces.texts.at(string_id)->text.data() + pos, len);
335 }
336};
337
338
339void
340piece_store::build_string(vector<piece> const & pieces,
341 string & out)
342{
343 out.clear();
344 out.reserve(pieces.size() * 60);
345 for(vector<piece>::const_iterator i = pieces.begin();
346 i != pieces.end(); ++i)
347 out.append(texts.at(i->string_id)->text, i->pos, i->len);
348}
349
350void
351piece_store::index_deltatext(shared_ptr<rcs_deltatext> const & dt,
352 vector<piece> & pieces)
353{
354 pieces.clear();
355 pieces.reserve(dt->text.size() / 30);
356 texts.push_back(dt);
357 unsigned long id = texts.size() - 1;
358 string::size_type begin = 0;
359 string::size_type end = dt->text.find('\n');
360 while(end != string::npos)
361 {
362 // nb: the piece includes the '\n'
363 pieces.push_back(piece(begin, (end - begin) + 1, id));
364 begin = end + 1;
365 end = dt->text.find('\n', begin);
366 }
367 if (begin != dt->text.size())
368 {
369 // the text didn't end with '\n', so neither does the piece
370 end = dt->text.size();
371 pieces.push_back(piece(begin, end - begin, id));
372 }
373}
374
375
376static void
377process_one_hunk(vector< piece > const & source,
378 vector< piece > & dest,
379 vector< piece >::const_iterator & i,
380 int & cursor)
381{
382 string directive = **i;
383 assert(directive.size() > 1);
384 ++i;
385
386 try
387 {
388 char code;
389 int pos, len;
390 if (sscanf(directive.c_str(), " %c %d %d", &code, &pos, &len) != 3)
391 throw oops("illformed directive '" + directive + "'");
392
393 if (code == 'a')
394 {
395 // 'ax y' means "copy from source to dest until cursor == x, then
396 // copy y lines from delta, leaving cursor where it is"
397 while (cursor < pos)
398 dest.push_back(source.at(cursor++));
399 I(cursor == pos);
400 while (len--)
401 dest.push_back(*i++);
402 }
403 else if (code == 'd')
404 {
405 // 'dx y' means "copy from source to dest until cursor == x-1,
406 // then increment cursor by y, ignoring those y lines"
407 while (cursor < (pos - 1))
408 dest.push_back(source.at(cursor++));
409 I(cursor == pos - 1);
410 cursor += len;
411 }
412 else
413 throw oops("unknown directive '" + directive + "'");
414 }
415 catch (out_of_range &)
416 {
417 throw oops("out_of_range while processing " + directive
418 + " with source.size() == "
419 + lexical_cast<string>(source.size())
420 + " and cursor == "
421 + lexical_cast<string>(cursor));
422 }
423}
424
425static void
426construct_version(vector< piece > const & source_lines,
427 string const & dest_version,
428 vector< piece > & dest_lines,
429 rcs_file const & r)
430{
431 dest_lines.clear();
432 dest_lines.reserve(source_lines.size());
433
434 I(r.deltas.find(dest_version) != r.deltas.end());
435 shared_ptr<rcs_delta> delta = r.deltas.find(dest_version)->second;
436
437 I(r.deltatexts.find(dest_version) != r.deltatexts.end());
438 shared_ptr<rcs_deltatext> deltatext = r.deltatexts.find(dest_version)->second;
439
440 vector<piece> deltalines;
441 global_pieces.index_deltatext(deltatext, deltalines);
442
443 int cursor = 0;
444 for (vector<piece>::const_iterator i = deltalines.begin();
445 i != deltalines.end(); )
446 process_one_hunk(source_lines, dest_lines, i, cursor);
447 while (cursor < static_cast<int>(source_lines.size()))
448 dest_lines.push_back(source_lines[cursor++]);
449}
450
451// FIXME: should these be someplace else? using 'friend' to reach into the
452// DB is stupid, but it's also stupid to put raw edge insert methods on the
453// DB itself. or is it? hmm.. encapsulation vs. usage guidance..
454void
455rcs_put_raw_file_edge(hexenc<id> const & old_id,
456 hexenc<id> const & new_id,
457 delta const & del,
458 database & db)
459{
460 if (old_id == new_id)
461 {
462 L(FL("skipping identity file edge"));
463 return;
464 }
465
466 if (db.file_version_exists(file_id(old_id)))
467 {
468 // we already have a way to get to this old version,
469 // no need to insert another reconstruction path
470 L(FL("existing path to %s found, skipping") % old_id);
471 }
472 else
473 {
474 I(db.file_or_manifest_base_exists(new_id, "files")
475 || db.delta_exists(new_id(), "file_deltas"));
476 db.put_file_delta(file_id(old_id), file_id(new_id), file_delta(del));
477 }
478}
479
480
481static void
482insert_into_db(data const & curr_data,
483 hexenc<id> const & curr_id,
484 vector< piece > const & next_lines,
485 data & next_data,
486 hexenc<id> & next_id,
487 database & db)
488{
489 // inserting into the DB
490 // note: curr_lines is a "new" (base) version
491 // and next_lines is an "old" (derived) version.
492 // all storage edges go from new -> old.
493 {
494 string tmp;
495 global_pieces.build_string(next_lines, tmp);
496 next_data = data(tmp);
497 }
498 delta del;
499 diff(curr_data, next_data, del);
500 calculate_ident(next_data, next_id);
501 rcs_put_raw_file_edge(next_id, curr_id, del, db);
502}
503
504
505
506/*
507
508please read this exhaustingly long comment and understand it
509before mucking with the branch inference logic.
510
511we are processing a file version. a branch might begin here. if
512the current version is X.Y, then there is a branch B starting
513here iff there is a symbol in the admin section called X.Y.0.Z,
514where Z is the branch number (or if there is a private branch
515called X.Y.Z, which is either an import branch or some private
516RCS cruft).
517
518the version X.Y is then considered the branchpoint of B in the
519current file. this does *not* mean that the CVS key -- an
520abstraction representing whole-tree operations -- of X.Y is the
521branchpoint across the CVS archive we're processing.
522
523in fact, CVS does not record the occurrence of a branching
524action (tag -b). we have no idea who executed that command and
525when. what we know instead is the commit X.Y immediately
526preceeding the branch -- CVS consideres this the branchpoint --
527in this file's reduced view of history. we also know the first
528commit X.Y.Z.1 inside the branch (which might not exist).
529
530our old strategy was to consider all branches nested in a
531hierarchy, which was a super-tree of all the branch trees in all
532the CVS files in a repository. this involved considering X.Y as
533the parent version of branch X.Y.Z, an selecting "the"
534branchpoint connecting the two as the least CVS key X.Y.Z.1
535committed inside the branch B.
536
537this was a mistake, for two significant reasons.
538
539first, some files do not *have* any commit inside the branch B,
540only a branchpoint X.Y.0.Z. this branchpoint is actually the
541last commit *before* the user branched, and could be a very old
542commit, long before the branch was formed, so it is useless in
543determining the branch structure.
544
545second, some files do not have a branch B, or worse, have
546branched into B from an "ancestor" branch A, where a different
547file branches into B from a different ancestor branch C. in
548other words, while there *is* a tree structure within the X.Y.Z
549branches of each file, there is *no* shared tree structure
550between the branch names across a repository. in one file A can
551be an ancestor of B, in another file B can be an ancestor of A.
552
553thus, we give up on establishing a hierarchy between branches
554altogether. all branches exist in a flat namespace, and all are
555direct descendents of the empty revision at the root of
556history. each branchpoint symbol mentioned in the
557administrative section of a file is considered the root of a new
558lineage.
559
560*/
561
562
563static void
564process_branch(string const & begin_version,
565 vector< piece > const & begin_lines,
566 data const & begin_data,
567 hexenc<id> const & begin_id,
568 rcs_file const & r,
569 database & db,
570 cvs_history & cvs)
571{
572 string curr_version = begin_version;
573 scoped_ptr< vector< piece > > next_lines(new vector<piece>);
574 scoped_ptr< vector< piece > > curr_lines(new vector<piece>
575 (begin_lines.begin(),
576 begin_lines.end()));
577 data curr_data(begin_data), next_data;
578 hexenc<id> curr_id(begin_id), next_id;
579
580 while(! (r.deltas.find(curr_version) == r.deltas.end()))
581 {
582 L(FL("version %s has %d lines") % curr_version % curr_lines->size());
583
584 cvs_commit curr_commit(r, curr_version, file_id(curr_id), cvs);
585 if (!curr_commit.is_synthetic_branch_root)
586 {
587 cvs.stk.top()->append_commit(curr_commit);
588 ++cvs.n_versions;
589 }
590
591 string next_version = r.deltas.find(curr_version)->second->next;
592
593 if (! next_version.empty())
594 {
595 L(FL("following RCS edge %s -> %s") % curr_version % next_version);
596
597 construct_version(*curr_lines, next_version, *next_lines, r);
598 L(FL("constructed RCS version %s, inserting into database") %
599 next_version);
600
601 insert_into_db(curr_data, curr_id,
602 *next_lines, next_data, next_id, db);
603 }
604
605 // mark the beginning-of-branch time and state of this file if
606 // we're at a branchpoint
607 typedef multimap<string,string>::const_iterator ity;
608 pair<ity,ity> range = cvs.branchpoints.equal_range(curr_version);
609 if (range.first != cvs.branchpoints.end()
610 && range.first->first == curr_version)
611 {
612 for (ity i = range.first; i != range.second; ++i)
613 {
614 cvs.push_branch(i->second, false);
615 shared_ptr<cvs_branch> b = cvs.stk.top();
616 if (curr_commit.alive)
617 b->live_at_beginning[cvs.curr_file_interned] = curr_commit.version;
618 b->note_branchpoint(curr_commit.time);
619 cvs.pop_branch();
620 }
621 }
622
623
624 // recursively follow any branch commits coming from the branchpoint
625 shared_ptr<rcs_delta> curr_delta = r.deltas.find(curr_version)->second;
626 for(vector<string>::const_iterator i = curr_delta->branches.begin();
627 i != curr_delta->branches.end(); ++i)
628 {
629 string branch;
630 data branch_data;
631 hexenc<id> branch_id;
632 vector< piece > branch_lines;
633 bool priv = false;
634 map<string, string>::const_iterator be = cvs.branch_first_entries.find(*i);
635
636 if (be != cvs.branch_first_entries.end())
637 branch = be->second;
638 else
639 priv = true;
640
641 L(FL("following RCS branch %s = '%s'") % (*i) % branch);
642
643 construct_version(*curr_lines, *i, branch_lines, r);
644 insert_into_db(curr_data, curr_id,
645 branch_lines, branch_data, branch_id, db);
646
647 cvs.push_branch(branch, priv);
648 process_branch(*i, branch_lines, branch_data, branch_id, r, db, cvs);
649 cvs.pop_branch();
650
651 L(FL("finished RCS branch %s = '%s'") % (*i) % branch);
652 }
653
654 if (!r.deltas.find(curr_version)->second->next.empty())
655 {
656 // advance
657 curr_data = next_data;
658 curr_id = next_id;
659 curr_version = next_version;
660 swap(next_lines, curr_lines);
661 next_lines->clear();
662 }
663 else break;
664 }
665}
666
667
668static void
669import_rcs_file_with_cvs(string const & filename, database & db, cvs_history & cvs)
670{
671 rcs_file r;
672 L(FL("parsing RCS file %s") % filename);
673 parse_rcs_file(filename, r);
674 L(FL("parsed RCS file %s OK") % filename);
675
676 {
677 vector< piece > head_lines;
678 I(r.deltatexts.find(r.admin.head) != r.deltatexts.end());
679 I(r.deltas.find(r.admin.head) != r.deltas.end());
680
681 hexenc<id> id;
682 data dat(r.deltatexts.find(r.admin.head)->second->text);
683 calculate_ident(dat, id);
684 file_id fid(id);
685
686 cvs.set_filename (filename, fid);
687 cvs.index_branchpoint_symbols (r);
688
689 if (! db.file_version_exists (fid))
690 {
691 db.put_file(fid, file_data(dat));
692 }
693
694 {
695 // create the head state in case it is a loner
696 // cvs_key k;
697 // shared_ptr<cvs_state> s;
698 // L(FL("noting head version %s : %s") % cvs.curr_file % r.admin.head);
699 // cvs.find_key_and_state (r, r.admin.head, k, s);
700 }
701
702 global_pieces.reset();
703 global_pieces.index_deltatext(r.deltatexts.find(r.admin.head)->second, head_lines);
704 process_branch(r.admin.head, head_lines, dat, id, r, db, cvs);
705 global_pieces.reset();
706 }
707
708 ui.set_tick_trailer("");
709}
710
711
712void
713test_parse_rcs_file(system_path const & filename, database & db)
714{
715 cvs_history cvs;
716
717 I(! filename.empty());
718 assert_path_is_file(filename);
719
720 P(F("parsing RCS file %s") % filename);
721 rcs_file r;
722 parse_rcs_file(filename.as_external(), r);
723 P(F("parsed RCS file %s OK") % filename);
724}
725
726
727// CVS importing stuff follows
728
729
730static void
731split_version(string const & v, vector<string> & vs)
732{
733 vs.clear();
734 boost::char_separator<char> sep(".");
735 typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
736 tokenizer tokens(v, sep);
737 copy(tokens.begin(), tokens.end(), back_inserter(vs));
738}
739
740static void
741join_version(vector<string> const & vs, string & v)
742{
743 v.clear();
744 for (vector<string>::const_iterator i = vs.begin();
745 i != vs.end(); ++i)
746 {
747 if (i != vs.begin())
748 v += ".";
749 v += *i;
750 }
751}
752
753cvs_history::cvs_history() :
754 n_versions("versions", "v", 1),
755 n_tree_branches("branches", "b", 1)
756{
757}
758
759void
760cvs_history::set_filename(string const & file,
761 file_id const & ident)
762{
763 L(FL("importing file '%s'") % file);
764 I(file.size() > 2);
765 I(file.substr(file.size() - 2) == string(",v"));
766 string ss = file;
767 ui.set_tick_trailer(ss);
768 ss.resize(ss.size() - 2);
769 // remove Attic/ if present
770 string::size_type last_slash=ss.rfind('/');
771 if (last_slash!=string::npos && last_slash>=5
772 && ss.substr(last_slash-5,6)=="Attic/")
773 ss.erase(last_slash-5,6);
774 curr_file = file_path_internal(ss);
775 curr_file_interned = path_interner.intern(ss);
776}
777
778void cvs_history::index_branchpoint_symbols(rcs_file const & r)
779{
780 branchpoints.clear();
781 branch_first_entries.clear();
782
783 for (multimap<string, string>::const_iterator i =
784 r.admin.symbols.begin(); i != r.admin.symbols.end(); ++i)
785 {
786 string const & num = i->first;
787 string const & sym = i->second;
788
789 vector<string> components;
790 split_version(num, components);
791
792 vector<string> first_entry_components;
793 vector<string> branchpoint_components;
794
795 if (components.size() > 2 &&
796 (components.size() % 2 == 1))
797 {
798 // this is a "vendor" branch
799 //
800 // such as "1.1.1", where "1.1" is the branchpoint and
801 // "1.1.1.1" will be the first commit on it.
802
803 first_entry_components = components;
804 first_entry_components.push_back("1");
805
806 branchpoint_components = components;
807 branchpoint_components.erase(branchpoint_components.end() - 1,
808 branchpoint_components.end());
809
810 }
811
812 else if (components.size() > 2 &&
813 (components.size() % 2 == 0) &&
814 components[components.size() - 2] == string("0"))
815 {
816 // this is a "normal" branch
817 //
818 // such as "1.3.0.2", where "1.3" is the branchpoint and
819 // "1.3.2.1"
820
821 first_entry_components = components;
822 first_entry_components[first_entry_components.size() - 2]
823 = first_entry_components[first_entry_components.size() - 1];
824 first_entry_components[first_entry_components.size() - 1]
825 = string("1");
826
827 branchpoint_components = components;
828 branchpoint_components.erase(branchpoint_components.end() - 2,
829 branchpoint_components.end());
830 }
831
832 string first_entry_version;
833 join_version(first_entry_components, first_entry_version);
834
835 L(FL("first version in branch %s would be %s")
836 % sym % first_entry_version);
837 branch_first_entries.insert(make_pair(first_entry_version, sym));
838
839 string branchpoint_version;
840 join_version(branchpoint_components, branchpoint_version);
841
842 L(FL("file branchpoint for %s at %s") % sym % branchpoint_version);
843 branchpoints.insert(make_pair(branchpoint_version, sym));
844 }
845}
846
847
848
849void
850cvs_history::push_branch(string const & branch_name, bool private_branch)
851{
852 shared_ptr<cvs_branch> branch;
853
854 string bname = base_branch + "." + branch_name;
855 I(stk.size() > 0);
856
857 if (private_branch)
858 {
859 branch = shared_ptr<cvs_branch>(new cvs_branch());
860 stk.push(branch);
861 bstk.push(branch_interner.intern(""));
862 return;
863 }
864 else
865 {
866 map<string, shared_ptr<cvs_branch> >::const_iterator b = branches.find(bname);
867 if (b == branches.end())
868 {
869 branch = shared_ptr<cvs_branch>(new cvs_branch());
870 branches.insert(make_pair(bname, branch));
871 ++n_tree_branches;
872 }
873 else
874 branch = b->second;
875
876 stk.push(branch);
877 bstk.push(branch_interner.intern(bname));
878 }
879}
880
881void
882cvs_history::pop_branch()
883{
884 I(stk.size() > 1);
885 stk.pop();
886 bstk.pop();
887}
888
889
890class
891cvs_tree_walker
892 : public tree_walker
893{
894 cvs_history & cvs;
895 database & db;
896public:
897 cvs_tree_walker(cvs_history & c, database & d) :
898 cvs(c), db(d)
899 {
900 }
901 virtual void visit_file(file_path const & path)
902 {
903 string file = path.as_external();
904 if (file.substr(file.size() - 2) == string(",v"))
905 {
906 try
907 {
908 import_rcs_file_with_cvs(file, db, cvs);
909 }
910 catch (oops const & o)
911 {
912 W(F("error reading RCS file %s: %s") % file % o.what());
913 }
914 }
915 else
916 L(FL("skipping non-RCS file %s") % file);
917 }
918 virtual ~cvs_tree_walker() {}
919};
920
921
922
923
924//
925// our task here is to produce a sequence of revision descriptions
926// from the per-file commit records we have. we do this by rolling
927// forwards through the temporally sorted file-commit list
928// accumulating file-commits into revisions and flushing the
929// revisions when we feel they are "complete".
930//
931// revisions have to have a time associated with them. this time
932// will be the first time of any commit associated with the
933// revision. they have an author and a changelog, which is shared
934// by all the file-commits in the revision.
935//
936// there might be multiple revisions overlapping in time. this is
937// legal wrt. CVS. we keep a set, and search all members of the set
938// for the best match.
939//
940// consider this situation of overlapping revisions:
941//
942// +---------------+ +---------------+ +---------------+
943// | rev #1 @ 0011 | | rev #2 @ 0012 | | rev #3 @ 0013 |
944// |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~|
945// | patch foo.txt | | patch bar.txt | | patch baz.txt |
946// +---------------+ +---------------+ +---------------+
947//
948// suppose you have this situation and you run across a "patch
949// bar.txt" commit at timestamp 0014. what do you do?
950//
951// - you know that rev #2 cannot accept this commit, simply because
952// two commits on the same file makes *two* revisions, not one.
953//
954// - perhaps rev #3 could accept it; after all, it could be that the
955// commit associated with rev #2 released its commit lock, and the
956// commit associated with rev #3 quickly updated and committed at
957// 0013, finishing off at 0014.
958//
959// - can rev #1 accept it? no. because CVS calcualted the version it
960// expected to see in bar.txt before calling up the server, when
961// committing rev #1. the version it expected to see was the version
962// in bar.txt *before* time 0012; that is, before rev #2 had any affect
963// on bar.txt. when it contacted the server, the commit associated
964// with rev #1 would have aborted if it had seen any other number.
965// so rev #1 could not start before an edit to bar.txt and then
966// include its own edit to bar.txt.
967//
968// so we have only one case where bar.txt can be accepted. if the
969// commit is not accepted into a legal rev (outside the window,
970// wrong changelog/author) it starts a new revision.
971//
972// as we scan forwards, if we hit timestamps which lie beyond rev #n's
973// window, we flush rev #n.
974//
975// if there are multiple coincident and legal revs to direct a
976// commit to (all with the same author/changelog), we direct the
977// commit to the rev with the closest initial timestamp. that is,
978// the *latest* beginning time.
979
980struct
981cvs_cluster
982{
983 time_t first_time;
984 cvs_author author;
985 cvs_changelog changelog;
986 set<cvs_tag> tags;
987
988 cvs_cluster(time_t t,
989 cvs_author a,
990 cvs_changelog c)
991 : first_time(t),
992 author(a),
993 changelog(c)
994 {}
995
996 struct entry
997 {
998 bool live;
999 cvs_version version;
1000 time_t time;
1001 entry(bool l, cvs_version v, time_t t)
1002 : live(l),
1003 version(v),
1004 time(t)
1005 {}
1006 };
1007
1008 typedef map<cvs_path, entry> entry_map;
1009 entry_map entries;
1010};
1011
1012
1013struct
1014cluster_consumer
1015{
1016 cvs_history & cvs;
1017 app_state & app;
1018 string const & branchname;
1019 cvs_branch const & branch;
1020 set<split_path> created_dirs;
1021 map<cvs_path, cvs_version> live_files;
1022 ticker & n_revisions;
1023
1024 struct prepared_revision
1025 {
1026 prepared_revision(revision_id i,
1027 shared_ptr<revision_t> r,
1028 cvs_cluster const & c);
1029 revision_id rid;
1030 shared_ptr<revision_t> rev;
1031 time_t time;
1032 cvs_author author;
1033 cvs_changelog changelog;
1034 vector<cvs_tag> tags;
1035 };
1036
1037 vector<prepared_revision> preps;
1038
1039 roster_t ros;
1040 temp_node_id_source nis;
1041 editable_roster_base editable_ros;
1042 revision_id parent_rid, child_rid;
1043
1044 cluster_consumer(cvs_history & cvs,
1045 app_state & app,
1046 string const & branchname,
1047 cvs_branch const & branch,
1048 ticker & n_revs);
1049
1050 void consume_cluster(cvs_cluster const & c);
1051 void add_missing_parents(split_path const & sp, cset & cs);
1052 void build_cset(cvs_cluster const & c, cset & cs);
1053 void store_auxiliary_certs(prepared_revision const & p);
1054 void store_revisions();
1055};
1056
1057typedef shared_ptr<cvs_cluster>
1058cluster_ptr;
1059
1060struct
1061cluster_ptr_lt
1062{
1063 bool operator()(cluster_ptr const & a,
1064 cluster_ptr const & b) const
1065 {
1066 return a->first_time < b->first_time;
1067 }
1068};
1069
1070typedef set<cluster_ptr, cluster_ptr_lt>
1071cluster_set;
1072
1073void
1074import_branch(cvs_history & cvs,
1075 app_state & app,
1076 string const & branchname,
1077 shared_ptr<cvs_branch> const & branch,
1078 ticker & n_revs)
1079{
1080 cluster_set clusters;
1081 cluster_consumer cons(cvs, app, branchname, *branch, n_revs);
1082 unsigned long commits_remaining = branch->lineage.size();
1083
1084 // step 1: sort the lineage
1085 stable_sort(branch->lineage.begin(), branch->lineage.end());
1086
1087 for (vector<cvs_commit>::const_iterator i = branch->lineage.begin();
1088 i != branch->lineage.end(); ++i)
1089 {
1090 commits_remaining--;
1091
1092 L(FL("examining next commit [t:%d] [p:%s] [a:%s] [c:%s]")
1093 % i->time
1094 % cvs.path_interner.lookup(i->path)
1095 % cvs.author_interner.lookup(i->author)
1096 % cvs.changelog_interner.lookup(i->changelog));
1097
1098 // step 2: expire all clusters from the beginning of the set which
1099 // have passed the window size
1100 while (!clusters.empty())
1101 {
1102 cluster_set::iterator j = clusters.begin();
1103 if ((*j)->first_time + constants::cvs_window < i->time)
1104 {
1105 L(FL("expiring cluster"));
1106 cons.consume_cluster(**j);
1107 clusters.erase(j);
1108 }
1109 else
1110 break;
1111 }
1112
1113 // step 3: find the last still-live cluster to have touched this
1114 // file
1115 time_t time_of_last_cluster_touching_this_file = 0;
1116
1117 unsigned clu = 0;
1118 for (cluster_set::const_iterator j = clusters.begin();
1119 j != clusters.end(); ++j)
1120 {
1121 L(FL("examining cluster %d to see if it touched %d")
1122 % clu++
1123 % i->path);
1124
1125 cvs_cluster::entry_map::const_iterator k = (*j)->entries.find(i->path);
1126 if ((k != (*j)->entries.end())
1127 && (k->second.time > time_of_last_cluster_touching_this_file))
1128 {
1129 L(FL("found cluster touching %d: [t:%d] [a:%d] [c:%d]")
1130 % i->path
1131 % (*j)->first_time
1132 % (*j)->author
1133 % (*j)->changelog);
1134 time_of_last_cluster_touching_this_file = (*j)->first_time;
1135 }
1136 }
1137 L(FL("last modification time is %d")
1138 % time_of_last_cluster_touching_this_file);
1139
1140 // step 4: find a cluster which starts on or after the
1141 // last_modify_time, which doesn't modify the file in question,
1142 // and which contains the same author and changelog as our
1143 // commit
1144 cluster_ptr target;
1145 for (cluster_set::const_iterator j = clusters.begin();
1146 j != clusters.end(); ++j)
1147 {
1148 if (((*j)->first_time >= time_of_last_cluster_touching_this_file)
1149 && ((*j)->author == i->author)
1150 && ((*j)->changelog == i->changelog)
1151 && ((*j)->entries.find(i->path) == (*j)->entries.end()))
1152 {
1153 L(FL("picked existing cluster [t:%d] [a:%d] [c:%d]")
1154 % (*j)->first_time
1155 % (*j)->author
1156 % (*j)->changelog);
1157
1158 target = (*j);
1159 }
1160 }
1161
1162 // if we're still not finding an active cluster,
1163 // this is probably the first commit in it. make
1164 // a new one.
1165 if (!target)
1166 {
1167 L(FL("building new cluster [t:%d] [a:%d] [c:%d]")
1168 % i->time
1169 % i->author
1170 % i->changelog);
1171
1172 target = cluster_ptr(new cvs_cluster(i->time,
1173 i->author,
1174 i->changelog));
1175 clusters.insert(target);
1176 }
1177
1178 I(target);
1179 target->entries.insert(make_pair(i->path,
1180 cvs_cluster::entry(i->alive,
1181 i->version,
1182 i->time)));
1183 for (vector<cvs_tag>::const_iterator j = i->tags.begin();
1184 j != i->tags.end(); ++j)
1185 {
1186 target->tags.insert(*j);
1187 }
1188 }
1189
1190
1191 // now we are done this lineage; flush all remaining clusters
1192 L(FL("finished branch commits, writing all pending clusters"));
1193 while (!clusters.empty())
1194 {
1195 cons.consume_cluster(**clusters.begin());
1196 clusters.erase(clusters.begin());
1197 }
1198 L(FL("finished writing pending clusters"));
1199
1200 cons.store_revisions();
1201
1202}
1203
1204void
1205import_cvs_repo(system_path const & cvsroot,
1206 app_state & app)
1207{
1208 N(!directory_exists(cvsroot / "CVSROOT"),
1209 F("%s appears to be a CVS repository root directory\n"
1210 "try importing a module instead, with 'cvs_import %s/<module_name>")
1211 % cvsroot % cvsroot);
1212
1213 {
1214 // early short-circuit to avoid failure after lots of work
1215 rsa_keypair_id key;
1216 get_user_key(key, app);
1217 require_password(key, app);
1218 }
1219
1220 cvs_history cvs;
1221 N(app.opts.branchname() != "", F("need base --branch argument for importing"));
1222 cvs.base_branch = app.opts.branchname();
1223
1224 // push the trunk
1225 cvs.trunk = shared_ptr<cvs_branch>(new cvs_branch());
1226 cvs.stk.push(cvs.trunk);
1227 cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch));
1228
1229 {
1230 transaction_guard guard(app.db);
1231 cvs_tree_walker walker(cvs, app.db);
1232 require_path_is_directory(cvsroot,
1233 F("path %s does not exist") % cvsroot,
1234 F("'%s' is not a directory") % cvsroot);
1235 app.db.ensure_open();
1236 change_current_working_dir(cvsroot);
1237 walk_tree(file_path(), walker);
1238 guard.commit();
1239 }
1240
1241 I(cvs.stk.size() == 1);
1242
1243 ticker n_revs(_("revisions"), "r", 1);
1244
1245 while (cvs.branches.size() > 0)
1246 {
1247 transaction_guard guard(app.db);
1248 map<string, shared_ptr<cvs_branch> >::const_iterator i = cvs.branches.begin();
1249 string branchname = i->first;
1250 shared_ptr<cvs_branch> branch = i->second;
1251 L(FL("branch %s has %d entries") % branchname % branch->lineage.size());
1252 import_branch(cvs, app, branchname, branch, n_revs);
1253
1254 // free up some memory
1255 cvs.branches.erase(branchname);
1256 guard.commit();
1257 }
1258
1259 {
1260 transaction_guard guard(app.db);
1261 L(FL("trunk has %d entries") % cvs.trunk->lineage.size());
1262 import_branch(cvs, app, cvs.base_branch, cvs.trunk, n_revs);
1263 guard.commit();
1264 }
1265
1266 // now we have a "last" rev for each tag
1267 {
1268 ticker n_tags(_("tags"), "t", 1);
1269 packet_db_writer dbw(app);
1270 transaction_guard guard(app.db);
1271 for (map<unsigned long, pair<time_t, revision_id> >::const_iterator i = cvs.resolved_tags.begin();
1272 i != cvs.resolved_tags.end(); ++i)
1273 {
1274 string tag = cvs.tag_interner.lookup(i->first);
1275 ui.set_tick_trailer("marking tag " + tag);
1276 app.get_project().put_tag(i->second.second, tag, dbw);
1277 ++n_tags;
1278 }
1279 guard.commit();
1280 }
1281
1282
1283 return;
1284}
1285
1286cluster_consumer::cluster_consumer(cvs_history & cvs,
1287 app_state & app,
1288 string const & branchname,
1289 cvs_branch const & branch,
1290 ticker & n_revs)
1291 : cvs(cvs),
1292 app(app),
1293 branchname(branchname),
1294 branch(branch),
1295 n_revisions(n_revs),
1296 editable_ros(ros, nis)
1297{
1298 if (!branch.live_at_beginning.empty())
1299 {
1300 cvs_author synthetic_author =
1301 cvs.author_interner.intern("cvs_import");
1302
1303 cvs_changelog synthetic_cl =
1304 cvs.changelog_interner.intern("beginning of branch "
1305 + branchname);
1306
1307 time_t synthetic_time = branch.beginning();
1308 cvs_cluster initial_cluster(synthetic_time,
1309 synthetic_author,
1310 synthetic_cl);
1311
1312 L(FL("initial cluster on branch %s has %d live entries") %
1313 branchname % branch.live_at_beginning.size());
1314
1315 for (map<cvs_path, cvs_version>::const_iterator i = branch.live_at_beginning.begin();
1316 i != branch.live_at_beginning.end(); ++i)
1317 {
1318 cvs_cluster::entry e(true, i->second, synthetic_time);
1319 L(FL("initial cluster contains %s at %s") %
1320 cvs.path_interner.lookup(i->first) %
1321 cvs.file_version_interner.lookup(i->second));
1322 initial_cluster.entries.insert(make_pair(i->first, e));
1323 }
1324 consume_cluster(initial_cluster);
1325 }
1326}
1327
1328cluster_consumer::prepared_revision::prepared_revision(revision_id i,
1329 shared_ptr<revision_t> r,
1330 cvs_cluster const & c)
1331 : rid(i),
1332 rev(r),
1333 time(c.first_time),
1334 author(c.author),
1335 changelog(c.changelog)
1336{
1337 for (set<cvs_tag>::const_iterator i = c.tags.begin();
1338 i != c.tags.end(); ++i)
1339 {
1340 tags.push_back(*i);
1341 }
1342}
1343
1344
1345void
1346cluster_consumer::store_revisions()
1347{
1348 for (vector<prepared_revision>::const_iterator i = preps.begin();
1349 i != preps.end(); ++i)
1350 {
1351 if (! app.db.revision_exists(i->rid))
1352 {
1353 data tmp;
1354 write_revision(*(i->rev), tmp);
1355 app.db.put_revision(i->rid, *(i->rev));
1356 store_auxiliary_certs(*i);
1357 ++n_revisions;
1358 }
1359 }
1360}
1361
1362void
1363cluster_consumer::store_auxiliary_certs(prepared_revision const & p)
1364{
1365 packet_db_writer dbw(app);
1366
1367 for (vector<cvs_tag>::const_iterator i = p.tags.begin();
1368 i != p.tags.end(); ++i)
1369 {
1370 map<unsigned long, pair<time_t, revision_id> >::const_iterator j
1371 = cvs.resolved_tags.find(*i);
1372
1373 if (j != cvs.resolved_tags.end())
1374 {
1375 if (j->second.first < p.time)
1376 {
1377 // move the tag forwards
1378 cvs.resolved_tags.erase(*i);
1379 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1380 }
1381 }
1382 else
1383 {
1384 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1385 }
1386 }
1387
1388 app.get_project().put_standard_certs(p.rid,
1389 branch_name(branchname),
1390 utf8(cvs.changelog_interner.lookup(p.changelog)),
1391 date_t::from_unix_epoch(p.time),
1392 utf8(cvs.author_interner.lookup(p.author)),
1393 dbw);
1394}
1395
1396void
1397cluster_consumer::add_missing_parents(split_path const & sp, cset & cs)
1398{
1399 split_path tmp(sp);
1400 if (tmp.empty())
1401 return;
1402 tmp.pop_back();
1403 while (!tmp.empty())
1404 {
1405 if (created_dirs.find(tmp) == created_dirs.end())
1406 {
1407 safe_insert(created_dirs, tmp);
1408 safe_insert(cs.dirs_added, tmp);
1409 }
1410 tmp.pop_back();
1411 }
1412}
1413
1414void
1415cluster_consumer::build_cset(cvs_cluster const & c,
1416 cset & cs)
1417{
1418 for (cvs_cluster::entry_map::const_iterator i = c.entries.begin();
1419 i != c.entries.end(); ++i)
1420 {
1421 file_path pth = file_path_internal(cvs.path_interner.lookup(i->first));
1422 split_path sp;
1423 pth.split(sp);
1424
1425 file_id fid(cvs.file_version_interner.lookup(i->second.version));
1426 if (i->second.live)
1427 {
1428 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1429 if (e == live_files.end())
1430 {
1431 add_missing_parents(sp, cs);
1432 L(FL("adding entry state '%s' on '%s'") % fid % pth);
1433 safe_insert(cs.files_added, make_pair(sp, fid));
1434 live_files[i->first] = i->second.version;
1435 }
1436 else if (e->second != i->second.version)
1437 {
1438 file_id old_fid(cvs.file_version_interner.lookup(e->second));
1439 L(FL("applying state delta on '%s' : '%s' -> '%s'")
1440 % pth % old_fid % fid);
1441 safe_insert(cs.deltas_applied,
1442 make_pair(sp, make_pair(old_fid, fid)));
1443 live_files[i->first] = i->second.version;
1444 }
1445 }
1446 else
1447 {
1448 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1449 if (e != live_files.end())
1450 {
1451 L(FL("deleting entry state '%s' on '%s'") % fid % pth);
1452 safe_insert(cs.nodes_deleted, sp);
1453 live_files.erase(i->first);
1454 }
1455 }
1456 }
1457}
1458
1459void
1460cluster_consumer::consume_cluster(cvs_cluster const & c)
1461{
1462 // we should never have an empty cluster; it's *possible* to have
1463 // an empty changeset (say on a vendor import) but every cluster
1464 // should have been created by at least one file commit, even
1465 // if the commit made no changes. it's a logical inconsistency if
1466 // you have an empty cluster.
1467 I(!c.entries.empty());
1468
1469 shared_ptr<revision_t> rev(new revision_t());
1470 shared_ptr<cset> cs(new cset());
1471 build_cset(c, *cs);
1472
1473 cs->apply_to(editable_ros);
1474 manifest_id child_mid;
1475 calculate_ident(ros, child_mid);
1476 rev->made_for = made_for_database;
1477 rev->new_manifest = child_mid;
1478 rev->edges.insert(make_pair(parent_rid, cs));
1479 calculate_ident(*rev, child_rid);
1480
1481 preps.push_back(prepared_revision(child_rid, rev, c));
1482
1483 parent_rid = child_rid;
1484}
1485
1486// Local Variables:
1487// mode: C++
1488// fill-column: 76
1489// c-file-style: "gnu"
1490// indent-tabs-mode: nil
1491// End:
1492// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status