monotone

monotone Mtn Source Tree

Root/rcs_import.cc

1// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
2//
3// This program is made available under the GNU GPL version 2.0 or
4// greater. See the accompanying file COPYING for details.
5//
6// This program is distributed WITHOUT ANY WARRANTY; without even the
7// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8// PURPOSE.
9
10#include <algorithm>
11#include <iostream>
12#include <iterator>
13#include <list>
14#include <map>
15#include <set>
16#include <sstream>
17#include <stack>
18#include <stdexcept>
19#include <string>
20#include <vector>
21
22#include <unistd.h>
23#include <cstdio>
24
25#include <boost/shared_ptr.hpp>
26#include <boost/scoped_ptr.hpp>
27#include <boost/lexical_cast.hpp>
28#include <boost/tokenizer.hpp>
29
30#include "app_state.hh"
31#include "cert.hh"
32#include "constants.hh"
33#include "cycle_detector.hh"
34#include "database.hh"
35#include "file_io.hh"
36#include "interner.hh"
37#include "keys.hh"
38#include "packet.hh"
39#include "paths.hh"
40#include "platform.hh"
41#include "rcs_file.hh"
42#include "revision.hh"
43#include "safe_map.hh"
44#include "sanity.hh"
45#include "transforms.hh"
46#include "ui.hh"
47
48using std::make_pair;
49using std::map;
50using std::multimap;
51using std::out_of_range;
52using std::pair;
53using std::search;
54using std::set;
55using std::sscanf;
56using std::stable_sort;
57using std::stack;
58using std::string;
59using std::vector;
60
61using boost::scoped_ptr;
62using boost::shared_ptr;
63using boost::lexical_cast;
64
65// cvs history recording stuff
66
67typedef unsigned long cvs_branchname;
68typedef unsigned long cvs_author;
69typedef unsigned long cvs_changelog;
70typedef unsigned long cvs_version;
71typedef unsigned long cvs_path;
72typedef unsigned long cvs_tag;
73
74struct cvs_history;
75
76struct
77cvs_commit
78{
79 cvs_commit(rcs_file const & r,
80 string const & rcs_version,
81 file_id const & ident,
82 cvs_history & cvs);
83
84 bool is_synthetic_branch_root;
85 time_t time;
86 bool alive;
87 cvs_author author;
88 cvs_changelog changelog;
89 cvs_version version;
90 cvs_path path;
91 vector<cvs_tag> tags;
92
93 bool operator<(cvs_commit const & other) const
94 {
95 return time < other.time;
96 }
97};
98
99struct
100cvs_branch
101{
102 bool has_a_branchpoint;
103 bool has_a_commit;
104 time_t last_branchpoint;
105 time_t first_commit;
106
107 map<cvs_path, cvs_version> live_at_beginning;
108 vector<cvs_commit> lineage;
109
110 cvs_branch()
111 : has_a_branchpoint(false),
112 has_a_commit(false),
113 last_branchpoint(0),
114 first_commit(0)
115 {
116 }
117
118 void note_commit(time_t now)
119 {
120 if (!has_a_commit)
121 {
122 first_commit = now;
123 }
124 else
125 {
126 if (now < first_commit)
127 first_commit = now;
128 }
129 has_a_commit = true;
130 }
131
132 void note_branchpoint(time_t now)
133 {
134 has_a_branchpoint = true;
135 if (now > last_branchpoint)
136 last_branchpoint = now;
137 }
138
139 time_t beginning() const
140 {
141 I(has_a_branchpoint || has_a_commit);
142 if (has_a_commit)
143 {
144 I(first_commit != 0);
145 return first_commit;
146 }
147 else
148 {
149 I(last_branchpoint != 0);
150 return last_branchpoint;
151 }
152 }
153
154 void append_commit(cvs_commit const & c)
155 {
156 I(c.time != 0);
157 note_commit(c.time);
158 lineage.push_back(c);
159 }
160};
161
162struct
163cvs_history
164{
165
166 interner<unsigned long> branch_interner;
167 interner<unsigned long> author_interner;
168 interner<unsigned long> changelog_interner;
169 interner<unsigned long> file_version_interner;
170 interner<unsigned long> path_interner;
171 interner<unsigned long> tag_interner;
172
173 // assume admin has foo:X.Y.0.N in it, then
174 // this multimap contains entries of the form
175 // X.Y -> foo
176 multimap<string, string> branchpoints;
177
178 // and this map contains entries of the form
179 // X.Y.N.1 -> foo
180 map<string, string> branch_first_entries;
181
182 // branch name -> branch
183 map<string, shared_ptr<cvs_branch> > branches;
184 shared_ptr<cvs_branch> trunk;
185
186 // stack of branches we're injecting states into
187 stack< shared_ptr<cvs_branch> > stk;
188 stack< cvs_branchname > bstk;
189
190 // tag -> time, revision
191 //
192 // used to resolve the *last* revision which has a given tag
193 // applied; this is the revision which wins the tag.
194 map<unsigned long, pair<time_t, revision_id> > resolved_tags;
195
196 file_path curr_file;
197 cvs_path curr_file_interned;
198
199 string base_branch;
200
201 ticker n_versions;
202 ticker n_tree_branches;
203
204 cvs_history();
205 void set_filename(string const & file,
206 file_id const & ident);
207
208 void index_branchpoint_symbols(rcs_file const & r);
209
210 void push_branch(string const & branch_name, bool private_branch);
211 void pop_branch();
212};
213
214
215static bool
216is_sbr(shared_ptr<rcs_delta> dl,
217 shared_ptr<rcs_deltatext> dt)
218{
219
220 // CVS abuses the RCS format a bit (ha!) when storing a file which
221 // was only added on a branch: on the root of the branch there'll be
222 // a commit with dead state, empty text, and a log message
223 // containing the string "file foo was initially added on branch
224 // bar". We recognize and ignore these cases, as they do not
225 // "really" represent commits to be clustered together.
226
227 if (dl->state != "dead")
228 return false;
229
230 if (!dt->text.empty())
231 return false;
232
233 string log_bit = "was initially added on branch";
234 string::const_iterator i = search(dt->log.begin(),
235 dt->log.end(),
236 log_bit.begin(),
237 log_bit.end());
238
239 return i != dt->log.end();
240}
241
242
243cvs_commit::cvs_commit(rcs_file const & r,
244 string const & rcs_version,
245 file_id const & ident,
246 cvs_history & cvs)
247{
248 map<string, shared_ptr<rcs_delta> >::const_iterator delta =
249 r.deltas.find(rcs_version);
250 I(delta != r.deltas.end());
251
252 map<string, shared_ptr<rcs_deltatext> >::const_iterator deltatext =
253 r.deltatexts.find(rcs_version);
254 I(deltatext != r.deltatexts.end());
255
256 struct tm t;
257 // We need to initialize t to all zeros, because strptime has a habit of
258 // leaving bits of the data structure alone, letting garbage sneak into
259 // our output.
260 memset(&t, 0, sizeof(t));
261 char const * dp = delta->second->date.c_str();
262 L(FL("Calculating time of %s") % dp);
263#ifdef HAVE_STRPTIME
264 if (strptime(dp, "%y.%m.%d.%H.%M.%S", &t) == NULL)
265 I(strptime(dp, "%Y.%m.%d.%H.%M.%S", &t) != NULL);
266#else
267 I(sscanf(dp, "%d.%d.%d.%d.%d.%d", &(t.tm_year), &(t.tm_mon),
268 &(t.tm_mday), &(t.tm_hour), &(t.tm_min), &(t.tm_sec))==6);
269 t.tm_mon--;
270 // Apparently some RCS files have 2 digit years, others four; tm always
271 // wants a 2 (or 3) digit year (years since 1900).
272 if (t.tm_year > 1900)
273 t.tm_year-=1900;
274#endif
275 time = mktime(&t);
276 L(FL("= %i") % time);
277
278 is_synthetic_branch_root = is_sbr(delta->second,
279 deltatext->second);
280
281 alive = delta->second->state != "dead";
282 if (is_synthetic_branch_root)
283 changelog = cvs.changelog_interner.intern("synthetic branch root changelog");
284 else
285 changelog = cvs.changelog_interner.intern(deltatext->second->log);
286 author = cvs.author_interner.intern(delta->second->author);
287 path = cvs.curr_file_interned;
288 version = cvs.file_version_interner.intern(ident.inner()());
289
290 typedef multimap<string,string>::const_iterator ity;
291 pair<ity,ity> range = r.admin.symbols.equal_range(rcs_version);
292 for (ity i = range.first; i != range.second; ++i)
293 {
294 if (i->first == rcs_version)
295 {
296 L(FL("version %s -> tag %s") % rcs_version % i->second);
297 tags.push_back(cvs.tag_interner.intern(i->second));
298 }
299 }
300
301}
302
303
304// piece table stuff
305
306struct piece;
307
308struct
309piece_store
310{
311 vector< shared_ptr<rcs_deltatext> > texts;
312 void index_deltatext(shared_ptr<rcs_deltatext> const & dt,
313 vector<piece> & pieces);
314 void build_string(vector<piece> const & pieces,
315 string & out);
316 void reset() { texts.clear(); }
317};
318
319// FIXME: kludge, I was lazy and did not make this
320// a properly scoped variable.
321
322static piece_store global_pieces;
323
324
325struct
326piece
327{
328 piece(string::size_type p, string::size_type l, unsigned long id) :
329 pos(p), len(l), string_id(id) {}
330 string::size_type pos;
331 string::size_type len;
332 unsigned long string_id;
333 string operator*() const
334 {
335 return string(global_pieces.texts.at(string_id)->text.data() + pos, len);
336 }
337};
338
339
340void
341piece_store::build_string(vector<piece> const & pieces,
342 string & out)
343{
344 out.clear();
345 out.reserve(pieces.size() * 60);
346 for(vector<piece>::const_iterator i = pieces.begin();
347 i != pieces.end(); ++i)
348 out.append(texts.at(i->string_id)->text, i->pos, i->len);
349}
350
351void
352piece_store::index_deltatext(shared_ptr<rcs_deltatext> const & dt,
353 vector<piece> & pieces)
354{
355 pieces.clear();
356 pieces.reserve(dt->text.size() / 30);
357 texts.push_back(dt);
358 unsigned long id = texts.size() - 1;
359 string::size_type begin = 0;
360 string::size_type end = dt->text.find('\n');
361 while(end != string::npos)
362 {
363 // nb: the piece includes the '\n'
364 pieces.push_back(piece(begin, (end - begin) + 1, id));
365 begin = end + 1;
366 end = dt->text.find('\n', begin);
367 }
368 if (begin != dt->text.size())
369 {
370 // the text didn't end with '\n', so neither does the piece
371 end = dt->text.size();
372 pieces.push_back(piece(begin, end - begin, id));
373 }
374}
375
376
377static void
378process_one_hunk(vector< piece > const & source,
379 vector< piece > & dest,
380 vector< piece >::const_iterator & i,
381 int & cursor)
382{
383 string directive = **i;
384 assert(directive.size() > 1);
385 ++i;
386
387 try
388 {
389 char code;
390 int pos, len;
391 if (sscanf(directive.c_str(), " %c %d %d", &code, &pos, &len) != 3)
392 throw oops("illformed directive '" + directive + "'");
393
394 if (code == 'a')
395 {
396 // 'ax y' means "copy from source to dest until cursor == x, then
397 // copy y lines from delta, leaving cursor where it is"
398 while (cursor < pos)
399 dest.push_back(source.at(cursor++));
400 I(cursor == pos);
401 while (len--)
402 dest.push_back(*i++);
403 }
404 else if (code == 'd')
405 {
406 // 'dx y' means "copy from source to dest until cursor == x-1,
407 // then increment cursor by y, ignoring those y lines"
408 while (cursor < (pos - 1))
409 dest.push_back(source.at(cursor++));
410 I(cursor == pos - 1);
411 cursor += len;
412 }
413 else
414 throw oops("unknown directive '" + directive + "'");
415 }
416 catch (out_of_range &)
417 {
418 throw oops("out_of_range while processing " + directive
419 + " with source.size() == "
420 + lexical_cast<string>(source.size())
421 + " and cursor == "
422 + lexical_cast<string>(cursor));
423 }
424}
425
426static void
427construct_version(vector< piece > const & source_lines,
428 string const & dest_version,
429 vector< piece > & dest_lines,
430 rcs_file const & r)
431{
432 dest_lines.clear();
433 dest_lines.reserve(source_lines.size());
434
435 I(r.deltas.find(dest_version) != r.deltas.end());
436 shared_ptr<rcs_delta> delta = r.deltas.find(dest_version)->second;
437
438 I(r.deltatexts.find(dest_version) != r.deltatexts.end());
439 shared_ptr<rcs_deltatext> deltatext = r.deltatexts.find(dest_version)->second;
440
441 vector<piece> deltalines;
442 global_pieces.index_deltatext(deltatext, deltalines);
443
444 int cursor = 0;
445 for (vector<piece>::const_iterator i = deltalines.begin();
446 i != deltalines.end(); )
447 process_one_hunk(source_lines, dest_lines, i, cursor);
448 while (cursor < static_cast<int>(source_lines.size()))
449 dest_lines.push_back(source_lines[cursor++]);
450}
451
452// FIXME: should these be someplace else? using 'friend' to reach into the
453// DB is stupid, but it's also stupid to put raw edge insert methods on the
454// DB itself. or is it? hmm.. encapsulation vs. usage guidance..
455void
456rcs_put_raw_file_edge(hexenc<id> const & old_id,
457 hexenc<id> const & new_id,
458 delta const & del,
459 database & db)
460{
461 if (old_id == new_id)
462 {
463 L(FL("skipping identity file edge"));
464 return;
465 }
466
467 if (db.file_version_exists(old_id))
468 {
469 // we already have a way to get to this old version,
470 // no need to insert another reconstruction path
471 L(FL("existing path to %s found, skipping") % old_id);
472 }
473 else
474 {
475 I(db.exists(new_id, "files")
476 || db.delta_exists(new_id, "file_deltas"));
477 db.put_delta(old_id, new_id, del, "file_deltas");
478 }
479}
480
481
482static void
483insert_into_db(data const & curr_data,
484 hexenc<id> const & curr_id,
485 vector< piece > const & next_lines,
486 data & next_data,
487 hexenc<id> & next_id,
488 database & db)
489{
490 // inserting into the DB
491 // note: curr_lines is a "new" (base) version
492 // and next_lines is an "old" (derived) version.
493 // all storage edges go from new -> old.
494 {
495 string tmp;
496 global_pieces.build_string(next_lines, tmp);
497 next_data = tmp;
498 }
499 delta del;
500 diff(curr_data, next_data, del);
501 calculate_ident(next_data, next_id);
502 rcs_put_raw_file_edge(next_id, curr_id, del, db);
503}
504
505
506
507/*
508
509please read this exhaustingly long comment and understand it
510before mucking with the branch inference logic.
511
512we are processing a file version. a branch might begin here. if
513the current version is X.Y, then there is a branch B starting
514here iff there is a symbol in the admin section called X.Y.0.Z,
515where Z is the branch number (or if there is a private branch
516called X.Y.Z, which is either an import branch or some private
517RCS cruft).
518
519the version X.Y is then considered the branchpoint of B in the
520current file. this does *not* mean that the CVS key -- an
521abstraction representing whole-tree operations -- of X.Y is the
522branchpoint across the CVS archive we're processing.
523
524in fact, CVS does not record the occurrence of a branching
525action (tag -b). we have no idea who executed that command and
526when. what we know instead is the commit X.Y immediately
527preceeding the branch -- CVS consideres this the branchpoint --
528in this file's reduced view of history. we also know the first
529commit X.Y.Z.1 inside the branch (which might not exist).
530
531our old strategy was to consider all branches nested in a
532hierarchy, which was a super-tree of all the branch trees in all
533the CVS files in a repository. this involved considering X.Y as
534the parent version of branch X.Y.Z, an selecting "the"
535branchpoint connecting the two as the least CVS key X.Y.Z.1
536committed inside the branch B.
537
538this was a mistake, for two significant reasons.
539
540first, some files do not *have* any commit inside the branch B,
541only a branchpoint X.Y.0.Z. this branchpoint is actually the
542last commit *before* the user branched, and could be a very old
543commit, long before the branch was formed, so it is useless in
544determining the branch structure.
545
546second, some files do not have a branch B, or worse, have
547branched into B from an "ancestor" branch A, where a different
548file branches into B from a different ancestor branch C. in
549other words, while there *is* a tree structure within the X.Y.Z
550branches of each file, there is *no* shared tree structure
551between the branch names across a repository. in one file A can
552be an ancestor of B, in another file B can be an ancestor of A.
553
554thus, we give up on establishing a hierarchy between branches
555altogether. all branches exist in a flat namespace, and all are
556direct descendents of the empty revision at the root of
557history. each branchpoint symbol mentioned in the
558administrative section of a file is considered the root of a new
559lineage.
560
561*/
562
563
564static void
565process_branch(string const & begin_version,
566 vector< piece > const & begin_lines,
567 data const & begin_data,
568 hexenc<id> const & begin_id,
569 rcs_file const & r,
570 database & db,
571 cvs_history & cvs)
572{
573 string curr_version = begin_version;
574 scoped_ptr< vector< piece > > next_lines(new vector<piece>);
575 scoped_ptr< vector< piece > > curr_lines(new vector<piece>
576 (begin_lines.begin(),
577 begin_lines.end()));
578 data curr_data(begin_data), next_data;
579 hexenc<id> curr_id(begin_id), next_id;
580
581 while(! (r.deltas.find(curr_version) == r.deltas.end()))
582 {
583 L(FL("version %s has %d lines") % curr_version % curr_lines->size());
584
585 cvs_commit curr_commit(r, curr_version, curr_id, cvs);
586 if (!curr_commit.is_synthetic_branch_root)
587 {
588 cvs.stk.top()->append_commit(curr_commit);
589 ++cvs.n_versions;
590 }
591
592 string next_version = r.deltas.find(curr_version)->second->next;
593
594 if (! next_version.empty())
595 {
596 L(FL("following RCS edge %s -> %s") % curr_version % next_version);
597
598 construct_version(*curr_lines, next_version, *next_lines, r);
599 L(FL("constructed RCS version %s, inserting into database") %
600 next_version);
601
602 insert_into_db(curr_data, curr_id,
603 *next_lines, next_data, next_id, db);
604 }
605
606 // mark the beginning-of-branch time and state of this file if
607 // we're at a branchpoint
608 typedef multimap<string,string>::const_iterator ity;
609 pair<ity,ity> range = cvs.branchpoints.equal_range(curr_version);
610 if (range.first != cvs.branchpoints.end()
611 && range.first->first == curr_version)
612 {
613 for (ity i = range.first; i != range.second; ++i)
614 {
615 cvs.push_branch(i->second, false);
616 shared_ptr<cvs_branch> b = cvs.stk.top();
617 if (curr_commit.alive)
618 b->live_at_beginning[cvs.curr_file_interned] = curr_commit.version;
619 b->note_branchpoint(curr_commit.time);
620 cvs.pop_branch();
621 }
622 }
623
624
625 // recursively follow any branch commits coming from the branchpoint
626 shared_ptr<rcs_delta> curr_delta = r.deltas.find(curr_version)->second;
627 for(vector<string>::const_iterator i = curr_delta->branches.begin();
628 i != curr_delta->branches.end(); ++i)
629 {
630 string branch;
631 data branch_data;
632 hexenc<id> branch_id;
633 vector< piece > branch_lines;
634 bool priv = false;
635 map<string, string>::const_iterator be = cvs.branch_first_entries.find(*i);
636
637 if (be != cvs.branch_first_entries.end())
638 branch = be->second;
639 else
640 priv = true;
641
642 L(FL("following RCS branch %s = '%s'") % (*i) % branch);
643
644 construct_version(*curr_lines, *i, branch_lines, r);
645 insert_into_db(curr_data, curr_id,
646 branch_lines, branch_data, branch_id, db);
647
648 cvs.push_branch(branch, priv);
649 process_branch(*i, branch_lines, branch_data, branch_id, r, db, cvs);
650 cvs.pop_branch();
651
652 L(FL("finished RCS branch %s = '%s'") % (*i) % branch);
653 }
654
655 if (!r.deltas.find(curr_version)->second->next.empty())
656 {
657 // advance
658 curr_data = next_data;
659 curr_id = next_id;
660 curr_version = next_version;
661 swap(next_lines, curr_lines);
662 next_lines->clear();
663 }
664 else break;
665 }
666}
667
668
669static void
670import_rcs_file_with_cvs(string const & filename, database & db, cvs_history & cvs)
671{
672 rcs_file r;
673 L(FL("parsing RCS file %s") % filename);
674 parse_rcs_file(filename, r);
675 L(FL("parsed RCS file %s OK") % filename);
676
677 {
678 vector< piece > head_lines;
679 I(r.deltatexts.find(r.admin.head) != r.deltatexts.end());
680 I(r.deltas.find(r.admin.head) != r.deltas.end());
681
682 hexenc<id> id;
683 data dat(r.deltatexts.find(r.admin.head)->second->text);
684 calculate_ident(dat, id);
685 file_id fid = id;
686
687 cvs.set_filename (filename, fid);
688 cvs.index_branchpoint_symbols (r);
689
690 if (! db.file_version_exists (fid))
691 {
692 db.put_file(fid, dat);
693 }
694
695 {
696 // create the head state in case it is a loner
697 // cvs_key k;
698 // shared_ptr<cvs_state> s;
699 // L(FL("noting head version %s : %s") % cvs.curr_file % r.admin.head);
700 // cvs.find_key_and_state (r, r.admin.head, k, s);
701 }
702
703 global_pieces.reset();
704 global_pieces.index_deltatext(r.deltatexts.find(r.admin.head)->second, head_lines);
705 process_branch(r.admin.head, head_lines, dat, id, r, db, cvs);
706 global_pieces.reset();
707 }
708
709 ui.set_tick_trailer("");
710}
711
712
713void
714test_parse_rcs_file(system_path const & filename, database & db)
715{
716 cvs_history cvs;
717
718 I(! filename.empty());
719 assert_path_is_file(filename);
720
721 P(F("parsing RCS file %s") % filename);
722 rcs_file r;
723 parse_rcs_file(filename.as_external(), r);
724 P(F("parsed RCS file %s OK") % filename);
725}
726
727
728// CVS importing stuff follows
729
730
731static void
732split_version(string const & v, vector<string> & vs)
733{
734 vs.clear();
735 boost::char_separator<char> sep(".");
736 typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
737 tokenizer tokens(v, sep);
738 copy(tokens.begin(), tokens.end(), back_inserter(vs));
739}
740
741static void
742join_version(vector<string> const & vs, string & v)
743{
744 v.clear();
745 for (vector<string>::const_iterator i = vs.begin();
746 i != vs.end(); ++i)
747 {
748 if (i != vs.begin())
749 v += ".";
750 v += *i;
751 }
752}
753
754cvs_history::cvs_history() :
755 n_versions("versions", "v", 1),
756 n_tree_branches("branches", "b", 1)
757{
758}
759
760void
761cvs_history::set_filename(string const & file,
762 file_id const & ident)
763{
764 L(FL("importing file '%s'") % file);
765 I(file.size() > 2);
766 I(file.substr(file.size() - 2) == string(",v"));
767 string ss = file;
768 ui.set_tick_trailer(ss);
769 ss.resize(ss.size() - 2);
770 // remove Attic/ if present
771 string::size_type last_slash=ss.rfind('/');
772 if (last_slash!=string::npos && last_slash>=5
773 && ss.substr(last_slash-5,6)=="Attic/")
774 ss.erase(last_slash-5,6);
775 curr_file = file_path_internal(ss);
776 curr_file_interned = path_interner.intern(ss);
777}
778
779void cvs_history::index_branchpoint_symbols(rcs_file const & r)
780{
781 branchpoints.clear();
782 branch_first_entries.clear();
783
784 for (multimap<string, string>::const_iterator i =
785 r.admin.symbols.begin(); i != r.admin.symbols.end(); ++i)
786 {
787 string const & num = i->first;
788 string const & sym = i->second;
789
790 vector<string> components;
791 split_version(num, components);
792
793 vector<string> first_entry_components;
794 vector<string> branchpoint_components;
795
796 if (components.size() > 2 &&
797 (components.size() % 2 == 1))
798 {
799 // this is a "vendor" branch
800 //
801 // such as "1.1.1", where "1.1" is the branchpoint and
802 // "1.1.1.1" will be the first commit on it.
803
804 first_entry_components = components;
805 first_entry_components.push_back("1");
806
807 branchpoint_components = components;
808 branchpoint_components.erase(branchpoint_components.end() - 1,
809 branchpoint_components.end());
810
811 }
812
813 else if (components.size() > 2 &&
814 (components.size() % 2 == 0) &&
815 components[components.size() - 2] == string("0"))
816 {
817 // this is a "normal" branch
818 //
819 // such as "1.3.0.2", where "1.3" is the branchpoint and
820 // "1.3.2.1"
821
822 first_entry_components = components;
823 first_entry_components[first_entry_components.size() - 2]
824 = first_entry_components[first_entry_components.size() - 1];
825 first_entry_components[first_entry_components.size() - 1]
826 = string("1");
827
828 branchpoint_components = components;
829 branchpoint_components.erase(branchpoint_components.end() - 2,
830 branchpoint_components.end());
831 }
832
833 string first_entry_version;
834 join_version(first_entry_components, first_entry_version);
835
836 L(FL("first version in branch %s would be %s")
837 % sym % first_entry_version);
838 branch_first_entries.insert(make_pair(first_entry_version, sym));
839
840 string branchpoint_version;
841 join_version(branchpoint_components, branchpoint_version);
842
843 L(FL("file branchpoint for %s at %s") % sym % branchpoint_version);
844 branchpoints.insert(make_pair(branchpoint_version, sym));
845 }
846}
847
848
849
850void
851cvs_history::push_branch(string const & branch_name, bool private_branch)
852{
853 shared_ptr<cvs_branch> branch;
854
855 string bname = base_branch + "." + branch_name;
856 I(stk.size() > 0);
857
858 if (private_branch)
859 {
860 branch = shared_ptr<cvs_branch>(new cvs_branch());
861 stk.push(branch);
862 bstk.push(branch_interner.intern(""));
863 return;
864 }
865 else
866 {
867 map<string, shared_ptr<cvs_branch> >::const_iterator b = branches.find(bname);
868 if (b == branches.end())
869 {
870 branch = shared_ptr<cvs_branch>(new cvs_branch());
871 branches.insert(make_pair(bname, branch));
872 ++n_tree_branches;
873 }
874 else
875 branch = b->second;
876
877 stk.push(branch);
878 bstk.push(branch_interner.intern(bname));
879 }
880}
881
882void
883cvs_history::pop_branch()
884{
885 I(stk.size() > 1);
886 stk.pop();
887 bstk.pop();
888}
889
890
891class
892cvs_tree_walker
893 : public tree_walker
894{
895 cvs_history & cvs;
896 database & db;
897public:
898 cvs_tree_walker(cvs_history & c, database & d) :
899 cvs(c), db(d)
900 {
901 }
902 virtual void visit_file(file_path const & path)
903 {
904 string file = path.as_external();
905 if (file.substr(file.size() - 2) == string(",v"))
906 {
907 try
908 {
909 import_rcs_file_with_cvs(file, db, cvs);
910 }
911 catch (oops const & o)
912 {
913 W(F("error reading RCS file %s: %s") % file % o.what());
914 }
915 }
916 else
917 L(FL("skipping non-RCS file %s") % file);
918 }
919 virtual ~cvs_tree_walker() {}
920};
921
922
923
924
925//
926// our task here is to produce a sequence of revision descriptions
927// from the per-file commit records we have. we do this by rolling
928// forwards through the temporally sorted file-commit list
929// accumulating file-commits into revisions and flushing the
930// revisions when we feel they are "complete".
931//
932// revisions have to have a time associated with them. this time
933// will be the first time of any commit associated with the
934// revision. they have an author and a changelog, which is shared
935// by all the file-commits in the revision.
936//
937// there might be multiple revisions overlapping in time. this is
938// legal wrt. CVS. we keep a set, and search all members of the set
939// for the best match.
940//
941// consider this situation of overlapping revisions:
942//
943// +---------------+ +---------------+ +---------------+
944// | rev #1 @ 0011 | | rev #2 @ 0012 | | rev #3 @ 0013 |
945// |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~| |~~~~~~~~~~~~~~~|
946// | patch foo.txt | | patch bar.txt | | patch baz.txt |
947// +---------------+ +---------------+ +---------------+
948//
949// suppose you have this situation and you run across a "patch
950// bar.txt" commit at timestamp 0014. what do you do?
951//
952// - you know that rev #2 cannot accept this commit, simply because
953// two commits on the same file makes *two* revisions, not one.
954//
955// - perhaps rev #3 could accept it; after all, it could be that the
956// commit associated with rev #2 released its commit lock, and the
957// commit associated with rev #3 quickly updated and committed at
958// 0013, finishing off at 0014.
959//
960// - can rev #1 accept it? no. because CVS calcualted the version it
961// expected to see in bar.txt before calling up the server, when
962// committing rev #1. the version it expected to see was the version
963// in bar.txt *before* time 0012; that is, before rev #2 had any affect
964// on bar.txt. when it contacted the server, the commit associated
965// with rev #1 would have aborted if it had seen any other number.
966// so rev #1 could not start before an edit to bar.txt and then
967// include its own edit to bar.txt.
968//
969// so we have only one case where bar.txt can be accepted. if the
970// commit is not accepted into a legal rev (outside the window,
971// wrong changelog/author) it starts a new revision.
972//
973// as we scan forwards, if we hit timestamps which lie beyond rev #n's
974// window, we flush rev #n.
975//
976// if there are multiple coincident and legal revs to direct a
977// commit to (all with the same author/changelog), we direct the
978// commit to the rev with the closest initial timestamp. that is,
979// the *latest* beginning time.
980
981struct
982cvs_cluster
983{
984 time_t first_time;
985 cvs_author author;
986 cvs_changelog changelog;
987 set<cvs_tag> tags;
988
989 cvs_cluster(time_t t,
990 cvs_author a,
991 cvs_changelog c)
992 : first_time(t),
993 author(a),
994 changelog(c)
995 {}
996
997 struct entry
998 {
999 bool live;
1000 cvs_version version;
1001 time_t time;
1002 entry(bool l, cvs_version v, time_t t)
1003 : live(l),
1004 version(v),
1005 time(t)
1006 {}
1007 };
1008
1009 typedef map<cvs_path, entry> entry_map;
1010 entry_map entries;
1011};
1012
1013
1014struct
1015cluster_consumer
1016{
1017 cvs_history & cvs;
1018 app_state & app;
1019 string const & branchname;
1020 cvs_branch const & branch;
1021 set<split_path> created_dirs;
1022 map<cvs_path, cvs_version> live_files;
1023 ticker & n_revisions;
1024
1025 struct prepared_revision
1026 {
1027 prepared_revision(revision_id i,
1028 shared_ptr<revision_t> r,
1029 cvs_cluster const & c);
1030 revision_id rid;
1031 shared_ptr<revision_t> rev;
1032 time_t time;
1033 cvs_author author;
1034 cvs_changelog changelog;
1035 vector<cvs_tag> tags;
1036 };
1037
1038 vector<prepared_revision> preps;
1039
1040 roster_t ros;
1041 temp_node_id_source nis;
1042 editable_roster_base editable_ros;
1043 revision_id parent_rid, child_rid;
1044
1045 cluster_consumer(cvs_history & cvs,
1046 app_state & app,
1047 string const & branchname,
1048 cvs_branch const & branch,
1049 ticker & n_revs);
1050
1051 void consume_cluster(cvs_cluster const & c);
1052 void add_missing_parents(split_path const & sp, cset & cs);
1053 void build_cset(cvs_cluster const & c, cset & cs);
1054 void store_auxiliary_certs(prepared_revision const & p);
1055 void store_revisions();
1056};
1057
1058typedef shared_ptr<cvs_cluster>
1059cluster_ptr;
1060
1061struct
1062cluster_ptr_lt
1063{
1064 bool operator()(cluster_ptr const & a,
1065 cluster_ptr const & b) const
1066 {
1067 return a->first_time < b->first_time;
1068 }
1069};
1070
1071typedef set<cluster_ptr, cluster_ptr_lt>
1072cluster_set;
1073
1074void
1075import_branch(cvs_history & cvs,
1076 app_state & app,
1077 string const & branchname,
1078 shared_ptr<cvs_branch> const & branch,
1079 ticker & n_revs)
1080{
1081 cluster_set clusters;
1082 cluster_consumer cons(cvs, app, branchname, *branch, n_revs);
1083 unsigned long commits_remaining = branch->lineage.size();
1084
1085 // step 1: sort the lineage
1086 stable_sort(branch->lineage.begin(), branch->lineage.end());
1087
1088 for (vector<cvs_commit>::const_iterator i = branch->lineage.begin();
1089 i != branch->lineage.end(); ++i)
1090 {
1091 commits_remaining--;
1092
1093 L(FL("examining next commit [t:%d] [p:%s] [a:%s] [c:%s]")
1094 % i->time
1095 % cvs.path_interner.lookup(i->path)
1096 % cvs.author_interner.lookup(i->author)
1097 % cvs.changelog_interner.lookup(i->changelog));
1098
1099 // step 2: expire all clusters from the beginning of the set which
1100 // have passed the window size
1101 while (!clusters.empty())
1102 {
1103 cluster_set::iterator j = clusters.begin();
1104 if ((*j)->first_time + constants::cvs_window < i->time)
1105 {
1106 L(FL("expiring cluster"));
1107 cons.consume_cluster(**j);
1108 clusters.erase(j);
1109 }
1110 else
1111 break;
1112 }
1113
1114 // step 3: find the last still-live cluster to have touched this
1115 // file
1116 time_t time_of_last_cluster_touching_this_file = 0;
1117
1118 unsigned clu = 0;
1119 for (cluster_set::const_iterator j = clusters.begin();
1120 j != clusters.end(); ++j)
1121 {
1122 L(FL("examining cluster %d to see if it touched %d")
1123 % clu++
1124 % i->path);
1125
1126 cvs_cluster::entry_map::const_iterator k = (*j)->entries.find(i->path);
1127 if ((k != (*j)->entries.end())
1128 && (k->second.time > time_of_last_cluster_touching_this_file))
1129 {
1130 L(FL("found cluster touching %d: [t:%d] [a:%d] [c:%d]")
1131 % i->path
1132 % (*j)->first_time
1133 % (*j)->author
1134 % (*j)->changelog);
1135 time_of_last_cluster_touching_this_file = (*j)->first_time;
1136 }
1137 }
1138 L(FL("last modification time is %d")
1139 % time_of_last_cluster_touching_this_file);
1140
1141 // step 4: find a cluster which starts on or after the
1142 // last_modify_time, which doesn't modify the file in question,
1143 // and which contains the same author and changelog as our
1144 // commit
1145 cluster_ptr target;
1146 for (cluster_set::const_iterator j = clusters.begin();
1147 j != clusters.end(); ++j)
1148 {
1149 if (((*j)->first_time >= time_of_last_cluster_touching_this_file)
1150 && ((*j)->author == i->author)
1151 && ((*j)->changelog == i->changelog)
1152 && ((*j)->entries.find(i->path) == (*j)->entries.end()))
1153 {
1154 L(FL("picked existing cluster [t:%d] [a:%d] [c:%d]")
1155 % (*j)->first_time
1156 % (*j)->author
1157 % (*j)->changelog);
1158
1159 target = (*j);
1160 }
1161 }
1162
1163 // if we're still not finding an active cluster,
1164 // this is probably the first commit in it. make
1165 // a new one.
1166 if (!target)
1167 {
1168 L(FL("building new cluster [t:%d] [a:%d] [c:%d]")
1169 % i->time
1170 % i->author
1171 % i->changelog);
1172
1173 target = cluster_ptr(new cvs_cluster(i->time,
1174 i->author,
1175 i->changelog));
1176 clusters.insert(target);
1177 }
1178
1179 I(target);
1180 target->entries.insert(make_pair(i->path,
1181 cvs_cluster::entry(i->alive,
1182 i->version,
1183 i->time)));
1184 for (vector<cvs_tag>::const_iterator j = i->tags.begin();
1185 j != i->tags.end(); ++j)
1186 {
1187 target->tags.insert(*j);
1188 }
1189 }
1190
1191
1192 // now we are done this lineage; flush all remaining clusters
1193 L(FL("finished branch commits, writing all pending clusters"));
1194 while (!clusters.empty())
1195 {
1196 cons.consume_cluster(**clusters.begin());
1197 clusters.erase(clusters.begin());
1198 }
1199 L(FL("finished writing pending clusters"));
1200
1201 cons.store_revisions();
1202
1203}
1204
1205void
1206import_cvs_repo(system_path const & cvsroot,
1207 app_state & app)
1208{
1209 N(!directory_exists(cvsroot / "CVSROOT"),
1210 F("%s appears to be a CVS repository root directory\n"
1211 "try importing a module instead, with 'cvs_import %s/<module_name>")
1212 % cvsroot % cvsroot);
1213
1214 {
1215 // early short-circuit to avoid failure after lots of work
1216 rsa_keypair_id key;
1217 get_user_key(key, app);
1218 require_password(key, app);
1219 }
1220
1221 cvs_history cvs;
1222 N(app.branch_name() != "", F("need base --branch argument for importing"));
1223 cvs.base_branch = app.branch_name();
1224
1225 // push the trunk
1226 cvs.trunk = shared_ptr<cvs_branch>(new cvs_branch());
1227 cvs.stk.push(cvs.trunk);
1228 cvs.bstk.push(cvs.branch_interner.intern(cvs.base_branch));
1229
1230 {
1231 transaction_guard guard(app.db);
1232 cvs_tree_walker walker(cvs, app.db);
1233 require_path_is_directory(cvsroot,
1234 F("path %s does not exist") % cvsroot,
1235 F("'%s' is not a directory") % cvsroot);
1236 app.db.ensure_open();
1237 change_current_working_dir(cvsroot);
1238 walk_tree(file_path(), walker);
1239 guard.commit();
1240 }
1241
1242 I(cvs.stk.size() == 1);
1243
1244 ticker n_revs(_("revisions"), "r", 1);
1245
1246 while (cvs.branches.size() > 0)
1247 {
1248 transaction_guard guard(app.db);
1249 map<string, shared_ptr<cvs_branch> >::const_iterator i = cvs.branches.begin();
1250 string branchname = i->first;
1251 shared_ptr<cvs_branch> branch = i->second;
1252 L(FL("branch %s has %d entries") % branchname % branch->lineage.size());
1253 import_branch(cvs, app, branchname, branch, n_revs);
1254
1255 // free up some memory
1256 cvs.branches.erase(branchname);
1257 guard.commit();
1258 }
1259
1260 {
1261 transaction_guard guard(app.db);
1262 L(FL("trunk has %d entries") % cvs.trunk->lineage.size());
1263 import_branch(cvs, app, cvs.base_branch, cvs.trunk, n_revs);
1264 guard.commit();
1265 }
1266
1267 // now we have a "last" rev for each tag
1268 {
1269 ticker n_tags(_("tags"), "t", 1);
1270 packet_db_writer dbw(app);
1271 transaction_guard guard(app.db);
1272 for (map<unsigned long, pair<time_t, revision_id> >::const_iterator i = cvs.resolved_tags.begin();
1273 i != cvs.resolved_tags.end(); ++i)
1274 {
1275 string tag = cvs.tag_interner.lookup(i->first);
1276 ui.set_tick_trailer("marking tag " + tag);
1277 cert_revision_tag(i->second.second, tag, app, dbw);
1278 ++n_tags;
1279 }
1280 guard.commit();
1281 }
1282
1283
1284 return;
1285}
1286
1287cluster_consumer::cluster_consumer(cvs_history & cvs,
1288 app_state & app,
1289 string const & branchname,
1290 cvs_branch const & branch,
1291 ticker & n_revs)
1292 : cvs(cvs),
1293 app(app),
1294 branchname(branchname),
1295 branch(branch),
1296 n_revisions(n_revs),
1297 editable_ros(ros, nis)
1298{
1299 if (!branch.live_at_beginning.empty())
1300 {
1301 cvs_author synthetic_author =
1302 cvs.author_interner.intern("cvs_import");
1303
1304 cvs_changelog synthetic_cl =
1305 cvs.changelog_interner.intern("beginning of branch "
1306 + branchname);
1307
1308 time_t synthetic_time = branch.beginning();
1309 cvs_cluster initial_cluster(synthetic_time,
1310 synthetic_author,
1311 synthetic_cl);
1312
1313 L(FL("initial cluster on branch %s has %d live entries") %
1314 branchname % branch.live_at_beginning.size());
1315
1316 for (map<cvs_path, cvs_version>::const_iterator i = branch.live_at_beginning.begin();
1317 i != branch.live_at_beginning.end(); ++i)
1318 {
1319 cvs_cluster::entry e(true, i->second, synthetic_time);
1320 L(FL("initial cluster contains %s at %s") %
1321 cvs.path_interner.lookup(i->first) %
1322 cvs.file_version_interner.lookup(i->second));
1323 initial_cluster.entries.insert(make_pair(i->first, e));
1324 }
1325 consume_cluster(initial_cluster);
1326 }
1327}
1328
1329cluster_consumer::prepared_revision::prepared_revision(revision_id i,
1330 shared_ptr<revision_t> r,
1331 cvs_cluster const & c)
1332 : rid(i),
1333 rev(r),
1334 time(c.first_time),
1335 author(c.author),
1336 changelog(c.changelog)
1337{
1338 for (set<cvs_tag>::const_iterator i = c.tags.begin();
1339 i != c.tags.end(); ++i)
1340 {
1341 tags.push_back(*i);
1342 }
1343}
1344
1345
1346void
1347cluster_consumer::store_revisions()
1348{
1349 for (vector<prepared_revision>::const_iterator i = preps.begin();
1350 i != preps.end(); ++i)
1351 {
1352 if (! app.db.revision_exists(i->rid))
1353 {
1354 data tmp;
1355 write_revision(*(i->rev), tmp);
1356 app.db.put_revision(i->rid, *(i->rev));
1357 store_auxiliary_certs(*i);
1358 ++n_revisions;
1359 }
1360 }
1361}
1362
1363void
1364cluster_consumer::store_auxiliary_certs(prepared_revision const & p)
1365{
1366 packet_db_writer dbw(app);
1367
1368 for (vector<cvs_tag>::const_iterator i = p.tags.begin();
1369 i != p.tags.end(); ++i)
1370 {
1371 map<unsigned long, pair<time_t, revision_id> >::const_iterator j
1372 = cvs.resolved_tags.find(*i);
1373
1374 if (j != cvs.resolved_tags.end())
1375 {
1376 if (j->second.first < p.time)
1377 {
1378 // move the tag forwards
1379 cvs.resolved_tags.erase(*i);
1380 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1381 }
1382 }
1383 else
1384 {
1385 cvs.resolved_tags.insert(make_pair(*i, make_pair(p.time, p.rid)));
1386 }
1387 }
1388
1389 cert_revision_in_branch(p.rid, cert_value(branchname), app, dbw);
1390 cert_revision_author(p.rid, cvs.author_interner.lookup(p.author), app, dbw);
1391 cert_revision_changelog(p.rid, cvs.changelog_interner.lookup(p.changelog), app, dbw);
1392 cert_revision_date_time(p.rid, p.time, app, dbw);
1393}
1394
1395void
1396cluster_consumer::add_missing_parents(split_path const & sp, cset & cs)
1397{
1398 split_path tmp(sp);
1399 if (tmp.empty())
1400 return;
1401 tmp.pop_back();
1402 while (!tmp.empty())
1403 {
1404 if (created_dirs.find(tmp) == created_dirs.end())
1405 {
1406 safe_insert(created_dirs, tmp);
1407 safe_insert(cs.dirs_added, tmp);
1408 }
1409 tmp.pop_back();
1410 }
1411}
1412
1413void
1414cluster_consumer::build_cset(cvs_cluster const & c,
1415 cset & cs)
1416{
1417 for (cvs_cluster::entry_map::const_iterator i = c.entries.begin();
1418 i != c.entries.end(); ++i)
1419 {
1420 file_path pth = file_path_internal(cvs.path_interner.lookup(i->first));
1421 split_path sp;
1422 pth.split(sp);
1423
1424 file_id fid(cvs.file_version_interner.lookup(i->second.version));
1425 if (i->second.live)
1426 {
1427 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1428 if (e == live_files.end())
1429 {
1430 add_missing_parents(sp, cs);
1431 L(FL("adding entry state '%s' on '%s'") % fid % pth);
1432 safe_insert(cs.files_added, make_pair(sp, fid));
1433 live_files[i->first] = i->second.version;
1434 }
1435 else if (e->second != i->second.version)
1436 {
1437 file_id old_fid(cvs.file_version_interner.lookup(e->second));
1438 L(FL("applying state delta on '%s' : '%s' -> '%s'")
1439 % pth % old_fid % fid);
1440 safe_insert(cs.deltas_applied,
1441 make_pair(sp, make_pair(old_fid, fid)));
1442 live_files[i->first] = i->second.version;
1443 }
1444 }
1445 else
1446 {
1447 map<cvs_path, cvs_version>::const_iterator e = live_files.find(i->first);
1448 if (e != live_files.end())
1449 {
1450 L(FL("deleting entry state '%s' on '%s'") % fid % pth);
1451 safe_insert(cs.nodes_deleted, sp);
1452 live_files.erase(i->first);
1453 }
1454 }
1455 }
1456}
1457
1458void
1459cluster_consumer::consume_cluster(cvs_cluster const & c)
1460{
1461 // we should never have an empty cluster; it's *possible* to have
1462 // an empty changeset (say on a vendor import) but every cluster
1463 // should have been created by at least one file commit, even
1464 // if the commit made no changes. it's a logical inconsistency if
1465 // you have an empty cluster.
1466 I(!c.entries.empty());
1467
1468 shared_ptr<revision_t> rev(new revision_t());
1469 shared_ptr<cset> cs(new cset());
1470 build_cset(c, *cs);
1471
1472 cs->apply_to(editable_ros);
1473 manifest_id child_mid;
1474 calculate_ident(ros, child_mid);
1475 rev->new_manifest = child_mid;
1476 rev->edges.insert(make_pair(parent_rid, cs));
1477 calculate_ident(*rev, child_rid);
1478
1479 preps.push_back(prepared_revision(child_rid, rev, c));
1480
1481 parent_rid = child_rid;
1482}
1483
1484// Local Variables:
1485// mode: C++
1486// fill-column: 76
1487// c-file-style: "gnu"
1488// indent-tabs-mode: nil
1489// End:
1490// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status