monotone

monotone Mtn Source Tree

Root/lcs.cc

1/*
2
3 this is a pretty direct translation (with only vague understanding,
4 unfortunately) of aubrey jaffer's most recent O(NP) edit-script
5 calculation algorithm, which performs quite a bit better than myers,
6 manber and miller's O(NP) simple edit *distance* algorithm. this one
7 builds the entire *script* that fast.
8
9 the following is jaffer's copyright and license statement. it probably
10 still has some legal relevance here, as this is a highly derivative
11 work. if not, the portions of this file which are "mine" (if any exist)
12 are subject to copyright (C) 2003 graydon hoare, licensed to the public
13 under the GPL v2+. see the file COPYING for details. if you want to see
14 more of the original file jaffer's work came from, see the SLIB
15 repository on savannah.nongnu.org, or look in the journal of
16 computational biology. apparantly it's submitted for publication there
17 too.
18
19 ---
20
21 "differ.scm" O(NP) Sequence Comparison Algorithm.
22 Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
23
24 Permission to copy this software, to modify it, to redistribute it, to
25 distribute modified versions, and to use it for any purpose is granted,
26 subject to the following restrictions and understandings.
27
28 1. Any copy made of this software must include this copyright notice
29 in full.
30
31 2. I have made no warrantee or representation that the operation of
32 this software will be error-free, and I am under no obligation to
33 provide any services, by way of maintenance, update, or otherwise.
34
35 3. In conjunction with products arising from the use of this material,
36 there shall be no use of my name in any advertising, promotional, or
37 sales literature without prior written consent in each case.
38
39*/
40
41#include <algorithm>
42#include <vector>
43
44#include "lcs.hh"
45#include "sanity.hh"
46
47using std::back_insert_iterator;
48using std::back_inserter;
49using std::copy;
50using std::iterator_traits;
51using std::max;
52using std::min;
53using std::sort;
54using std::vector;
55
56struct work_vec
57{
58 long lo;
59 long hi;
60 static vector<long, QA(long)> vec;
61 work_vec(long lo, long hi) :
62 lo(lo), hi(hi)
63 {
64 // I(hi >= lo);
65 size_t len = (hi - lo) + 1;
66 vec.resize(len);
67 vec.assign(len, -1);
68 }
69
70 inline long & operator[](long t)
71 {
72 // I(t >= lo && t <= hi);
73 return vec[t-lo];
74 }
75};
76
77vector<long, QA(long)> work_vec::vec;
78
79template <typename A,
80 typename B,
81 typename LCS>
82struct jaffer_edit_calculator
83{
84
85 typedef vector<long, QA(long)> cost_vec;
86 typedef vector<long, QA(long)> edit_vec;
87
88 template <typename T>
89 struct subarray
90 {
91 typedef typename iterator_traits<T>::value_type vt;
92
93 T base; // underlying representation
94 long start; // current extent
95 long end; // current extent
96
97 subarray(T b, long s, long e) :
98 base(b), start(s), end(e) {}
99
100 inline long size() const
101 {
102 if (end < start)
103 return start - end;
104 else
105 return end - start;
106 }
107
108 inline subarray subset(long s, long e) const
109 {
110 return subarray(base + min(start, end), s, e);
111 }
112
113 inline vt const & operator[](size_t idx) const
114 {
115 if (end < start)
116 return *(base + (start - (idx + 1)));
117 else
118 return *(base + (start + idx));
119 }
120 };
121
122 static long run(work_vec & fp, long k,
123 subarray<A> const & a, long m,
124 subarray<B> const & b, long n,
125 cost_vec & CC, long p)
126 {
127 long cost = k + 2*p;
128
129 // do the run
130 long y = max(fp[k-1]+1, fp[k+1]);
131 long x = y - k;
132
133 I(y >= 0);
134 I(x >= 0);
135
136 while (true)
137 {
138 // record costs along the way
139 long xcst = m - x;
140 if (y < static_cast<long>(CC.size()) && xcst >= 0)
141 {
142 CC[y] = min(xcst + cost, CC[y]);
143 }
144 if (x < m && y < n && a[x] == b[y])
145 {
146 ++x; ++y;
147 }
148 else
149 break;
150 }
151
152 fp[k] = y;
153 return y;
154 }
155
156 // 'compare' here is the core myers, manber and miller algorithm.
157 static long compare(cost_vec & costs,
158 subarray<A> const & a, long m,
159 subarray<B> const & b, long n,
160 long p_lim,
161 bool full_scan = true)
162 {
163 long lo = -(m+1), hi = (1+n);
164 if (full_scan)
165 {
166 lo = -(p_lim + 1);
167 hi = p_lim + 1 + (n-m);
168 }
169 work_vec fp(lo, hi);
170
171 long p = 0;
172 long delta = n - m;
173
174 for (; p <= p_lim; ++p)
175 {
176
177 // lower sweep
178 for (long k = -p; k < delta; ++k)
179 run(fp, k, a, m, b, n, costs, p);
180
181 // upper sweep
182 for (long k = delta + p; k > delta; --k)
183 run(fp, k, a, m, b, n, costs, p);
184
185 // middle
186 long fpval = run(fp, delta, a, m, b, n, costs, p);
187
188 // we can bail early if not doing a full scan
189 if (!full_scan && n <= fpval)
190 break;
191 }
192
193 return delta + 2*p;
194 }
195
196 static long divide_and_conquer(subarray<A> const & a, long start_a, long end_a,
197 subarray<B> const & b, long start_b, long end_b,
198 edit_vec & edits,
199 unsigned long edx,
200 long polarity,
201 long p_lim)
202 {
203 long mid_a = (start_a + end_a) / 2;
204 long len_b = end_b - start_b;
205 long len_a = end_a - start_a;
206 long tcst = (2 * p_lim) + (len_b - len_a);
207
208 I(start_a >= 0);
209 I(start_a <= a.size());
210 I(start_b >= 0);
211 I(start_b <= b.size());
212 I(end_a >= 0);
213 I(end_a <= a.size());
214 I(end_b >= 0);
215 I(end_b <= b.size());
216
217 cost_vec cc(len_b + 1, len_a + len_b);
218 cost_vec rr(len_b + 1, len_a + len_b);
219
220 compare (cc,
221 a.subset(start_a, mid_a), (mid_a - start_a),
222 b.subset(start_b, end_b), len_b, min(p_lim, len_a));
223
224 compare (rr,
225 a.subset(end_a, mid_a), (end_a - mid_a),
226 b.subset(end_b, start_b), len_b, min(p_lim, len_a));
227
228 long b_split = mid_split(len_a, len_b, rr, cc, tcst);
229
230 long est_c = cc[b_split];
231 long est_r = rr[len_b - b_split];
232
233 long cost_c = diff_to_et (a, start_a, mid_a,
234 b, start_b, start_b + b_split,
235 edits, edx, polarity,
236 (est_c - (b_split - (mid_a - start_a))) / 2);
237
238 I(cost_c == est_c);
239
240 long cost_r = diff_to_et (a, mid_a, end_a,
241 b, start_b + b_split, end_b,
242 edits, est_c + edx, polarity,
243 (est_r - ((len_b - b_split) - (end_a - mid_a))) / 2);
244
245 I(cost_r == est_r);
246
247 return est_r + est_c;
248 }
249
250 static long mid_split(long m, long n,
251 cost_vec const & rr,
252 cost_vec const & cc,
253 long cost)
254 {
255 long cdx = 1 + n/2;
256 long rdx = n/2;
257 while (true)
258 {
259 I (rdx >= 0);
260
261 if (cost == (cc[rdx] + rr[n-rdx]))
262 return rdx;
263 if (cost == (cc[cdx] + rr[n-cdx]))
264 return cdx;
265 --rdx;
266 ++cdx;
267 }
268 }
269
270
271 static void order_edits(edit_vec const & edits,
272 long sign,
273 edit_vec & nedits)
274 {
275 nedits.clear();
276 nedits.resize(edits.size());
277 long cost = edits.size();
278
279 if (cost == 0)
280 {
281 nedits = edits;
282 return;
283 }
284
285 edit_vec sedits = edits;
286 sort(sedits.begin(), sedits.end());
287
288 long idx0;
289 for (idx0 = 0; idx0 < cost && sedits[idx0] < 0; ++idx0);
290 long len_a = max(0L, -sedits[0]);
291 long len_b = sedits[cost-1];
292
293 long ddx = idx0 - 1;
294 long idx = idx0;
295 long ndx = 0;
296 long adx = 0;
297 long bdx = 0;
298
299 while (bdx < len_b || adx < len_a)
300 {
301
302 long del = (ddx < 0) ? 0 : sedits[ddx];
303 long ins = (idx >= cost) ? 0 : sedits[idx];
304
305 if (del < 0 && adx >= (-1 - del) &&
306 ins > 0 && bdx >= (-1 + ins))
307 {
308 nedits[ndx] = del;
309 nedits[ndx+1] = ins;
310 --ddx; ++idx; ndx += 2; ++adx; ++bdx;
311 }
312 else if (del < 0 && adx >= (-1 - del))
313 {
314 nedits[ndx] = del;
315 --ddx; ++ndx; ++adx;
316 }
317 else if (ins > 0 && bdx >= (-1 + ins))
318 {
319 nedits[ndx] = ins;
320 ++idx; ++ndx; ++bdx;
321 }
322 else
323 {
324 ++adx; ++bdx;
325 }
326 }
327 }
328
329 // trims and calls diff_to_ez
330 static long diff_to_et(subarray<A> const & a, long start_a, long end_a,
331 subarray<B> const & b, long start_b, long end_b,
332 vector<long, QA(long)> & edits,
333 long edx,
334 long polarity,
335 long p_lim)
336 {
337
338 I(start_a >= 0);
339 I(start_a <= a.size());
340 I(start_b >= 0);
341 I(start_b <= b.size());
342 I(end_a >= 0);
343 I(end_a <= a.size());
344 I(end_b >= 0);
345 I(end_b <= b.size());
346
347 I(end_a - start_a >= p_lim);
348
349 long bsx, bdx, asx, adx;
350
351 for (bdx = end_b - 1, adx = end_a - 1;
352 (start_b <= bdx) && (start_a <= adx) && (a[adx] == b[bdx]);
353 --bdx, --adx);
354
355 for (bsx = start_b, asx = start_a;
356 (bsx < bdx) && (asx < adx) && (a[asx] == b[bsx]);
357 ++bsx, ++asx);
358
359 // we've trimmed; now call diff_to_ez.
360
361 long delta = (bdx - bsx) - (adx - asx);
362 if (delta < 0)
363 return diff_to_ez (b, bsx, bdx+1,
364 a, asx, adx+1,
365 edits, edx, -polarity, delta + p_lim);
366 else
367 return diff_to_ez (a, asx, adx+1,
368 b, bsx, bdx+1,
369 edits, edx, polarity, p_lim);
370 }
371
372 static long diff_to_ez(subarray<A> const & a, long start_a, long end_a,
373 subarray<B> const & b, long start_b, long end_b,
374 vector<long, QA(long)> & edits,
375 long edx1,
376 long polarity,
377 long p_lim)
378 {
379
380 I(start_a >= 0);
381 I(start_a <= a.size());
382 I(start_b >= 0);
383 I(start_b <= b.size());
384 I(end_a >= 0);
385 I(end_a <= a.size());
386 I(end_b >= 0);
387 I(end_b <= b.size());
388
389 long len_a = end_a - start_a;
390 long len_b = end_b - start_b;
391
392 I(len_a <= len_b);
393
394 // easy case #1: B inserts only
395 if (p_lim == 0)
396 {
397 // A == B, no edits
398 if (len_a == len_b)
399 return 0;
400
401 long adx = start_a;
402 long bdx = start_b;
403 long edx0 = edx1;
404
405 while (true)
406 {
407 if (bdx >= end_b)
408 return len_b - len_a;
409
410 if (adx >= end_a)
411 {
412 for (long idx = bdx, edx = edx0;
413 idx < end_b;
414 ++idx, ++edx)
415 edits[edx] = polarity * (idx+1);
416
417 return len_b - len_a;
418 }
419
420 if (a[adx] == b[bdx])
421 {
422 ++adx; ++bdx;
423 }
424 else
425 {
426 edits[edx0] = polarity * (bdx+1);
427 ++bdx; ++edx0;
428 }
429 }
430 }
431
432 // easy case #2: delete all A, insert all B
433 else if (len_a <= p_lim)
434 {
435 I(len_a == p_lim);
436
437 long edx0 = edx1;
438 for (long idx = start_a; idx < end_a; ++idx, ++edx0)
439 edits[edx0] = polarity * (-1 - idx);
440
441 for (long jdx = start_b; jdx < end_b; ++jdx, ++edx0)
442 edits[edx0] = polarity * (jdx + 1);
443
444 return len_a + len_b;
445 }
446
447 // hard case: recurse on subproblems
448 else
449 {
450 return divide_and_conquer (a, start_a, end_a,
451 b, start_b, end_b,
452 edits, edx1, polarity, p_lim);
453 }
454 }
455
456 static void diff_to_edits(subarray<A> const & a, long m,
457 subarray<B> const & b, long n,
458 vector<long, QA(long)> & edits,
459 long p_lim)
460 {
461 I(m <= n);
462 cost_vec costs(m+n); // scratch array, ignored
463 long edit_distance = compare(costs, a, m, b, n, p_lim, false);
464
465 edits.clear();
466 edits.resize(edit_distance, 0);
467 long cost = diff_to_et(a, 0, m,
468 b, 0, n,
469 edits, 0, 1, (edit_distance - (n-m)) / 2);
470 I(cost == edit_distance);
471 }
472
473 static void edits_to_lcs (vector<long, QA(long)> const & edits,
474 subarray<A> const a, long m, long n,
475 LCS output)
476 {
477 long edx = 0, sdx = 0, adx = 0;
478 typedef typename iterator_traits<A>::value_type vt;
479 vector<vt, QA(vt)> lcs(((m + n) - edits.size()) / 2);
480 while (true)
481 {
482 long edit = (edx < static_cast<long>(edits.size())) ? edits[edx] : 0;
483
484 if (adx >= m)
485 break;
486 else if (edit > 0)
487 { ++edx; }
488 else if (edit == 0)
489 { lcs[sdx++] = a[adx++]; }
490 else if (adx >= (-1 - edit))
491 { ++edx; ++adx; }
492 else
493 { lcs[sdx++] = a[adx++]; }
494 }
495
496 copy(lcs.begin(), lcs.end(), output);
497 }
498};
499
500
501template <typename A,
502 typename B,
503 typename LCS>
504void _edit_script(A begin_a, A end_a,
505 B begin_b, B end_b,
506 long p_lim,
507 vector<long, QA(long)> & edits_out,
508 LCS ignored_out)
509{
510 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
511 long len_a = end_a - begin_a;
512 long len_b = end_b - begin_b;
513 typename calc_t::edit_vec edits, ordered;
514
515 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
516 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
517
518 if (len_b < len_a)
519 {
520 calc_t::diff_to_edits (b, len_b, a, len_a, edits, p_lim);
521 calc_t::order_edits (edits, -1, ordered);
522 for (size_t i = 0; i < ordered.size(); ++i)
523 ordered[i] *= -1;
524 }
525 else
526 {
527 calc_t::diff_to_edits (a, len_a, b, len_b, edits, p_lim);
528 calc_t::order_edits (edits, 1, ordered);
529 }
530
531 edits_out.clear();
532 edits_out.reserve(ordered.size());
533 copy(ordered.begin(), ordered.end(), back_inserter(edits_out));
534}
535
536
537template <typename A,
538 typename B,
539 typename LCS>
540void _longest_common_subsequence(A begin_a, A end_a,
541 B begin_b, B end_b,
542 long p_lim,
543 LCS out)
544{
545 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
546 long len_a = end_a - begin_a;
547 long len_b = end_b - begin_b;
548 typename calc_t::edit_vec edits, ordered;
549
550 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
551 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
552
553 if (len_b < len_a)
554 {
555 calc_t::diff_to_edits(b, len_b, a, len_a, edits, p_lim);
556 calc_t::order_edits(edits, -1, ordered);
557 calc_t::edits_to_lcs(ordered, b, len_b, len_a, out);
558 }
559 else
560 {
561 calc_t::diff_to_edits(a, len_a, b, len_b, edits, p_lim);
562 calc_t::order_edits(edits, 1, ordered);
563 calc_t::edits_to_lcs(ordered, a, len_a, len_b, out);
564 }
565}
566
567
568void
569longest_common_subsequence(vector<long, QA(long)>::const_iterator begin_a,
570 vector<long, QA(long)>::const_iterator end_a,
571 vector<long, QA(long)>::const_iterator begin_b,
572 vector<long, QA(long)>::const_iterator end_b,
573 long p_lim,
574 back_insert_iterator< vector<long, QA(long)> > lcs)
575{
576 _longest_common_subsequence(begin_a, end_a,
577 begin_b, end_b,
578 p_lim, lcs);
579}
580
581void
582edit_script(vector<long, QA(long)>::const_iterator begin_a,
583 vector<long, QA(long)>::const_iterator end_a,
584 vector<long, QA(long)>::const_iterator begin_b,
585 vector<long, QA(long)>::const_iterator end_b,
586 long p_lim,
587 vector<long, QA(long)> & edits_out)
588{
589 vector<long, QA(long)> lcs;
590 _edit_script(begin_a, end_a,
591 begin_b, end_b,
592 p_lim, edits_out,
593 back_inserter(lcs));
594}
595
596// Local Variables:
597// mode: C++
598// fill-column: 76
599// c-file-style: "gnu"
600// indent-tabs-mode: nil
601// End:
602// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status