monotone

monotone Mtn Source Tree

Root/lcs.cc

1/*
2
3 this is a pretty direct translation (with only vague understanding,
4 unfortunately) of aubrey jaffer's most recent O(NP) edit-script
5 calculation algorithm, which performs quite a bit better than myers,
6 manber and miller's O(NP) simple edit *distance* algorithm. this one
7 builds the entire *script* that fast.
8
9 the following is jaffer's copyright and license statement. it probably
10 still has some legal relevance here, as this is a highly derivative
11 work. if not, the portions of this file which are "mine" (if any exist)
12 are subject to copyright (C) 2003 graydon hoare, licensed to the public
13 under the GPL v2+. see the file COPYING for details. if you want to see
14 more of the original file jaffer's work came from, see the SLIB
15 repository on savannah.nongnu.org, or look in the journal of
16 computational biology. apparantly it's submitted for publication there
17 too.
18
19 ---
20
21 "differ.scm" O(NP) Sequence Comparison Algorithm.
22 Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
23
24 Permission to copy this software, to modify it, to redistribute it, to
25 distribute modified versions, and to use it for any purpose is granted,
26 subject to the following restrictions and understandings.
27
28 1. Any copy made of this software must include this copyright notice
29 in full.
30
31 2. I have made no warrantee or representation that the operation of
32 this software will be error-free, and I am under no obligation to
33 provide any services, by way of maintenance, update, or otherwise.
34
35 3. In conjunction with products arising from the use of this material,
36 there shall be no use of my name in any advertising, promotional, or
37 sales literature without prior written consent in each case.
38
39*/
40
41#include "base.hh"
42#include <algorithm>
43#include <vector>
44
45#include "lcs.hh"
46#include "sanity.hh"
47
48using std::back_insert_iterator;
49using std::back_inserter;
50using std::copy;
51using std::iterator_traits;
52using std::max;
53using std::min;
54using std::sort;
55using std::vector;
56
57struct work_vec
58{
59 long lo;
60 long hi;
61 static vector<long, QA(long)> vec;
62 work_vec(long lo, long hi) :
63 lo(lo), hi(hi)
64 {
65 // I(hi >= lo);
66 size_t len = (hi - lo) + 1;
67 vec.resize(len);
68 vec.assign(len, -1);
69 }
70
71 inline long & operator[](long t)
72 {
73 // I(t >= lo && t <= hi);
74 return vec[t-lo];
75 }
76};
77
78vector<long, QA(long)> work_vec::vec;
79
80template <typename A,
81 typename B,
82 typename LCS>
83struct jaffer_edit_calculator
84{
85
86 typedef vector<long, QA(long)> cost_vec;
87 typedef vector<long, QA(long)> edit_vec;
88
89 template <typename T>
90 struct subarray
91 {
92 typedef typename iterator_traits<T>::value_type vt;
93
94 T base; // underlying representation
95 long start; // current extent
96 long end; // current extent
97
98 subarray(T b, long s, long e) :
99 base(b), start(s), end(e) {}
100
101 inline long size() const
102 {
103 if (end < start)
104 return start - end;
105 else
106 return end - start;
107 }
108
109 inline subarray subset(long s, long e) const
110 {
111 return subarray(base + min(start, end), s, e);
112 }
113
114 inline vt const & operator[](size_t idx) const
115 {
116 if (end < start)
117 return *(base + (start - (idx + 1)));
118 else
119 return *(base + (start + idx));
120 }
121 };
122
123 static long run(work_vec & fp, long k,
124 subarray<A> const & a, long m,
125 subarray<B> const & b, long n,
126 cost_vec & CC, long p)
127 {
128 long cost = k + 2*p;
129
130 // do the run
131 long y = max(fp[k-1]+1, fp[k+1]);
132 long x = y - k;
133
134 I(y >= 0);
135 I(x >= 0);
136
137 while (true)
138 {
139 // record costs along the way
140 long xcst = m - x;
141 if (y < static_cast<long>(CC.size()) && xcst >= 0)
142 {
143 CC[y] = min(xcst + cost, CC[y]);
144 }
145 if (x < m && y < n && a[x] == b[y])
146 {
147 ++x; ++y;
148 }
149 else
150 break;
151 }
152
153 fp[k] = y;
154 return y;
155 }
156
157 // 'compare' here is the core myers, manber and miller algorithm.
158 static long compare(cost_vec & costs,
159 subarray<A> const & a, long m,
160 subarray<B> const & b, long n,
161 long p_lim,
162 bool full_scan = true)
163 {
164 long lo = -(m+1), hi = (1+n);
165 if (full_scan)
166 {
167 lo = -(p_lim + 1);
168 hi = p_lim + 1 + (n-m);
169 }
170 work_vec fp(lo, hi);
171
172 long p = 0;
173 long delta = n - m;
174
175 for (; p <= p_lim; ++p)
176 {
177
178 // lower sweep
179 for (long k = -p; k < delta; ++k)
180 run(fp, k, a, m, b, n, costs, p);
181
182 // upper sweep
183 for (long k = delta + p; k > delta; --k)
184 run(fp, k, a, m, b, n, costs, p);
185
186 // middle
187 long fpval = run(fp, delta, a, m, b, n, costs, p);
188
189 // we can bail early if not doing a full scan
190 if (!full_scan && n <= fpval)
191 break;
192 }
193
194 return delta + 2*p;
195 }
196
197 static long divide_and_conquer(subarray<A> const & a, long start_a, long end_a,
198 subarray<B> const & b, long start_b, long end_b,
199 edit_vec & edits,
200 unsigned long edx,
201 long polarity,
202 long p_lim)
203 {
204 long mid_a = (start_a + end_a) / 2;
205 long len_b = end_b - start_b;
206 long len_a = end_a - start_a;
207 long tcst = (2 * p_lim) + (len_b - len_a);
208
209 I(start_a >= 0);
210 I(start_a <= a.size());
211 I(start_b >= 0);
212 I(start_b <= b.size());
213 I(end_a >= 0);
214 I(end_a <= a.size());
215 I(end_b >= 0);
216 I(end_b <= b.size());
217
218 cost_vec cc(len_b + 1, len_a + len_b);
219 cost_vec rr(len_b + 1, len_a + len_b);
220
221 compare (cc,
222 a.subset(start_a, mid_a), (mid_a - start_a),
223 b.subset(start_b, end_b), len_b, min(p_lim, len_a));
224
225 compare (rr,
226 a.subset(end_a, mid_a), (end_a - mid_a),
227 b.subset(end_b, start_b), len_b, min(p_lim, len_a));
228
229 long b_split = mid_split(len_a, len_b, rr, cc, tcst);
230
231 long est_c = cc[b_split];
232 long est_r = rr[len_b - b_split];
233
234 long cost_c = diff_to_et (a, start_a, mid_a,
235 b, start_b, start_b + b_split,
236 edits, edx, polarity,
237 (est_c - (b_split - (mid_a - start_a))) / 2);
238
239 I(cost_c == est_c);
240
241 long cost_r = diff_to_et (a, mid_a, end_a,
242 b, start_b + b_split, end_b,
243 edits, est_c + edx, polarity,
244 (est_r - ((len_b - b_split) - (end_a - mid_a))) / 2);
245
246 I(cost_r == est_r);
247
248 return est_r + est_c;
249 }
250
251 static long mid_split(long m, long n,
252 cost_vec const & rr,
253 cost_vec const & cc,
254 long cost)
255 {
256 long cdx = 1 + n/2;
257 long rdx = n/2;
258 while (true)
259 {
260 I (rdx >= 0);
261
262 if (cost == (cc[rdx] + rr[n-rdx]))
263 return rdx;
264 if (cost == (cc[cdx] + rr[n-cdx]))
265 return cdx;
266 --rdx;
267 ++cdx;
268 }
269 }
270
271
272 static void order_edits(edit_vec const & edits,
273 long sign,
274 edit_vec & nedits)
275 {
276 nedits.clear();
277 nedits.resize(edits.size());
278 long cost = edits.size();
279
280 if (cost == 0)
281 {
282 nedits = edits;
283 return;
284 }
285
286 edit_vec sedits = edits;
287 sort(sedits.begin(), sedits.end());
288
289 long idx0;
290 for (idx0 = 0; idx0 < cost && sedits[idx0] < 0; ++idx0);
291 long len_a = max(0L, -sedits[0]);
292 long len_b = sedits[cost-1];
293
294 long ddx = idx0 - 1;
295 long idx = idx0;
296 long ndx = 0;
297 long adx = 0;
298 long bdx = 0;
299
300 while (bdx < len_b || adx < len_a)
301 {
302
303 long del = (ddx < 0) ? 0 : sedits[ddx];
304 long ins = (idx >= cost) ? 0 : sedits[idx];
305
306 if (del < 0 && adx >= (-1 - del) &&
307 ins > 0 && bdx >= (-1 + ins))
308 {
309 nedits[ndx] = del;
310 nedits[ndx+1] = ins;
311 --ddx; ++idx; ndx += 2; ++adx; ++bdx;
312 }
313 else if (del < 0 && adx >= (-1 - del))
314 {
315 nedits[ndx] = del;
316 --ddx; ++ndx; ++adx;
317 }
318 else if (ins > 0 && bdx >= (-1 + ins))
319 {
320 nedits[ndx] = ins;
321 ++idx; ++ndx; ++bdx;
322 }
323 else
324 {
325 ++adx; ++bdx;
326 }
327 }
328 }
329
330 // trims and calls diff_to_ez
331 static long diff_to_et(subarray<A> const & a, long start_a, long end_a,
332 subarray<B> const & b, long start_b, long end_b,
333 vector<long, QA(long)> & edits,
334 long edx,
335 long polarity,
336 long p_lim)
337 {
338
339 I(start_a >= 0);
340 I(start_a <= a.size());
341 I(start_b >= 0);
342 I(start_b <= b.size());
343 I(end_a >= 0);
344 I(end_a <= a.size());
345 I(end_b >= 0);
346 I(end_b <= b.size());
347
348 I(end_a - start_a >= p_lim);
349
350 long bsx, bdx, asx, adx;
351
352 for (bdx = end_b - 1, adx = end_a - 1;
353 (start_b <= bdx) && (start_a <= adx) && (a[adx] == b[bdx]);
354 --bdx, --adx);
355
356 for (bsx = start_b, asx = start_a;
357 (bsx < bdx) && (asx < adx) && (a[asx] == b[bsx]);
358 ++bsx, ++asx);
359
360 // we've trimmed; now call diff_to_ez.
361
362 long delta = (bdx - bsx) - (adx - asx);
363 if (delta < 0)
364 return diff_to_ez (b, bsx, bdx+1,
365 a, asx, adx+1,
366 edits, edx, -polarity, delta + p_lim);
367 else
368 return diff_to_ez (a, asx, adx+1,
369 b, bsx, bdx+1,
370 edits, edx, polarity, p_lim);
371 }
372
373 static long diff_to_ez(subarray<A> const & a, long start_a, long end_a,
374 subarray<B> const & b, long start_b, long end_b,
375 vector<long, QA(long)> & edits,
376 long edx1,
377 long polarity,
378 long p_lim)
379 {
380
381 I(start_a >= 0);
382 I(start_a <= a.size());
383 I(start_b >= 0);
384 I(start_b <= b.size());
385 I(end_a >= 0);
386 I(end_a <= a.size());
387 I(end_b >= 0);
388 I(end_b <= b.size());
389
390 long len_a = end_a - start_a;
391 long len_b = end_b - start_b;
392
393 I(len_a <= len_b);
394
395 // easy case #1: B inserts only
396 if (p_lim == 0)
397 {
398 // A == B, no edits
399 if (len_a == len_b)
400 return 0;
401
402 long adx = start_a;
403 long bdx = start_b;
404 long edx0 = edx1;
405
406 while (true)
407 {
408 if (bdx >= end_b)
409 return len_b - len_a;
410
411 if (adx >= end_a)
412 {
413 for (long idx = bdx, edx = edx0;
414 idx < end_b;
415 ++idx, ++edx)
416 edits[edx] = polarity * (idx+1);
417
418 return len_b - len_a;
419 }
420
421 if (a[adx] == b[bdx])
422 {
423 ++adx; ++bdx;
424 }
425 else
426 {
427 edits[edx0] = polarity * (bdx+1);
428 ++bdx; ++edx0;
429 }
430 }
431 }
432
433 // easy case #2: delete all A, insert all B
434 else if (len_a <= p_lim)
435 {
436 I(len_a == p_lim);
437
438 long edx0 = edx1;
439 for (long idx = start_a; idx < end_a; ++idx, ++edx0)
440 edits[edx0] = polarity * (-1 - idx);
441
442 for (long jdx = start_b; jdx < end_b; ++jdx, ++edx0)
443 edits[edx0] = polarity * (jdx + 1);
444
445 return len_a + len_b;
446 }
447
448 // hard case: recurse on subproblems
449 else
450 {
451 return divide_and_conquer (a, start_a, end_a,
452 b, start_b, end_b,
453 edits, edx1, polarity, p_lim);
454 }
455 }
456
457 static void diff_to_edits(subarray<A> const & a, long m,
458 subarray<B> const & b, long n,
459 vector<long, QA(long)> & edits,
460 long p_lim)
461 {
462 I(m <= n);
463 cost_vec costs(m+n); // scratch array, ignored
464 long edit_distance = compare(costs, a, m, b, n, p_lim, false);
465
466 edits.clear();
467 edits.resize(edit_distance, 0);
468 long cost = diff_to_et(a, 0, m,
469 b, 0, n,
470 edits, 0, 1, (edit_distance - (n-m)) / 2);
471 I(cost == edit_distance);
472 }
473
474 static void edits_to_lcs (vector<long, QA(long)> const & edits,
475 subarray<A> const a, long m, long n,
476 LCS output)
477 {
478 long edx = 0, sdx = 0, adx = 0;
479 typedef typename iterator_traits<A>::value_type vt;
480 vector<vt, QA(vt)> lcs(((m + n) - edits.size()) / 2);
481 while (true)
482 {
483 long edit = (edx < static_cast<long>(edits.size())) ? edits[edx] : 0;
484
485 if (adx >= m)
486 break;
487 else if (edit > 0)
488 { ++edx; }
489 else if (edit == 0)
490 { lcs[sdx++] = a[adx++]; }
491 else if (adx >= (-1 - edit))
492 { ++edx; ++adx; }
493 else
494 { lcs[sdx++] = a[adx++]; }
495 }
496
497 copy(lcs.begin(), lcs.end(), output);
498 }
499};
500
501
502template <typename A,
503 typename B,
504 typename LCS>
505void _edit_script(A begin_a, A end_a,
506 B begin_b, B end_b,
507 long p_lim,
508 vector<long, QA(long)> & edits_out,
509 LCS ignored_out)
510{
511 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
512 long len_a = end_a - begin_a;
513 long len_b = end_b - begin_b;
514 typename calc_t::edit_vec edits, ordered;
515
516 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
517 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
518
519 if (len_b < len_a)
520 {
521 calc_t::diff_to_edits (b, len_b, a, len_a, edits, p_lim);
522 calc_t::order_edits (edits, -1, ordered);
523 for (size_t i = 0; i < ordered.size(); ++i)
524 ordered[i] *= -1;
525 }
526 else
527 {
528 calc_t::diff_to_edits (a, len_a, b, len_b, edits, p_lim);
529 calc_t::order_edits (edits, 1, ordered);
530 }
531
532 edits_out.clear();
533 edits_out.reserve(ordered.size());
534 copy(ordered.begin(), ordered.end(), back_inserter(edits_out));
535}
536
537
538template <typename A,
539 typename B,
540 typename LCS>
541void _longest_common_subsequence(A begin_a, A end_a,
542 B begin_b, B end_b,
543 long p_lim,
544 LCS out)
545{
546 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
547 long len_a = end_a - begin_a;
548 long len_b = end_b - begin_b;
549 typename calc_t::edit_vec edits, ordered;
550
551 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
552 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
553
554 if (len_b < len_a)
555 {
556 calc_t::diff_to_edits(b, len_b, a, len_a, edits, p_lim);
557 calc_t::order_edits(edits, -1, ordered);
558 calc_t::edits_to_lcs(ordered, b, len_b, len_a, out);
559 }
560 else
561 {
562 calc_t::diff_to_edits(a, len_a, b, len_b, edits, p_lim);
563 calc_t::order_edits(edits, 1, ordered);
564 calc_t::edits_to_lcs(ordered, a, len_a, len_b, out);
565 }
566}
567
568
569void
570longest_common_subsequence(vector<long, QA(long)>::const_iterator begin_a,
571 vector<long, QA(long)>::const_iterator end_a,
572 vector<long, QA(long)>::const_iterator begin_b,
573 vector<long, QA(long)>::const_iterator end_b,
574 long p_lim,
575 back_insert_iterator< vector<long, QA(long)> > lcs)
576{
577 _longest_common_subsequence(begin_a, end_a,
578 begin_b, end_b,
579 p_lim, lcs);
580}
581
582void
583edit_script(vector<long, QA(long)>::const_iterator begin_a,
584 vector<long, QA(long)>::const_iterator end_a,
585 vector<long, QA(long)>::const_iterator begin_b,
586 vector<long, QA(long)>::const_iterator end_b,
587 long p_lim,
588 vector<long, QA(long)> & edits_out)
589{
590 vector<long, QA(long)> lcs;
591 _edit_script(begin_a, end_a,
592 begin_b, end_b,
593 p_lim, edits_out,
594 back_inserter(lcs));
595}
596
597// Local Variables:
598// mode: C++
599// fill-column: 76
600// c-file-style: "gnu"
601// indent-tabs-mode: nil
602// End:
603// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status