monotone

monotone Mtn Source Tree

Root/lcs.cc

1/*
2
3 this is a pretty direct translation (with only vague understanding,
4 unfortunately) of aubrey jaffer's most recent O(NP) edit-script
5 calculation algorithm, which performs quite a bit better than myers,
6 manber and miller's O(NP) simple edit *distance* algorithm. this one
7 builds the entire *script* that fast.
8
9 the following is jaffer's copyright and license statement. it probably
10 still has some legal relevance here, as this is a highly derivative
11 work. if not, the portions of this file which are "mine" (if any exist)
12 are subject to copyright (C) 2003 graydon hoare, licensed to the public
13 under the GPL v2+. see the file COPYING for details. if you want to see
14 more of the original file jaffer's work came from, see the SLIB
15 repository on savannah.nongnu.org, his website at
16 http://www.swiss.ai.mit.edu/~jaffer/, or look in the journal of
17 computational biology. apparantly it's submitted for publication there
18 too.
19
20 ---
21
22 "differ.scm" O(NP) Sequence Comparison Algorithm.
23 Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
24
25 Permission to copy this software, to modify it, to redistribute it, to
26 distribute modified versions, and to use it for any purpose is granted,
27 subject to the following restrictions and understandings.
28
29 1. Any copy made of this software must include this copyright notice
30 in full.
31
32 2. I have made no warrantee or representation that the operation of
33 this software will be error-free, and I am under no obligation to
34 provide any services, by way of maintenance, update, or otherwise.
35
36 3. In conjunction with products arising from the use of this material,
37 there shall be no use of my name in any advertising, promotional, or
38 sales literature without prior written consent in each case.
39
40*/
41
42#include "base.hh"
43#include <algorithm>
44#include "vector.hh"
45
46#include "lcs.hh"
47#include "sanity.hh"
48
49using std::back_insert_iterator;
50using std::back_inserter;
51using std::copy;
52using std::iterator_traits;
53using std::max;
54using std::min;
55using std::sort;
56using std::vector;
57
58struct work_vec
59{
60 long lo;
61 long hi;
62 static vector<long, QA(long)> vec;
63 work_vec(long lo, long hi) :
64 lo(lo), hi(hi)
65 {
66 // I(hi >= lo);
67 size_t len = (hi - lo) + 1;
68 vec.resize(len);
69 vec.assign(len, -1);
70 }
71
72 inline long & operator[](long t)
73 {
74 // I(t >= lo && t <= hi);
75 return vec[t-lo];
76 }
77};
78
79vector<long, QA(long)> work_vec::vec;
80
81template <typename A,
82 typename B,
83 typename LCS>
84struct jaffer_edit_calculator
85{
86
87 typedef vector<long, QA(long)> cost_vec;
88 typedef vector<long, QA(long)> edit_vec;
89
90 template <typename T>
91 struct subarray
92 {
93 typedef typename iterator_traits<T>::value_type vt;
94
95 T base; // underlying representation
96 long start; // current extent
97 long end; // current extent
98
99 subarray(T b, long s, long e) :
100 base(b), start(s), end(e) {}
101
102 inline long size() const
103 {
104 if (end < start)
105 return start - end;
106 else
107 return end - start;
108 }
109
110 inline subarray subset(long s, long e) const
111 {
112 return subarray(base + min(start, end), s, e);
113 }
114
115 inline vt const & operator[](size_t idx) const
116 {
117 if (end < start)
118 return *(base + (start - (idx + 1)));
119 else
120 return *(base + (start + idx));
121 }
122 };
123
124 static long run(work_vec & fp, long k,
125 subarray<A> const & a, long m,
126 subarray<B> const & b, long n,
127 cost_vec & CC, long p)
128 {
129 long cost = k + 2*p;
130
131 // do the run
132 long y = max(fp[k-1]+1, fp[k+1]);
133 long x = y - k;
134
135 I(y >= 0);
136 I(x >= 0);
137
138 while (true)
139 {
140 // record costs along the way
141 long xcst = m - x;
142 if (y < static_cast<long>(CC.size()) && xcst >= 0)
143 {
144 CC[y] = min(xcst + cost, CC[y]);
145 }
146 if (x < m && y < n && a[x] == b[y])
147 {
148 ++x; ++y;
149 }
150 else
151 break;
152 }
153
154 fp[k] = y;
155 return y;
156 }
157
158 // 'compare' here is the core myers, manber and miller algorithm.
159 static long compare(cost_vec & costs,
160 subarray<A> const & a, long m,
161 subarray<B> const & b, long n,
162 long p_lim,
163 bool full_scan = true)
164 {
165 long lo = -(m+1), hi = (1+n);
166 if (full_scan)
167 {
168 lo = -(p_lim + 1);
169 hi = p_lim + 1 + (n-m);
170 }
171 work_vec fp(lo, hi);
172
173 long p = 0;
174 long delta = n - m;
175
176 for (; p <= p_lim; ++p)
177 {
178
179 // lower sweep
180 for (long k = -p; k < delta; ++k)
181 run(fp, k, a, m, b, n, costs, p);
182
183 // upper sweep
184 for (long k = delta + p; k > delta; --k)
185 run(fp, k, a, m, b, n, costs, p);
186
187 // middle
188 long fpval = run(fp, delta, a, m, b, n, costs, p);
189
190 // we can bail early if not doing a full scan
191 if (!full_scan && n <= fpval)
192 break;
193 }
194
195 return delta + 2*p;
196 }
197
198 static long divide_and_conquer(subarray<A> const & a, long start_a, long end_a,
199 subarray<B> const & b, long start_b, long end_b,
200 edit_vec & edits,
201 unsigned long edx,
202 long polarity,
203 long p_lim)
204 {
205 long mid_a = (start_a + end_a) / 2;
206 long len_b = end_b - start_b;
207 long len_a = end_a - start_a;
208 long tcst = (2 * p_lim) + (len_b - len_a);
209
210 I(start_a >= 0);
211 I(start_a <= a.size());
212 I(start_b >= 0);
213 I(start_b <= b.size());
214 I(end_a >= 0);
215 I(end_a <= a.size());
216 I(end_b >= 0);
217 I(end_b <= b.size());
218
219 cost_vec cc(len_b + 1, len_a + len_b);
220 cost_vec rr(len_b + 1, len_a + len_b);
221
222 compare (cc,
223 a.subset(start_a, mid_a), (mid_a - start_a),
224 b.subset(start_b, end_b), len_b, min(p_lim, len_a));
225
226 compare (rr,
227 a.subset(end_a, mid_a), (end_a - mid_a),
228 b.subset(end_b, start_b), len_b, min(p_lim, len_a));
229
230 long b_split = mid_split(len_a, len_b, rr, cc, tcst);
231
232 long est_c = cc[b_split];
233 long est_r = rr[len_b - b_split];
234
235 long cost_c = diff_to_et (a, start_a, mid_a,
236 b, start_b, start_b + b_split,
237 edits, edx, polarity,
238 (est_c - (b_split - (mid_a - start_a))) / 2);
239
240 I(cost_c == est_c);
241
242 long cost_r = diff_to_et (a, mid_a, end_a,
243 b, start_b + b_split, end_b,
244 edits, est_c + edx, polarity,
245 (est_r - ((len_b - b_split) - (end_a - mid_a))) / 2);
246
247 I(cost_r == est_r);
248
249 return est_r + est_c;
250 }
251
252 static long mid_split(long m, long n,
253 cost_vec const & rr,
254 cost_vec const & cc,
255 long cost)
256 {
257 long cdx = 1 + n/2;
258 long rdx = n/2;
259 while (true)
260 {
261 I (rdx >= 0);
262
263 if (cost == (cc[rdx] + rr[n-rdx]))
264 return rdx;
265 if (cost == (cc[cdx] + rr[n-cdx]))
266 return cdx;
267 --rdx;
268 ++cdx;
269 }
270 }
271
272
273 static void order_edits(edit_vec const & edits,
274 long sign,
275 edit_vec & nedits)
276 {
277 nedits.clear();
278 nedits.resize(edits.size());
279 long cost = edits.size();
280
281 if (cost == 0)
282 {
283 nedits = edits;
284 return;
285 }
286
287 edit_vec sedits = edits;
288 sort(sedits.begin(), sedits.end());
289
290 long idx0;
291 for (idx0 = 0; idx0 < cost && sedits[idx0] < 0; ++idx0) ;
292 long len_a = max(0L, -sedits[0]);
293 long len_b = sedits[cost-1];
294
295 long ddx = idx0 - 1;
296 long idx = idx0;
297 long ndx = 0;
298 long adx = 0;
299 long bdx = 0;
300
301 while (bdx < len_b || adx < len_a)
302 {
303
304 long del = (ddx < 0) ? 0 : sedits[ddx];
305 long ins = (idx >= cost) ? 0 : sedits[idx];
306
307 if (del < 0 && adx >= (-1 - del) &&
308 ins > 0 && bdx >= (-1 + ins))
309 {
310 nedits[ndx] = del;
311 nedits[ndx+1] = ins;
312 --ddx; ++idx; ndx += 2; ++adx; ++bdx;
313 }
314 else if (del < 0 && adx >= (-1 - del))
315 {
316 nedits[ndx] = del;
317 --ddx; ++ndx; ++adx;
318 }
319 else if (ins > 0 && bdx >= (-1 + ins))
320 {
321 nedits[ndx] = ins;
322 ++idx; ++ndx; ++bdx;
323 }
324 else
325 {
326 ++adx; ++bdx;
327 }
328 }
329 }
330
331 // trims and calls diff_to_ez
332 static long diff_to_et(subarray<A> const & a, long start_a, long end_a,
333 subarray<B> const & b, long start_b, long end_b,
334 vector<long, QA(long)> & edits,
335 long edx,
336 long polarity,
337 long p_lim)
338 {
339
340 I(start_a >= 0);
341 I(start_a <= a.size());
342 I(start_b >= 0);
343 I(start_b <= b.size());
344 I(end_a >= 0);
345 I(end_a <= a.size());
346 I(end_b >= 0);
347 I(end_b <= b.size());
348
349 I(end_a - start_a >= p_lim);
350
351 long bsx, bdx, asx, adx;
352
353 for (bdx = end_b - 1, adx = end_a - 1;
354 (start_b <= bdx) && (start_a <= adx) && (a[adx] == b[bdx]);
355 --bdx, --adx) ;
356
357 for (bsx = start_b, asx = start_a;
358 (bsx < bdx) && (asx < adx) && (a[asx] == b[bsx]);
359 ++bsx, ++asx) ;
360
361 // we've trimmed; now call diff_to_ez.
362
363 long delta = (bdx - bsx) - (adx - asx);
364 if (delta < 0)
365 return diff_to_ez (b, bsx, bdx+1,
366 a, asx, adx+1,
367 edits, edx, -polarity, delta + p_lim);
368 else
369 return diff_to_ez (a, asx, adx+1,
370 b, bsx, bdx+1,
371 edits, edx, polarity, p_lim);
372 }
373
374 static long diff_to_ez(subarray<A> const & a, long start_a, long end_a,
375 subarray<B> const & b, long start_b, long end_b,
376 vector<long, QA(long)> & edits,
377 long edx1,
378 long polarity,
379 long p_lim)
380 {
381
382 I(start_a >= 0);
383 I(start_a <= a.size());
384 I(start_b >= 0);
385 I(start_b <= b.size());
386 I(end_a >= 0);
387 I(end_a <= a.size());
388 I(end_b >= 0);
389 I(end_b <= b.size());
390
391 long len_a = end_a - start_a;
392 long len_b = end_b - start_b;
393
394 I(len_a <= len_b);
395
396 // easy case #1: B inserts only
397 if (p_lim == 0)
398 {
399 // A == B, no edits
400 if (len_a == len_b)
401 return 0;
402
403 long adx = start_a;
404 long bdx = start_b;
405 long edx0 = edx1;
406
407 while (true)
408 {
409 if (bdx >= end_b)
410 return len_b - len_a;
411
412 if (adx >= end_a)
413 {
414 for (long idx = bdx, edx = edx0;
415 idx < end_b;
416 ++idx, ++edx)
417 edits[edx] = polarity * (idx+1);
418
419 return len_b - len_a;
420 }
421
422 if (a[adx] == b[bdx])
423 {
424 ++adx; ++bdx;
425 }
426 else
427 {
428 edits[edx0] = polarity * (bdx+1);
429 ++bdx; ++edx0;
430 }
431 }
432 }
433
434 // easy case #2: delete all A, insert all B
435 else if (len_a <= p_lim)
436 {
437 I(len_a == p_lim);
438
439 long edx0 = edx1;
440 for (long idx = start_a; idx < end_a; ++idx, ++edx0)
441 edits[edx0] = polarity * (-1 - idx);
442
443 for (long jdx = start_b; jdx < end_b; ++jdx, ++edx0)
444 edits[edx0] = polarity * (jdx + 1);
445
446 return len_a + len_b;
447 }
448
449 // hard case: recurse on subproblems
450 else
451 {
452 return divide_and_conquer (a, start_a, end_a,
453 b, start_b, end_b,
454 edits, edx1, polarity, p_lim);
455 }
456 }
457
458 static void diff_to_edits(subarray<A> const & a, long m,
459 subarray<B> const & b, long n,
460 vector<long, QA(long)> & edits,
461 long p_lim)
462 {
463 I(m <= n);
464 cost_vec costs(m+n); // scratch array, ignored
465 long edit_distance = compare(costs, a, m, b, n, p_lim, false);
466
467 edits.clear();
468 edits.resize(edit_distance, 0);
469 long cost = diff_to_et(a, 0, m,
470 b, 0, n,
471 edits, 0, 1, (edit_distance - (n-m)) / 2);
472 I(cost == edit_distance);
473 }
474
475 static void edits_to_lcs (vector<long, QA(long)> const & edits,
476 subarray<A> const a, long m, long n,
477 LCS output)
478 {
479 long edx = 0, sdx = 0, adx = 0;
480 typedef typename iterator_traits<A>::value_type vt;
481 vector<vt, QA(vt)> lcs(((m + n) - edits.size()) / 2);
482 while (true)
483 {
484 long edit = (edx < static_cast<long>(edits.size())) ? edits[edx] : 0;
485
486 if (adx >= m)
487 break;
488 else if (edit > 0)
489 { ++edx; }
490 else if (edit == 0)
491 { lcs[sdx++] = a[adx++]; }
492 else if (adx >= (-1 - edit))
493 { ++edx; ++adx; }
494 else
495 { lcs[sdx++] = a[adx++]; }
496 }
497
498 copy(lcs.begin(), lcs.end(), output);
499 }
500};
501
502
503template <typename A,
504 typename B,
505 typename LCS>
506void _edit_script(A begin_a, A end_a,
507 B begin_b, B end_b,
508 long p_lim,
509 vector<long, QA(long)> & edits_out,
510 LCS ignored_out)
511{
512 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
513 long len_a = end_a - begin_a;
514 long len_b = end_b - begin_b;
515 typename calc_t::edit_vec edits, ordered;
516
517 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
518 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
519
520 if (len_b < len_a)
521 {
522 calc_t::diff_to_edits (b, len_b, a, len_a, edits, p_lim);
523 calc_t::order_edits (edits, -1, ordered);
524 for (size_t i = 0; i < ordered.size(); ++i)
525 ordered[i] *= -1;
526 }
527 else
528 {
529 calc_t::diff_to_edits (a, len_a, b, len_b, edits, p_lim);
530 calc_t::order_edits (edits, 1, ordered);
531 }
532
533 edits_out.clear();
534 edits_out.reserve(ordered.size());
535 copy(ordered.begin(), ordered.end(), back_inserter(edits_out));
536}
537
538
539template <typename A,
540 typename B,
541 typename LCS>
542void _longest_common_subsequence(A begin_a, A end_a,
543 B begin_b, B end_b,
544 long p_lim,
545 LCS out)
546{
547 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
548 long len_a = end_a - begin_a;
549 long len_b = end_b - begin_b;
550 typename calc_t::edit_vec edits, ordered;
551
552 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
553 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
554
555 if (len_b < len_a)
556 {
557 calc_t::diff_to_edits(b, len_b, a, len_a, edits, p_lim);
558 calc_t::order_edits(edits, -1, ordered);
559 calc_t::edits_to_lcs(ordered, b, len_b, len_a, out);
560 }
561 else
562 {
563 calc_t::diff_to_edits(a, len_a, b, len_b, edits, p_lim);
564 calc_t::order_edits(edits, 1, ordered);
565 calc_t::edits_to_lcs(ordered, a, len_a, len_b, out);
566 }
567}
568
569
570void
571longest_common_subsequence(vector<long, QA(long)>::const_iterator begin_a,
572 vector<long, QA(long)>::const_iterator end_a,
573 vector<long, QA(long)>::const_iterator begin_b,
574 vector<long, QA(long)>::const_iterator end_b,
575 long p_lim,
576 back_insert_iterator< vector<long, QA(long)> > lcs)
577{
578 _longest_common_subsequence(begin_a, end_a,
579 begin_b, end_b,
580 p_lim, lcs);
581}
582
583void
584edit_script(vector<long, QA(long)>::const_iterator begin_a,
585 vector<long, QA(long)>::const_iterator end_a,
586 vector<long, QA(long)>::const_iterator begin_b,
587 vector<long, QA(long)>::const_iterator end_b,
588 long p_lim,
589 vector<long, QA(long)> & edits_out)
590{
591 vector<long, QA(long)> lcs;
592 _edit_script(begin_a, end_a,
593 begin_b, end_b,
594 p_lim, edits_out,
595 back_inserter(lcs));
596}
597
598// Local Variables:
599// mode: C++
600// fill-column: 76
601// c-file-style: "gnu"
602// indent-tabs-mode: nil
603// End:
604// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status