monotone

monotone Mtn Source Tree

Root/lcs.cc

1/*
2
3 this is a pretty direct translation (with only vague understanding,
4 unfortunately) of aubrey jaffer's most recent O(NP) edit-script
5 calculation algorithm, which performs quite a bit better than myers,
6 manber and miller's O(NP) simple edit *distance* algorithm. this one
7 builds the entire *script* that fast.
8
9 the following is jaffer's copyright and license statement. it probably
10 still has some legal relevance here, as this is a highly derivative
11 work. if not, the portions of this file which are "mine" (if any exist)
12 are subject to copyright (C) 2003 graydon hoare, licensed to the public
13 under the GPL v2+. see the file COPYING for details. if you want to see
14 more of the original file jaffer's work came from, see the SLIB
15 repository on savannah.nongnu.org, or look in the journal of
16 computational biology. apparantly it's submitted for publication there
17 too.
18
19 ---
20
21 "differ.scm" O(NP) Sequence Comparison Algorithm.
22 Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
23
24 Permission to copy this software, to modify it, to redistribute it, to
25 distribute modified versions, and to use it for any purpose is granted,
26 subject to the following restrictions and understandings.
27
28 1. Any copy made of this software must include this copyright notice
29 in full.
30
31 2. I have made no warrantee or representation that the operation of
32 this software will be error-free, and I am under no obligation to
33 provide any services, by way of maintenance, update, or otherwise.
34
35 3. In conjunction with products arising from the use of this material,
36 there shall be no use of my name in any advertising, promotional, or
37 sales literature without prior written consent in each case.
38
39*/
40
41#include "lcs.hh"
42
43#include <vector>
44#include <algorithm>
45#include <iostream>
46#include "sanity.hh"
47
48struct work_vec
49{
50 long lo;
51 long hi;
52 static std::vector<long, QA(long)> vec;
53 work_vec(long lo, long hi) :
54 lo(lo), hi(hi)
55 {
56 // I(hi >= lo);
57 size_t len = (hi - lo) + 1;
58 vec.resize(len);
59 vec.assign(len, -1);
60 }
61
62 inline long & operator[](long t)
63 {
64 // I(t >= lo && t <= hi);
65 return vec[t-lo];
66 }
67};
68
69std::vector<long, QA(long)> work_vec::vec;
70
71template <typename A,
72 typename B,
73 typename LCS>
74struct jaffer_edit_calculator
75{
76
77 typedef std::vector<long, QA(long)> cost_vec;
78 typedef std::vector<long, QA(long)> edit_vec;
79
80 template <typename T>
81 struct subarray
82 {
83 typedef typename std::iterator_traits<T>::value_type vt;
84
85 T base; // underlying representation
86 long start; // current extent
87 long end; // current extent
88
89 subarray(T b, long s, long e) :
90 base(b), start(s), end(e) {}
91
92 inline long size() const
93 {
94 if (end < start)
95return start - end;
96 else
97return end - start;
98 }
99
100 inline subarray subset(long s, long e) const
101 {
102 return subarray(base + std::min(start, end), s, e);
103 }
104
105 inline vt const & operator[](size_t idx) const
106 {
107 if (end < start)
108 return *(base + (start - (idx + 1)));
109 else
110 return *(base + (start + idx));
111 }
112 };
113
114 static long run(work_vec & fp, long k,
115 subarray<A> const & a, long m,
116 subarray<B> const & b, long n,
117 cost_vec & CC, long p)
118 {
119 long cost = k + 2*p;
120
121 // do the run
122 long y = std::max(fp[k-1]+1, fp[k+1]);
123 long x = y - k;
124
125 I(y >= 0);
126 I(x >= 0);
127
128 while (true)
129 {
130// record costs along the way
131long xcst = m - x;
132if (y < static_cast<long>(CC.size()) && xcst >= 0)
133 {
134 CC[y] = std::min(xcst + cost, CC[y]);
135 }
136if (x < m && y < n && a[x] == b[y])
137 {
138 ++x; ++y;
139 }
140else
141 break;
142 }
143
144 fp[k] = y;
145 return y;
146 }
147
148 // 'compare' here is the core myers, manber and miller algorithm.
149 static long compare(cost_vec & costs,
150 subarray<A> const & a, long m,
151 subarray<B> const & b, long n,
152 long p_lim,
153 bool full_scan = true)
154 {
155 long lo = -(m+1), hi = (1+n);
156 if (full_scan)
157 {
158lo = -(p_lim + 1);
159hi = p_lim + 1 + (n-m);
160 }
161 work_vec fp(lo, hi);
162
163 long p = 0;
164 long delta = n - m;
165
166 for (; p <= p_lim; ++p)
167 {
168
169// lower sweep
170for (long k = -p; k < delta; ++k)
171 run(fp, k, a, m, b, n, costs, p);
172
173// upper sweep
174for (long k = delta + p; k > delta; --k)
175 run(fp, k, a, m, b, n, costs, p);
176
177// middle
178long fpval = run(fp, delta, a, m, b, n, costs, p);
179
180// we can bail early if not doing a full scan
181if (!full_scan && n <= fpval)
182 break;
183 }
184
185 return delta + 2*p;
186 }
187
188 static long divide_and_conquer(subarray<A> const & a, long start_a, long end_a,
189 subarray<B> const & b, long start_b, long end_b,
190 edit_vec & edits,
191 unsigned long edx,
192 long polarity,
193 long p_lim)
194 {
195 long mid_a = (start_a + end_a) / 2;
196 long len_b = end_b - start_b;
197 long len_a = end_a - start_a;
198 long tcst = (2 * p_lim) + (len_b - len_a);
199
200 I(start_a >= 0);
201 I(start_a <= a.size());
202 I(start_b >= 0);
203 I(start_b <= b.size());
204 I(end_a >= 0);
205 I(end_a <= a.size());
206 I(end_b >= 0);
207 I(end_b <= b.size());
208
209 cost_vec cc(len_b + 1, len_a + len_b);
210 cost_vec rr(len_b + 1, len_a + len_b);
211
212 compare (cc,
213 a.subset(start_a, mid_a), (mid_a - start_a),
214 b.subset(start_b, end_b), len_b, std::min(p_lim, len_a));
215
216 compare (rr,
217 a.subset(end_a, mid_a), (end_a - mid_a),
218 b.subset(end_b, start_b), len_b, std::min(p_lim, len_a));
219
220 long b_split = mid_split(len_a, len_b, rr, cc, tcst);
221
222 long est_c = cc[b_split];
223 long est_r = rr[len_b - b_split];
224
225 long cost_c = diff_to_et (a, start_a, mid_a,
226 b, start_b, start_b + b_split,
227 edits, edx, polarity,
228 (est_c - (b_split - (mid_a - start_a))) / 2);
229
230 I(cost_c == est_c);
231
232 long cost_r = diff_to_et (a, mid_a, end_a,
233 b, start_b + b_split, end_b,
234 edits, est_c + edx, polarity,
235 (est_r - ((len_b - b_split) - (end_a - mid_a))) / 2);
236
237 I(cost_r == est_r);
238
239 return est_r + est_c;
240 }
241
242 static long mid_split(long m, long n,
243cost_vec const & rr,
244cost_vec const & cc,
245long cost)
246 {
247 long cdx = 1 + n/2;
248 long rdx = n/2;
249 while (true)
250 {
251I (rdx >= 0);
252
253if (cost == (cc[rdx] + rr[n-rdx]))
254 return rdx;
255if (cost == (cc[cdx] + rr[n-cdx]))
256 return cdx;
257--rdx;
258++cdx;
259 }
260 }
261
262
263 static void order_edits(edit_vec const & edits,
264 long sign,
265 edit_vec & nedits)
266 {
267 nedits.clear();
268 nedits.resize(edits.size());
269 long cost = edits.size();
270
271 if (cost == 0)
272 {
273nedits = edits;
274return;
275 }
276
277 edit_vec sedits = edits;
278 std::sort(sedits.begin(), sedits.end());
279
280 long idx0;
281 for (idx0 = 0; idx0 < cost && sedits[idx0] < 0; ++idx0);
282 long len_a = std::max(0L, -sedits[0]);
283 long len_b = sedits[cost-1];
284
285 long ddx = idx0 - 1;
286 long idx = idx0;
287 long ndx = 0;
288 long adx = 0;
289 long bdx = 0;
290
291 while (bdx < len_b || adx < len_a)
292 {
293
294long del = (ddx < 0) ? 0 : sedits[ddx];
295long ins = (idx >= cost) ? 0 : sedits[idx];
296
297if (del < 0 && adx >= (-1 - del) &&
298 ins > 0 && bdx >= (-1 + ins))
299 {
300 nedits[ndx] = del;
301 nedits[ndx+1] = ins;
302 --ddx; ++idx; ndx += 2; ++adx; ++bdx;
303 }
304else if (del < 0 && adx >= (-1 - del))
305 {
306 nedits[ndx] = del;
307 --ddx; ++ndx; ++adx;
308 }
309else if (ins > 0 && bdx >= (-1 + ins))
310 {
311 nedits[ndx] = ins;
312 ++idx; ++ndx; ++bdx;
313 }
314else
315 {
316 ++adx; ++bdx;
317 }
318 }
319 }
320
321 // trims and calls diff_to_ez
322 static long diff_to_et(subarray<A> const & a, long start_a, long end_a,
323 subarray<B> const & b, long start_b, long end_b,
324 std::vector<long, QA(long)> & edits,
325 long edx,
326 long polarity,
327 long p_lim)
328 {
329
330 I(start_a >= 0);
331 I(start_a <= a.size());
332 I(start_b >= 0);
333 I(start_b <= b.size());
334 I(end_a >= 0);
335 I(end_a <= a.size());
336 I(end_b >= 0);
337 I(end_b <= b.size());
338
339 I(end_a - start_a >= p_lim);
340
341 long bsx, bdx, asx, adx;
342
343 for (bdx = end_b - 1, adx = end_a - 1;
344 (start_b <= bdx) && (start_a <= adx) && (a[adx] == b[bdx]);
345 --bdx, --adx);
346
347 for (bsx = start_b, asx = start_a;
348 (bsx < bdx) && (asx < adx) && (a[asx] == b[bsx]);
349 ++bsx, ++asx);
350
351 // we've trimmed; now call diff_to_ez.
352
353 long delta = (bdx - bsx) - (adx - asx);
354 if (delta < 0)
355 return diff_to_ez (b, bsx, bdx+1,
356 a, asx, adx+1,
357 edits, edx, -polarity, delta + p_lim);
358 else
359 return diff_to_ez (a, asx, adx+1,
360 b, bsx, bdx+1,
361 edits, edx, polarity, p_lim);
362 }
363
364 static long diff_to_ez(subarray<A> const & a, long start_a, long end_a,
365 subarray<B> const & b, long start_b, long end_b,
366 std::vector<long, QA(long)> & edits,
367 long edx1,
368 long polarity,
369 long p_lim)
370 {
371
372 I(start_a >= 0);
373 I(start_a <= a.size());
374 I(start_b >= 0);
375 I(start_b <= b.size());
376 I(end_a >= 0);
377 I(end_a <= a.size());
378 I(end_b >= 0);
379 I(end_b <= b.size());
380
381 long len_a = end_a - start_a;
382 long len_b = end_b - start_b;
383
384 I(len_a <= len_b);
385
386 // easy case #1: B inserts only
387 if (p_lim == 0)
388 {
389// A == B, no edits
390if (len_a == len_b)
391 return 0;
392
393long adx = start_a;
394long bdx = start_b;
395long edx0 = edx1;
396
397while (true)
398 {
399 if (bdx >= end_b)
400 return len_b - len_a;
401
402 if (adx >= end_a)
403 {
404for (long idx = bdx, edx = edx0;
405 idx < end_b;
406 ++idx, ++edx)
407 edits[edx] = polarity * (idx+1);
408
409return len_b - len_a;
410 }
411
412 if (a[adx] == b[bdx])
413 {
414++adx; ++bdx;
415 }
416 else
417 {
418edits[edx0] = polarity * (bdx+1);
419++bdx; ++edx0;
420 }
421 }
422 }
423
424 // easy case #2: delete all A, insert all B
425 else if (len_a <= p_lim)
426 {
427I(len_a == p_lim);
428
429long edx0 = edx1;
430for (long idx = start_a; idx < end_a; ++idx, ++edx0)
431 edits[edx0] = polarity * (-1 - idx);
432
433for (long jdx = start_b; jdx < end_b; ++jdx, ++edx0)
434 edits[edx0] = polarity * (jdx + 1);
435
436return len_a + len_b;
437 }
438
439 // hard case: recurse on subproblems
440 else
441 {
442return divide_and_conquer (a, start_a, end_a,
443 b, start_b, end_b,
444 edits, edx1, polarity, p_lim);
445 }
446 }
447
448 static void diff_to_edits(subarray<A> const & a, long m,
449 subarray<B> const & b, long n,
450 std::vector<long, QA(long)> & edits,
451 long p_lim)
452 {
453 I(m <= n);
454 cost_vec costs(m+n); // scratch array, ignored
455 long edit_distance = compare(costs, a, m, b, n, p_lim, false);
456
457 edits.clear();
458 edits.resize(edit_distance, 0);
459 long cost = diff_to_et(a, 0, m,
460 b, 0, n,
461 edits, 0, 1, (edit_distance - (n-m)) / 2);
462 I(cost == edit_distance);
463 }
464
465 static void edits_to_lcs (std::vector<long, QA(long)> const & edits,
466 subarray<A> const a, long m, long n,
467 LCS output)
468 {
469 long edx = 0, sdx = 0, adx = 0;
470 typedef typename std::iterator_traits<A>::value_type vt;
471 std::vector<vt, QA(vt)> lcs(((m + n) - edits.size()) / 2);
472 while (true)
473 {
474long edit = (edx < static_cast<long>(edits.size())) ? edits[edx] : 0;
475
476if (adx >= m)
477 break;
478else if (edit > 0)
479 { ++edx; }
480else if (edit == 0)
481 { lcs[sdx++] = a[adx++]; }
482else if (adx >= (-1 - edit))
483 { ++edx; ++adx; }
484else
485 { lcs[sdx++] = a[adx++]; }
486 }
487
488 std::copy(lcs.begin(), lcs.end(), output);
489 }
490};
491
492
493template <typename A,
494 typename B,
495 typename LCS>
496void _edit_script(A begin_a, A end_a,
497 B begin_b, B end_b,
498 long p_lim,
499 std::vector<long, QA(long)> & edits_out,
500 LCS ignored_out)
501{
502 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
503 long len_a = end_a - begin_a;
504 long len_b = end_b - begin_b;
505 typename calc_t::edit_vec edits, ordered;
506
507 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
508 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
509
510 if (len_b < len_a)
511 {
512 calc_t::diff_to_edits (b, len_b, a, len_a, edits, p_lim);
513 calc_t::order_edits (edits, -1, ordered);
514 for (size_t i = 0; i < ordered.size(); ++i)
515ordered[i] *= -1;
516 }
517 else
518 {
519 calc_t::diff_to_edits (a, len_a, b, len_b, edits, p_lim);
520 calc_t::order_edits (edits, 1, ordered);
521 }
522
523 edits_out.clear();
524 edits_out.reserve(ordered.size());
525 copy(ordered.begin(), ordered.end(), back_inserter(edits_out));
526}
527
528
529template <typename A,
530 typename B,
531 typename LCS>
532void _longest_common_subsequence(A begin_a, A end_a,
533 B begin_b, B end_b,
534 long p_lim,
535 LCS out)
536{
537 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
538 long len_a = end_a - begin_a;
539 long len_b = end_b - begin_b;
540 typename calc_t::edit_vec edits, ordered;
541
542 typename calc_t::template subarray<A> a(begin_a, 0, len_a);
543 typename calc_t::template subarray<B> b(begin_b, 0, len_b);
544
545 if (len_b < len_a)
546 {
547 calc_t::diff_to_edits (b, len_b, a, len_a, edits, p_lim);
548 calc_t::order_edits (edits, -1, ordered);
549 calc_t::edits_to_lcs (ordered, b, len_b, len_a, out);
550 }
551 else
552 {
553 calc_t::diff_to_edits (a, len_a, b, len_b, edits, p_lim);
554 calc_t::order_edits (edits, 1, ordered);
555 calc_t::edits_to_lcs (ordered, a, len_a, len_b, out);
556 }
557}
558
559
560void
561longest_common_subsequence(std::vector<long, QA(long)>::const_iterator begin_a,
562 std::vector<long, QA(long)>::const_iterator end_a,
563 std::vector<long, QA(long)>::const_iterator begin_b,
564 std::vector<long, QA(long)>::const_iterator end_b,
565 long p_lim,
566 std::back_insert_iterator< std::vector<long, QA(long)> > lcs)
567{
568 _longest_common_subsequence(begin_a, end_a,
569 begin_b, end_b,
570 p_lim, lcs);
571}
572
573void
574edit_script(std::vector<long, QA(long)>::const_iterator begin_a,
575 std::vector<long, QA(long)>::const_iterator end_a,
576 std::vector<long, QA(long)>::const_iterator begin_b,
577 std::vector<long, QA(long)>::const_iterator end_b,
578 long p_lim,
579 std::vector<long, QA(long)> & edits_out)
580{
581 std::vector<long, QA(long)> lcs;
582 _edit_script(begin_a, end_a,
583 begin_b, end_b,
584 p_lim, edits_out,
585 back_inserter(lcs));
586}

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status