monotone

monotone Mtn Source Tree

Root/lcs.hh

1#ifndef __LCS_HH__
2#define __LCS_HH__
3
4/*
5
6 this is a pretty direct translation (with only vague understanding,
7 unfortunately) of aubrey jaffer's most recent O(NP) edit-script
8 calculation algorithm, which performs quite a bit better than myers,
9 manber and miller's O(NP) simple edit *distance* algorithm. this one
10 builds the entire *script* that fast.
11
12 the following is jaffer's copyright and license statement. it probably
13 still has some legal relevance here, as this is a highly derivative
14 work. if not, the portions of this file which are "mine" (if any exist)
15 are subject to copyright (C) 2003 graydon hoare, licensed to the public
16 under the GPL v2+. see the file COPYING for details. if you want to see
17 more of the original file jaffer's work came from, see the SLIB
18 repository on savannah.nongnu.org, or look in the journal of
19 computational biology. apparantly it's submitted for publication there
20 too.
21
22 ---
23
24 "differ.scm" O(NP) Sequence Comparison Algorithm.
25 Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
26
27 Permission to copy this software, to modify it, to redistribute it, to
28 distribute modified versions, and to use it for any purpose is granted,
29 subject to the following restrictions and understandings.
30
31 1. Any copy made of this software must include this copyright notice
32 in full.
33
34 2. I have made no warrantee or representation that the operation of
35 this software will be error-free, and I am under no obligation to
36 provide any services, by way of maintenance, update, or otherwise.
37
38 3. In conjunction with products arising from the use of this material,
39 there shall be no use of my name in any advertising, promotional, or
40 sales literature without prior written consent in each case.
41
42*/
43
44#include <vector>
45#include <algorithm>
46#include <iostream>
47#include "sanity.hh"
48
49template <typename A,
50 typename B,
51 typename LCS>
52struct jaffer_edit_calculator
53{
54
55 typedef std::vector<long> cost_vec;
56 typedef std::vector<long> edit_vec;
57
58 template <typename T>
59 struct subarray
60 {
61 typedef typename std::iterator_traits<T>::value_type vt;
62
63 T base; // underlying representation
64 long start; // current extent
65 long end; // current extent
66
67 subarray(T b, long s, long e) :
68 base(b), start(s), end(e) {}
69
70 long size() const
71 {
72 if (end < start)
73return start - end;
74 else
75return end - start;
76 }
77
78 subarray subset(long s, long e) const
79 {
80 return subarray(base + std::min(start, end), s, e);
81 }
82
83 vt const & operator[](size_t idx) const
84 {
85 if (end < start)
86 return *(base + (start - (idx + 1)));
87 else
88 return *(base + (start + idx));
89 }
90 };
91
92 struct work_vec
93 {
94 long lo;
95 long hi;
96 std::vector<long> vec;
97 work_vec(long lo, long hi) :
98 lo(lo), hi(hi), vec((hi - lo) + 1, -1)
99 {}
100 long & operator[](long t)
101 {
102 I(t >= lo);
103 I(t <= hi);
104 I((t-lo) <= static_cast<long>(vec.size()));
105 return vec[t-lo];
106 }
107 };
108
109
110 static long run(work_vec & fp, long k,
111 subarray<A> const & a, long m,
112 subarray<B> const & b, long n,
113 cost_vec & CC, long p)
114 {
115 long cost = k + 2*p;
116
117 // do the run
118 long y = std::max(fp[k-1]+1, fp[k+1]);
119 long x = y - k;
120
121 I(y >= 0);
122 I(x >= 0);
123
124 while (true)
125 {
126// record costs along the way
127long xcst = m - x;
128if (y < static_cast<long>(CC.size()) && xcst >= 0)
129 {
130 CC[y] = std::min(xcst + cost, CC[y]);
131 }
132if (x < m && y < n && a[x] == b[y])
133 {
134 ++x; ++y;
135 }
136else
137 break;
138 }
139
140 fp[k] = y;
141 return y;
142 }
143
144 // 'compare' here is the core myers, manber and miller algorithm.
145 static long compare(cost_vec & costs,
146 subarray<A> const & a, long m,
147 subarray<B> const & b, long n,
148 long p_lim,
149 bool full_scan = true)
150 {
151 long lo = -(m+1), hi = (1+n);
152 if (full_scan)
153 {
154lo = -(p_lim + 1);
155hi = p_lim + 1 + (n-m);
156 }
157 work_vec fp(lo, hi);
158
159 long p = 0;
160 long delta = n - m;
161
162 for (; p <= p_lim; ++p)
163 {
164
165// lower sweep
166for (long k = -p; k < delta; ++k)
167 run(fp, k, a, m, b, n, costs, p);
168
169// upper sweep
170for (long k = delta + p; k > delta; --k)
171 run(fp, k, a, m, b, n, costs, p);
172
173// middle
174long fpval = run(fp, delta, a, m, b, n, costs, p);
175
176// we can bail early if not doing a full scan
177if (!full_scan && n <= fpval)
178 break;
179 }
180
181 return delta + 2*p;
182 }
183
184 static long divide_and_conquer(subarray<A> const & a, long start_a, long end_a,
185 subarray<B> const & b, long start_b, long end_b,
186 edit_vec & edits,
187 unsigned long edx,
188 long polarity,
189 long p_lim)
190 {
191 long mid_a = (start_a + end_a) / 2;
192 long len_b = end_b - start_b;
193 long len_a = end_a - start_a;
194 long tcst = (2 * p_lim) + (len_b - len_a);
195
196 I(start_a >= 0);
197 I(start_a <= a.size());
198 I(start_b >= 0);
199 I(start_b <= b.size());
200 I(end_a >= 0);
201 I(end_a <= a.size());
202 I(end_b >= 0);
203 I(end_b <= b.size());
204
205 cost_vec cc(len_b + 1, len_a + len_b);
206 cost_vec rr(len_b + 1, len_a + len_b);
207
208 compare (cc,
209 a.subset(start_a, mid_a), (mid_a - start_a),
210 b.subset(start_b, end_b), len_b, std::min(p_lim, len_a));
211
212 compare (rr,
213 a.subset(end_a, mid_a), (end_a - mid_a),
214 b.subset(end_b, start_b), len_b, std::min(p_lim, len_a));
215
216 long b_split = mid_split(len_a, len_b, rr, cc, tcst);
217
218 long est_c = cc[b_split];
219 long est_r = rr[len_b - b_split];
220
221 long cost_c = diff_to_et (a, start_a, mid_a,
222 b, start_b, start_b + b_split,
223 edits, edx, polarity,
224 (est_c - (b_split - (mid_a - start_a))) / 2);
225
226 I(cost_c == est_c);
227
228 long cost_r = diff_to_et (a, mid_a, end_a,
229 b, start_b + b_split, end_b,
230 edits, est_c + edx, polarity,
231 (est_r - ((len_b - b_split) - (end_a - mid_a))) / 2);
232
233 I(cost_r == est_r);
234
235 return est_r + est_c;
236 }
237
238 static long mid_split(long m, long n,
239cost_vec const & rr,
240cost_vec const & cc,
241long cost)
242 {
243 long cdx = 1 + n/2;
244 long rdx = n/2;
245 while (true)
246 {
247I (rdx >= 0);
248
249if (cost == (cc[rdx] + rr[n-rdx]))
250 return rdx;
251if (cost == (cc[cdx] + rr[n-cdx]))
252 return cdx;
253--rdx;
254++cdx;
255 }
256 }
257
258
259 static void order_edits(edit_vec const & edits,
260 long sign,
261 edit_vec & nedits)
262 {
263 nedits.clear();
264 nedits.resize(edits.size());
265 long cost = edits.size();
266
267 if (cost == 0)
268 {
269nedits = edits;
270return;
271 }
272
273 edit_vec sedits = edits;
274 std::sort(sedits.begin(), sedits.end());
275
276 long idx0;
277 for (idx0 = 0; idx0 < cost && sedits[idx0] < 0; ++idx0);
278 long len_a = std::max(0L, -sedits[0]);
279 long len_b = sedits[cost-1];
280
281 long ddx = idx0 - 1;
282 long idx = idx0;
283 long ndx = 0;
284 long adx = 0;
285 long bdx = 0;
286
287 while (bdx < len_b || adx < len_a)
288 {
289
290long del = (ddx < 0) ? 0 : sedits[ddx];
291long ins = (idx >= cost) ? 0 : sedits[idx];
292
293if (del < 0 && adx >= (-1 - del) &&
294 ins > 0 && bdx >= (-1 + ins))
295 {
296 nedits[ndx] = del;
297 nedits[ndx+1] = ins;
298 --ddx; ++idx; ndx += 2; ++adx; ++bdx;
299 }
300else if (del < 0 && adx >= (-1 - del))
301 {
302 nedits[ndx] = del;
303 --ddx; ++ndx; ++adx;
304 }
305else if (ins > 0 && bdx >= (-1 + ins))
306 {
307 nedits[ndx] = ins;
308 ++idx; ++ndx; ++bdx;
309 }
310else
311 {
312 ++adx; ++bdx;
313 }
314 }
315 }
316
317 // trims and calls diff_to_ez
318 static long diff_to_et(subarray<A> const & a, long start_a, long end_a,
319 subarray<B> const & b, long start_b, long end_b,
320 std::vector<long> & edits,
321 long edx,
322 long polarity,
323 long p_lim)
324 {
325
326 I(start_a >= 0);
327 I(start_a <= a.size());
328 I(start_b >= 0);
329 I(start_b <= b.size());
330 I(end_a >= 0);
331 I(end_a <= a.size());
332 I(end_b >= 0);
333 I(end_b <= b.size());
334
335 I(end_a - start_a >= p_lim);
336
337 long bsx, bdx, asx, adx;
338
339 for (bdx = end_b - 1, adx = end_a - 1;
340 (start_b <= bdx) && (start_a <= adx) && (a[adx] == b[bdx]);
341 --bdx, --adx);
342
343 for (bsx = start_b, asx = start_a;
344 (bsx < bdx) && (asx < adx) && (a[asx] == b[bsx]);
345 ++bsx, ++asx);
346
347 // we've trimmed; now call diff_to_ez.
348
349 long delta = (bdx - bsx) - (adx - asx);
350 if (delta < 0)
351 return diff_to_ez (b, bsx, bdx+1,
352 a, asx, adx+1,
353 edits, edx, -polarity, delta + p_lim);
354 else
355 return diff_to_ez (a, asx, adx+1,
356 b, bsx, bdx+1,
357 edits, edx, polarity, p_lim);
358 }
359
360 static long diff_to_ez(subarray<A> const & a, long start_a, long end_a,
361 subarray<B> const & b, long start_b, long end_b,
362 std::vector<long> & edits,
363 long edx1,
364 long polarity,
365 long p_lim)
366 {
367
368 I(start_a >= 0);
369 I(start_a <= a.size());
370 I(start_b >= 0);
371 I(start_b <= b.size());
372 I(end_a >= 0);
373 I(end_a <= a.size());
374 I(end_b >= 0);
375 I(end_b <= b.size());
376
377 long len_a = end_a - start_a;
378 long len_b = end_b - start_b;
379
380 I(len_a <= len_b);
381
382 // easy case #1: B inserts only
383 if (p_lim == 0)
384 {
385// A == B, no edits
386if (len_a == len_b)
387 return 0;
388
389long adx = start_a;
390long bdx = start_b;
391long edx0 = edx1;
392
393while (true)
394 {
395 if (bdx >= end_b)
396 return len_b - len_a;
397
398 if (adx >= end_a)
399 {
400for (long idx = bdx, edx = edx0;
401 idx < end_b;
402 ++idx, ++edx)
403 edits[edx] = polarity * (idx+1);
404
405return len_b - len_a;
406 }
407
408 if (a[adx] == b[bdx])
409 {
410++adx; ++bdx;
411 }
412 else
413 {
414edits[edx0] = polarity * (bdx+1);
415++bdx; ++edx0;
416 }
417 }
418 }
419
420 // easy case #2: delete all A, insert all B
421 else if (len_a <= p_lim)
422 {
423I(len_a == p_lim);
424
425long edx0 = edx1;
426for (long idx = start_a; idx < end_a; ++idx, ++edx0)
427 edits[edx0] = polarity * (-1 - idx);
428
429for (long jdx = start_b; jdx < end_b; ++jdx, ++edx0)
430 edits[edx0] = polarity * (jdx + 1);
431
432return len_a + len_b;
433 }
434
435 // hard case: recurse on subproblems
436 else
437 {
438return divide_and_conquer (a, start_a, end_a,
439 b, start_b, end_b,
440 edits, edx1, polarity, p_lim);
441 }
442 }
443
444 static void diff_to_edits(subarray<A> const & a, long m,
445 subarray<B> const & b, long n,
446 std::vector<long> & edits,
447 long p_lim)
448 {
449 I(m <= n);
450 cost_vec costs(m+n); // scratch array, ignored
451 long edit_distance = compare(costs, a, m, b, n, p_lim, false);
452
453 edits.clear();
454 edits.resize(edit_distance, 0);
455 long cost = diff_to_et(a, 0, m,
456 b, 0, n,
457 edits, 0, 1, (edit_distance - (n-m)) / 2);
458 I(cost == edit_distance);
459 }
460
461 static void edits_to_lcs (std::vector<long> const & edits,
462 subarray<A> const a, long m, long n,
463 LCS output)
464 {
465 long edx = 0, sdx = 0, adx = 0;
466 typedef typename std::iterator_traits<A>::value_type vt;
467 std::vector<vt> lcs(((m + n) - edits.size()) / 2);
468 while (true)
469 {
470long edit = (edx < static_cast<long>(edits.size())) ? edits[edx] : 0;
471
472if (adx >= m)
473 break;
474else if (edit > 0)
475 { ++edx; }
476else if (edit == 0)
477 { lcs[sdx++] = a[adx++]; }
478else if (adx >= (-1 - edit))
479 { ++edx; ++adx; }
480else
481 { lcs[sdx++] = a[adx++]; }
482 }
483
484 std::copy(lcs.begin(), lcs.end(), output);
485 }
486};
487
488
489template <typename A,
490 typename B,
491 typename LCS>
492
493void edit_script(A begin_a, A end_a,
494 B begin_b, B end_b,
495 long p_lim,
496 std::vector<long> & edits_out,
497 LCS ignored_out)
498{
499 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
500 long len_a = end_a - begin_a;
501 long len_b = end_b - begin_b;
502 typename calc_t::edit_vec edits, ordered;
503
504 typename calc_t::subarray<A> a(begin_a, 0, len_a);
505 typename calc_t::subarray<B> b(begin_b, 0, len_b);
506
507 if (len_b < len_a)
508 {
509 calc_t::diff_to_edits (b, len_b, a, len_a, edits, p_lim);
510 calc_t::order_edits (edits, -1, ordered);
511 for (size_t i = 0; i < ordered.size(); ++i)
512ordered[i] *= -1;
513 }
514 else
515 {
516 calc_t::diff_to_edits (a, len_a, b, len_b, edits, p_lim);
517 calc_t::order_edits (edits, 1, ordered);
518 }
519
520 edits_out.clear();
521 edits_out.reserve(ordered.size());
522 copy(ordered.begin(), ordered.end(), back_inserter(edits_out));
523}
524
525
526template <typename A,
527 typename B,
528 typename LCS>
529void longest_common_subsequence(A begin_a, A end_a,
530B begin_b, B end_b,
531long p_lim,
532LCS out)
533{
534 typedef jaffer_edit_calculator<A,B,LCS> calc_t;
535 long len_a = end_a - begin_a;
536 long len_b = end_b - begin_b;
537 typename calc_t::edit_vec edits, ordered;
538
539 typename calc_t::subarray<A> a(begin_a, 0, len_a);
540 typename calc_t::subarray<B> b(begin_b, 0, len_b);
541
542 if (len_b < len_a)
543 {
544 calc_t::diff_to_edits (b, len_b, a, len_a, edits, p_lim);
545 calc_t::order_edits (edits, -1, ordered);
546 calc_t::edits_to_lcs (ordered, b, len_b, len_a, out);
547 }
548 else
549 {
550 calc_t::diff_to_edits (a, len_a, b, len_b, edits, p_lim);
551 calc_t::order_edits (edits, 1, ordered);
552 calc_t::edits_to_lcs (ordered, a, len_a, len_b, out);
553 }
554}
555
556#endif // __LCS_HH__

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status