1 /** Longest Common Subsequence, typically used as a base for writing diff/compare algorithms.
2 
3 	Copyright: Per Nordlöw 2014-.
4 	License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
5 	Authors: $(WEB Per Nordlöw)
6 
7 	See_Also: https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
8 	See_Also: https://en.wikipedia.org/wiki/Diff
9 */
10 
11 module nxt.lcs;
12 
13 /** Longest Common Subsequence (LCS).
14 	See_Also: http://rosettacode.org/wiki/Longest_common_subsequence#Recursive_version
15 */
16 T[] lcsR(T)(in T[] a,
17 			in T[] b) pure nothrow
18 {
19 	import std.range.primitives : empty;
20 	if (a.empty || b.empty)
21 		return null;			// undefined
22 	if (a[0] == b[0])
23 		return a[0] ~ lcsR(a[1 .. $],
24 						   b[1 .. $]);
25 	const longest = (T[] x, T[] y) => x.length > y.length ? x : y;
26 	return longest(lcsR(a, b[1 .. $]),
27 				   lcsR(a[1 .. $], b));
28 }
29 
30 /** Longest Common Subsequence (LCS).
31 	Faster Dynamic Programming Version.
32 	See_Also: http://rosettacode.org/wiki/Longest_common_subsequence#Faster_dynamic_programming_version
33 */
34 T[] lcsDP(T)(in T[] a,
35 			 in T[] b) pure /* nothrow */
36 {
37 	import std.algorithm.comparison : max;
38 
39 	auto L = new uint[][](a.length + 1, b.length + 1);
40 
41 	foreach (immutable i; 0 .. a.length)
42 		foreach (immutable j; 0 .. b.length)
43 			L[i + 1][j + 1] = (a[i] == b[j]) ? (1 + L[i][j]) : max(L[i + 1][j], L[i][j + 1]);
44 
45 	import core.internal.traits : Unqual;
46 	Unqual!T[] result;
47 
48 	for (auto i = a.length, j = b.length; i > 0 && j > 0; )
49 	{
50 		if (a[i - 1] == b[j - 1])
51 		{
52 			result ~= a[i - 1];
53 			i--;
54 			j--;
55 		}
56 		else
57 		{
58 			if (L[i][j - 1] < L[i - 1][j])
59 				i--;
60 			else
61 				j--;
62 		}
63 	}
64 
65 	import std.algorithm : reverse;
66 	result.reverse();		   // not nothrow
67 	return result;
68 }
69 
70 /** Get LCS Lengths. */
71 uint[] lcsLengths(R)(R xs,
72 					 R ys) pure nothrow
73 {
74 	import std.algorithm.comparison : max;
75 
76 	auto prev = new typeof(return)(1 + ys.length);
77 	auto curr = new typeof(return)(1 + ys.length);
78 
79 	foreach (immutable x; xs)
80 	{
81 		import std.algorithm: swap;
82 		swap(curr, prev);
83 		size_t i = 0;
84 		foreach (immutable y; ys)
85 		{
86 			curr[i + 1] = (x == y) ? prev[i] + 1 : max(curr[i], prev[i + 1]);
87 			i++;
88 		}
89 	}
90 
91 	return curr;
92 }
93 
94 void lcsDo(T)(in T[] xs,
95 			  in T[] ys,
96 			  bool[] xsInLCS,
97 			  in size_t idx = 0) pure nothrow
98 {
99 	immutable nx = xs.length;
100 	immutable ny = ys.length;
101 
102 	if (nx == 0)
103 		return;
104 
105 	if (nx == 1)
106 	{
107 		import std.algorithm: canFind;
108 		if (ys.canFind(xs[0]))
109 			xsInLCS[idx] = true;
110 	}
111 	else
112 	{
113 		immutable mid = nx / 2;
114 		const xb = xs[0.. mid];
115 		const xe = xs[mid .. $];
116 		immutable ll_b = lcsLengths(xb, ys);
117 
118 		import std.range: retro;
119 		const ll_e = lcsLengths(xe.retro,
120 								ys.retro); // retro is slow with DMD
121 
122 		// import std.algorithm.comparison : max;
123 		//immutable k = iota(ny + 1)
124 		//			  .reduce!(max!(j => ll_b[j] + ll_e[ny - j]));
125 		import std.range: iota;
126 		import std.algorithm: minPos;
127 		import std.typecons: tuple;
128 		immutable k = iota(ny + 1).minPos!((i, j) => tuple(ll_b[i] + ll_e[ny - i]) >
129 										   tuple(ll_b[j] + ll_e[ny - j]))[0];
130 
131 		lcsDo(xb, ys[0 .. k], xsInLCS, idx);
132 		lcsDo(xe, ys[k .. $], xsInLCS, idx + mid);
133 	}
134 }
135 
136 /** Longest Common Subsequence (LCS) using Hirschberg.
137 	Linear-Space Faster Dynamic Programming Version.
138 
139 	Time Complexity: O(m*n)
140 	Space Complexity: O(min(m,n))
141 
142 	To speed up this code on DMD remove the memory allocations from $(D lcsLengths), and
143 	do not use the $(D retro) range (replace it with $(D foreach_reverse))
144 
145 	See_Also: https://en.wikipedia.org/wiki/Hirschberg%27s_algorithm
146 	See_Also: http://rosettacode.org/wiki/Longest_common_subsequence#Hirschberg_algorithm_version
147 */
148 const(T)[] lcs(T)(in T[] xs,
149 				  in T[] ys) pure /*nothrow*/
150 {
151 	auto xsInLCS = new bool[xs.length];
152 	lcsDo(xs, ys, xsInLCS);
153 	import std.range : zip;
154 	import std.algorithm.iteration : filter, map;
155 	import std.array : array;
156 	return zip(xs, xsInLCS).filter!q{ a[1] }.map!q{ a[0] }.array; // Not nothrow.
157 }
158 
159 import std.string : representation, assumeUTF;
160 
161 immutable(char)[] lcs(inout(char)[] a, inout(char)[] b) pure /*nothrow*/
162 	=> cast(typeof(return))(lcs(a.representation, b.representation).assumeUTF);
163 
164 unittest {
165 	immutable x = "thisisatest";
166 	immutable y = "testing123testing";
167 	immutable z = "tsitest";
168 	assert(z == lcsR(x, y));
169 	assert(z == lcsDP(x, y));
170 	assert(z == lcs(x, y));
171 	assert("" == lcs("", ""));
172 }
173 
174 unittest {
175 	immutable x = [1, 2, 3];
176 	immutable y = [4, 5, 6];
177 	immutable z = [];
178 	assert(z == lcsR(x, y));
179 	assert(z == lcsDP(x, y));
180 	assert(z == lcs(x, y));
181 }
182 
183 unittest {
184 	immutable x = [1, 2, 3];
185 	immutable y = [2, 3, 4];
186 	immutable z = [2, 3];
187 	assert(z == lcsR(x, y));
188 	assert(z == lcsDP(x, y));
189 	assert(z == lcs(x, y));
190 }
191 
192 unittest {
193 	alias T = int;
194 	size_t n = 300;
195 	auto x = new T[n];
196 	auto y = new T[n];
197 	assert(lcs(x, y).length == n);
198 }
199 
200 version (nxt_benchmark) unittest {
201 	import std.algorithm : levenshteinDistance, levenshteinDistanceAndPath;
202 	import nxt.random_ex: randInPlaceBlockwise;
203 
204 	alias T = int;
205 	size_t n = 300;
206 	auto x = new T[n];
207 	auto y = new T[n];
208 
209 	x.randInPlaceBlockwise;
210 	y.randInPlaceBlockwise;
211 
212 	void bLCS() { const d = lcs(x, y); }
213 	void bLD() { const d = levenshteinDistance(x, y); }
214 	void bLDAP() { const dp = levenshteinDistanceAndPath(x, y); }
215 
216 	import std.datetime: benchmark;
217 	import std.algorithm: map;
218 	import std.array: array;
219 
220 	auto r = benchmark!(bLCS, bLD, bLDAP)(1);
221 	auto q = r.array.map!(a => a.msecs);
222 
223 	import std.stdio: writeln;
224 	writeln(q, " milliseconds");
225 }