nxt.horspool source code

1 /** Boyer–Moore–Horspool Algorithm
2 	See_Also: https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm
3 	Copyright: Per Nordlöw 2014-.
4 	License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
5 	Authors: $(WEB Per Nordlöw)
6  */
7 module nxt.horspool;
8 
9 import std.range.primitives : isRandomAccessRange;
10 
11 /** Returns a pointer to the first occurrence of "needle"
12  * within "haystack", or [] if not found. Works like
13  * memmem().
14  *
15  * Note: In this example needle is a C string. The ending
16  * 0x00 will be cut off, so you could call this example with
17  * boyermoore_horspool_memmem(haystack, hlen, "abc", sizeof("abc"))
18  */
19 Range boyerMooreHorspoolFind(Range)(Range haystack,
20 									in Range needle)
21 	if (isRandomAccessRange!Range)
22 {
23 	import std.range.primitives : ElementType;
24 	alias T = ElementType!Range;
25 	size_t scan = 0;
26 	size_t[T.max + 1] skips; // "bad" character index skips shift
27 
28 	/* Sanity checks on the parameters */
29 	/* if (needle.length <= 0 || !haystack || !needle) return []; */
30 
31 	/* ---- Preprocess ---- */
32 	/* Initialize the table to default value */
33 	/* When a character is encountered that does not occur
34 	 * in the needle, we can safely skip ahead for the whole
35 	 * length of the needle. */
36 	for (scan = 0; scan <= T.max; ++scan)
37 		skips[scan] = needle.length;
38 	const size_t last = needle.length - 1; // last index of C-style array
39 	/* populate with needle analysis */
40 	for (scan = 0; scan < last; ++scan)
41 		skips[needle[scan]] = last - scan;
42 
43 	/* ---- Do the matching ---- */
44 	while (haystack.length >= needle.length) { // while the needle can still be within it
45 		/* scan from the end of the needle */
46 		for (scan = last; haystack[scan] == needle[scan]; --scan)
47 			if (scan == 0) /* If the first byte matches, we've found it. */
48 				return haystack[0..needle.length];
49 		/* otherwise, we need to skip some bytes and start again.
50 		   Note that here we are getting the skip value based on the last byte
51 		   of needle, no matter where we didn't match. So if needle is: "abcd"
52 		   then we are skipping based on 'd' and that value will be 4, and
53 		   for "abcdd" we again skip on 'd' but the value will be only 1.
54 		   The alternative of pretending that the mismatched character was
55 		   the last character is slower in the normal case (E.g. finding
56 		   "abcd" in "...azcd..." gives 4 by using 'd' but only
57 		   4-2==2 using 'z'. */
58 		/* hlen	 -= skips[haystack[last]]; */
59 		/* haystack += skips[haystack[last]]; */
60 		haystack = haystack[skips[haystack[last]]..$]; // NOTE: Is this to slow for now?
61 	}
62 	return [];
63 }
64 
65 pure nothrow @safe unittest {
66 	alias boyerMooreHorspoolFind find;
67 	ubyte[] haystack = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
68 	assert(find(haystack, cast(ubyte[])[10]) == []);
69 	assert(find(haystack, cast(ubyte[])[2, 3]) == [2, 3]);
70 	assert(find(haystack, cast(ubyte[])[0]) == [0]);
71 	assert(find(haystack, cast(ubyte[])[9]) == [9]);
72 	assert(find(cast(ubyte[])[], cast(ubyte[])[9]) == []);
73 	assert(haystack.boyerMooreHorspoolFind(cast(ubyte[])[1, 0]) == []);
74 }