1 /** Boyer–Moore–Horspool Algorithm 2 See_Also: https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm 3 Copyright: Per Nordlöw 2014-. 4 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0). 5 Authors: $(WEB Per Nordlöw) 6 */ 7 module nxt.horspool; 8 9 import std.range.primitives : isRandomAccessRange; 10 11 /** Returns a pointer to the first occurrence of "needle" 12 * within "haystack", or [] if not found. Works like 13 * memmem(). 14 * 15 * Note: In this example needle is a C string. The ending 16 * 0x00 will be cut off, so you could call this example with 17 * boyermoore_horspool_memmem(haystack, hlen, "abc", sizeof("abc")) 18 */ 19 Range boyerMooreHorspoolFind(Range)(Range haystack, 20 in Range needle) 21 if (isRandomAccessRange!Range) 22 { 23 import std.range.primitives : ElementType; 24 alias T = ElementType!Range; 25 size_t scan = 0; 26 size_t[T.max + 1] skips; // "bad" character index skips shift 27 28 /* Sanity checks on the parameters */ 29 /* if (needle.length <= 0 || !haystack || !needle) return []; */ 30 31 /* ---- Preprocess ---- */ 32 /* Initialize the table to default value */ 33 /* When a character is encountered that does not occur 34 * in the needle, we can safely skip ahead for the whole 35 * length of the needle. */ 36 for (scan = 0; scan <= T.max; ++scan) 37 skips[scan] = needle.length; 38 const size_t last = needle.length - 1; // last index of C-style array 39 /* populate with needle analysis */ 40 for (scan = 0; scan < last; ++scan) 41 skips[needle[scan]] = last - scan; 42 43 /* ---- Do the matching ---- */ 44 while (haystack.length >= needle.length) { // while the needle can still be within it 45 /* scan from the end of the needle */ 46 for (scan = last; haystack[scan] == needle[scan]; --scan) 47 if (scan == 0) /* If the first byte matches, we've found it. */ 48 return haystack[0..needle.length]; 49 /* otherwise, we need to skip some bytes and start again. 50 Note that here we are getting the skip value based on the last byte 51 of needle, no matter where we didn't match. So if needle is: "abcd" 52 then we are skipping based on 'd' and that value will be 4, and 53 for "abcdd" we again skip on 'd' but the value will be only 1. 54 The alternative of pretending that the mismatched character was 55 the last character is slower in the normal case (E.g. finding 56 "abcd" in "...azcd..." gives 4 by using 'd' but only 57 4-2==2 using 'z'. */ 58 /* hlen -= skips[haystack[last]]; */ 59 /* haystack += skips[haystack[last]]; */ 60 haystack = haystack[skips[haystack[last]]..$]; // NOTE: Is this to slow for now? 61 } 62 return []; 63 } 64 65 @safe pure nothrow unittest 66 { 67 alias boyerMooreHorspoolFind find; 68 ubyte[] haystack = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; 69 assert(find(haystack, cast(ubyte[])[10]) == []); 70 assert(find(haystack, cast(ubyte[])[2, 3]) == [2, 3]); 71 assert(find(haystack, cast(ubyte[])[0]) == [0]); 72 assert(find(haystack, cast(ubyte[])[9]) == [9]); 73 assert(find(cast(ubyte[])[], cast(ubyte[])[9]) == []); 74 assert(haystack.boyerMooreHorspoolFind(cast(ubyte[])[1, 0]) == []); 75 }