1 /** First and Higher Order Statistics: $(LUCKY Histograms) and $(LUCKY N-grams).
2
3 Copyright: Per Nordlöw 2018-.
4 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
5 Authors: $(WEB Per Nordlöw)
6
7 TODO Replace static ifs with static final switch when Issue 6921 is fixed
8
9 TODO Remove overloads using std.string:representation and use sparse maps
10 over dchar for strings.
11 */
12 module nxt.ngram;
13
14 import nxt.static_bitarray;
15 import std.range: InputRange, ElementType, isInputRange;
16 import std.traits: isSomeChar, isUnsigned, isIntegral, isFloatingPoint, Unqual, isStaticArray, isIterable, isAssociativeArray, CommonType;
17 import std.stdint: uint64_t;
18 import std.typecons: Tuple, tuple;
19 import nxt.predicates: allZero, allEqualTo;
20 import nxt.nesses: denseness;
21 import nxt.rational: Rational;
22 // import msgpack;
23 import std.numeric: dotProduct;
24 import std..string: representation;
25 import std.conv: to;
26
27 // version = print;
28
29 /** N-Gram Model Kind. */
30 enum Kind
31 {
32 saturated, // Standard Saturated Integers. Saturation is more robust than default integer wrap-around.
33 binary // Binary Gram.
34 }
35
36 /** N-Gram Model Storage. */
37 enum Storage
38 {
39 denseDynamic,// Dynamically allocated dense storage.
40 denseStatic, // Statically allocated dense storage.
41 sparse, // N-Dimensional/Level D Maps/Dictionaries.
42 }
43
44 /** N-Gram Model Symmetry. */
45 enum Symmetry
46 {
47 ordered, // Order between elements matters.
48 unordered, // Order of elements is ignored (relaxed). Tuples are sorted before inserted.
49 }
50
51 /**
52 Computes $(LUCKY N-Gram) Model of Input Range $(D range).
53
54 If N == 1 this N-gram becomes a standard $(LUCKY Histogram), also called a
55 $(LUCKY Unigram).
56
57 If Bin/Bucket Type $(D RequestedBinType), requested by user, is $(D void) it
58 will be chosen automatically based on element type of $(D Range).
59
60 TODO Add optimized put() be precalculating maximum number of elements that
61 can be inserted without any risk overflow.
62 */
63 struct NGram(ValueType,
64 int N = 1,
65 Kind kind = Kind.saturated,
66 Storage storage = Storage.denseDynamic,
67 Symmetry symmetry = Symmetry.ordered,
68 RequestedBinType = void,
69 Range) if (is(RequestedBinType == void) ||
70 isUnsigned!RequestedBinType ||
71 isFloatingPoint!RequestedBinType)
72 {
73 this(in Range range) @safe pure nothrow { put(range); }
74
75 auto ref value() @property @safe pure inout nothrow { return _bins; }
76
77 alias Q = Rational!ulong;
78
79 @property static int order() @safe pure nothrow { return N; }
80
81 enum isBinary = (kind == Kind.binary);
82 enum isDense = (storage == Storage.denseStatic ||
83 storage == Storage.denseDynamic);
84 enum isSparse = !isDense;
85 enum cacheDeepDenseness = false;
86
87 enum getStorage = storage;
88
89 /** Returns: Documentation of the type of $(D this). */
90 string typeName() @property const
91 {
92 string prefix;
93 static if (N == 1) { prefix = "Uni"; }
94 else static if (N == 2) { prefix = "Bi"; }
95 else static if (N == 3) { prefix = "Tri"; }
96 else static if (N == 4) { prefix = "Qua"; }
97 else static if (N == 5) { prefix = "Pen"; }
98 else { prefix = to!string(N) ~ "-"; }
99 string rval;
100 static if (isBinary)
101 {
102 rval ~= "Bit";
103 }
104 else
105 {
106 rval ~= BinType.stringof;
107 }
108 rval ~= "-" ~ prefix ~ "Gram over " ~ ValueType.stringof ~ "s";
109 return rval;
110 }
111
112 string toString() @property const
113 {
114 string rval = typeName ~ ": ";
115
116 // print contents
117 static if (isAA)
118 {
119 rval ~= to!string(_bins);
120 }
121 else if (N >= 2 ||
122 this.denseness < Q(1, 2)) // if less than half of the bins are occupied
123 {
124 // show as sparse
125 rval ~= "[";
126
127 // TODO Replace these with recursion?
128 static if (N == 1)
129 {
130 bool begun = false;
131 foreach (ix, elt; _bins) // TODO Make static_bitarray support ref foreach and make elt const ref
132 {
133 if (elt)
134 {
135 if (begun) { rval ~= ", "; }
136 rval ~= "{" ~ to!string(ix) ~ "}#" ~ to!string(elt);
137 begun = true;
138 }
139 }
140 }
141 else static if (N == 2)
142 {
143 bool begun = false;
144 foreach (ix0, const ref elt0; _bins)
145 {
146 foreach (ix1, elt1; elt0) // TODO Make static_bitarray support ref foreach and make elt1 const ref
147 {
148 if (elt1)
149 {
150 if (begun) { rval ~= ", "; }
151 rval ~= ("{" ~
152 to!string(ix0) ~ "," ~
153 to!string(ix1) ~ "}#" ~
154 to!string(elt1));
155 begun = true;
156 }
157 }
158 }
159 }
160 else static if (N == 3)
161 {
162 bool begun = false;
163 foreach (ix0, const ref elt0; _bins)
164 {
165 foreach (ix1, const ref elt1; elt0)
166 {
167 foreach (ix2, elt2; elt1) { // TODO Make static_bitarray support ref foreach and make elt2 const ref
168 if (elt2)
169 {
170 if (begun) { rval ~= ", "; }
171 rval ~= ("{" ~
172 to!string(ix0) ~ "," ~
173 to!string(ix1) ~ "," ~
174 to!string(ix2) ~ "}#" ~
175 to!string(elt2));
176 begun = true;
177 }
178 }
179 }
180 }
181 }
182 else static if (N == 4)
183 {
184 bool begun = false;
185 foreach (ix0, const ref elt0; _bins)
186 {
187 foreach (ix1, const ref elt1; elt0)
188 {
189 foreach (ix2, elt2; elt1)
190 {
191 foreach (ix3, elt3; elt2) // TODO Make static_bitarray support ref foreach and make elt2 const ref
192 {
193 if (elt3)
194 {
195 if (begun) { rval ~= ", "; }
196 rval ~= ("{" ~
197 to!string(ix0) ~ "," ~
198 to!string(ix1) ~ "," ~
199 to!string(ix2) ~ "," ~
200 to!string(ix3) ~ "}#" ~
201 to!string(elt3));
202 begun = true;
203 }
204 }
205 }
206 }
207 }
208 }
209 else static if (N == 5)
210 {
211 bool begun = false;
212 foreach (ix0, const ref elt0; _bins)
213 {
214 foreach (ix1, const ref elt1; elt0)
215 {
216 foreach (ix2, const ref elt2; elt1)
217 {
218 foreach (ix3, const ref elt3; elt2) // TODO Make static_bitarray support ref foreach and make elt2 const ref
219 {
220 foreach (ix4, elt4; elt3) // TODO Make static_bitarray support ref foreach and make elt2 const ref
221 {
222 if (elt4)
223 {
224 if (begun) { rval ~= ", "; }
225 rval ~= ("{" ~
226 to!string(ix0) ~ "," ~
227 to!string(ix1) ~ "," ~
228 to!string(ix2) ~ "," ~
229 to!string(ix3) ~ "," ~
230 to!string(ix4) ~ "}#" ~
231 to!string(elt4));
232 begun = true;
233 }
234 }
235 }
236 }
237 }
238 }
239 }
240 else
241 {
242 static assert(0, "N >= 5 not supported");
243 }
244 rval ~= "]";
245 }
246 else
247 {
248 rval ~= to!string(_bins); // default
249 }
250
251 static if (N == 1)
252 {
253 rval ~= " denseness:" ~ to!string(denseness);
254 }
255 else
256 {
257 rval ~= (" shallowDenseness:" ~ to!string(denseness(0)) ~
258 " deepDenseness:" ~ to!string(denseness(-1)));
259 }
260
261 return rval;
262 }
263
264 void reset() @safe nothrow
265 {
266 import nxt.algorithm_ex: reset;
267 _bins.reset();
268 }
269 // alias reset = clear;
270
271 bool empty() const pure @property @trusted nothrow
272 {
273 static if (isAA)
274 {
275 return _bins.length == 0;
276 }
277 else
278 {
279 return _bins.allZero;
280 }
281 }
282
283 static if (N >= 2 &&
284 storage != Storage.sparse)
285 {
286 Q shallowDenseness() const pure @property @trusted nothrow
287 {
288 static if (N == 2)
289 {
290 ulong x = 0;
291 foreach (const ref elt0; 0..combE) // every possible first ngram element
292 {
293 bool used = false;
294 foreach (elt1; 0..combE) // every possible second ngram element
295 {
296 if (_bins[elt0][elt1]) // if any bit is used
297 {
298 used = true; // we flag it and exit
299 break;
300 }
301 }
302 if (used) ++x;
303 }
304 return Q(x, combE);
305 }
306 else
307 {
308 return Q(1, 1);
309 }
310 }
311 }
312
313 typeof(this) opAssign(RhsRange)(in NGram!(ValueType, N, kind, storage, symmetry, RequestedBinType, RhsRange) rhs)
314 if (isInputRange!RhsRange) // if (is(Unqual!Range == Unqual!RhsRange))
315 {
316 static if (storage == Storage.denseDynamic)
317 _bins = rhs._bins.dup;
318 else static if (storage == Storage.sparse)
319 _bins = cast(BinType[ValueType[N]])(rhs._bins.dup); // TODO How can we prevent this cast?
320 else static if (storage == Storage.denseStatic)
321 _bins = rhs._bins;
322 else
323 static assert(0, "Cannot handle storage of type " ~ to!string(storage));
324 return this;
325 }
326
327 static if (N == 1)
328 {
329 void normalize() @safe pure /* nothrow */
330 {
331 static if (kind != Kind.binary)
332 {
333 static if (isFloatingPoint!ValueType)
334 {
335 immutable scaleFactor = (cast(real)1) / _bins.length; // precalc scaling
336 }
337 foreach (ref elt; _bins) // TODO Why can this throw for associative arrays?
338 {
339 static if (isIntegral!ValueType ||
340 isSomeChar!ValueType)
341 {
342 elt /= 2; // drop least significant bit
343 }
344 else static if (isFloatingPoint!ValueType)
345 {
346 elt =* scaleFactor;
347 }
348 }
349 }
350 }
351 }
352
353 /** Invalidate Local Statistics. */
354 void invalidateStats() @safe nothrow
355 {
356 static if (isAA && cacheDeepDenseness) { _deepDenseness = 0; }
357 }
358
359 /** Saturated Increment $(D x) by one. */
360 ref T incs(T)(ref T x) @safe nothrow if (isIntegral!T) { if (x != T.max) { ++x; invalidateStats(); } return x; }
361 /** Saturated Increment $(D x) by one. */
362 ref T incs(T)(ref T x) @safe nothrow if (isFloatingPoint!T) { invalidateStats(); return ++x; }
363
364 static if (kind == Kind.saturated)
365 {
366 /** Increase bin of $(D ng).
367 Bin of ng must already be allocated if its stored in a map.
368 */
369 ref NGram inc(ValueType[N] ng) @safe pure nothrow
370 {
371 static if (isAA)
372 {
373 incs(_bins[ng]); // just one case!
374 }
375 else
376 {
377 static if (N == 1)
378 {
379 static if (storage == Storage.denseDynamic)
380 {
381 if (!_bins) { _bins = new BinType[noABins]; }
382 }
383 incs(_bins[ng[0]]);
384 }
385 else static if (N == 2)
386 {
387 static if (storage == Storage.denseDynamic)
388 {
389 if (!_bins) { _bins = new BinType[][noABins]; }
390 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[noABins]; }
391 }
392 incs(_bins[ng[0]][ng[1]]);
393 }
394 else static if (N == 3)
395 {
396 static if (storage == Storage.denseDynamic)
397 {
398 if (!_bins) { _bins = new BinType[][][noABins]; }
399 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][noABins]; }
400 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[noABins]; }
401 }
402 incs(_bins[ng[0]][ng[1]][ng[2]]);
403 }
404 else static if (N == 4)
405 {
406 static if (storage == Storage.denseDynamic)
407 {
408 if (!_bins) { _bins = new BinType[][][][noABins]; }
409 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][][noABins]; }
410 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[][noABins]; }
411 if (!_bins[ng[0]][ng[1]][ng[2]]) { _bins[ng[0]][ng[1]][ng[2]] = new BinType[noABins]; }
412 }
413 incs(_bins[ng[0]][ng[1]][ng[2]][ng[3]]);
414 }
415 else static if (N == 5)
416 {
417 static if (storage == Storage.denseDynamic)
418 {
419 if (!_bins) { _bins = new BinType[][][][][noABins]; }
420 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][][][noABins]; }
421 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[][][noABins]; }
422 if (!_bins[ng[0]][ng[1]][ng[2]]) { _bins[ng[0]][ng[1]][ng[2]] = new BinType[][noABins]; }
423 if (!_bins[ng[0]][ng[1]][ng[2]][ng[3]]) { _bins[ng[0]][ng[1]][ng[2]][ng[3]] = new BinType[noABins]; }
424 }
425 incs(_bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]);
426 }
427 else
428 {
429 static assert(0, "N >= 6 not supported");
430 }
431 }
432 return this;
433 }
434 }
435
436 /** Returns: Bin Count of NGram $(D ng). */
437 BinType opIndex(T)(in T ng) const @safe pure nothrow if (isIntegral!T ||
438 isStaticArray!T)
439 {
440 static if (isAA)
441 {
442 if (ng in _bins)
443 {
444 return _bins[ng];
445 }
446 else
447 {
448 return 0; // empty means zero bin
449 }
450 }
451 else
452 {
453 static if (N == 1) { return _bins[ng]; }
454 else static if (N == 2) { return _bins[ng[0]][ng[1]]; }
455 else static if (N == 3) { return _bins[ng[0]][ng[1]][ng[2]]; }
456 else static if (N == 4) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]]; }
457 else static if (N == 5) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]; }
458 else { static assert(0, "N >= 6 not supported"); }
459 }
460 }
461
462 /* ref BinType opIndexAssign(Index)(BinType b, Index i) @trusted pure nothrow if (isIntegral!Index) in { */
463
464 /* } */
465
466 /** Returns: Bin Count of NGram $(D ng). */
467 auto opIndexAssign(T)(in T ng) const @safe pure nothrow if (isIntegral!T ||
468 isStaticArray!T)
469 {
470 static if (isAA)
471 {
472 if (ng in _bins)
473 {
474 return _bins[ng];
475 }
476 else
477 {
478 return 0; // empty means zero bin
479 }
480 }
481 else
482 {
483 static if (N == 1) { return _bins[ng]; }
484 else static if (N == 2) { return _bins[ng[0]][ng[1]]; }
485 else static if (N == 3) { return _bins[ng[0]][ng[1]][ng[2]]; }
486 else static if (N == 4) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]]; }
487 else static if (N == 5) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]; }
488 else { static assert(0, "N >= 6 not supported"); }
489 }
490 }
491
492 /** Put NGram Element $(D ng) in $(D this).
493 */
494 ref NGram put(ValueType[N] ng) @safe pure nothrow
495 {
496 static if (isBinary)
497 {
498 static if (N == 1) { _bins[ng[0]] = true; }
499 else static if (N == 2) { _bins[ng[0]][ng[1]] = true; }
500 else static if (N == 3) { _bins[ng[0]][ng[1]][ng[2]] = true; }
501 else static if (N == 4) { _bins[ng[0]][ng[1]][ng[2]][ng[3]] = true; }
502 else static if (N == 5) { _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]] = true; }
503 else { static assert(0, "N >= 6 not supported"); }
504 }
505 else
506 {
507 static if (isAA)
508 {
509 if (ng in _bins)
510 {
511 inc(ng); // increase bin
512 }
513 else
514 {
515 _bins[ng] = 1; // initial bin count
516 }
517 }
518 else
519 {
520 inc(ng);
521 }
522 }
523 invalidateStats();
524 return this;
525 }
526
527 /** Put NGram Elements $(D range) in $(D this).
528 */
529 ref NGram put(T)(in T range) @safe pure nothrow if (isIterable!T)
530 {
531 static if (N == 1)
532 {
533 foreach (ix, const ref elt; range)
534 {
535 put([elt]);
536 }
537 }
538 else static if (N == 2)
539 {
540 ValueType prev; // previous element
541 foreach (ix, const ref elt; range)
542 {
543 if (ix >= N-1) { put([prev, elt]); }
544 prev = elt;
545 }
546 }
547 else static if (N == 3)
548 {
549 ValueType pprev, prev; // previous elements
550 foreach (ix, const ref elt; range)
551 {
552 if (ix >= N-1) { put([pprev, prev, elt]); }
553 pprev = prev;
554 prev = elt;
555 }
556 }
557 else static if (N == 4)
558 {
559 ValueType ppprev, pprev, prev; // previous elements
560 foreach (ix, const ref elt; range)
561 {
562 if (ix >= N-1) { put([ppprev, pprev, prev, elt]); }
563 ppprev = pprev;
564 pprev = prev;
565 prev = elt;
566 }
567 }
568 else static if (N == 5)
569 {
570 ValueType pppprev, ppprev, pprev, prev; // previous elements
571 foreach (ix, const ref elt; range)
572 {
573 if (ix >= N-1) { put([pppprev, ppprev, pprev, prev, elt]); }
574 pppprev = ppprev;
575 ppprev = pprev;
576 pprev = prev;
577 prev = elt;
578 }
579 }
580 return this;
581 }
582
583 /** Scalar (Dot) Product Match $(D this) with $(D rhs). */
584 CommonType!(ulong, BinType) matchDenser(rhsValueType,
585 Kind rhsKind = Kind.saturated,
586 Storage rhsStorage = Storage.denseDynamic,
587 RhsRange)(in NGram!(rhsValueType,
588 N,
589 rhsKind,
590 rhsStorage,
591 symmetry,
592 RequestedBinType,
593 RhsRange) rhs) const @trusted pure /* nothrow */
594 {
595 static if (this.isDense && rhs.isDense)
596 {
597 static if (N == 1) return dotProduct(this._bins, rhs._bins);
598 else static assert(0, "N >= " ~ to!string(N) ~ " not supported");
599 }
600 else static if (this.isSparse)
601 {
602 typeof(return) sum = 0;
603 foreach (ix, const ref bin; _bins) // assume lhs is sparsest
604 {
605 sum += bin*rhs[ix];
606 }
607 return sum;
608 }
609 else static if (rhs.isSparse)
610 {
611 typeof(return) sum = 0;
612 foreach (ix, const ref bin; rhs._bins) // assume lhs is sparsest
613 {
614 sum += this[ix]*bin;
615 }
616 return sum;
617 }
618 else
619 {
620 static assert(0, "Combination of " ~ typeof(storage).stringof ~ " and " ~
621 typeof(rhsStorage).stringof ~ " not supported");
622 return 0;
623 }
624 }
625
626 auto opBinary(string op,
627 string file = __FILE__, int line = __LINE__)(in NGram rhs) const @trusted pure /* nothrow */
628 {
629 NGram tmp;
630 static if (this.isSparse ||
631 rhs.isSparse)
632 {
633 void doIt(in NGram a, in NGram b) // more functional than orderInPlace
634 {
635 foreach (ix, const ref bin; a._bins)
636 {
637 if (ix in b._bins)
638 {
639 mixin("tmp._bins[ix] = bin " ~ op ~ " b._bins[ix];"); // propagate pointwise operation
640 }
641 }
642 }
643 if (this.length < rhs.length) { doIt(this, rhs); }
644 else { doIt(rhs, this); }
645 }
646 else static if (this.isBinary &&
647 rhs.isBinary)
648 {
649 mixin("tmp._bins = _bins " ~ op ~ "rhs._bins;"); // bitsets cannot be sliced (yet)
650 }
651 else static if (this.isDense &&
652 rhs.isDense &&
653 N == 1)
654 {
655 assert(this.length == rhs.length);
656 import std.range: zip;
657 import std.algorithm: map;
658 import std.array: array;
659 tmp._bins = zip(this._bins[], rhs._bins[]).map!(a => mixin("a[0] " ~ op ~ "a[1]"))().array; // TODO Use zipWith when Issue 8715 is fixed
660 /* foreach (ix, let; this._bins) { // TODO Reuse Phobos algorithm or range */
661 /* mixin("tmp[ix] = _bins[ix] " ~ op ~ "rhs._bins[ix];"); */
662 /* } */
663 }
664 else
665 {
666 static assert(0, "Combination of " ~ to!string(this.getStorage) ~ " and " ~
667 to!string(rhs.getStorage) ~ " and N == " ~ to!string(N) ~ " not supported");
668 }
669 return tmp;
670 }
671
672 /** Determine Bin (Counter) Type. */
673 static if (isBinary)
674 {
675 alias BinType = bool;
676 }
677 else static if (is(RequestedBinType == void))
678 {
679 alias BinType = uint64_t; // Bin (counter) type. Count long by default.
680 }
681 else
682 {
683 alias BinType = RequestedBinType;
684 }
685
686 enum bitsE = 8 * ValueType.sizeof; /** Number of Bits in ValueType (element). TODO This may have to be fixed for ref types. */
687 enum combE = 2^^bitsE; /** Number of combinations in element. */
688 enum noABins = combE; // Number of bins per array dimension
689 enum noBins = combE^^N; /** Maximum number of bins (possible). */
690
691 // Determine storage structure base on element type.
692 static if (storage == Storage.sparse)
693 {
694 BinType[ValueType[N]] _bins;
695 }
696 else static if (isBinary &&
697 N*bitsE <= 32) // safely fits on stack
698 {
699 static if (storage == Storage.denseStatic)
700 {
701 static if (N == 1) StaticBitArray!noABins _bins;
702 else static if (N == 2) StaticBitArray!noABins[noABins] _bins;
703 else static if (N == 3) StaticBitArray!noABins[noABins][noABins] _bins;
704 else static assert(0, "Dense static N >= 4 does no safely fit on stack");
705 }
706 else static if (storage == Storage.denseDynamic)
707 {
708 static if (N == 1) StaticBitArray!noABins _bins;
709 else static if (N == 2) StaticBitArray!noABins[] _bins;
710 else static if (N == 3) StaticBitArray!noABins[][] _bins;
711 else static if (N == 4) StaticBitArray!noABins[][][] _bins;
712 else static if (N == 5) StaticBitArray!noABins[][][][] _bins;
713 else static assert(0, "N >= 6 not supported");
714 }
715 else static if (storage == Storage.sparse) // shouldn't happen
716 {
717 }
718 }
719 else static if (storage == Storage.denseStatic &&
720 (isUnsigned!ValueType ||
721 isSomeChar!ValueType) &&
722 N*bitsE <= 16) // safely fits on stack
723 {
724 static if (N == 1) BinType[noABins] _bins;
725 else static if (N == 2) BinType[noABins][noABins] _bins;
726 else static assert(0, "Dense static N >= 3 does not safely fit on stack");
727 }
728 else static if (storage == Storage.denseDynamic &&
729 (isUnsigned!ValueType ||
730 isSomeChar!ValueType)) // safely fits on stack
731 {
732 static if (N == 1) BinType[] _bins;
733 else static if (N == 2) BinType[][] _bins;
734 else static if (N == 3) BinType[][][] _bins;
735 else static if (N == 4) BinType[][][][] _bins;
736 else static if (N == 5) BinType[][][][][] _bins;
737 else static assert(0, "N >= 6 not supported");
738 }
739 else
740 {
741 BinType[ValueType[N]] _bins;
742 }
743
744 private alias _bins this;
745
746 enum isAA = isAssociativeArray!(typeof(_bins));
747
748 static if (isAA && cacheDeepDenseness)
749 {
750 ulong _deepDenseness; // Cache deep denseness in associative arrays
751 }
752
753 Q densenessUncached(int depth = -1) const pure @property @trusted nothrow
754 {
755 static if (isBinary)
756 {
757 static if (N >= 2)
758 {
759 if (N == 2 && depth == 0) // if shallow wanted
760 {
761 return shallowDenseness();
762 }
763 }
764 return _bins.denseness;
765 }
766 else
767 {
768 static if (isAA)
769 {
770 return Q(_bins.length, noBins);
771 }
772 else
773 {
774 return _bins.denseness(depth);
775 }
776 }
777 }
778
779 /** Returns: Number of Non-Zero Elements in $(D range). */
780 Q denseness(int depth = -1) const pure @property @safe nothrow
781 {
782 static if (isAA && cacheDeepDenseness)
783 {
784 if (depth == -1) // -1 means deepDenseness
785 {
786 if (!_deepDenseness) // if not yet defined.
787 {
788 unqual(this).densenessUncached(depth).numerator; // calculate it
789 }
790 return Q(_deepDenseness, noBins);
791 }
792 }
793 return densenessUncached(depth);
794 }
795 }
796
797 /**
798 Computes $(LUCKY Bi-Gram) of $(D range).
799 */
800 auto ngram(int N = 2,
801 Kind kind = Kind.saturated,
802 Storage storage = Storage.denseDynamic,
803 Symmetry symmetry = Symmetry.ordered,
804 RequestedBinType = void,
805 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
806 {
807 return NGram!(Unqual!(ElementType!Range), N,
808 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range);
809 }
810 auto sparseUIntNGramOverRepresentation(int N = 2,
811 Kind kind = Kind.saturated,
812 Symmetry symmetry = Symmetry.ordered)(in string range) @safe pure nothrow
813 {
814 auto y = range.representation;
815 return ngram!(N, kind, Storage.sparse, symmetry, uint)(y);
816 }
817
818 auto histogram(Kind kind = Kind.saturated,
819 Storage storage = Storage.denseDynamic,
820 Symmetry symmetry = Symmetry.ordered,
821 RequestedBinType = void,
822 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
823 {
824 return NGram!(Unqual!(ElementType!Range), 1, kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range);
825 }
826 alias Hist = NGram;
827 alias hist = histogram;
828 alias unigram = histogram;
829
830 unittest
831 {
832 const ubyte[] x = [1, 2, 3, 4, 5];
833 const h = x.histogram!(Kind.saturated, Storage.denseStatic);
834 const hh = h + h;
835
836 auto h2 = x.histogram!(Kind.saturated, Storage.denseStatic);
837 h2.put(x);
838 assert(hh == h2);
839 }
840
841 unittest
842 {
843 const ubyte[] x;
844 const hDS = x.histogram!(Kind.saturated, Storage.denseStatic); assert(hDS.empty);
845
846 const hDD = x.histogram!(Kind.saturated, Storage.denseDynamic); assert(hDD.empty);
847
848 auto h = x.histogram!(Kind.saturated, Storage.sparse); assert(h.empty);
849 const ubyte[5] x1 = [1, 2, 3, 4, 5];
850 h.put(x1); assert(!h.empty);
851 assert(h.matchDenser(h) == x1.length);
852 auto hh = h + h;
853 assert(hh.matchDenser(hh) == 4*x1.length);
854
855 h.destroy(); assert(h.empty);
856
857 auto hU32 = x.histogram!(Kind.saturated, Storage.sparse, Symmetry.ordered, uint);
858 }
859
860 unittest
861 {
862 alias ValueType = ubyte;
863 const ValueType[3] x = [11, 12, 13];
864
865 auto h = x.histogram!(Kind.saturated, Storage.denseStatic);
866 assert(h[0] == 0);
867 assert(h[11] == 1 && h[12] == 1 && h[13] == 1);
868 assert(3 == h.matchDenser(h));
869
870 h.put(x);
871 assert(h[11] == 2 && h[12] == 2 && h[13] == 2);
872 assert(12 == h.matchDenser(h));
873
874 version(print) dbg(h);
875
876 auto hD = x.histogram!(Kind.saturated, Storage.denseDynamic);
877 version(print) dbg(hD);
878 }
879
880 unittest
881 {
882 alias ValueType = ulong;
883 const ValueType[3] x = [1, 2, 3];
884 auto h = x.histogram!(Kind.saturated, Storage.denseStatic);
885 h.put([4, 5, 6]);
886 assert(h.length == 6);
887 }
888
889 /**
890 Computes $(LUCKY Binary Histogram) of $(D range).
891 */
892 auto bistogram(Range)(in Range range) @safe pure nothrow if (isIterable!Range)
893
894 {
895 return NGram!(Unqual!(ElementType!Range), 1,
896 Kind.binary,
897 Storage.denseStatic,
898 Symmetry.ordered,
899 void,
900 Unqual!Range)(range);
901 }
902 alias bunigram = bistogram;
903
904 unittest
905 {
906 const ubyte[] x;
907 const b = x.bistogram;
908 assert(b.empty);
909
910 // test const unqualification
911 NGram!(ubyte, 1, Kind.binary, Storage.denseStatic, Symmetry.ordered, void, const(ubyte)[]) cb;
912 cb = b;
913 assert(cb == b);
914
915 // test immutable unqualification
916 NGram!(ubyte, 1, Kind.binary, Storage.denseStatic, Symmetry.ordered, void, immutable(ubyte)[]) ib;
917 ib = b;
918 assert(ib == b);
919 }
920
921 unittest
922 {
923 const ubyte[3] x = [1, 2, 3];
924 const h_ = x.bistogram;
925 assert(8*h_.sizeof == 2^^8);
926 const h = h_;
927 assert(h[1] && h[2] && h[3]);
928 assert(!h[0]);
929 assert(!h[4]);
930
931 const h2_ = h_;
932 assert((h2_ & h_) == h2_);
933 assert((h2_ | h_) == h2_);
934 assert((h2_.value & h_.value) == h2_.value);
935 assert((h2_.value | h_.value) == h2_.value);
936 }
937 unittest
938 {
939 const ushort[3] x = [1, 2, 3];
940 const h_ = x.bistogram;
941 assert(8*h_.sizeof == 2^^16);
942 const h = h_;
943 assert(h[1] && h[2] && h[3]);
944 assert(!h[0]);
945 assert(!h[4]);
946 }
947
948 /**
949 Computes $(LUCKY Binary (Occurrence) NGram) of Bytes in Input String $(D x).
950 */
951 auto bistogramOverRepresentation(in string x) pure nothrow
952
953 {
954 return bistogram(x.representation);
955 }
956 unittest
957 {
958 const x = "abcdef";
959 auto h = x.bistogramOverRepresentation;
960 size_t ix = 0;
961 foreach (const bin; h)
962 {
963 if (ix >= cast(ubyte)'a' &&
964 ix <= cast(ubyte)'f')
965 {
966 assert(bin);
967 }
968 else
969 {
970 assert(!bin);
971 }
972 ++ix;
973 }
974 version(print) dbg(h);
975 }
976
977 /**
978 Computes $(LUCKY Bi-Gram) of $(D range).
979 */
980 auto bigram(Kind kind = Kind.saturated,
981 Storage storage = Storage.denseDynamic,
982 Symmetry symmetry = Symmetry.ordered,
983 RequestedBinType = void,
984 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
985 {
986 return NGram!(Unqual!(ElementType!Range), 2,
987 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range);
988 }
989 auto sparseUIntBigram(Kind kind = Kind.saturated,
990 Symmetry symmetry = Symmetry.ordered,
991 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
992 {
993 return NGram!(Unqual!(ElementType!Range), 2,
994 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range);
995 }
996
997 unittest
998 {
999 const ubyte[] x = [1, 2, 3, 3, 3, 4, 4, 5];
1000
1001 auto bSp = x.bigram!(Kind.saturated, Storage.sparse);
1002 alias SparseNGram = Unqual!(typeof(bSp));
1003
1004 // check msgpacking
1005 // auto bSPBytes = bSp.pack();
1006 // SparseNGram bSp_;
1007 // bSPBytes.unpack(bSp_);
1008 // assert(bSp == bSp_);
1009
1010 auto bS = x.bigram!(Kind.saturated, Storage.denseStatic);
1011 const bSCopy = bS;
1012 bS.put(x);
1013 assert(bS != bSCopy);
1014
1015 const bD = x.bigram!(Kind.saturated, Storage.denseDynamic);
1016 version(print) dbg(bD);
1017
1018 const bb = x.bigram!(Kind.binary, Storage.denseStatic);
1019 version(print) dbg(typeof(bb._bins).stringof);
1020
1021 assert(bb.denseness(0).numerator == 4);
1022 // assert(bb.denseness(-1).numerator == 6);
1023 }
1024 unittest
1025 {
1026 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
1027
1028 const bS = x.bigram!(Kind.saturated, Storage.denseStatic);
1029 version(print) dbg(bS);
1030 assert(bS.denseness.numerator == x.length - bS.order + 1);
1031
1032 const bB = x.bigram!(Kind.binary, Storage.denseStatic);
1033 version(print) dbg(bB);
1034 }
1035
1036 unittest
1037 {
1038 const x = [1, 2, 3, 4, 5];
1039 auto h = x.bigram!(Kind.saturated, Storage.sparse);
1040 assert(h.matchDenser(h) == x.length - 1);
1041 }
1042
1043 /**
1044 Computes $(LUCKY Tri-Gram) of $(D range).
1045 */
1046 auto trigram(Kind kind = Kind.saturated,
1047 Storage storage = Storage.denseDynamic,
1048 Symmetry symmetry = Symmetry.ordered,
1049 RequestedBinType = void,
1050 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
1051 {
1052 return NGram!(Unqual!(ElementType!Range), 3,
1053 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range);
1054 }
1055 auto sparseUIntTrigram(Kind kind = Kind.saturated,
1056 Symmetry symmetry = Symmetry.ordered,
1057 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
1058 {
1059 return NGram!(Unqual!(ElementType!Range), 3,
1060 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range);
1061 }
1062 unittest
1063 {
1064 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
1065
1066 const bS = x.trigram!(Kind.saturated, Storage.denseStatic);
1067 version(print) dbg(bS);
1068 assert(bS.denseness.numerator == x.length - bS.order + 1);
1069
1070 const bD = x.trigram!(Kind.saturated, Storage.denseDynamic);
1071 version(print) dbg(bD);
1072
1073 const bB = x.trigram!(Kind.binary, Storage.denseStatic);
1074 /* version(print) dbg(bB); */
1075 }
1076
1077 /**
1078 Computes $(LUCKY Qua-Gram) of $(D range).
1079 */
1080 auto quagram(Kind kind = Kind.saturated,
1081 Storage storage = Storage.denseDynamic,
1082 Symmetry symmetry = Symmetry.ordered,
1083 RequestedBinType = void,
1084 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
1085 {
1086 return NGram!(Unqual!(ElementType!Range), 4,
1087 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range);
1088 }
1089 auto sparseUIntQuagram(Kind kind = Kind.saturated,
1090 Symmetry symmetry = Symmetry.ordered,
1091 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
1092 {
1093 return NGram!(Unqual!(ElementType!Range), 4,
1094 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range);
1095 }
1096 unittest
1097 {
1098 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
1099
1100 const bD = x.quagram!(Kind.saturated, Storage.denseDynamic);
1101 version(print) dbg(bD);
1102 assert(bD.denseness.numerator == x.length - bD.order + 1);
1103 }
1104 auto sparseUIntQuagramOverRepresentation(in string x) pure nothrow
1105 {
1106 return sparseUIntQuagram(x.representation);
1107 }
1108
1109 /**
1110 Computes $(LUCKY Pen-Gram) of $(D range).
1111 */
1112 auto pengram(Kind kind = Kind.saturated,
1113 Storage storage = Storage.denseDynamic,
1114 Symmetry symmetry = Symmetry.ordered,
1115 RequestedBinType = void,
1116 Range)(in Range range) @safe pure nothrow if (isIterable!Range)
1117 {
1118 return NGram!(Unqual!(ElementType!Range), 5,
1119 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range);
1120 }
1121 unittest
1122 {
1123 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
1124
1125 const bS = x.pengram!(Kind.saturated, Storage.sparse);
1126 const bS_ = bS;
1127 assert(bS_ == bS);
1128
1129 // skipping denseStatic because it doesn't fit stack anyway
1130
1131 const bD = x.pengram!(Kind.saturated, Storage.denseDynamic);
1132 version(print) dbg(bD);
1133 assert(bD.denseness.numerator == x.length - bD.order + 1);
1134 }