1 /** First and Higher Order Statistics: $(LUCKY Histograms) and $(LUCKY N-grams). 2 3 Copyright: Per Nordlöw 2022-. 4 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0). 5 Authors: $(WEB Per Nordlöw) 6 7 TODO: Replace static ifs with static final switch when Issue 6921 is fixed 8 9 TODO: Remove overloads using std.string:representation and use sparse maps 10 over dchar for strings. 11 */ 12 module nxt.ngram; 13 14 // version = print; 15 16 import nxt.container.static_bitarray; 17 import std.range: InputRange, ElementType, isInputRange; 18 import std.traits: isSomeChar, isUnsigned, isIntegral, isFloatingPoint, Unqual, isIterable, isAssociativeArray, CommonType; 19 import std.stdint: uint64_t; 20 import std.typecons: Tuple, tuple; 21 import nxt.predicates: allZero, allEqualTo; 22 import nxt.nesses: denseness; 23 import nxt.rational: Rational; 24 // import msgpack; 25 import std.numeric: dotProduct; 26 import std.string: representation; 27 import std.conv: to; 28 29 /** N-Gram Model Kind. */ 30 enum Kind 31 { 32 saturated, // Standard Saturated Integers. Saturation is more robust than default integer wrap-around. 33 binary // Binary Gram. 34 } 35 36 /** N-Gram Model Storage. */ 37 enum Storage 38 { 39 denseDynamic,// Dynamically allocated dense storage. 40 denseStatic, // Statically allocated dense storage. 41 sparse, // N-Dimensional/Level D Maps/Dictionaries. 42 } 43 44 /** N-Gram Model Symmetry. */ 45 enum Symmetry 46 { 47 ordered, // Order between elements matters. 48 unordered, // Order of elements is ignored (relaxed). Tuples are sorted before inserted. 49 } 50 51 /** 52 Computes $(LUCKY N-Gram) Model of Input Range $(D range). 53 54 If N == 1 this N-gram becomes a standard $(LUCKY Histogram), also called a 55 $(LUCKY Unigram). 56 57 If Bin/Bucket Type $(D RequestedBinType), requested by user, is $(D void) it 58 will be chosen automatically based on element type of $(D Range). 59 60 TODO: Add optimized put() be precalculating maximum number of elements that 61 can be inserted without any risk overflow. 62 */ 63 struct NGram(ValueType, 64 int N = 1, 65 Kind kind = Kind.saturated, 66 Storage storage = Storage.denseDynamic, 67 Symmetry symmetry = Symmetry.ordered, 68 RequestedBinType = void, 69 Range) if (is(RequestedBinType == void) || 70 isUnsigned!RequestedBinType || 71 isFloatingPoint!RequestedBinType) 72 { 73 this(in Range range) @safe pure nothrow { put(range); } 74 75 auto ref value() @property @safe pure inout nothrow { return _bins; } 76 77 alias Q = Rational!ulong; 78 79 @property static int order() @safe pure nothrow { return N; } 80 81 enum isBinary = (kind == Kind.binary); 82 enum isDense = (storage == Storage.denseStatic || 83 storage == Storage.denseDynamic); 84 enum isSparse = !isDense; 85 enum cacheDeepDenseness = false; 86 87 enum getStorage = storage; 88 89 /** Returns: Documentation of the type of $(D this). */ 90 string typeName() @property const 91 { 92 string prefix; 93 static if (N == 1) { prefix = "Uni"; } 94 else static if (N == 2) { prefix = "Bi"; } 95 else static if (N == 3) { prefix = "Tri"; } 96 else static if (N == 4) { prefix = "Qua"; } 97 else static if (N == 5) { prefix = "Pen"; } 98 else { prefix = to!string(N) ~ "-"; } 99 string rval; 100 static if (isBinary) 101 { 102 rval ~= "Bit"; 103 } 104 else 105 { 106 rval ~= BinType.stringof; 107 } 108 rval ~= "-" ~ prefix ~ "Gram over " ~ ValueType.stringof ~ "s"; 109 return rval; 110 } 111 112 string toString() @property const 113 { 114 string rval = typeName ~ ": "; 115 116 // print contents 117 static if (isAA) 118 { 119 rval ~= to!string(_bins); 120 } 121 else if (N >= 2 || 122 this.denseness < Q(1, 2)) // if less than half of the bins are occupied 123 { 124 // show as sparse 125 rval ~= "["; 126 127 /+ TODO: Replace these with recursion? +/ 128 static if (N == 1) 129 { 130 bool begun = false; 131 foreach (ix, elt; _bins) /+ TODO: Make static_bitarray support ref foreach and make elt const ref +/ 132 { 133 if (elt) 134 { 135 if (begun) { rval ~= ", "; } 136 rval ~= "{" ~ to!string(ix) ~ "}#" ~ to!string(elt); 137 begun = true; 138 } 139 } 140 } 141 else static if (N == 2) 142 { 143 bool begun = false; 144 foreach (ix0, const ref elt0; _bins) 145 { 146 foreach (ix1, elt1; elt0) /+ TODO: Make static_bitarray support ref foreach and make elt1 const ref +/ 147 { 148 if (elt1) 149 { 150 if (begun) { rval ~= ", "; } 151 rval ~= ("{" ~ 152 to!string(ix0) ~ "," ~ 153 to!string(ix1) ~ "}#" ~ 154 to!string(elt1)); 155 begun = true; 156 } 157 } 158 } 159 } 160 else static if (N == 3) 161 { 162 bool begun = false; 163 foreach (ix0, const ref elt0; _bins) 164 { 165 foreach (ix1, const ref elt1; elt0) 166 { 167 foreach (ix2, elt2; elt1) { /+ TODO: Make static_bitarray support ref foreach and make elt2 const ref +/ 168 if (elt2) 169 { 170 if (begun) { rval ~= ", "; } 171 rval ~= ("{" ~ 172 to!string(ix0) ~ "," ~ 173 to!string(ix1) ~ "," ~ 174 to!string(ix2) ~ "}#" ~ 175 to!string(elt2)); 176 begun = true; 177 } 178 } 179 } 180 } 181 } 182 else static if (N == 4) 183 { 184 bool begun = false; 185 foreach (ix0, const ref elt0; _bins) 186 { 187 foreach (ix1, const ref elt1; elt0) 188 { 189 foreach (ix2, elt2; elt1) 190 { 191 foreach (ix3, elt3; elt2) /+ TODO: Make static_bitarray support ref foreach and make elt2 const ref +/ 192 { 193 if (elt3) 194 { 195 if (begun) { rval ~= ", "; } 196 rval ~= ("{" ~ 197 to!string(ix0) ~ "," ~ 198 to!string(ix1) ~ "," ~ 199 to!string(ix2) ~ "," ~ 200 to!string(ix3) ~ "}#" ~ 201 to!string(elt3)); 202 begun = true; 203 } 204 } 205 } 206 } 207 } 208 } 209 else static if (N == 5) 210 { 211 bool begun = false; 212 foreach (ix0, const ref elt0; _bins) 213 { 214 foreach (ix1, const ref elt1; elt0) 215 { 216 foreach (ix2, const ref elt2; elt1) 217 { 218 foreach (ix3, const ref elt3; elt2) /+ TODO: Make static_bitarray support ref foreach and make elt2 const ref +/ 219 { 220 foreach (ix4, elt4; elt3) /+ TODO: Make static_bitarray support ref foreach and make elt2 const ref +/ 221 { 222 if (elt4) 223 { 224 if (begun) { rval ~= ", "; } 225 rval ~= ("{" ~ 226 to!string(ix0) ~ "," ~ 227 to!string(ix1) ~ "," ~ 228 to!string(ix2) ~ "," ~ 229 to!string(ix3) ~ "," ~ 230 to!string(ix4) ~ "}#" ~ 231 to!string(elt4)); 232 begun = true; 233 } 234 } 235 } 236 } 237 } 238 } 239 } 240 else 241 { 242 static assert(0, "N >= 5 not supported"); 243 } 244 rval ~= "]"; 245 } 246 else 247 { 248 rval ~= to!string(_bins); // default 249 } 250 251 static if (N == 1) 252 { 253 rval ~= " denseness:" ~ to!string(denseness); 254 } 255 else 256 { 257 rval ~= (" shallowDenseness:" ~ to!string(denseness(0)) ~ 258 " deepDenseness:" ~ to!string(denseness(-1))); 259 } 260 261 return rval; 262 } 263 264 void reset() @safe nothrow 265 { 266 import nxt.algorithm_ex: reset; 267 _bins.reset(); 268 } 269 // alias reset = clear; 270 271 bool empty() const @property pure @trusted nothrow 272 { 273 static if (isAA) 274 { 275 return _bins.length == 0; 276 } 277 else 278 { 279 return _bins.allZero; 280 } 281 } 282 283 static if (N >= 2 && 284 storage != Storage.sparse) 285 { 286 Q shallowDenseness() const pure @property @trusted nothrow 287 { 288 static if (N == 2) 289 { 290 ulong x = 0; 291 foreach (const ref elt0; 0..combE) // every possible first ngram element 292 { 293 bool used = false; 294 foreach (elt1; 0..combE) // every possible second ngram element 295 { 296 if (_bins[elt0][elt1]) // if any bit is used 297 { 298 used = true; // we flag it and exit 299 break; 300 } 301 } 302 if (used) ++x; 303 } 304 return Q(x, combE); 305 } 306 else 307 { 308 return Q(1, 1); 309 } 310 } 311 } 312 313 typeof(this) opAssign(RhsRange)(in NGram!(ValueType, N, kind, storage, symmetry, RequestedBinType, RhsRange) rhs) 314 if (isInputRange!RhsRange) // if (is(Unqual!Range == Unqual!RhsRange)) 315 { 316 static if (storage == Storage.denseDynamic) 317 _bins = rhs._bins.dup; 318 else static if (storage == Storage.sparse) 319 _bins = cast(BinType[ValueType[N]])(rhs._bins.dup); /+ TODO: How can we prevent this cast? +/ 320 else static if (storage == Storage.denseStatic) 321 _bins = rhs._bins; 322 else 323 static assert(0, "Cannot handle storage of type " ~ to!string(storage)); 324 return this; 325 } 326 327 static if (N == 1) 328 { 329 void normalize() @safe pure /* nothrow */ 330 { 331 static if (kind != Kind.binary) 332 { 333 static if (isFloatingPoint!ValueType) 334 { 335 immutable scaleFactor = (cast(real)1) / _bins.length; // precalc scaling 336 } 337 foreach (ref elt; _bins) /+ TODO: Why can this throw for associative arrays? +/ 338 { 339 static if (isIntegral!ValueType || 340 isSomeChar!ValueType) 341 { 342 elt /= 2; // drop least significant bit 343 } 344 else static if (isFloatingPoint!ValueType) 345 { 346 elt =* scaleFactor; 347 } 348 } 349 } 350 } 351 } 352 353 /** Invalidate Local Statistics. */ 354 void invalidateStats() @safe nothrow 355 { 356 static if (isAA && cacheDeepDenseness) { _deepDenseness = 0; } 357 } 358 359 /** Saturated Increment $(D x) by one. */ 360 ref T incs(T)(ref T x) @safe nothrow if (isIntegral!T) { if (x != T.max) { ++x; invalidateStats(); } return x; } 361 /** Saturated Increment $(D x) by one. */ 362 ref T incs(T)(ref T x) @safe nothrow if (isFloatingPoint!T) { invalidateStats(); return ++x; } 363 364 static if (kind == Kind.saturated) 365 { 366 /** Increase bin of $(D ng). 367 Bin of ng must already be allocated if its stored in a map. 368 */ 369 ref NGram inc(ValueType[N] ng) @safe pure nothrow 370 { 371 static if (isAA) 372 { 373 incs(_bins[ng]); // just one case! 374 } 375 else 376 { 377 static if (N == 1) 378 { 379 static if (storage == Storage.denseDynamic) 380 { 381 if (!_bins) { _bins = new BinType[noABins]; } 382 } 383 incs(_bins[ng[0]]); 384 } 385 else static if (N == 2) 386 { 387 static if (storage == Storage.denseDynamic) 388 { 389 if (!_bins) { _bins = new BinType[][noABins]; } 390 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[noABins]; } 391 } 392 incs(_bins[ng[0]][ng[1]]); 393 } 394 else static if (N == 3) 395 { 396 static if (storage == Storage.denseDynamic) 397 { 398 if (!_bins) { _bins = new BinType[][][noABins]; } 399 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][noABins]; } 400 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[noABins]; } 401 } 402 incs(_bins[ng[0]][ng[1]][ng[2]]); 403 } 404 else static if (N == 4) 405 { 406 static if (storage == Storage.denseDynamic) 407 { 408 if (!_bins) { _bins = new BinType[][][][noABins]; } 409 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][][noABins]; } 410 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[][noABins]; } 411 if (!_bins[ng[0]][ng[1]][ng[2]]) { _bins[ng[0]][ng[1]][ng[2]] = new BinType[noABins]; } 412 } 413 incs(_bins[ng[0]][ng[1]][ng[2]][ng[3]]); 414 } 415 else static if (N == 5) 416 { 417 static if (storage == Storage.denseDynamic) 418 { 419 if (!_bins) { _bins = new BinType[][][][][noABins]; } 420 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][][][noABins]; } 421 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[][][noABins]; } 422 if (!_bins[ng[0]][ng[1]][ng[2]]) { _bins[ng[0]][ng[1]][ng[2]] = new BinType[][noABins]; } 423 if (!_bins[ng[0]][ng[1]][ng[2]][ng[3]]) { _bins[ng[0]][ng[1]][ng[2]][ng[3]] = new BinType[noABins]; } 424 } 425 incs(_bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]); 426 } 427 else 428 { 429 static assert(0, "N >= 6 not supported"); 430 } 431 } 432 return this; 433 } 434 } 435 436 /** Returns: Bin Count of NGram $(D ng). */ 437 BinType opIndex(T)(in T ng) const @safe pure nothrow if (isIntegral!T || 438 __traits(isStaticArray, T)) 439 { 440 static if (isAA) 441 { 442 if (ng in _bins) 443 { 444 return _bins[ng]; 445 } 446 else 447 { 448 return 0; // empty means zero bin 449 } 450 } 451 else 452 { 453 static if (N == 1) { return _bins[ng]; } 454 else static if (N == 2) { return _bins[ng[0]][ng[1]]; } 455 else static if (N == 3) { return _bins[ng[0]][ng[1]][ng[2]]; } 456 else static if (N == 4) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]]; } 457 else static if (N == 5) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]; } 458 else { static assert(0, "N >= 6 not supported"); } 459 } 460 } 461 462 /* ref BinType opIndexAssign(Index)(BinType b, Index i) @trusted pure nothrow if (isIntegral!Index) in { */ 463 464 /* } */ 465 466 /** Returns: Bin Count of NGram $(D ng). */ 467 auto opIndexAssign(T)(in T ng) const @safe pure nothrow if (isIntegral!T || 468 __traits(isStaticArray, T)) 469 { 470 static if (isAA) 471 { 472 if (ng in _bins) 473 { 474 return _bins[ng]; 475 } 476 else 477 { 478 return 0; // empty means zero bin 479 } 480 } 481 else 482 { 483 static if (N == 1) { return _bins[ng]; } 484 else static if (N == 2) { return _bins[ng[0]][ng[1]]; } 485 else static if (N == 3) { return _bins[ng[0]][ng[1]][ng[2]]; } 486 else static if (N == 4) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]]; } 487 else static if (N == 5) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]; } 488 else { static assert(0, "N >= 6 not supported"); } 489 } 490 } 491 492 /** Put NGram Element $(D ng) in $(D this). 493 */ 494 ref NGram put(ValueType[N] ng) @safe pure nothrow 495 { 496 static if (isBinary) 497 { 498 static if (N == 1) { _bins[ng[0]] = true; } 499 else static if (N == 2) { _bins[ng[0]][ng[1]] = true; } 500 else static if (N == 3) { _bins[ng[0]][ng[1]][ng[2]] = true; } 501 else static if (N == 4) { _bins[ng[0]][ng[1]][ng[2]][ng[3]] = true; } 502 else static if (N == 5) { _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]] = true; } 503 else { static assert(0, "N >= 6 not supported"); } 504 } 505 else 506 { 507 static if (isAA) 508 { 509 if (ng in _bins) 510 { 511 inc(ng); // increase bin 512 } 513 else 514 { 515 _bins[ng] = 1; // initial bin count 516 } 517 } 518 else 519 { 520 inc(ng); 521 } 522 } 523 invalidateStats(); 524 return this; 525 } 526 527 /** Put NGram Elements $(D range) in $(D this). 528 */ 529 ref NGram put(T)(in T range) @safe pure nothrow if (isIterable!T) 530 { 531 static if (N == 1) 532 { 533 foreach (ix, const ref elt; range) 534 { 535 put([elt]); 536 } 537 } 538 else static if (N == 2) 539 { 540 ValueType prev; // previous element 541 foreach (ix, const ref elt; range) 542 { 543 if (ix >= N-1) { put([prev, elt]); } 544 prev = elt; 545 } 546 } 547 else static if (N == 3) 548 { 549 ValueType pprev, prev; // previous elements 550 foreach (ix, const ref elt; range) 551 { 552 if (ix >= N-1) { put([pprev, prev, elt]); } 553 pprev = prev; 554 prev = elt; 555 } 556 } 557 else static if (N == 4) 558 { 559 ValueType ppprev, pprev, prev; // previous elements 560 foreach (ix, const ref elt; range) 561 { 562 if (ix >= N-1) { put([ppprev, pprev, prev, elt]); } 563 ppprev = pprev; 564 pprev = prev; 565 prev = elt; 566 } 567 } 568 else static if (N == 5) 569 { 570 ValueType pppprev, ppprev, pprev, prev; // previous elements 571 foreach (ix, const ref elt; range) 572 { 573 if (ix >= N-1) { put([pppprev, ppprev, pprev, prev, elt]); } 574 pppprev = ppprev; 575 ppprev = pprev; 576 pprev = prev; 577 prev = elt; 578 } 579 } 580 return this; 581 } 582 583 /** Scalar (Dot) Product Match $(D this) with $(D rhs). */ 584 CommonType!(ulong, BinType) matchDenser(rhsValueType, 585 Kind rhsKind = Kind.saturated, 586 Storage rhsStorage = Storage.denseDynamic, 587 RhsRange)(in NGram!(rhsValueType, 588 N, 589 rhsKind, 590 rhsStorage, 591 symmetry, 592 RequestedBinType, 593 RhsRange) rhs) const @trusted pure /* nothrow */ 594 { 595 static if (this.isDense && rhs.isDense) 596 { 597 static if (N == 1) return dotProduct(this._bins, rhs._bins); 598 else static assert(0, "N >= " ~ to!string(N) ~ " not supported"); 599 } 600 else static if (this.isSparse) 601 { 602 typeof(return) sum = 0; 603 foreach (ix, const ref bin; _bins) // assume lhs is sparsest 604 { 605 sum += bin*rhs[ix]; 606 } 607 return sum; 608 } 609 else static if (rhs.isSparse) 610 { 611 typeof(return) sum = 0; 612 foreach (ix, const ref bin; rhs._bins) // assume lhs is sparsest 613 { 614 sum += this[ix]*bin; 615 } 616 return sum; 617 } 618 else 619 { 620 static assert(0, "Combination of " ~ typeof(storage).stringof ~ " and " ~ 621 typeof(rhsStorage).stringof ~ " not supported"); 622 return 0; 623 } 624 } 625 626 auto opBinary(string op, 627 string file = __FILE__, int line = __LINE__)(in NGram rhs) const @trusted pure /* nothrow */ 628 { 629 NGram tmp; 630 static if (this.isSparse || 631 rhs.isSparse) 632 { 633 void doIt(in NGram a, in NGram b) // more functional than orderInPlace 634 { 635 foreach (ix, const ref bin; a._bins) 636 { 637 if (ix in b._bins) 638 { 639 mixin("tmp._bins[ix] = bin " ~ op ~ " b._bins[ix];"); // propagate pointwise operation 640 } 641 } 642 } 643 if (this.length < rhs.length) { doIt(this, rhs); } 644 else { doIt(rhs, this); } 645 } 646 else static if (this.isBinary && 647 rhs.isBinary) 648 { 649 mixin("tmp._bins = _bins " ~ op ~ "rhs._bins;"); // bitsets cannot be sliced (yet) 650 } 651 else static if (this.isDense && 652 rhs.isDense && 653 N == 1) 654 { 655 assert(this.length == rhs.length); 656 import std.range: zip; 657 import std.algorithm: map; 658 import std.array: array; 659 tmp._bins = zip(this._bins[], rhs._bins[]).map!(a => mixin("a[0] " ~ op ~ "a[1]"))().array; /+ TODO: Use zipWith when Issue 8715 is fixed +/ 660 /* foreach (ix, let; this._bins) { /+ TODO: Reuse Phobos algorithm or range */ +/ 661 /* mixin("tmp[ix] = _bins[ix] " ~ op ~ "rhs._bins[ix];"); */ 662 /* } */ 663 } 664 else 665 { 666 static assert(0, "Combination of " ~ to!string(this.getStorage) ~ " and " ~ 667 to!string(rhs.getStorage) ~ " and N == " ~ to!string(N) ~ " not supported"); 668 } 669 return tmp; 670 } 671 672 /** Determine Bin (Counter) Type. */ 673 static if (isBinary) 674 { 675 alias BinType = bool; 676 } 677 else static if (is(RequestedBinType == void)) 678 { 679 alias BinType = uint64_t; // Bin (counter) type. Count long by default. 680 } 681 else 682 { 683 alias BinType = RequestedBinType; 684 } 685 686 enum bitsE = 8 * ValueType.sizeof; /** Number of Bits in ValueType (element). TODO: This may have to be fixed for ref types. */ 687 enum combE = 2^^bitsE; /** Number of combinations in element. */ 688 enum noABins = combE; // Number of bins per array dimension 689 enum noBins = combE^^N; /** Maximum number of bins (possible). */ 690 691 // Determine storage structure base on element type. 692 static if (storage == Storage.sparse) 693 { 694 BinType[ValueType[N]] _bins; 695 } 696 else static if (isBinary && 697 N*bitsE <= 32) // safely fits on stack 698 { 699 static if (storage == Storage.denseStatic) 700 { 701 static if (N == 1) StaticBitArray!noABins _bins; 702 else static if (N == 2) StaticBitArray!noABins[noABins] _bins; 703 else static if (N == 3) StaticBitArray!noABins[noABins][noABins] _bins; 704 else static assert(0, "Dense static N >= 4 does no safely fit on stack"); 705 } 706 else static if (storage == Storage.denseDynamic) 707 { 708 static if (N == 1) StaticBitArray!noABins _bins; 709 else static if (N == 2) StaticBitArray!noABins[] _bins; 710 else static if (N == 3) StaticBitArray!noABins[][] _bins; 711 else static if (N == 4) StaticBitArray!noABins[][][] _bins; 712 else static if (N == 5) StaticBitArray!noABins[][][][] _bins; 713 else static assert(0, "N >= 6 not supported"); 714 } 715 else static if (storage == Storage.sparse) // shouldn't happen 716 { 717 } 718 } 719 else static if (storage == Storage.denseStatic && 720 (isUnsigned!ValueType || 721 isSomeChar!ValueType) && 722 N*bitsE <= 16) // safely fits on stack 723 { 724 static if (N == 1) BinType[noABins] _bins; 725 else static if (N == 2) BinType[noABins][noABins] _bins; 726 else static assert(0, "Dense static N >= 3 does not safely fit on stack"); 727 } 728 else static if (storage == Storage.denseDynamic && 729 (isUnsigned!ValueType || 730 isSomeChar!ValueType)) // safely fits on stack 731 { 732 static if (N == 1) BinType[] _bins; 733 else static if (N == 2) BinType[][] _bins; 734 else static if (N == 3) BinType[][][] _bins; 735 else static if (N == 4) BinType[][][][] _bins; 736 else static if (N == 5) BinType[][][][][] _bins; 737 else static assert(0, "N >= 6 not supported"); 738 } 739 else 740 { 741 BinType[ValueType[N]] _bins; 742 } 743 744 private alias _bins this; 745 746 enum isAA = isAssociativeArray!(typeof(_bins)); 747 748 static if (isAA && cacheDeepDenseness) 749 { 750 ulong _deepDenseness; // Cache deep denseness in associative arrays 751 } 752 753 Q densenessUncached(int depth = -1) const pure @property @trusted nothrow 754 { 755 static if (isBinary) 756 { 757 static if (N >= 2) 758 { 759 if (N == 2 && depth == 0) // if shallow wanted 760 { 761 return shallowDenseness(); 762 } 763 } 764 return _bins.denseness; 765 } 766 else 767 { 768 static if (isAA) 769 { 770 return Q(_bins.length, noBins); 771 } 772 else 773 { 774 return _bins.denseness(depth); 775 } 776 } 777 } 778 779 /** Returns: Number of Non-Zero Elements in $(D range). */ 780 Q denseness(int depth = -1) const pure @property @safe nothrow 781 { 782 static if (isAA && cacheDeepDenseness) 783 { 784 if (depth == -1) // -1 means deepDenseness 785 { 786 if (!_deepDenseness) // if not yet defined. 787 { 788 unqual(this).densenessUncached(depth).numerator; // calculate it 789 } 790 return Q(_deepDenseness, noBins); 791 } 792 } 793 return densenessUncached(depth); 794 } 795 } 796 797 /** 798 Computes $(LUCKY Bi-Gram) of $(D range). 799 */ 800 auto ngram(int N = 2, 801 Kind kind = Kind.saturated, 802 Storage storage = Storage.denseDynamic, 803 Symmetry symmetry = Symmetry.ordered, 804 RequestedBinType = void, 805 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 806 { 807 return NGram!(Unqual!(ElementType!Range), N, 808 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 809 } 810 auto sparseUIntNGramOverRepresentation(int N = 2, 811 Kind kind = Kind.saturated, 812 Symmetry symmetry = Symmetry.ordered)(in string range) @safe pure nothrow 813 { 814 auto y = range.representation; 815 return ngram!(N, kind, Storage.sparse, symmetry, uint)(y); 816 } 817 818 auto histogram(Kind kind = Kind.saturated, 819 Storage storage = Storage.denseDynamic, 820 Symmetry symmetry = Symmetry.ordered, 821 RequestedBinType = void, 822 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 823 { 824 return NGram!(Unqual!(ElementType!Range), 1, kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 825 } 826 alias Hist = NGram; 827 alias hist = histogram; 828 alias unigram = histogram; 829 830 unittest { 831 const ubyte[] x = [1, 2, 3, 4, 5]; 832 const h = x.histogram!(Kind.saturated, Storage.denseStatic); 833 const hh = h + h; 834 835 auto h2 = x.histogram!(Kind.saturated, Storage.denseStatic); 836 h2.put(x); 837 assert(hh == h2); 838 } 839 840 unittest { 841 const ubyte[] x; 842 const hDS = x.histogram!(Kind.saturated, Storage.denseStatic); assert(hDS.empty); 843 844 const hDD = x.histogram!(Kind.saturated, Storage.denseDynamic); assert(hDD.empty); 845 846 auto h = x.histogram!(Kind.saturated, Storage.sparse); assert(h.empty); 847 const ubyte[5] x1 = [1, 2, 3, 4, 5]; 848 h.put(x1); assert(!h.empty); 849 assert(h.matchDenser(h) == x1.length); 850 auto hh = h + h; 851 assert(hh.matchDenser(hh) == 4*x1.length); 852 853 h.destroy(); assert(h.empty); 854 855 auto hU32 = x.histogram!(Kind.saturated, Storage.sparse, Symmetry.ordered, uint); 856 } 857 858 unittest { 859 alias ValueType = ubyte; 860 const ValueType[3] x = [11, 12, 13]; 861 862 auto h = x.histogram!(Kind.saturated, Storage.denseStatic); 863 assert(h[0] == 0); 864 assert(h[11] == 1 && h[12] == 1 && h[13] == 1); 865 assert(3 == h.matchDenser(h)); 866 867 h.put(x); 868 assert(h[11] == 2 && h[12] == 2 && h[13] == 2); 869 assert(12 == h.matchDenser(h)); 870 871 version (print) dbg(h); 872 873 auto hD = x.histogram!(Kind.saturated, Storage.denseDynamic); 874 version (print) dbg(hD); 875 } 876 877 unittest { 878 alias ValueType = ulong; 879 const ValueType[3] x = [1, 2, 3]; 880 auto h = x.histogram!(Kind.saturated, Storage.denseStatic); 881 h.put([4, 5, 6]); 882 assert(h.length == 6); 883 } 884 885 /** 886 Computes $(LUCKY Binary Histogram) of $(D range). 887 */ 888 auto bistogram(Range)(in Range range) @safe pure nothrow if (isIterable!Range) 889 890 { 891 return NGram!(Unqual!(ElementType!Range), 1, 892 Kind.binary, 893 Storage.denseStatic, 894 Symmetry.ordered, 895 void, 896 Unqual!Range)(range); 897 } 898 alias bunigram = bistogram; 899 900 unittest { 901 const ubyte[] x; 902 const b = x.bistogram; 903 assert(b.empty); 904 905 // test const unqualification 906 NGram!(ubyte, 1, Kind.binary, Storage.denseStatic, Symmetry.ordered, void, const(ubyte)[]) cb; 907 cb = b; 908 assert(cb == b); 909 910 // test immutable unqualification 911 NGram!(ubyte, 1, Kind.binary, Storage.denseStatic, Symmetry.ordered, void, immutable(ubyte)[]) ib; 912 ib = b; 913 assert(ib == b); 914 } 915 916 unittest { 917 const ubyte[3] x = [1, 2, 3]; 918 const h_ = x.bistogram; 919 assert(8*h_.sizeof == 2^^8); 920 const h = h_; 921 assert(h[1] && h[2] && h[3]); 922 assert(!h[0]); 923 assert(!h[4]); 924 925 const h2_ = h_; 926 assert((h2_ & h_) == h2_); 927 assert((h2_ | h_) == h2_); 928 assert((h2_.value & h_.value) == h2_.value); 929 assert((h2_.value | h_.value) == h2_.value); 930 } 931 unittest { 932 const ushort[3] x = [1, 2, 3]; 933 const h_ = x.bistogram; 934 assert(8*h_.sizeof == 2^^16); 935 const h = h_; 936 assert(h[1] && h[2] && h[3]); 937 assert(!h[0]); 938 assert(!h[4]); 939 } 940 941 /** 942 Computes $(LUCKY Binary (Occurrence) NGram) of Bytes in Input String $(D x). 943 */ 944 auto bistogramOverRepresentation(in string x) pure nothrow 945 946 { 947 return bistogram(x.representation); 948 } 949 unittest { 950 const x = "abcdef"; 951 auto h = x.bistogramOverRepresentation; 952 size_t ix = 0; 953 foreach (const bin; h) 954 { 955 if (ix >= cast(ubyte)'a' && 956 ix <= cast(ubyte)'f') 957 { 958 assert(bin); 959 } 960 else 961 { 962 assert(!bin); 963 } 964 ++ix; 965 } 966 version (print) dbg(h); 967 } 968 969 /** 970 Computes $(LUCKY Bi-Gram) of $(D range). 971 */ 972 auto bigram(Kind kind = Kind.saturated, 973 Storage storage = Storage.denseDynamic, 974 Symmetry symmetry = Symmetry.ordered, 975 RequestedBinType = void, 976 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 977 { 978 return NGram!(Unqual!(ElementType!Range), 2, 979 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 980 } 981 auto sparseUIntBigram(Kind kind = Kind.saturated, 982 Symmetry symmetry = Symmetry.ordered, 983 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 984 { 985 return NGram!(Unqual!(ElementType!Range), 2, 986 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range); 987 } 988 989 unittest { 990 const ubyte[] x = [1, 2, 3, 3, 3, 4, 4, 5]; 991 992 auto bSp = x.bigram!(Kind.saturated, Storage.sparse); 993 alias SparseNGram = Unqual!(typeof(bSp)); 994 995 // check msgpacking 996 // auto bSPBytes = bSp.pack(); 997 // SparseNGram bSp_; 998 // bSPBytes.unpack(bSp_); 999 // assert(bSp == bSp_); 1000 1001 auto bS = x.bigram!(Kind.saturated, Storage.denseStatic); 1002 const bSCopy = bS; 1003 bS.put(x); 1004 assert(bS != bSCopy); 1005 1006 const bD = x.bigram!(Kind.saturated, Storage.denseDynamic); 1007 version (print) dbg(bD); 1008 1009 const bb = x.bigram!(Kind.binary, Storage.denseStatic); 1010 version (print) dbg(typeof(bb._bins).stringof); 1011 1012 assert(bb.denseness(0).numerator == 4); 1013 // assert(bb.denseness(-1).numerator == 6); 1014 } 1015 unittest { 1016 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1017 1018 const bS = x.bigram!(Kind.saturated, Storage.denseStatic); 1019 version (print) dbg(bS); 1020 assert(bS.denseness.numerator == x.length - bS.order + 1); 1021 1022 const bB = x.bigram!(Kind.binary, Storage.denseStatic); 1023 version (print) dbg(bB); 1024 } 1025 1026 unittest { 1027 const x = [1, 2, 3, 4, 5]; 1028 auto h = x.bigram!(Kind.saturated, Storage.sparse); 1029 assert(h.matchDenser(h) == x.length - 1); 1030 } 1031 1032 /** 1033 Computes $(LUCKY Tri-Gram) of $(D range). 1034 */ 1035 auto trigram(Kind kind = Kind.saturated, 1036 Storage storage = Storage.denseDynamic, 1037 Symmetry symmetry = Symmetry.ordered, 1038 RequestedBinType = void, 1039 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1040 { 1041 return NGram!(Unqual!(ElementType!Range), 3, 1042 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 1043 } 1044 auto sparseUIntTrigram(Kind kind = Kind.saturated, 1045 Symmetry symmetry = Symmetry.ordered, 1046 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1047 { 1048 return NGram!(Unqual!(ElementType!Range), 3, 1049 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range); 1050 } 1051 unittest { 1052 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1053 1054 const bS = x.trigram!(Kind.saturated, Storage.denseStatic); 1055 version (print) dbg(bS); 1056 assert(bS.denseness.numerator == x.length - bS.order + 1); 1057 1058 const bD = x.trigram!(Kind.saturated, Storage.denseDynamic); 1059 version (print) dbg(bD); 1060 1061 const bB = x.trigram!(Kind.binary, Storage.denseStatic); 1062 /* version (print) dbg(bB); */ 1063 } 1064 1065 /** 1066 Computes $(LUCKY Qua-Gram) of $(D range). 1067 */ 1068 auto quagram(Kind kind = Kind.saturated, 1069 Storage storage = Storage.denseDynamic, 1070 Symmetry symmetry = Symmetry.ordered, 1071 RequestedBinType = void, 1072 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1073 { 1074 return NGram!(Unqual!(ElementType!Range), 4, 1075 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 1076 } 1077 auto sparseUIntQuagram(Kind kind = Kind.saturated, 1078 Symmetry symmetry = Symmetry.ordered, 1079 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1080 { 1081 return NGram!(Unqual!(ElementType!Range), 4, 1082 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range); 1083 } 1084 unittest { 1085 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1086 1087 const bD = x.quagram!(Kind.saturated, Storage.denseDynamic); 1088 version (print) dbg(bD); 1089 assert(bD.denseness.numerator == x.length - bD.order + 1); 1090 } 1091 auto sparseUIntQuagramOverRepresentation(in string x) pure nothrow 1092 { 1093 return sparseUIntQuagram(x.representation); 1094 } 1095 1096 /** 1097 Computes $(LUCKY Pen-Gram) of $(D range). 1098 */ 1099 auto pengram(Kind kind = Kind.saturated, 1100 Storage storage = Storage.denseDynamic, 1101 Symmetry symmetry = Symmetry.ordered, 1102 RequestedBinType = void, 1103 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1104 { 1105 return NGram!(Unqual!(ElementType!Range), 5, 1106 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 1107 } 1108 unittest { 1109 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1110 1111 const bS = x.pengram!(Kind.saturated, Storage.sparse); 1112 const bS_ = bS; 1113 assert(bS_ == bS); 1114 1115 // skipping denseStatic because it doesn't fit stack anyway 1116 1117 const bD = x.pengram!(Kind.saturated, Storage.denseDynamic); 1118 version (print) dbg(bD); 1119 assert(bD.denseness.numerator == x.length - bD.order + 1); 1120 }