1 /** First and Higher Order Statistics: $(LUCKY Histograms) and $(LUCKY N-grams). 2 3 Copyright: Per Nordlöw 2018-. 4 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0). 5 Authors: $(WEB Per Nordlöw) 6 7 TODO Replace static ifs with static final switch when Issue 6921 is fixed 8 9 TODO Remove overloads using std.string:representation and use sparse maps 10 over dchar for strings. 11 */ 12 module nxt.ngram; 13 14 import nxt.static_bitarray; 15 import std.range: InputRange, ElementType, isInputRange; 16 import std.traits: isSomeChar, isUnsigned, isIntegral, isFloatingPoint, Unqual, isStaticArray, isIterable, isAssociativeArray, CommonType; 17 import std.stdint: uint64_t; 18 import std.typecons: Tuple, tuple; 19 import nxt.predicates: allZero, allEqualTo; 20 import nxt.nesses: denseness; 21 import nxt.rational: Rational; 22 // import msgpack; 23 import std.numeric: dotProduct; 24 import std..string: representation; 25 import std.conv: to; 26 27 // version = print; 28 29 /** N-Gram Model Kind. */ 30 enum Kind 31 { 32 saturated, // Standard Saturated Integers. Saturation is more robust than default integer wrap-around. 33 binary // Binary Gram. 34 } 35 36 /** N-Gram Model Storage. */ 37 enum Storage 38 { 39 denseDynamic,// Dynamically allocated dense storage. 40 denseStatic, // Statically allocated dense storage. 41 sparse, // N-Dimensional/Level D Maps/Dictionaries. 42 } 43 44 /** N-Gram Model Symmetry. */ 45 enum Symmetry 46 { 47 ordered, // Order between elements matters. 48 unordered, // Order of elements is ignored (relaxed). Tuples are sorted before inserted. 49 } 50 51 /** 52 Computes $(LUCKY N-Gram) Model of Input Range $(D range). 53 54 If N == 1 this N-gram becomes a standard $(LUCKY Histogram), also called a 55 $(LUCKY Unigram). 56 57 If Bin/Bucket Type $(D RequestedBinType), requested by user, is $(D void) it 58 will be chosen automatically based on element type of $(D Range). 59 60 TODO Add optimized put() be precalculating maximum number of elements that 61 can be inserted without any risk overflow. 62 */ 63 struct NGram(ValueType, 64 int N = 1, 65 Kind kind = Kind.saturated, 66 Storage storage = Storage.denseDynamic, 67 Symmetry symmetry = Symmetry.ordered, 68 RequestedBinType = void, 69 Range) if (is(RequestedBinType == void) || 70 isUnsigned!RequestedBinType || 71 isFloatingPoint!RequestedBinType) 72 { 73 this(in Range range) @safe pure nothrow { put(range); } 74 75 auto ref value() @property @safe pure inout nothrow { return _bins; } 76 77 alias Q = Rational!ulong; 78 79 @property static int order() @safe pure nothrow { return N; } 80 81 enum isBinary = (kind == Kind.binary); 82 enum isDense = (storage == Storage.denseStatic || 83 storage == Storage.denseDynamic); 84 enum isSparse = !isDense; 85 enum cacheDeepDenseness = false; 86 87 enum getStorage = storage; 88 89 /** Returns: Documentation of the type of $(D this). */ 90 string typeName() @property const 91 { 92 string prefix; 93 static if (N == 1) { prefix = "Uni"; } 94 else static if (N == 2) { prefix = "Bi"; } 95 else static if (N == 3) { prefix = "Tri"; } 96 else static if (N == 4) { prefix = "Qua"; } 97 else static if (N == 5) { prefix = "Pen"; } 98 else { prefix = to!string(N) ~ "-"; } 99 string rval; 100 static if (isBinary) 101 { 102 rval ~= "Bit"; 103 } 104 else 105 { 106 rval ~= BinType.stringof; 107 } 108 rval ~= "-" ~ prefix ~ "Gram over " ~ ValueType.stringof ~ "s"; 109 return rval; 110 } 111 112 string toString() @property const 113 { 114 string rval = typeName ~ ": "; 115 116 // print contents 117 static if (isAA) 118 { 119 rval ~= to!string(_bins); 120 } 121 else if (N >= 2 || 122 this.denseness < Q(1, 2)) // if less than half of the bins are occupied 123 { 124 // show as sparse 125 rval ~= "["; 126 127 // TODO Replace these with recursion? 128 static if (N == 1) 129 { 130 bool begun = false; 131 foreach (ix, elt; _bins) // TODO Make static_bitarray support ref foreach and make elt const ref 132 { 133 if (elt) 134 { 135 if (begun) { rval ~= ", "; } 136 rval ~= "{" ~ to!string(ix) ~ "}#" ~ to!string(elt); 137 begun = true; 138 } 139 } 140 } 141 else static if (N == 2) 142 { 143 bool begun = false; 144 foreach (ix0, const ref elt0; _bins) 145 { 146 foreach (ix1, elt1; elt0) // TODO Make static_bitarray support ref foreach and make elt1 const ref 147 { 148 if (elt1) 149 { 150 if (begun) { rval ~= ", "; } 151 rval ~= ("{" ~ 152 to!string(ix0) ~ "," ~ 153 to!string(ix1) ~ "}#" ~ 154 to!string(elt1)); 155 begun = true; 156 } 157 } 158 } 159 } 160 else static if (N == 3) 161 { 162 bool begun = false; 163 foreach (ix0, const ref elt0; _bins) 164 { 165 foreach (ix1, const ref elt1; elt0) 166 { 167 foreach (ix2, elt2; elt1) { // TODO Make static_bitarray support ref foreach and make elt2 const ref 168 if (elt2) 169 { 170 if (begun) { rval ~= ", "; } 171 rval ~= ("{" ~ 172 to!string(ix0) ~ "," ~ 173 to!string(ix1) ~ "," ~ 174 to!string(ix2) ~ "}#" ~ 175 to!string(elt2)); 176 begun = true; 177 } 178 } 179 } 180 } 181 } 182 else static if (N == 4) 183 { 184 bool begun = false; 185 foreach (ix0, const ref elt0; _bins) 186 { 187 foreach (ix1, const ref elt1; elt0) 188 { 189 foreach (ix2, elt2; elt1) 190 { 191 foreach (ix3, elt3; elt2) // TODO Make static_bitarray support ref foreach and make elt2 const ref 192 { 193 if (elt3) 194 { 195 if (begun) { rval ~= ", "; } 196 rval ~= ("{" ~ 197 to!string(ix0) ~ "," ~ 198 to!string(ix1) ~ "," ~ 199 to!string(ix2) ~ "," ~ 200 to!string(ix3) ~ "}#" ~ 201 to!string(elt3)); 202 begun = true; 203 } 204 } 205 } 206 } 207 } 208 } 209 else static if (N == 5) 210 { 211 bool begun = false; 212 foreach (ix0, const ref elt0; _bins) 213 { 214 foreach (ix1, const ref elt1; elt0) 215 { 216 foreach (ix2, const ref elt2; elt1) 217 { 218 foreach (ix3, const ref elt3; elt2) // TODO Make static_bitarray support ref foreach and make elt2 const ref 219 { 220 foreach (ix4, elt4; elt3) // TODO Make static_bitarray support ref foreach and make elt2 const ref 221 { 222 if (elt4) 223 { 224 if (begun) { rval ~= ", "; } 225 rval ~= ("{" ~ 226 to!string(ix0) ~ "," ~ 227 to!string(ix1) ~ "," ~ 228 to!string(ix2) ~ "," ~ 229 to!string(ix3) ~ "," ~ 230 to!string(ix4) ~ "}#" ~ 231 to!string(elt4)); 232 begun = true; 233 } 234 } 235 } 236 } 237 } 238 } 239 } 240 else 241 { 242 static assert(0, "N >= 5 not supported"); 243 } 244 rval ~= "]"; 245 } 246 else 247 { 248 rval ~= to!string(_bins); // default 249 } 250 251 static if (N == 1) 252 { 253 rval ~= " denseness:" ~ to!string(denseness); 254 } 255 else 256 { 257 rval ~= (" shallowDenseness:" ~ to!string(denseness(0)) ~ 258 " deepDenseness:" ~ to!string(denseness(-1))); 259 } 260 261 return rval; 262 } 263 264 void reset() @safe nothrow 265 { 266 import nxt.algorithm_ex: reset; 267 _bins.reset(); 268 } 269 // alias reset = clear; 270 271 bool empty() const pure @property @trusted nothrow 272 { 273 static if (isAA) 274 { 275 return _bins.length == 0; 276 } 277 else 278 { 279 return _bins.allZero; 280 } 281 } 282 283 static if (N >= 2 && 284 storage != Storage.sparse) 285 { 286 Q shallowDenseness() const pure @property @trusted nothrow 287 { 288 static if (N == 2) 289 { 290 ulong x = 0; 291 foreach (const ref elt0; 0..combE) // every possible first ngram element 292 { 293 bool used = false; 294 foreach (elt1; 0..combE) // every possible second ngram element 295 { 296 if (_bins[elt0][elt1]) // if any bit is used 297 { 298 used = true; // we flag it and exit 299 break; 300 } 301 } 302 if (used) ++x; 303 } 304 return Q(x, combE); 305 } 306 else 307 { 308 return Q(1, 1); 309 } 310 } 311 } 312 313 typeof(this) opAssign(RhsRange)(in NGram!(ValueType, N, kind, storage, symmetry, RequestedBinType, RhsRange) rhs) 314 if (isInputRange!RhsRange) // if (is(Unqual!Range == Unqual!RhsRange)) 315 { 316 static if (storage == Storage.denseDynamic) 317 _bins = rhs._bins.dup; 318 else static if (storage == Storage.sparse) 319 _bins = cast(BinType[ValueType[N]])(rhs._bins.dup); // TODO How can we prevent this cast? 320 else static if (storage == Storage.denseStatic) 321 _bins = rhs._bins; 322 else 323 static assert(0, "Cannot handle storage of type " ~ to!string(storage)); 324 return this; 325 } 326 327 static if (N == 1) 328 { 329 void normalize() @safe pure /* nothrow */ 330 { 331 static if (kind != Kind.binary) 332 { 333 static if (isFloatingPoint!ValueType) 334 { 335 immutable scaleFactor = (cast(real)1) / _bins.length; // precalc scaling 336 } 337 foreach (ref elt; _bins) // TODO Why can this throw for associative arrays? 338 { 339 static if (isIntegral!ValueType || 340 isSomeChar!ValueType) 341 { 342 elt /= 2; // drop least significant bit 343 } 344 else static if (isFloatingPoint!ValueType) 345 { 346 elt =* scaleFactor; 347 } 348 } 349 } 350 } 351 } 352 353 /** Invalidate Local Statistics. */ 354 void invalidateStats() @safe nothrow 355 { 356 static if (isAA && cacheDeepDenseness) { _deepDenseness = 0; } 357 } 358 359 /** Saturated Increment $(D x) by one. */ 360 ref T incs(T)(ref T x) @safe nothrow if (isIntegral!T) { if (x != T.max) { ++x; invalidateStats(); } return x; } 361 /** Saturated Increment $(D x) by one. */ 362 ref T incs(T)(ref T x) @safe nothrow if (isFloatingPoint!T) { invalidateStats(); return ++x; } 363 364 static if (kind == Kind.saturated) 365 { 366 /** Increase bin of $(D ng). 367 Bin of ng must already be allocated if its stored in a map. 368 */ 369 ref NGram inc(ValueType[N] ng) @safe pure nothrow 370 { 371 static if (isAA) 372 { 373 incs(_bins[ng]); // just one case! 374 } 375 else 376 { 377 static if (N == 1) 378 { 379 static if (storage == Storage.denseDynamic) 380 { 381 if (!_bins) { _bins = new BinType[noABins]; } 382 } 383 incs(_bins[ng[0]]); 384 } 385 else static if (N == 2) 386 { 387 static if (storage == Storage.denseDynamic) 388 { 389 if (!_bins) { _bins = new BinType[][noABins]; } 390 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[noABins]; } 391 } 392 incs(_bins[ng[0]][ng[1]]); 393 } 394 else static if (N == 3) 395 { 396 static if (storage == Storage.denseDynamic) 397 { 398 if (!_bins) { _bins = new BinType[][][noABins]; } 399 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][noABins]; } 400 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[noABins]; } 401 } 402 incs(_bins[ng[0]][ng[1]][ng[2]]); 403 } 404 else static if (N == 4) 405 { 406 static if (storage == Storage.denseDynamic) 407 { 408 if (!_bins) { _bins = new BinType[][][][noABins]; } 409 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][][noABins]; } 410 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[][noABins]; } 411 if (!_bins[ng[0]][ng[1]][ng[2]]) { _bins[ng[0]][ng[1]][ng[2]] = new BinType[noABins]; } 412 } 413 incs(_bins[ng[0]][ng[1]][ng[2]][ng[3]]); 414 } 415 else static if (N == 5) 416 { 417 static if (storage == Storage.denseDynamic) 418 { 419 if (!_bins) { _bins = new BinType[][][][][noABins]; } 420 if (!_bins[ng[0]]) { _bins[ng[0]] = new BinType[][][][noABins]; } 421 if (!_bins[ng[0]][ng[1]]) { _bins[ng[0]][ng[1]] = new BinType[][][noABins]; } 422 if (!_bins[ng[0]][ng[1]][ng[2]]) { _bins[ng[0]][ng[1]][ng[2]] = new BinType[][noABins]; } 423 if (!_bins[ng[0]][ng[1]][ng[2]][ng[3]]) { _bins[ng[0]][ng[1]][ng[2]][ng[3]] = new BinType[noABins]; } 424 } 425 incs(_bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]); 426 } 427 else 428 { 429 static assert(0, "N >= 6 not supported"); 430 } 431 } 432 return this; 433 } 434 } 435 436 /** Returns: Bin Count of NGram $(D ng). */ 437 BinType opIndex(T)(in T ng) const @safe pure nothrow if (isIntegral!T || 438 isStaticArray!T) 439 { 440 static if (isAA) 441 { 442 if (ng in _bins) 443 { 444 return _bins[ng]; 445 } 446 else 447 { 448 return 0; // empty means zero bin 449 } 450 } 451 else 452 { 453 static if (N == 1) { return _bins[ng]; } 454 else static if (N == 2) { return _bins[ng[0]][ng[1]]; } 455 else static if (N == 3) { return _bins[ng[0]][ng[1]][ng[2]]; } 456 else static if (N == 4) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]]; } 457 else static if (N == 5) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]; } 458 else { static assert(0, "N >= 6 not supported"); } 459 } 460 } 461 462 /* ref BinType opIndexAssign(Index)(BinType b, Index i) @trusted pure nothrow if (isIntegral!Index) in { */ 463 464 /* } */ 465 466 /** Returns: Bin Count of NGram $(D ng). */ 467 auto opIndexAssign(T)(in T ng) const @safe pure nothrow if (isIntegral!T || 468 isStaticArray!T) 469 { 470 static if (isAA) 471 { 472 if (ng in _bins) 473 { 474 return _bins[ng]; 475 } 476 else 477 { 478 return 0; // empty means zero bin 479 } 480 } 481 else 482 { 483 static if (N == 1) { return _bins[ng]; } 484 else static if (N == 2) { return _bins[ng[0]][ng[1]]; } 485 else static if (N == 3) { return _bins[ng[0]][ng[1]][ng[2]]; } 486 else static if (N == 4) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]]; } 487 else static if (N == 5) { return _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]]; } 488 else { static assert(0, "N >= 6 not supported"); } 489 } 490 } 491 492 /** Put NGram Element $(D ng) in $(D this). 493 */ 494 ref NGram put(ValueType[N] ng) @safe pure nothrow 495 { 496 static if (isBinary) 497 { 498 static if (N == 1) { _bins[ng[0]] = true; } 499 else static if (N == 2) { _bins[ng[0]][ng[1]] = true; } 500 else static if (N == 3) { _bins[ng[0]][ng[1]][ng[2]] = true; } 501 else static if (N == 4) { _bins[ng[0]][ng[1]][ng[2]][ng[3]] = true; } 502 else static if (N == 5) { _bins[ng[0]][ng[1]][ng[2]][ng[3]][ng[4]] = true; } 503 else { static assert(0, "N >= 6 not supported"); } 504 } 505 else 506 { 507 static if (isAA) 508 { 509 if (ng in _bins) 510 { 511 inc(ng); // increase bin 512 } 513 else 514 { 515 _bins[ng] = 1; // initial bin count 516 } 517 } 518 else 519 { 520 inc(ng); 521 } 522 } 523 invalidateStats(); 524 return this; 525 } 526 527 /** Put NGram Elements $(D range) in $(D this). 528 */ 529 ref NGram put(T)(in T range) @safe pure nothrow if (isIterable!T) 530 { 531 static if (N == 1) 532 { 533 foreach (ix, const ref elt; range) 534 { 535 put([elt]); 536 } 537 } 538 else static if (N == 2) 539 { 540 ValueType prev; // previous element 541 foreach (ix, const ref elt; range) 542 { 543 if (ix >= N-1) { put([prev, elt]); } 544 prev = elt; 545 } 546 } 547 else static if (N == 3) 548 { 549 ValueType pprev, prev; // previous elements 550 foreach (ix, const ref elt; range) 551 { 552 if (ix >= N-1) { put([pprev, prev, elt]); } 553 pprev = prev; 554 prev = elt; 555 } 556 } 557 else static if (N == 4) 558 { 559 ValueType ppprev, pprev, prev; // previous elements 560 foreach (ix, const ref elt; range) 561 { 562 if (ix >= N-1) { put([ppprev, pprev, prev, elt]); } 563 ppprev = pprev; 564 pprev = prev; 565 prev = elt; 566 } 567 } 568 else static if (N == 5) 569 { 570 ValueType pppprev, ppprev, pprev, prev; // previous elements 571 foreach (ix, const ref elt; range) 572 { 573 if (ix >= N-1) { put([pppprev, ppprev, pprev, prev, elt]); } 574 pppprev = ppprev; 575 ppprev = pprev; 576 pprev = prev; 577 prev = elt; 578 } 579 } 580 return this; 581 } 582 583 /** Scalar (Dot) Product Match $(D this) with $(D rhs). */ 584 CommonType!(ulong, BinType) matchDenser(rhsValueType, 585 Kind rhsKind = Kind.saturated, 586 Storage rhsStorage = Storage.denseDynamic, 587 RhsRange)(in NGram!(rhsValueType, 588 N, 589 rhsKind, 590 rhsStorage, 591 symmetry, 592 RequestedBinType, 593 RhsRange) rhs) const @trusted pure /* nothrow */ 594 { 595 static if (this.isDense && rhs.isDense) 596 { 597 static if (N == 1) return dotProduct(this._bins, rhs._bins); 598 else static assert(0, "N >= " ~ to!string(N) ~ " not supported"); 599 } 600 else static if (this.isSparse) 601 { 602 typeof(return) sum = 0; 603 foreach (ix, const ref bin; _bins) // assume lhs is sparsest 604 { 605 sum += bin*rhs[ix]; 606 } 607 return sum; 608 } 609 else static if (rhs.isSparse) 610 { 611 typeof(return) sum = 0; 612 foreach (ix, const ref bin; rhs._bins) // assume lhs is sparsest 613 { 614 sum += this[ix]*bin; 615 } 616 return sum; 617 } 618 else 619 { 620 static assert(0, "Combination of " ~ typeof(storage).stringof ~ " and " ~ 621 typeof(rhsStorage).stringof ~ " not supported"); 622 return 0; 623 } 624 } 625 626 auto opBinary(string op, 627 string file = __FILE__, int line = __LINE__)(in NGram rhs) const @trusted pure /* nothrow */ 628 { 629 NGram tmp; 630 static if (this.isSparse || 631 rhs.isSparse) 632 { 633 void doIt(in NGram a, in NGram b) // more functional than orderInPlace 634 { 635 foreach (ix, const ref bin; a._bins) 636 { 637 if (ix in b._bins) 638 { 639 mixin("tmp._bins[ix] = bin " ~ op ~ " b._bins[ix];"); // propagate pointwise operation 640 } 641 } 642 } 643 if (this.length < rhs.length) { doIt(this, rhs); } 644 else { doIt(rhs, this); } 645 } 646 else static if (this.isBinary && 647 rhs.isBinary) 648 { 649 mixin("tmp._bins = _bins " ~ op ~ "rhs._bins;"); // bitsets cannot be sliced (yet) 650 } 651 else static if (this.isDense && 652 rhs.isDense && 653 N == 1) 654 { 655 assert(this.length == rhs.length); 656 import std.range: zip; 657 import std.algorithm: map; 658 import std.array: array; 659 tmp._bins = zip(this._bins[], rhs._bins[]).map!(a => mixin("a[0] " ~ op ~ "a[1]"))().array; // TODO Use zipWith when Issue 8715 is fixed 660 /* foreach (ix, let; this._bins) { // TODO Reuse Phobos algorithm or range */ 661 /* mixin("tmp[ix] = _bins[ix] " ~ op ~ "rhs._bins[ix];"); */ 662 /* } */ 663 } 664 else 665 { 666 static assert(0, "Combination of " ~ to!string(this.getStorage) ~ " and " ~ 667 to!string(rhs.getStorage) ~ " and N == " ~ to!string(N) ~ " not supported"); 668 } 669 return tmp; 670 } 671 672 /** Determine Bin (Counter) Type. */ 673 static if (isBinary) 674 { 675 alias BinType = bool; 676 } 677 else static if (is(RequestedBinType == void)) 678 { 679 alias BinType = uint64_t; // Bin (counter) type. Count long by default. 680 } 681 else 682 { 683 alias BinType = RequestedBinType; 684 } 685 686 enum bitsE = 8 * ValueType.sizeof; /** Number of Bits in ValueType (element). TODO This may have to be fixed for ref types. */ 687 enum combE = 2^^bitsE; /** Number of combinations in element. */ 688 enum noABins = combE; // Number of bins per array dimension 689 enum noBins = combE^^N; /** Maximum number of bins (possible). */ 690 691 // Determine storage structure base on element type. 692 static if (storage == Storage.sparse) 693 { 694 BinType[ValueType[N]] _bins; 695 } 696 else static if (isBinary && 697 N*bitsE <= 32) // safely fits on stack 698 { 699 static if (storage == Storage.denseStatic) 700 { 701 static if (N == 1) StaticBitArray!noABins _bins; 702 else static if (N == 2) StaticBitArray!noABins[noABins] _bins; 703 else static if (N == 3) StaticBitArray!noABins[noABins][noABins] _bins; 704 else static assert(0, "Dense static N >= 4 does no safely fit on stack"); 705 } 706 else static if (storage == Storage.denseDynamic) 707 { 708 static if (N == 1) StaticBitArray!noABins _bins; 709 else static if (N == 2) StaticBitArray!noABins[] _bins; 710 else static if (N == 3) StaticBitArray!noABins[][] _bins; 711 else static if (N == 4) StaticBitArray!noABins[][][] _bins; 712 else static if (N == 5) StaticBitArray!noABins[][][][] _bins; 713 else static assert(0, "N >= 6 not supported"); 714 } 715 else static if (storage == Storage.sparse) // shouldn't happen 716 { 717 } 718 } 719 else static if (storage == Storage.denseStatic && 720 (isUnsigned!ValueType || 721 isSomeChar!ValueType) && 722 N*bitsE <= 16) // safely fits on stack 723 { 724 static if (N == 1) BinType[noABins] _bins; 725 else static if (N == 2) BinType[noABins][noABins] _bins; 726 else static assert(0, "Dense static N >= 3 does not safely fit on stack"); 727 } 728 else static if (storage == Storage.denseDynamic && 729 (isUnsigned!ValueType || 730 isSomeChar!ValueType)) // safely fits on stack 731 { 732 static if (N == 1) BinType[] _bins; 733 else static if (N == 2) BinType[][] _bins; 734 else static if (N == 3) BinType[][][] _bins; 735 else static if (N == 4) BinType[][][][] _bins; 736 else static if (N == 5) BinType[][][][][] _bins; 737 else static assert(0, "N >= 6 not supported"); 738 } 739 else 740 { 741 BinType[ValueType[N]] _bins; 742 } 743 744 private alias _bins this; 745 746 enum isAA = isAssociativeArray!(typeof(_bins)); 747 748 static if (isAA && cacheDeepDenseness) 749 { 750 ulong _deepDenseness; // Cache deep denseness in associative arrays 751 } 752 753 Q densenessUncached(int depth = -1) const pure @property @trusted nothrow 754 { 755 static if (isBinary) 756 { 757 static if (N >= 2) 758 { 759 if (N == 2 && depth == 0) // if shallow wanted 760 { 761 return shallowDenseness(); 762 } 763 } 764 return _bins.denseness; 765 } 766 else 767 { 768 static if (isAA) 769 { 770 return Q(_bins.length, noBins); 771 } 772 else 773 { 774 return _bins.denseness(depth); 775 } 776 } 777 } 778 779 /** Returns: Number of Non-Zero Elements in $(D range). */ 780 Q denseness(int depth = -1) const pure @property @safe nothrow 781 { 782 static if (isAA && cacheDeepDenseness) 783 { 784 if (depth == -1) // -1 means deepDenseness 785 { 786 if (!_deepDenseness) // if not yet defined. 787 { 788 unqual(this).densenessUncached(depth).numerator; // calculate it 789 } 790 return Q(_deepDenseness, noBins); 791 } 792 } 793 return densenessUncached(depth); 794 } 795 } 796 797 /** 798 Computes $(LUCKY Bi-Gram) of $(D range). 799 */ 800 auto ngram(int N = 2, 801 Kind kind = Kind.saturated, 802 Storage storage = Storage.denseDynamic, 803 Symmetry symmetry = Symmetry.ordered, 804 RequestedBinType = void, 805 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 806 { 807 return NGram!(Unqual!(ElementType!Range), N, 808 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 809 } 810 auto sparseUIntNGramOverRepresentation(int N = 2, 811 Kind kind = Kind.saturated, 812 Symmetry symmetry = Symmetry.ordered)(in string range) @safe pure nothrow 813 { 814 auto y = range.representation; 815 return ngram!(N, kind, Storage.sparse, symmetry, uint)(y); 816 } 817 818 auto histogram(Kind kind = Kind.saturated, 819 Storage storage = Storage.denseDynamic, 820 Symmetry symmetry = Symmetry.ordered, 821 RequestedBinType = void, 822 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 823 { 824 return NGram!(Unqual!(ElementType!Range), 1, kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 825 } 826 alias Hist = NGram; 827 alias hist = histogram; 828 alias unigram = histogram; 829 830 unittest 831 { 832 const ubyte[] x = [1, 2, 3, 4, 5]; 833 const h = x.histogram!(Kind.saturated, Storage.denseStatic); 834 const hh = h + h; 835 836 auto h2 = x.histogram!(Kind.saturated, Storage.denseStatic); 837 h2.put(x); 838 assert(hh == h2); 839 } 840 841 unittest 842 { 843 const ubyte[] x; 844 const hDS = x.histogram!(Kind.saturated, Storage.denseStatic); assert(hDS.empty); 845 846 const hDD = x.histogram!(Kind.saturated, Storage.denseDynamic); assert(hDD.empty); 847 848 auto h = x.histogram!(Kind.saturated, Storage.sparse); assert(h.empty); 849 const ubyte[5] x1 = [1, 2, 3, 4, 5]; 850 h.put(x1); assert(!h.empty); 851 assert(h.matchDenser(h) == x1.length); 852 auto hh = h + h; 853 assert(hh.matchDenser(hh) == 4*x1.length); 854 855 h.destroy(); assert(h.empty); 856 857 auto hU32 = x.histogram!(Kind.saturated, Storage.sparse, Symmetry.ordered, uint); 858 } 859 860 unittest 861 { 862 alias ValueType = ubyte; 863 const ValueType[3] x = [11, 12, 13]; 864 865 auto h = x.histogram!(Kind.saturated, Storage.denseStatic); 866 assert(h[0] == 0); 867 assert(h[11] == 1 && h[12] == 1 && h[13] == 1); 868 assert(3 == h.matchDenser(h)); 869 870 h.put(x); 871 assert(h[11] == 2 && h[12] == 2 && h[13] == 2); 872 assert(12 == h.matchDenser(h)); 873 874 version(print) dbg(h); 875 876 auto hD = x.histogram!(Kind.saturated, Storage.denseDynamic); 877 version(print) dbg(hD); 878 } 879 880 unittest 881 { 882 alias ValueType = ulong; 883 const ValueType[3] x = [1, 2, 3]; 884 auto h = x.histogram!(Kind.saturated, Storage.denseStatic); 885 h.put([4, 5, 6]); 886 assert(h.length == 6); 887 } 888 889 /** 890 Computes $(LUCKY Binary Histogram) of $(D range). 891 */ 892 auto bistogram(Range)(in Range range) @safe pure nothrow if (isIterable!Range) 893 894 { 895 return NGram!(Unqual!(ElementType!Range), 1, 896 Kind.binary, 897 Storage.denseStatic, 898 Symmetry.ordered, 899 void, 900 Unqual!Range)(range); 901 } 902 alias bunigram = bistogram; 903 904 unittest 905 { 906 const ubyte[] x; 907 const b = x.bistogram; 908 assert(b.empty); 909 910 // test const unqualification 911 NGram!(ubyte, 1, Kind.binary, Storage.denseStatic, Symmetry.ordered, void, const(ubyte)[]) cb; 912 cb = b; 913 assert(cb == b); 914 915 // test immutable unqualification 916 NGram!(ubyte, 1, Kind.binary, Storage.denseStatic, Symmetry.ordered, void, immutable(ubyte)[]) ib; 917 ib = b; 918 assert(ib == b); 919 } 920 921 unittest 922 { 923 const ubyte[3] x = [1, 2, 3]; 924 const h_ = x.bistogram; 925 assert(8*h_.sizeof == 2^^8); 926 const h = h_; 927 assert(h[1] && h[2] && h[3]); 928 assert(!h[0]); 929 assert(!h[4]); 930 931 const h2_ = h_; 932 assert((h2_ & h_) == h2_); 933 assert((h2_ | h_) == h2_); 934 assert((h2_.value & h_.value) == h2_.value); 935 assert((h2_.value | h_.value) == h2_.value); 936 } 937 unittest 938 { 939 const ushort[3] x = [1, 2, 3]; 940 const h_ = x.bistogram; 941 assert(8*h_.sizeof == 2^^16); 942 const h = h_; 943 assert(h[1] && h[2] && h[3]); 944 assert(!h[0]); 945 assert(!h[4]); 946 } 947 948 /** 949 Computes $(LUCKY Binary (Occurrence) NGram) of Bytes in Input String $(D x). 950 */ 951 auto bistogramOverRepresentation(in string x) pure nothrow 952 953 { 954 return bistogram(x.representation); 955 } 956 unittest 957 { 958 const x = "abcdef"; 959 auto h = x.bistogramOverRepresentation; 960 size_t ix = 0; 961 foreach (const bin; h) 962 { 963 if (ix >= cast(ubyte)'a' && 964 ix <= cast(ubyte)'f') 965 { 966 assert(bin); 967 } 968 else 969 { 970 assert(!bin); 971 } 972 ++ix; 973 } 974 version(print) dbg(h); 975 } 976 977 /** 978 Computes $(LUCKY Bi-Gram) of $(D range). 979 */ 980 auto bigram(Kind kind = Kind.saturated, 981 Storage storage = Storage.denseDynamic, 982 Symmetry symmetry = Symmetry.ordered, 983 RequestedBinType = void, 984 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 985 { 986 return NGram!(Unqual!(ElementType!Range), 2, 987 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 988 } 989 auto sparseUIntBigram(Kind kind = Kind.saturated, 990 Symmetry symmetry = Symmetry.ordered, 991 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 992 { 993 return NGram!(Unqual!(ElementType!Range), 2, 994 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range); 995 } 996 997 unittest 998 { 999 const ubyte[] x = [1, 2, 3, 3, 3, 4, 4, 5]; 1000 1001 auto bSp = x.bigram!(Kind.saturated, Storage.sparse); 1002 alias SparseNGram = Unqual!(typeof(bSp)); 1003 1004 // check msgpacking 1005 // auto bSPBytes = bSp.pack(); 1006 // SparseNGram bSp_; 1007 // bSPBytes.unpack(bSp_); 1008 // assert(bSp == bSp_); 1009 1010 auto bS = x.bigram!(Kind.saturated, Storage.denseStatic); 1011 const bSCopy = bS; 1012 bS.put(x); 1013 assert(bS != bSCopy); 1014 1015 const bD = x.bigram!(Kind.saturated, Storage.denseDynamic); 1016 version(print) dbg(bD); 1017 1018 const bb = x.bigram!(Kind.binary, Storage.denseStatic); 1019 version(print) dbg(typeof(bb._bins).stringof); 1020 1021 assert(bb.denseness(0).numerator == 4); 1022 // assert(bb.denseness(-1).numerator == 6); 1023 } 1024 unittest 1025 { 1026 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1027 1028 const bS = x.bigram!(Kind.saturated, Storage.denseStatic); 1029 version(print) dbg(bS); 1030 assert(bS.denseness.numerator == x.length - bS.order + 1); 1031 1032 const bB = x.bigram!(Kind.binary, Storage.denseStatic); 1033 version(print) dbg(bB); 1034 } 1035 1036 unittest 1037 { 1038 const x = [1, 2, 3, 4, 5]; 1039 auto h = x.bigram!(Kind.saturated, Storage.sparse); 1040 assert(h.matchDenser(h) == x.length - 1); 1041 } 1042 1043 /** 1044 Computes $(LUCKY Tri-Gram) of $(D range). 1045 */ 1046 auto trigram(Kind kind = Kind.saturated, 1047 Storage storage = Storage.denseDynamic, 1048 Symmetry symmetry = Symmetry.ordered, 1049 RequestedBinType = void, 1050 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1051 { 1052 return NGram!(Unqual!(ElementType!Range), 3, 1053 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 1054 } 1055 auto sparseUIntTrigram(Kind kind = Kind.saturated, 1056 Symmetry symmetry = Symmetry.ordered, 1057 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1058 { 1059 return NGram!(Unqual!(ElementType!Range), 3, 1060 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range); 1061 } 1062 unittest 1063 { 1064 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1065 1066 const bS = x.trigram!(Kind.saturated, Storage.denseStatic); 1067 version(print) dbg(bS); 1068 assert(bS.denseness.numerator == x.length - bS.order + 1); 1069 1070 const bD = x.trigram!(Kind.saturated, Storage.denseDynamic); 1071 version(print) dbg(bD); 1072 1073 const bB = x.trigram!(Kind.binary, Storage.denseStatic); 1074 /* version(print) dbg(bB); */ 1075 } 1076 1077 /** 1078 Computes $(LUCKY Qua-Gram) of $(D range). 1079 */ 1080 auto quagram(Kind kind = Kind.saturated, 1081 Storage storage = Storage.denseDynamic, 1082 Symmetry symmetry = Symmetry.ordered, 1083 RequestedBinType = void, 1084 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1085 { 1086 return NGram!(Unqual!(ElementType!Range), 4, 1087 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 1088 } 1089 auto sparseUIntQuagram(Kind kind = Kind.saturated, 1090 Symmetry symmetry = Symmetry.ordered, 1091 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1092 { 1093 return NGram!(Unqual!(ElementType!Range), 4, 1094 kind, Storage.sparse, symmetry, uint, Unqual!(Range))(range); 1095 } 1096 unittest 1097 { 1098 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1099 1100 const bD = x.quagram!(Kind.saturated, Storage.denseDynamic); 1101 version(print) dbg(bD); 1102 assert(bD.denseness.numerator == x.length - bD.order + 1); 1103 } 1104 auto sparseUIntQuagramOverRepresentation(in string x) pure nothrow 1105 { 1106 return sparseUIntQuagram(x.representation); 1107 } 1108 1109 /** 1110 Computes $(LUCKY Pen-Gram) of $(D range). 1111 */ 1112 auto pengram(Kind kind = Kind.saturated, 1113 Storage storage = Storage.denseDynamic, 1114 Symmetry symmetry = Symmetry.ordered, 1115 RequestedBinType = void, 1116 Range)(in Range range) @safe pure nothrow if (isIterable!Range) 1117 { 1118 return NGram!(Unqual!(ElementType!Range), 5, 1119 kind, storage, symmetry, RequestedBinType, Unqual!(Range))(range); 1120 } 1121 unittest 1122 { 1123 const ubyte[] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; 1124 1125 const bS = x.pengram!(Kind.saturated, Storage.sparse); 1126 const bS_ = bS; 1127 assert(bS_ == bS); 1128 1129 // skipping denseStatic because it doesn't fit stack anyway 1130 1131 const bD = x.pengram!(Kind.saturated, Storage.denseDynamic); 1132 version(print) dbg(bD); 1133 assert(bD.denseness.numerator == x.length - bD.order + 1); 1134 }