1 /** Stemming algorithms 2 */ 3 module nxt.stemming; 4 5 import std.algorithm.comparison: among; 6 import std.algorithm.searching : endsWith, canFind; 7 import std.range: empty; 8 import std.traits: isSomeString; 9 import std.typecons : Tuple, tuple; 10 11 import nxt.iso_639_1 : Language; 12 import nxt.lingua : isEnglishVowel, isSwedishVowel, isSwedishConsonant, isEnglishConsonant; 13 import nxt.skip_ex : skipOverBack; 14 15 public class Stemmer(S) 16 if (isSomeString!S) 17 { 18 /** 19 * In stem(p,i,j), p is a char pointer, and the string to be stemmed 20 * is from p[i] to p[j] inclusive. Typically i is zero and j is the 21 * offset to the last character of a string, (p[j+1] == '\0'). The 22 * stemmer adjusts the characters p[i] ... p[j] and returns the new 23 * end-point of the string, k. Stemming never increases word length, so 24 * i <= k <= j. To turn the stemmer into a module, declare 'stem' as 25 * extern, and delete the remainder of this file. 26 */ 27 public S stem(S p) 28 { 29 _b = p; 30 _k = p.length - 1; 31 _k0 = 0; 32 33 /** strings of length 1 or 2 don't go through the stemming process, 34 * although no mention is made of this in the published 35 * algorithm. Remove the line to match the published algorithm. 36 */ 37 if (_k <= _k0 + 1) 38 return _b; 39 40 step1ab(); 41 step1c(); 42 step2(); 43 step3(); 44 step4(); 45 step5(); 46 return _b[_k0 .. _k + 1]; 47 48 } 49 50 private: 51 S _b; // buffer for the word 52 ptrdiff_t _k = 0; 53 ptrdiff_t _k0 = 0; 54 ptrdiff_t _j = 0; // offset within the string 55 56 /** 57 * cons returns true, if b[i] is a consonant 58 */ 59 bool isConsonant(ptrdiff_t i) 60 { 61 if (_b[i].isEnglishVowel) 62 return false; 63 if (_b[i] == 'y') 64 { 65 if (i == _k0) 66 { 67 return true; 68 } 69 else 70 { 71 return !isConsonant(i - 1); 72 } 73 } 74 return true; 75 } 76 77 /** Return the number of consonant sequences between k0 and j. 78 * if c is a consonant sequence and v a vowel sequence, and <..> 79 * indicates arbitrary presence, 80 * 81 * <c><v> gives 0 82 * <c>vc<v> gives 1 83 * <c>vcvc<v> gives 2 84 * <c>vcvcvc<v> gives 3 85 * 86 */ 87 size_t m() 88 { 89 ptrdiff_t n = 0; 90 ptrdiff_t i = _k0; 91 92 while (true) 93 { 94 if (i > _j) 95 { 96 return n; 97 } 98 if (!isConsonant(i)) 99 { 100 break; 101 } 102 i++; 103 } 104 i++; 105 while (true) 106 { 107 while (true) 108 { 109 if (i > _j) 110 { 111 return n; 112 } 113 if (isConsonant(i)) 114 { 115 break; 116 } 117 i++; 118 } 119 i++; 120 n++; 121 while (true) 122 { 123 if (i > _j) 124 { 125 return n; 126 } 127 if (!isConsonant(i)) 128 { 129 break; 130 } 131 i++; 132 } 133 i++; 134 } 135 } 136 137 /** Returns true if k0...j contains a vowel. */ 138 bool hasVowelInStem() 139 { 140 for (ptrdiff_t i = _k0; i < _j + 1; i++) 141 { 142 if (!isConsonant(i)) 143 return true; 144 } 145 return false; 146 } 147 148 /** Returns true if j, j-1 contains a double consonant 149 */ 150 bool doublec(ptrdiff_t j) 151 { 152 if (j < (_k0 + 1)) 153 return false; 154 if (_b[j] != _b[j-1]) 155 return false; 156 return isConsonant(j); 157 } 158 159 /** Returns true if i-2,i-1,i has the form consonant - vowel - consonant 160 * and also if the second c is not w,x or y. this is used when trying to 161 * restore an e at the end of a short e.g. 162 * 163 * cav(e), lov(e), hop(e), crim(e), but 164 * snow, box, tray. 165 * 166 */ 167 bool cvc(ptrdiff_t i) 168 { 169 if (i < (_k0 + 2) || !isConsonant(i) || isConsonant(i-1) || !isConsonant(i-2)) 170 return false; 171 if (_b[i] == 'w' || _b[i] == 'x' || _b[i] == 'y') 172 return false; 173 return true; 174 } 175 176 /** Return true if k0,...k endsWith with the string s. 177 */ 178 bool endsWith(S)(S s) 179 if (isSomeString!S) 180 { 181 const len = s.length; 182 183 if (s[len - 1] != _b[_k]) 184 return false; 185 if (len > (_k - _k0 + 1)) 186 return false; 187 188 const a = _k - len + 1; 189 const b = _k + 1; 190 191 if (_b[a..b] != s) 192 { 193 return false; 194 } 195 _j = _k - len; 196 197 return true; 198 } 199 200 /** Sets (j+1),...k to the characters in the string s, readjusting k. */ 201 void setto(S)(S s) 202 if (isSomeString!S) 203 { 204 _b = _b[0.._j+1] ~ s ~ _b[_j + s.length + 1 .. _b.length]; 205 _k = _j + s.length; 206 } 207 208 /** Used further down. */ 209 void r(S)(S s) 210 if (isSomeString!S) 211 { 212 if (m() > 0) 213 setto(s); 214 } 215 216 /** Gets rid of plurals and -ed or -ing. e.g. */ 217 void step1ab() 218 { 219 if (_b[_k] == 's') 220 { 221 if (endsWith("sses")) 222 { 223 _k = _k - 2; 224 } 225 else if (endsWith("ies")) 226 { 227 setto("i"); 228 } 229 else if (_b[_k - 1] != 's') 230 { 231 _k--; 232 } 233 } 234 if (endsWith("eed")) 235 { 236 if (m() > 0) 237 _k--; 238 } 239 else if ((endsWith("ed") || endsWith("ing")) && hasVowelInStem()) 240 { 241 _k = _j; 242 if (endsWith("at")) 243 { 244 setto("ate"); 245 } 246 else if (endsWith("bl")) 247 { 248 setto("ble"); 249 } 250 else if (endsWith("iz")) 251 { 252 setto("ize"); 253 } 254 else if (doublec(_k)) 255 { 256 _k--; 257 if (_b[_k] == 'l' || _b[_k] == 's' || _b[_k] == 'z') 258 _k++; 259 } 260 else if (m() == 1 && cvc(_k)) 261 { 262 setto("e"); 263 } 264 } 265 } 266 267 /** 268 * step1c() turns terminal y to i when there is another vowel in the stem. 269 */ 270 void step1c() 271 { 272 if (endsWith("y") && 273 !endsWith("day") && 274 hasVowelInStem()) 275 { 276 _b = _b[0.._k] ~ 'i' ~ _b[_k+1 .. _b.length]; 277 } 278 } 279 280 /** 281 * step2() maps double suffices to single ones. 282 * so -ization (= -ize plus -ation) maps to -ize etc. note that the 283 * string before the suffix must give m() > 0.* 284 */ 285 void step2() 286 { 287 if (_b[_k - 1] == 'a') 288 { 289 if (endsWith("ational")) 290 r("ate"); 291 else if (endsWith("tional")) 292 r("tion"); 293 } 294 else if (_b[_k - 1] == 'c') 295 { 296 if (endsWith("enci")) 297 r("ence"); 298 else if (endsWith("anci")) 299 r("ance"); 300 } 301 else if (_b[_k - 1] == 'e') 302 { 303 if (endsWith("izer")) 304 r("ize"); 305 } 306 else if (_b[_k - 1] == 'l') 307 { 308 if (endsWith("bli")) 309 r("ble"); 310 /* --DEPARTURE-- 311 * To match the published algorithm, replace this phrase with 312 * if (endsWith("abli")) 313 * r("able"); 314 */ 315 else if (endsWith("alli")) 316 r("al"); 317 else if (endsWith("entli")) 318 r("ent"); 319 else if (endsWith("eli")) 320 r("e"); 321 else if (endsWith("ousli")) 322 r("ous"); 323 } 324 else if (_b[_k - 1] == 'o') 325 { 326 if (endsWith("ization")) 327 r("ize"); 328 else if (endsWith("ation") || endsWith("ator")) 329 r("ate"); 330 } 331 else if (_b[_k - 1] == 's') 332 { 333 if (endsWith("alism")) 334 r("al"); 335 else if (endsWith("iveness")) 336 r("ive"); 337 else if (endsWith("fulness")) 338 r("ful"); 339 else if (endsWith("ousness")) 340 r("ous"); 341 } 342 else if (_b[_k - 1] == 't') 343 { 344 if (endsWith("aliti")) 345 r("al"); 346 else if (endsWith("iviti")) 347 r("ive"); 348 else if (endsWith("biliti")) 349 r("ble"); 350 } 351 else if (_b[_k - 1] == 'g') 352 { 353 /** 354 * --DEPARTURE-- 355 * To match the published algorithm, delete this phrase 356 */ 357 if (endsWith("logi")) 358 r("log"); 359 } 360 } 361 362 /** 363 * step3() dels with -ic-, -full, -ness etc. similar strategy to step2. 364 */ 365 void step3() 366 { 367 if (_b[_k] == 'e') 368 { 369 if (endsWith("icate")) r("ic"); 370 else if (endsWith("ative")) r(""); 371 else if (endsWith("alize")) r("al"); 372 } 373 else if (_b[_k] == 'i') 374 { 375 if (endsWith("iciti")) r("ic"); 376 } 377 else if (_b[_k] == 'l') 378 { 379 if (endsWith("ical")) r("ic"); 380 else if (endsWith("ful")) r(""); 381 } 382 else if (_b[_k] == 's') 383 { 384 if (endsWith("ness")) r(""); 385 } 386 } 387 388 /** 389 * step4() takes off -ant, -ence etc., in context <c>vcvc<v>. 390 */ 391 void step4() 392 { 393 /* fixes bug 1 */ 394 if (_k == 0) 395 return; 396 switch (_b[_k - 1]) 397 { 398 case 'a': 399 if (endsWith("al")) 400 break; 401 return; 402 case 'c': 403 if (endsWith("ance") || endsWith("ence")) 404 break; 405 return; 406 case 'e': 407 if (endsWith("er")) 408 break; 409 return; 410 case 'i': 411 if (endsWith("ic")) 412 break; 413 return; 414 case 'l': 415 if (endsWith("able") || endsWith("ible")) 416 break; 417 return; 418 case 'n': 419 if (endsWith("ant") || endsWith("ement") || endsWith("ment") || endsWith("ent")) 420 break; 421 return; 422 case 'o': 423 if (endsWith("ion") && _j >= 0 && (_b[_j] == 's' || _b[_j] == 't')) 424 { 425 /* _j >= 0 fixes bug 2 */ 426 break; 427 } 428 if (endsWith("ou")) 429 break; 430 return; 431 case 's': 432 if (endsWith("ism")) 433 break; 434 return; 435 case 't': 436 if (endsWith("ate") || endsWith("iti")) 437 break; 438 return; 439 case 'u': 440 if (endsWith("ous")) 441 break; 442 return; 443 case 'v': 444 if (endsWith("ive")) 445 break; 446 return; 447 case 'z': 448 if (endsWith("ize")) 449 break; 450 return; 451 default: 452 return; 453 } 454 455 if (m() > 1) 456 _k = _j; 457 458 } 459 460 /** 461 * step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1. 462 */ 463 void step5() 464 { 465 _j = _k; 466 if (_b[_k] == 'e' && 467 _b[0 .. _k] != `false`) 468 { 469 auto a = m(); 470 if (a > 1 || (a == 1 && !cvc(_k - 1))) 471 _k--; 472 } 473 if (_b[_k] == 'l' && doublec(_k) && m() > 1) 474 _k--; 475 } 476 } 477 478 unittest { 479 scope stemmer = new Stemmer!string(); 480 481 assert(stemmer.stem("") == ""); 482 assert(stemmer.stem("x") == "x"); 483 assert(stemmer.stem("xyz") == "xyz"); 484 assert(stemmer.stem("win") == "win"); 485 /+ TODO: assert(stemmer.stem("winner") == "win"); +/ 486 assert(stemmer.stem("winning") == "win"); 487 assert(stemmer.stem("farted") == "fart"); 488 assert(stemmer.stem("caresses") == "caress"); 489 assert(stemmer.stem("ponies") == "poni"); 490 assert(stemmer.stem("ties") == "ti"); 491 assert(stemmer.stem("caress") == "caress"); 492 assert(stemmer.stem("cats") == "cat"); 493 assert(stemmer.stem("feed") == "feed"); 494 assert(stemmer.stem("matting") == "mat"); 495 assert(stemmer.stem("mating") == "mate"); 496 assert(stemmer.stem("meeting") == "meet"); 497 assert(stemmer.stem("milling") == "mill"); 498 assert(stemmer.stem("messing") == "mess"); 499 assert(stemmer.stem("meetings") == "meet"); 500 assert(stemmer.stem("neutralize") == "neutral"); 501 assert(stemmer.stem("relational") == "relat"); 502 assert(stemmer.stem("relational") == "relat"); 503 assert(stemmer.stem("intricate") == "intric"); 504 505 assert(stemmer.stem("connection") == "connect"); 506 assert(stemmer.stem("connective") == "connect"); 507 assert(stemmer.stem("connecting") == "connect"); 508 509 assert(stemmer.stem("agreed") == "agre"); 510 assert(stemmer.stem("disabled") == "disabl"); 511 assert(stemmer.stem("gentle") == "gentl"); 512 assert(stemmer.stem("gently") == "gentli"); 513 assert(stemmer.stem("served") == "serv"); 514 assert(stemmer.stem("competes") == "compet"); 515 516 assert(stemmer.stem("fullnessful") == "fullness"); 517 assert(stemmer.stem(stemmer.stem("fullnessful")) == "full"); 518 519 assert(stemmer.stem("bee") == "bee"); 520 521 assert(stemmer.stem("dogs") == "dog"); 522 assert(stemmer.stem("churches") == "church"); 523 assert(stemmer.stem("hardrock") == "hardrock"); 524 525 /+ TODO: assert(stemmer.stem("false") == "false"); +/ 526 } 527 528 import nxt.debugio; 529 530 /** Stem Swedish Word $(D s). 531 */ 532 auto ref stemSwedish(S)(S s) 533 if (isSomeString!S) 534 { 535 enum ar = `ar`; 536 enum or = `or`; 537 enum er = `er`; 538 enum ya = `ya`; 539 540 enum en = `en`; 541 enum ern = `ern`; 542 enum an = `an`; 543 enum na = `na`; 544 enum et = `et`; 545 enum aste = `aste`; 546 enum are = `are`; 547 enum ast = `ast`; 548 enum iserad = `iserad`; 549 enum de = `de`; 550 enum ing = `ing`; 551 enum igt = `igt`; 552 enum llt = `llt`; 553 554 switch (s) 555 { 556 case `samtida`: return `samtid`; 557 default: break; 558 } 559 560 if (s.endsWith(`n`)) 561 { 562 if (s.endsWith(en)) 563 { 564 const t = s[0 .. $ - en.length]; 565 if (s.among!(`även`)) 566 { 567 return s; 568 } 569 else if (t.among!(`sann`)) 570 { 571 return t; 572 } 573 else if (t.endsWith(`mm`, `nn`)) 574 { 575 return t[0 .. $ - 1]; 576 } 577 return t; 578 } 579 if (s.endsWith(ern)) 580 { 581 return s[0 .. $ - 1]; 582 } 583 if (s.endsWith(an)) 584 { 585 const t = s[0 .. $ - an.length]; 586 if (t.length >= 3 && 587 t.endsWith(`tt`, `mp`, `ck`, `st`)) 588 { 589 return s[0 ..$ - 1]; 590 } 591 else if (t.length >= 2 && 592 t.endsWith(`n`, `p`)) 593 { 594 return s[0 ..$ - 1]; 595 } 596 else if (t.length < 3) 597 { 598 return s; 599 } 600 return t; 601 } 602 } 603 604 if (s.endsWith(igt)) 605 { 606 return s[0 .. $ - 1]; 607 } 608 609 if (s.endsWith(ya)) 610 { 611 return s[0 .. $ - 1]; 612 } 613 614 if (s.endsWith(na)) 615 { 616 if (s.among!(`sina`, `dina`, `mina`)) 617 { 618 return s[0 .. $ - 1]; 619 } 620 auto t = s[0 .. $ - na.length]; 621 if (t.endsWith(`r`)) 622 { 623 if (t.endsWith(ar, or, er)) 624 { 625 const u = t[0 .. $ - ar.length]; 626 if (u.canFind!(a => a.isSwedishVowel)) 627 { 628 return u; 629 } 630 else 631 { 632 return t[0 .. $ - 1]; 633 } 634 } 635 } 636 } 637 638 if (s.endsWith(et)) 639 { 640 const t = s[0 .. $ - et.length]; 641 if (t.length >= 3 && 642 t[$ - 3].isSwedishConsonant && 643 t[$ - 2].isSwedishConsonant && 644 t[$ - 1].isSwedishConsonant) 645 { 646 return s[0 .. $ - 1]; 647 } 648 else if (t.endsWith(`ck`)) 649 { 650 return s[0 .. $ - 1]; 651 } 652 653 return t; 654 } 655 656 if (s.endsWith(ar, or, er)) 657 { 658 const t = s[0 .. $ - ar.length]; 659 if (t.canFind!(a => a.isSwedishVowel)) 660 { 661 if (t.endsWith(`mm`, `nn`)) 662 { 663 return t[0 .. $ - 1]; 664 } 665 else 666 { 667 return t; 668 } 669 } 670 else 671 { 672 return s[0 .. $ - 1]; 673 } 674 } 675 676 if (s.endsWith(aste)) 677 { 678 const t = s[0 .. $ - aste.length]; 679 if (t.among!(`sann`)) 680 { 681 return t; 682 } 683 if (t.endsWith(`mm`, `nn`)) 684 { 685 return t[0 .. $ - 1]; 686 } 687 if (t.canFind!(a => a.isSwedishVowel)) 688 { 689 return t; 690 } 691 } 692 693 if (s.endsWith(are, ast)) 694 { 695 const t = s[0 .. $ - are.length]; 696 if (t.among!(`sann`)) 697 { 698 return t; 699 } 700 if (t.endsWith(`mm`, `nn`)) 701 { 702 return t[0 .. $ - 1]; 703 } 704 if (t.canFind!(a => a.isSwedishVowel)) 705 { 706 return t; 707 } 708 } 709 710 if (s.endsWith(iserad)) 711 { 712 const t = s[0 .. $ - iserad.length]; 713 if (!t.endsWith(`n`)) 714 { 715 return t; 716 } 717 } 718 719 if (s.endsWith(de)) 720 { 721 enum ande = `ande`; 722 if (s.endsWith(ande)) 723 { 724 const t = s[0 .. $ - ande.length]; 725 if (t.empty) 726 { 727 return s; 728 } 729 else if (t[$ - 1].isSwedishConsonant) 730 { 731 return s[0 .. $ - 3]; 732 } 733 return t; 734 } 735 if (s.among!(`hade`)) 736 { 737 return s; 738 } 739 const t = s[0 .. $ - de.length]; 740 return t; 741 } 742 743 if (s.endsWith(ing)) 744 { 745 enum ning = `ning`; 746 if (s.endsWith(ning)) 747 { 748 const t = s[0 .. $ - ning.length]; 749 if (!t.endsWith(`n`) && 750 t != `tid`) 751 { 752 return t; 753 } 754 } 755 return s[0 .. $ - ing.length]; 756 } 757 758 if (s.endsWith(llt)) 759 { 760 return s[0 .. $ - 1]; 761 } 762 763 return s; 764 } 765 766 unittest { 767 // import nxt.assert_ex; 768 769 assert("rumpan".stemSwedish == "rumpa"); 770 assert("sopan".stemSwedish == "sopa"); 771 assert("kistan".stemSwedish == "kista"); 772 773 assert("karl".stemSwedish == "karl"); 774 775 assert("grenen".stemSwedish == "gren"); 776 assert("busen".stemSwedish == "bus"); 777 assert("husen".stemSwedish == "hus"); 778 assert("räven".stemSwedish == "räv"); 779 assert("dunken".stemSwedish == "dunk"); 780 assert("männen".stemSwedish == "män"); 781 assert("manen".stemSwedish == "man"); 782 assert("mannen".stemSwedish == "man"); 783 784 assert("skalet".stemSwedish == "skal"); 785 assert("karet".stemSwedish == "kar"); 786 assert("taket".stemSwedish == "tak"); 787 assert("stinget".stemSwedish == "sting"); 788 789 assert("äpplet".stemSwedish == "äpple"); 790 791 assert("jakt".stemSwedish == "jakt"); 792 793 assert("sot".stemSwedish == "sot"); 794 assert("sotare".stemSwedish == "sot"); 795 796 assert("klok".stemSwedish == "klok"); 797 assert("klokare".stemSwedish == "klok"); 798 assert("klokast".stemSwedish == "klok"); 799 800 assert("stark".stemSwedish == "stark"); 801 assert("starkare".stemSwedish == "stark"); 802 assert("starkast".stemSwedish == "stark"); 803 804 assert("kort".stemSwedish == "kort"); 805 assert("kortare".stemSwedish == "kort"); 806 assert("kortast".stemSwedish == "kort"); 807 808 assert("rolig".stemSwedish == "rolig"); 809 assert("roligare".stemSwedish == "rolig"); 810 assert("roligast".stemSwedish == "rolig"); 811 812 assert("dum".stemSwedish == "dum"); 813 assert("dummare".stemSwedish == "dum"); 814 assert("dummast".stemSwedish == "dum"); 815 assert("dummaste".stemSwedish == "dum"); 816 assert("senaste".stemSwedish == "sen"); 817 818 assert("sanning".stemSwedish == "sann"); 819 assert("sann".stemSwedish == "sann"); 820 assert("sannare".stemSwedish == "sann"); 821 assert("sannare".stemSwedish == "sann"); 822 823 assert("stare".stemSwedish == "stare"); 824 assert("kvast".stemSwedish == "kvast"); 825 826 assert("täcket".stemSwedish == "täcke"); 827 assert("räcket".stemSwedish == "räcke"); 828 829 assert("van".stemSwedish == "van"); 830 assert("dan".stemSwedish == "dan"); 831 assert("man".stemSwedish == "man"); 832 assert("ovan".stemSwedish == "ovan"); 833 assert("stan".stemSwedish == "stan"); 834 assert("klan".stemSwedish == "klan"); 835 836 assert("klockan".stemSwedish == "klocka"); 837 assert("klockande".stemSwedish == "klocka"); 838 assert("sockan".stemSwedish == "socka"); 839 assert("rockan".stemSwedish == "rocka"); 840 assert("rock".stemSwedish == "rock"); 841 842 assert("agenter".stemSwedish == "agent"); 843 assert("agenterna".stemSwedish == "agent"); 844 assert("regenter".stemSwedish == "regent"); 845 assert("regenterna".stemSwedish == "regent"); 846 847 assert("brodern".stemSwedish == "broder"); 848 assert("kärnan".stemSwedish == "kärna"); 849 850 assert("skorna".stemSwedish == "sko"); 851 852 assert("inträffade".stemSwedish == "inträffa"); 853 assert("roa".stemSwedish == "roa"); 854 assert("roade".stemSwedish == "roa"); 855 assert("hade".stemSwedish == "hade"); 856 assert("hades".stemSwedish == "hades"); 857 858 assert("fullt".stemSwedish == "full"); 859 860 assert("kanaliserad".stemSwedish == "kanal"); 861 assert("alkoholiserad".stemSwedish == "alkohol"); 862 863 assert("roande".stemSwedish == "ro"); 864 865 /* assertEqual("ror".stemSwedish, "ro"); */ 866 /* assertEqual("öbor".stemSwedish, "öbo"); */ 867 868 assert("ande".stemSwedish == "ande"); 869 870 assert("störande".stemSwedish == "störa"); 871 assert("nekande".stemSwedish == "neka"); 872 assert("jagande".stemSwedish == "jaga"); 873 assert("stimulerande".stemSwedish == "stimulera"); 874 875 assert("karlar".stemSwedish == "karl"); 876 assert("lagar".stemSwedish == "lag"); 877 878 assert("sina".stemSwedish == "sin"); 879 assert("dina".stemSwedish == "din"); 880 assert("mina".stemSwedish == "min"); 881 882 assert("även".stemSwedish == "även"); 883 884 assert("samtida".stemSwedish == "samtid"); 885 886 assert("trattar".stemSwedish == "tratt"); 887 888 assert("katter".stemSwedish == "katt"); 889 assert("dagar".stemSwedish == "dag"); 890 assert("öar".stemSwedish == "ö"); 891 assert("åar".stemSwedish == "å"); 892 assert("ängar".stemSwedish == "äng"); 893 894 assert("spelar".stemSwedish == "spel"); 895 assert("drar".stemSwedish == "dra"); 896 897 assert("kullar".stemSwedish == "kull"); 898 assert("kullarna".stemSwedish == "kull"); 899 900 assert("mamma".stemSwedish == "mamma"); 901 902 assert("bestyr".stemSwedish == "bestyr"); 903 904 assert("krya".stemSwedish == "kry"); 905 assert("nya".stemSwedish == "ny"); 906 907 assert("lemmar".stemSwedish == "lem"); 908 909 /* assertEqual("ämnar".stemSwedish, "ämna"); */ 910 /* assert("rämnar".stemSwedish == "rämna"); */ 911 /* assert("lämnar".stemSwedish == "lämna"); */ 912 } 913 914 auto ref stemNorvegian(S)(S s) 915 if (isSomeString!S) 916 { 917 s.skipOverBack(`ede`); 918 return s; 919 } 920 921 /** Stem $(D s) in Language $(D lang). 922 If lang is unknown try each known language until failure. 923 */ 924 Tuple!(S, Language_ISO_639_1) stemIn(S)(S s, Language_ISO_639_1 lang = Language_ISO_639_1.init) 925 if (isSomeString!S) 926 { 927 typeof(return) t; 928 switch (lang) with (Language_ISO_639_1) 929 { 930 case unknown: 931 t = s.stemIn(en); if (t[0].length != s.length) return t; 932 t = s.stemIn(sv); if (t[0].length != s.length) return t; 933 t = s.stemIn(no); if (t[0].length != s.length) return t; 934 break; 935 case sv: t = tuple(s.stemSwedish, sv); break; 936 case no: t = tuple(s.stemNorvegian, no); break; 937 case en: 938 default: 939 auto stemmer = new Stemmer!string(); 940 t = tuple(stemmer.stem(s), lang); 941 } 942 return t; 943 } 944 945 /** Destructively Stem $(D s) in Language $(D lang). */ 946 Tuple!(bool, Language_ISO_639_1) stemize(S)(ref S s, Language_ISO_639_1 lang = Language_ISO_639_1.init) 947 if (isSomeString!S) 948 { 949 const n = s.length; 950 auto t = s.stemIn(lang); 951 s = t[0]; 952 return tuple(n != s.length, t[1]); 953 } 954 955 /** Return Stem of $(D s) using Porter's algorithm 956 See_Also: https://en.wikipedia.org/wiki/I_m_still_remembering 957 See_Also: https://en.wikipedia.org/wiki/Martin_Porter 958 See_Also: https://www.youtube.com/watch?v=2s7f8mBwnko&list=PL6397E4B26D00A269&index=4. 959 */ 960 S alternativePorterStemEnglish(S)(S s) 961 if (isSomeString!S) 962 { 963 /* Step 1a */ 964 if (s.endsWith(`sses`)) { s = s[0 .. $-2]; } 965 else if (s.endsWith(`ies`)) { s = s[0 .. $-2]; } 966 else if (s.endsWith(`ss`)) { } 967 else if (s.endsWith(`s`)) { s = s[0 .. $-1]; } 968 969 /* Step 2 */ 970 if (s.endsWith(`ational`)) { s = s[0 .. $-7] ~ `ate`; } 971 else if (s.endsWith(`izer`)) { s = s[0 .. $-1]; } 972 else if (s.endsWith(`ator`)) { s = s[0 .. $-2] ~ `e`; } 973 974 /* Step 3 */ 975 else if (s.endsWith(`al`)) { s = s[0 .. $-2] ~ `e`; } 976 else if (s.endsWith(`able`)) { s = s[0 .. $-4]; } 977 else if (s.endsWith(`ate`)) { s = s[0 .. $-3] ~ `e`; } 978 979 return s; 980 } 981 982 unittest { 983 assert(`caresses`.alternativePorterStemEnglish == `caress`); 984 assert(`ponies`.alternativePorterStemEnglish == `poni`); 985 assert(`caress`.alternativePorterStemEnglish == `caress`); 986 assert(`cats`.alternativePorterStemEnglish == `cat`); 987 988 assert(`relational`.alternativePorterStemEnglish == `relate`); 989 assert(`digitizer`.alternativePorterStemEnglish == `digitize`); 990 assert(`operator`.alternativePorterStemEnglish == `operate`); 991 992 assert(`revival`.alternativePorterStemEnglish == `revive`); 993 assert(`adjustable`.alternativePorterStemEnglish == `adjust`); 994 assert(`activate`.alternativePorterStemEnglish == `active`); 995 }