1 /** Stemming algorithms 2 */ 3 module nxt.stemming; 4 5 import std.algorithm.comparison: among; 6 import std.algorithm.searching : endsWith, canFind; 7 import std.range: empty; 8 import std.traits: isSomeString; 9 import std.typecons : Tuple, tuple; 10 11 import nxt.languages : Lang; 12 import nxt.lingua : isEnglishVowel, isSwedishVowel, isSwedishConsonant, isEnglishConsonant; 13 import nxt.skip_ex : skipOverBack; 14 15 public class Stemmer(S) 16 if (isSomeString!S) 17 { 18 /** 19 * In stem(p,i,j), p is a char pointer, and the string to be stemmed 20 * is from p[i] to p[j] inclusive. Typically i is zero and j is the 21 * offset to the last character of a string, (p[j+1] == '\0'). The 22 * stemmer adjusts the characters p[i] ... p[j] and returns the new 23 * end-point of the string, k. Stemming never increases word length, so 24 * i <= k <= j. To turn the stemmer into a module, declare 'stem' as 25 * extern, and delete the remainder of this file. 26 */ 27 public S stem(S p) 28 { 29 _b = p; 30 _k = p.length - 1; 31 _k0 = 0; 32 33 /** strings of length 1 or 2 don't go through the stemming process, 34 * although no mention is made of this in the published 35 * algorithm. Remove the line to match the published algorithm. 36 */ 37 if (_k <= _k0 + 1) 38 return _b; 39 40 step1ab(); 41 step1c(); 42 step2(); 43 step3(); 44 step4(); 45 step5(); 46 return _b[_k0 .. _k + 1]; 47 48 } 49 50 private: 51 S _b; // buffer for the word 52 ptrdiff_t _k = 0; 53 ptrdiff_t _k0 = 0; 54 ptrdiff_t _j = 0; // offset within the string 55 56 /** 57 * cons returns true, if b[i] is a consonant 58 */ 59 bool isConsonant(ptrdiff_t i) 60 { 61 if (_b[i].isEnglishVowel) 62 return false; 63 if (_b[i] == 'y') 64 { 65 if (i == _k0) 66 { 67 return true; 68 } 69 else 70 { 71 return !isConsonant(i - 1); 72 } 73 } 74 return true; 75 } 76 77 /** Return the number of consonant sequences between k0 and j. 78 * if c is a consonant sequence and v a vowel sequence, and <..> 79 * indicates arbitrary presence, 80 * 81 * <c><v> gives 0 82 * <c>vc<v> gives 1 83 * <c>vcvc<v> gives 2 84 * <c>vcvcvc<v> gives 3 85 * 86 */ 87 size_t m() 88 { 89 ptrdiff_t n = 0; 90 ptrdiff_t i = _k0; 91 92 while (true) 93 { 94 if (i > _j) 95 { 96 return n; 97 } 98 if (!isConsonant(i)) 99 { 100 break; 101 } 102 i++; 103 } 104 i++; 105 while (true) 106 { 107 while (true) 108 { 109 if (i > _j) 110 { 111 return n; 112 } 113 if (isConsonant(i)) 114 { 115 break; 116 } 117 i++; 118 } 119 i++; 120 n++; 121 while (true) 122 { 123 if (i > _j) 124 { 125 return n; 126 } 127 if (!isConsonant(i)) 128 { 129 break; 130 } 131 i++; 132 } 133 i++; 134 } 135 } 136 137 /** Returns true if k0...j contains a vowel. */ 138 bool hasVowelInStem() 139 { 140 for (ptrdiff_t i = _k0; i < _j + 1; i++) 141 { 142 if (!isConsonant(i)) 143 return true; 144 } 145 return false; 146 } 147 148 /** Returns true if j, j-1 contains a double consonant 149 */ 150 bool doublec(ptrdiff_t j) 151 { 152 if (j < (_k0 + 1)) 153 return false; 154 if (_b[j] != _b[j-1]) 155 return false; 156 return isConsonant(j); 157 } 158 159 /** Returns true if i-2,i-1,i has the form consonant - vowel - consonant 160 * and also if the second c is not w,x or y. this is used when trying to 161 * restore an e at the end of a short e.g. 162 * 163 * cav(e), lov(e), hop(e), crim(e), but 164 * snow, box, tray. 165 * 166 */ 167 bool cvc(ptrdiff_t i) 168 { 169 if (i < (_k0 + 2) || !isConsonant(i) || isConsonant(i-1) || !isConsonant(i-2)) 170 return false; 171 if (_b[i] == 'w' || _b[i] == 'x' || _b[i] == 'y') 172 return false; 173 return true; 174 } 175 176 /** Return true if k0,...k endsWith with the string s. 177 */ 178 bool endsWith(S)(S s) 179 if (isSomeString!S) 180 { 181 const len = s.length; 182 183 if (s[len - 1] != _b[_k]) 184 return false; 185 if (len > (_k - _k0 + 1)) 186 return false; 187 188 const a = _k - len + 1; 189 const b = _k + 1; 190 191 if (_b[a..b] != s) 192 { 193 return false; 194 } 195 _j = _k - len; 196 197 return true; 198 } 199 200 /** Sets (j+1),...k to the characters in the string s, readjusting k. */ 201 void setto(S)(S s) 202 if (isSomeString!S) 203 { 204 _b = _b[0.._j+1] ~ s ~ _b[_j + s.length + 1 .. _b.length]; 205 _k = _j + s.length; 206 } 207 208 /** Used further down. */ 209 void r(S)(S s) 210 if (isSomeString!S) 211 { 212 if (m() > 0) 213 setto(s); 214 } 215 216 /** Gets rid of plurals and -ed or -ing. e.g. */ 217 void step1ab() 218 { 219 if (_b[_k] == 's') 220 { 221 if (endsWith("sses")) 222 { 223 _k = _k - 2; 224 } 225 else if (endsWith("ies")) 226 { 227 setto("i"); 228 } 229 else if (_b[_k - 1] != 's') 230 { 231 _k--; 232 } 233 } 234 if (endsWith("eed")) 235 { 236 if (m() > 0) 237 _k--; 238 } 239 else if ((endsWith("ed") || endsWith("ing")) && hasVowelInStem()) 240 { 241 _k = _j; 242 if (endsWith("at")) 243 { 244 setto("ate"); 245 } 246 else if (endsWith("bl")) 247 { 248 setto("ble"); 249 } 250 else if (endsWith("iz")) 251 { 252 setto("ize"); 253 } 254 else if (doublec(_k)) 255 { 256 _k--; 257 if (_b[_k] == 'l' || _b[_k] == 's' || _b[_k] == 'z') 258 _k++; 259 } 260 else if (m() == 1 && cvc(_k)) 261 { 262 setto("e"); 263 } 264 } 265 } 266 267 /** 268 * step1c() turns terminal y to i when there is another vowel in the stem. 269 */ 270 void step1c() 271 { 272 if (endsWith("y") && 273 !endsWith("day") && 274 hasVowelInStem()) 275 { 276 _b = _b[0.._k] ~ 'i' ~ _b[_k+1 .. _b.length]; 277 } 278 } 279 280 /** 281 * step2() maps double suffices to single ones. 282 * so -ization (= -ize plus -ation) maps to -ize etc. note that the 283 * string before the suffix must give m() > 0.* 284 */ 285 void step2() 286 { 287 if (_b[_k - 1] == 'a') 288 { 289 if (endsWith("ational")) 290 r("ate"); 291 else if (endsWith("tional")) 292 r("tion"); 293 } 294 else if (_b[_k - 1] == 'c') 295 { 296 if (endsWith("enci")) 297 r("ence"); 298 else if (endsWith("anci")) 299 r("ance"); 300 } 301 else if (_b[_k - 1] == 'e') 302 { 303 if (endsWith("izer")) 304 r("ize"); 305 } 306 else if (_b[_k - 1] == 'l') 307 { 308 if (endsWith("bli")) 309 r("ble"); 310 /* --DEPARTURE-- 311 * To match the published algorithm, replace this phrase with 312 * if (endsWith("abli")) 313 * r("able"); 314 */ 315 else if (endsWith("alli")) 316 r("al"); 317 else if (endsWith("entli")) 318 r("ent"); 319 else if (endsWith("eli")) 320 r("e"); 321 else if (endsWith("ousli")) 322 r("ous"); 323 } 324 else if (_b[_k - 1] == 'o') 325 { 326 if (endsWith("ization")) 327 r("ize"); 328 else if (endsWith("ation") || endsWith("ator")) 329 r("ate"); 330 } 331 else if (_b[_k - 1] == 's') 332 { 333 if (endsWith("alism")) 334 r("al"); 335 else if (endsWith("iveness")) 336 r("ive"); 337 else if (endsWith("fulness")) 338 r("ful"); 339 else if (endsWith("ousness")) 340 r("ous"); 341 } 342 else if (_b[_k - 1] == 't') 343 { 344 if (endsWith("aliti")) 345 r("al"); 346 else if (endsWith("iviti")) 347 r("ive"); 348 else if (endsWith("biliti")) 349 r("ble"); 350 } 351 else if (_b[_k - 1] == 'g') 352 { 353 /** 354 * --DEPARTURE-- 355 * To match the published algorithm, delete this phrase 356 */ 357 if (endsWith("logi")) 358 r("log"); 359 } 360 } 361 362 /** 363 * step3() dels with -ic-, -full, -ness etc. similar strategy to step2. 364 */ 365 void step3() 366 { 367 if (_b[_k] == 'e') 368 { 369 if (endsWith("icate")) r("ic"); 370 else if (endsWith("ative")) r(""); 371 else if (endsWith("alize")) r("al"); 372 } 373 else if (_b[_k] == 'i') 374 { 375 if (endsWith("iciti")) r("ic"); 376 } 377 else if (_b[_k] == 'l') 378 { 379 if (endsWith("ical")) r("ic"); 380 else if (endsWith("ful")) r(""); 381 } 382 else if (_b[_k] == 's') 383 { 384 if (endsWith("ness")) r(""); 385 } 386 } 387 388 /** 389 * step4() takes off -ant, -ence etc., in context <c>vcvc<v>. 390 */ 391 void step4() 392 { 393 /* fixes bug 1 */ 394 if (_k == 0) 395 return; 396 switch (_b[_k - 1]) 397 { 398 case 'a': 399 if (endsWith("al")) 400 break; 401 return; 402 case 'c': 403 if (endsWith("ance") || endsWith("ence")) 404 break; 405 return; 406 case 'e': 407 if (endsWith("er")) 408 break; 409 return; 410 case 'i': 411 if (endsWith("ic")) 412 break; 413 return; 414 case 'l': 415 if (endsWith("able") || endsWith("ible")) 416 break; 417 return; 418 case 'n': 419 if (endsWith("ant") || endsWith("ement") || endsWith("ment") || endsWith("ent")) 420 break; 421 return; 422 case 'o': 423 if (endsWith("ion") && _j >= 0 && (_b[_j] == 's' || _b[_j] == 't')) 424 { 425 /* _j >= 0 fixes bug 2 */ 426 break; 427 } 428 if (endsWith("ou")) 429 break; 430 return; 431 case 's': 432 if (endsWith("ism")) 433 break; 434 return; 435 case 't': 436 if (endsWith("ate") || endsWith("iti")) 437 break; 438 return; 439 case 'u': 440 if (endsWith("ous")) 441 break; 442 return; 443 case 'v': 444 if (endsWith("ive")) 445 break; 446 return; 447 case 'z': 448 if (endsWith("ize")) 449 break; 450 return; 451 default: 452 return; 453 } 454 455 if (m() > 1) 456 _k = _j; 457 458 } 459 460 /** 461 * step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1. 462 */ 463 void step5() 464 { 465 _j = _k; 466 if (_b[_k] == 'e' && 467 _b[0 .. _k] != `false`) 468 { 469 auto a = m(); 470 if (a > 1 || (a == 1 && !cvc(_k - 1))) 471 _k--; 472 } 473 if (_b[_k] == 'l' && doublec(_k) && m() > 1) 474 _k--; 475 } 476 } 477 478 unittest 479 { 480 scope stemmer = new Stemmer!string(); 481 482 assert(stemmer.stem("") == ""); 483 assert(stemmer.stem("x") == "x"); 484 assert(stemmer.stem("xyz") == "xyz"); 485 assert(stemmer.stem("win") == "win"); 486 // TODO assert(stemmer.stem("winner") == "win"); 487 assert(stemmer.stem("winning") == "win"); 488 assert(stemmer.stem("farted") == "fart"); 489 assert(stemmer.stem("caresses") == "caress"); 490 assert(stemmer.stem("ponies") == "poni"); 491 assert(stemmer.stem("ties") == "ti"); 492 assert(stemmer.stem("caress") == "caress"); 493 assert(stemmer.stem("cats") == "cat"); 494 assert(stemmer.stem("feed") == "feed"); 495 assert(stemmer.stem("matting") == "mat"); 496 assert(stemmer.stem("mating") == "mate"); 497 assert(stemmer.stem("meeting") == "meet"); 498 assert(stemmer.stem("milling") == "mill"); 499 assert(stemmer.stem("messing") == "mess"); 500 assert(stemmer.stem("meetings") == "meet"); 501 assert(stemmer.stem("neutralize") == "neutral"); 502 assert(stemmer.stem("relational") == "relat"); 503 assert(stemmer.stem("relational") == "relat"); 504 assert(stemmer.stem("intricate") == "intric"); 505 506 assert(stemmer.stem("connection") == "connect"); 507 assert(stemmer.stem("connective") == "connect"); 508 assert(stemmer.stem("connecting") == "connect"); 509 510 assert(stemmer.stem("agreed") == "agre"); 511 assert(stemmer.stem("disabled") == "disabl"); 512 assert(stemmer.stem("gentle") == "gentl"); 513 assert(stemmer.stem("gently") == "gentli"); 514 assert(stemmer.stem("served") == "serv"); 515 assert(stemmer.stem("competes") == "compet"); 516 517 assert(stemmer.stem("fullnessful") == "fullness"); 518 assert(stemmer.stem(stemmer.stem("fullnessful")) == "full"); 519 520 assert(stemmer.stem("bee") == "bee"); 521 522 assert(stemmer.stem("dogs") == "dog"); 523 assert(stemmer.stem("churches") == "church"); 524 assert(stemmer.stem("hardrock") == "hardrock"); 525 526 // TODO assert(stemmer.stem("false") == "false"); 527 } 528 529 import nxt.dbgio; 530 531 /** Stem Swedish Word $(D s). 532 */ 533 auto ref stemSwedish(S)(S s) 534 if (isSomeString!S) 535 { 536 enum ar = `ar`; 537 enum or = `or`; 538 enum er = `er`; 539 enum ya = `ya`; 540 541 enum en = `en`; 542 enum ern = `ern`; 543 enum an = `an`; 544 enum na = `na`; 545 enum et = `et`; 546 enum aste = `aste`; 547 enum are = `are`; 548 enum ast = `ast`; 549 enum iserad = `iserad`; 550 enum de = `de`; 551 enum ing = `ing`; 552 enum igt = `igt`; 553 enum llt = `llt`; 554 555 switch (s) 556 { 557 case `samtida`: return `samtid`; 558 default: break; 559 } 560 561 if (s.endsWith(`n`)) 562 { 563 if (s.endsWith(en)) 564 { 565 const t = s[0 .. $ - en.length]; 566 if (s.among!(`även`)) 567 { 568 return s; 569 } 570 else if (t.among!(`sann`)) 571 { 572 return t; 573 } 574 else if (t.endsWith(`mm`, `nn`)) 575 { 576 return t[0 .. $ - 1]; 577 } 578 return t; 579 } 580 if (s.endsWith(ern)) 581 { 582 return s[0 .. $ - 1]; 583 } 584 if (s.endsWith(an)) 585 { 586 const t = s[0 .. $ - an.length]; 587 if (t.length >= 3 && 588 t.endsWith(`tt`, `mp`, `ck`, `st`)) 589 { 590 return s[0 ..$ - 1]; 591 } 592 else if (t.length >= 2 && 593 t.endsWith(`n`, `p`)) 594 { 595 return s[0 ..$ - 1]; 596 } 597 else if (t.length < 3) 598 { 599 return s; 600 } 601 return t; 602 } 603 } 604 605 if (s.endsWith(igt)) 606 { 607 return s[0 .. $ - 1]; 608 } 609 610 if (s.endsWith(ya)) 611 { 612 return s[0 .. $ - 1]; 613 } 614 615 if (s.endsWith(na)) 616 { 617 if (s.among!(`sina`, `dina`, `mina`)) 618 { 619 return s[0 .. $ - 1]; 620 } 621 auto t = s[0 .. $ - na.length]; 622 if (t.endsWith(`r`)) 623 { 624 if (t.endsWith(ar, or, er)) 625 { 626 const u = t[0 .. $ - ar.length]; 627 if (u.canFind!(a => a.isSwedishVowel)) 628 { 629 return u; 630 } 631 else 632 { 633 return t[0 .. $ - 1]; 634 } 635 } 636 } 637 } 638 639 if (s.endsWith(et)) 640 { 641 const t = s[0 .. $ - et.length]; 642 if (t.length >= 3 && 643 t[$ - 3].isSwedishConsonant && 644 t[$ - 2].isSwedishConsonant && 645 t[$ - 1].isSwedishConsonant) 646 { 647 return s[0 .. $ - 1]; 648 } 649 else if (t.endsWith(`ck`)) 650 { 651 return s[0 .. $ - 1]; 652 } 653 654 return t; 655 } 656 657 if (s.endsWith(ar, or, er)) 658 { 659 const t = s[0 .. $ - ar.length]; 660 if (t.canFind!(a => a.isSwedishVowel)) 661 { 662 if (t.endsWith(`mm`, `nn`)) 663 { 664 return t[0 .. $ - 1]; 665 } 666 else 667 { 668 return t; 669 } 670 } 671 else 672 { 673 return s[0 .. $ - 1]; 674 } 675 } 676 677 if (s.endsWith(aste)) 678 { 679 const t = s[0 .. $ - aste.length]; 680 if (t.among!(`sann`)) 681 { 682 return t; 683 } 684 if (t.endsWith(`mm`, `nn`)) 685 { 686 return t[0 .. $ - 1]; 687 } 688 if (t.canFind!(a => a.isSwedishVowel)) 689 { 690 return t; 691 } 692 } 693 694 if (s.endsWith(are, ast)) 695 { 696 const t = s[0 .. $ - are.length]; 697 if (t.among!(`sann`)) 698 { 699 return t; 700 } 701 if (t.endsWith(`mm`, `nn`)) 702 { 703 return t[0 .. $ - 1]; 704 } 705 if (t.canFind!(a => a.isSwedishVowel)) 706 { 707 return t; 708 } 709 } 710 711 if (s.endsWith(iserad)) 712 { 713 const t = s[0 .. $ - iserad.length]; 714 if (!t.endsWith(`n`)) 715 { 716 return t; 717 } 718 } 719 720 if (s.endsWith(de)) 721 { 722 enum ande = `ande`; 723 if (s.endsWith(ande)) 724 { 725 const t = s[0 .. $ - ande.length]; 726 if (t.empty) 727 { 728 return s; 729 } 730 else if (t[$ - 1].isSwedishConsonant) 731 { 732 return s[0 .. $ - 3]; 733 } 734 return t; 735 } 736 if (s.among!(`hade`)) 737 { 738 return s; 739 } 740 const t = s[0 .. $ - de.length]; 741 return t; 742 } 743 744 if (s.endsWith(ing)) 745 { 746 enum ning = `ning`; 747 if (s.endsWith(ning)) 748 { 749 const t = s[0 .. $ - ning.length]; 750 if (!t.endsWith(`n`) && 751 t != `tid`) 752 { 753 return t; 754 } 755 } 756 return s[0 .. $ - ing.length]; 757 } 758 759 if (s.endsWith(llt)) 760 { 761 return s[0 .. $ - 1]; 762 } 763 764 return s; 765 } 766 767 unittest 768 { 769 // import nxt.assert_ex; 770 771 assert("rumpan".stemSwedish == "rumpa"); 772 assert("sopan".stemSwedish == "sopa"); 773 assert("kistan".stemSwedish == "kista"); 774 775 assert("karl".stemSwedish == "karl"); 776 777 assert("grenen".stemSwedish == "gren"); 778 assert("busen".stemSwedish == "bus"); 779 assert("husen".stemSwedish == "hus"); 780 assert("räven".stemSwedish == "räv"); 781 assert("dunken".stemSwedish == "dunk"); 782 assert("männen".stemSwedish == "män"); 783 assert("manen".stemSwedish == "man"); 784 assert("mannen".stemSwedish == "man"); 785 786 assert("skalet".stemSwedish == "skal"); 787 assert("karet".stemSwedish == "kar"); 788 assert("taket".stemSwedish == "tak"); 789 assert("stinget".stemSwedish == "sting"); 790 791 assert("äpplet".stemSwedish == "äpple"); 792 793 assert("jakt".stemSwedish == "jakt"); 794 795 assert("sot".stemSwedish == "sot"); 796 assert("sotare".stemSwedish == "sot"); 797 798 assert("klok".stemSwedish == "klok"); 799 assert("klokare".stemSwedish == "klok"); 800 assert("klokast".stemSwedish == "klok"); 801 802 assert("stark".stemSwedish == "stark"); 803 assert("starkare".stemSwedish == "stark"); 804 assert("starkast".stemSwedish == "stark"); 805 806 assert("kort".stemSwedish == "kort"); 807 assert("kortare".stemSwedish == "kort"); 808 assert("kortast".stemSwedish == "kort"); 809 810 assert("rolig".stemSwedish == "rolig"); 811 assert("roligare".stemSwedish == "rolig"); 812 assert("roligast".stemSwedish == "rolig"); 813 814 assert("dum".stemSwedish == "dum"); 815 assert("dummare".stemSwedish == "dum"); 816 assert("dummast".stemSwedish == "dum"); 817 assert("dummaste".stemSwedish == "dum"); 818 assert("senaste".stemSwedish == "sen"); 819 820 assert("sanning".stemSwedish == "sann"); 821 assert("sann".stemSwedish == "sann"); 822 assert("sannare".stemSwedish == "sann"); 823 assert("sannare".stemSwedish == "sann"); 824 825 assert("stare".stemSwedish == "stare"); 826 assert("kvast".stemSwedish == "kvast"); 827 828 assert("täcket".stemSwedish == "täcke"); 829 assert("räcket".stemSwedish == "räcke"); 830 831 assert("van".stemSwedish == "van"); 832 assert("dan".stemSwedish == "dan"); 833 assert("man".stemSwedish == "man"); 834 assert("ovan".stemSwedish == "ovan"); 835 assert("stan".stemSwedish == "stan"); 836 assert("klan".stemSwedish == "klan"); 837 838 assert("klockan".stemSwedish == "klocka"); 839 assert("klockande".stemSwedish == "klocka"); 840 assert("sockan".stemSwedish == "socka"); 841 assert("rockan".stemSwedish == "rocka"); 842 assert("rock".stemSwedish == "rock"); 843 844 assert("agenter".stemSwedish == "agent"); 845 assert("agenterna".stemSwedish == "agent"); 846 assert("regenter".stemSwedish == "regent"); 847 assert("regenterna".stemSwedish == "regent"); 848 849 assert("brodern".stemSwedish == "broder"); 850 assert("kärnan".stemSwedish == "kärna"); 851 852 assert("skorna".stemSwedish == "sko"); 853 854 assert("inträffade".stemSwedish == "inträffa"); 855 assert("roa".stemSwedish == "roa"); 856 assert("roade".stemSwedish == "roa"); 857 assert("hade".stemSwedish == "hade"); 858 assert("hades".stemSwedish == "hades"); 859 860 assert("fullt".stemSwedish == "full"); 861 862 assert("kanaliserad".stemSwedish == "kanal"); 863 assert("alkoholiserad".stemSwedish == "alkohol"); 864 865 assert("roande".stemSwedish == "ro"); 866 867 /* assertEqual("ror".stemSwedish, "ro"); */ 868 /* assertEqual("öbor".stemSwedish, "öbo"); */ 869 870 assert("ande".stemSwedish == "ande"); 871 872 assert("störande".stemSwedish == "störa"); 873 assert("nekande".stemSwedish == "neka"); 874 assert("jagande".stemSwedish == "jaga"); 875 assert("stimulerande".stemSwedish == "stimulera"); 876 877 assert("karlar".stemSwedish == "karl"); 878 assert("lagar".stemSwedish == "lag"); 879 880 assert("sina".stemSwedish == "sin"); 881 assert("dina".stemSwedish == "din"); 882 assert("mina".stemSwedish == "min"); 883 884 assert("även".stemSwedish == "även"); 885 886 assert("samtida".stemSwedish == "samtid"); 887 888 assert("trattar".stemSwedish == "tratt"); 889 890 assert("katter".stemSwedish == "katt"); 891 assert("dagar".stemSwedish == "dag"); 892 assert("öar".stemSwedish == "ö"); 893 assert("åar".stemSwedish == "å"); 894 assert("ängar".stemSwedish == "äng"); 895 896 assert("spelar".stemSwedish == "spel"); 897 assert("drar".stemSwedish == "dra"); 898 899 assert("kullar".stemSwedish == "kull"); 900 assert("kullarna".stemSwedish == "kull"); 901 902 assert("mamma".stemSwedish == "mamma"); 903 904 assert("bestyr".stemSwedish == "bestyr"); 905 906 assert("krya".stemSwedish == "kry"); 907 assert("nya".stemSwedish == "ny"); 908 909 assert("lemmar".stemSwedish == "lem"); 910 911 /* assertEqual("ämnar".stemSwedish, "ämna"); */ 912 /* assert("rämnar".stemSwedish == "rämna"); */ 913 /* assert("lämnar".stemSwedish == "lämna"); */ 914 } 915 916 auto ref stemNorvegian(S)(S s) 917 if (isSomeString!S) 918 { 919 s.skipOverBack(`ede`); 920 return s; 921 } 922 923 /** Stem $(D s) in Language $(D lang). 924 If lang is unknown try each known language until failure. 925 */ 926 Tuple!(S, Lang) stemIn(S)(S s, Lang lang = Lang.init) 927 if (isSomeString!S) 928 { 929 typeof(return) t; 930 switch (lang) with (Lang) 931 { 932 case unknown: 933 t = s.stemIn(en); if (t[0].length != s.length) return t; 934 t = s.stemIn(sv); if (t[0].length != s.length) return t; 935 t = s.stemIn(no); if (t[0].length != s.length) return t; 936 break; 937 case sv: t = tuple(s.stemSwedish, sv); break; 938 case no: t = tuple(s.stemNorvegian, no); break; 939 case en: 940 default: 941 auto stemmer = new Stemmer!string(); 942 t = tuple(stemmer.stem(s), lang); 943 } 944 return t; 945 } 946 947 /** Destructively Stem $(D s) in Language $(D lang). */ 948 Tuple!(bool, Lang) stemize(S)(ref S s, Lang lang = Lang.init) 949 if (isSomeString!S) 950 { 951 const n = s.length; 952 auto t = s.stemIn(lang); 953 s = t[0]; 954 return tuple(n != s.length, t[1]); 955 } 956 957 /** Return Stem of $(D s) using Porter's algorithm 958 See_Also: https://en.wikipedia.org/wiki/I_m_still_remembering 959 See_Also: https://en.wikipedia.org/wiki/Martin_Porter 960 See_Also: https://www.youtube.com/watch?v=2s7f8mBwnko&list=PL6397E4B26D00A269&index=4. 961 */ 962 S alternativePorterStemEnglish(S)(S s) 963 if (isSomeString!S) 964 { 965 /* Step 1a */ 966 if (s.endsWith(`sses`)) { s = s[0 .. $-2]; } 967 else if (s.endsWith(`ies`)) { s = s[0 .. $-2]; } 968 else if (s.endsWith(`ss`)) { } 969 else if (s.endsWith(`s`)) { s = s[0 .. $-1]; } 970 971 /* Step 2 */ 972 if (s.endsWith(`ational`)) { s = s[0 .. $-7] ~ `ate`; } 973 else if (s.endsWith(`izer`)) { s = s[0 .. $-1]; } 974 else if (s.endsWith(`ator`)) { s = s[0 .. $-2] ~ `e`; } 975 976 /* Step 3 */ 977 else if (s.endsWith(`al`)) { s = s[0 .. $-2] ~ `e`; } 978 else if (s.endsWith(`able`)) { s = s[0 .. $-4]; } 979 else if (s.endsWith(`ate`)) { s = s[0 .. $-3] ~ `e`; } 980 981 return s; 982 } 983 984 unittest 985 { 986 assert(`caresses`.alternativePorterStemEnglish == `caress`); 987 assert(`ponies`.alternativePorterStemEnglish == `poni`); 988 assert(`caress`.alternativePorterStemEnglish == `caress`); 989 assert(`cats`.alternativePorterStemEnglish == `cat`); 990 991 assert(`relational`.alternativePorterStemEnglish == `relate`); 992 assert(`digitizer`.alternativePorterStemEnglish == `digitize`); 993 assert(`operator`.alternativePorterStemEnglish == `operate`); 994 995 assert(`revival`.alternativePorterStemEnglish == `revive`); 996 assert(`adjustable`.alternativePorterStemEnglish == `adjust`); 997 assert(`activate`.alternativePorterStemEnglish == `active`); 998 }