1 /** Stemming algorithms
2 */
3 module nxt.stemming;
4
5 import std.algorithm.comparison: among;
6 import std.algorithm.searching : endsWith, canFind;
7 import std.range: empty;
8 import std.traits: isSomeString;
9 import std.typecons : Tuple, tuple;
10
11 import nxt.languages : Lang;
12 import nxt.lingua : isEnglishVowel, isSwedishVowel, isSwedishConsonant, isEnglishConsonant;
13 import nxt.skip_ex : skipOverBack;
14
15 public class Stemmer(S)
16 if (isSomeString!S)
17 {
18 /**
19 * In stem(p,i,j), p is a char pointer, and the string to be stemmed
20 * is from p[i] to p[j] inclusive. Typically i is zero and j is the
21 * offset to the last character of a string, (p[j+1] == '\0'). The
22 * stemmer adjusts the characters p[i] ... p[j] and returns the new
23 * end-point of the string, k. Stemming never increases word length, so
24 * i <= k <= j. To turn the stemmer into a module, declare 'stem' as
25 * extern, and delete the remainder of this file.
26 */
27 public S stem(S p)
28 {
29 _b = p;
30 _k = p.length - 1;
31 _k0 = 0;
32
33 /** strings of length 1 or 2 don't go through the stemming process,
34 * although no mention is made of this in the published
35 * algorithm. Remove the line to match the published algorithm.
36 */
37 if (_k <= _k0 + 1)
38 return _b;
39
40 step1ab();
41 step1c();
42 step2();
43 step3();
44 step4();
45 step5();
46 return _b[_k0 .. _k + 1];
47
48 }
49
50 private:
51 S _b; // buffer for the word
52 ptrdiff_t _k = 0;
53 ptrdiff_t _k0 = 0;
54 ptrdiff_t _j = 0; // offset within the string
55
56 /**
57 * cons returns true, if b[i] is a consonant
58 */
59 bool isConsonant(ptrdiff_t i)
60 {
61 if (_b[i].isEnglishVowel)
62 return false;
63 if (_b[i] == 'y')
64 {
65 if (i == _k0)
66 {
67 return true;
68 }
69 else
70 {
71 return !isConsonant(i - 1);
72 }
73 }
74 return true;
75 }
76
77 /** Return the number of consonant sequences between k0 and j.
78 * if c is a consonant sequence and v a vowel sequence, and <..>
79 * indicates arbitrary presence,
80 *
81 * <c><v> gives 0
82 * <c>vc<v> gives 1
83 * <c>vcvc<v> gives 2
84 * <c>vcvcvc<v> gives 3
85 *
86 */
87 size_t m()
88 {
89 ptrdiff_t n = 0;
90 ptrdiff_t i = _k0;
91
92 while (true)
93 {
94 if (i > _j)
95 {
96 return n;
97 }
98 if (!isConsonant(i))
99 {
100 break;
101 }
102 i++;
103 }
104 i++;
105 while (true)
106 {
107 while (true)
108 {
109 if (i > _j)
110 {
111 return n;
112 }
113 if (isConsonant(i))
114 {
115 break;
116 }
117 i++;
118 }
119 i++;
120 n++;
121 while (true)
122 {
123 if (i > _j)
124 {
125 return n;
126 }
127 if (!isConsonant(i))
128 {
129 break;
130 }
131 i++;
132 }
133 i++;
134 }
135 }
136
137 /** Returns true if k0...j contains a vowel. */
138 bool hasVowelInStem()
139 {
140 for (ptrdiff_t i = _k0; i < _j + 1; i++)
141 {
142 if (!isConsonant(i))
143 return true;
144 }
145 return false;
146 }
147
148 /** Returns true if j, j-1 contains a double consonant
149 */
150 bool doublec(ptrdiff_t j)
151 {
152 if (j < (_k0 + 1))
153 return false;
154 if (_b[j] != _b[j-1])
155 return false;
156 return isConsonant(j);
157 }
158
159 /** Returns true if i-2,i-1,i has the form consonant - vowel - consonant
160 * and also if the second c is not w,x or y. this is used when trying to
161 * restore an e at the end of a short e.g.
162 *
163 * cav(e), lov(e), hop(e), crim(e), but
164 * snow, box, tray.
165 *
166 */
167 bool cvc(ptrdiff_t i)
168 {
169 if (i < (_k0 + 2) || !isConsonant(i) || isConsonant(i-1) || !isConsonant(i-2))
170 return false;
171 if (_b[i] == 'w' || _b[i] == 'x' || _b[i] == 'y')
172 return false;
173 return true;
174 }
175
176 /** Return true if k0,...k endsWith with the string s.
177 */
178 bool endsWith(S)(S s)
179 if (isSomeString!S)
180 {
181 const len = s.length;
182
183 if (s[len - 1] != _b[_k])
184 return false;
185 if (len > (_k - _k0 + 1))
186 return false;
187
188 const a = _k - len + 1;
189 const b = _k + 1;
190
191 if (_b[a..b] != s)
192 {
193 return false;
194 }
195 _j = _k - len;
196
197 return true;
198 }
199
200 /** Sets (j+1),...k to the characters in the string s, readjusting k. */
201 void setto(S)(S s)
202 if (isSomeString!S)
203 {
204 _b = _b[0.._j+1] ~ s ~ _b[_j + s.length + 1 .. _b.length];
205 _k = _j + s.length;
206 }
207
208 /** Used further down. */
209 void r(S)(S s)
210 if (isSomeString!S)
211 {
212 if (m() > 0)
213 setto(s);
214 }
215
216 /** Gets rid of plurals and -ed or -ing. e.g. */
217 void step1ab()
218 {
219 if (_b[_k] == 's')
220 {
221 if (endsWith("sses"))
222 {
223 _k = _k - 2;
224 }
225 else if (endsWith("ies"))
226 {
227 setto("i");
228 }
229 else if (_b[_k - 1] != 's')
230 {
231 _k--;
232 }
233 }
234 if (endsWith("eed"))
235 {
236 if (m() > 0)
237 _k--;
238 }
239 else if ((endsWith("ed") || endsWith("ing")) && hasVowelInStem())
240 {
241 _k = _j;
242 if (endsWith("at"))
243 {
244 setto("ate");
245 }
246 else if (endsWith("bl"))
247 {
248 setto("ble");
249 }
250 else if (endsWith("iz"))
251 {
252 setto("ize");
253 }
254 else if (doublec(_k))
255 {
256 _k--;
257 if (_b[_k] == 'l' || _b[_k] == 's' || _b[_k] == 'z')
258 _k++;
259 }
260 else if (m() == 1 && cvc(_k))
261 {
262 setto("e");
263 }
264 }
265 }
266
267 /**
268 * step1c() turns terminal y to i when there is another vowel in the stem.
269 */
270 void step1c()
271 {
272 if (endsWith("y") &&
273 !endsWith("day") &&
274 hasVowelInStem())
275 {
276 _b = _b[0.._k] ~ 'i' ~ _b[_k+1 .. _b.length];
277 }
278 }
279
280 /**
281 * step2() maps double suffices to single ones.
282 * so -ization (= -ize plus -ation) maps to -ize etc. note that the
283 * string before the suffix must give m() > 0.*
284 */
285 void step2()
286 {
287 if (_b[_k - 1] == 'a')
288 {
289 if (endsWith("ational"))
290 r("ate");
291 else if (endsWith("tional"))
292 r("tion");
293 }
294 else if (_b[_k - 1] == 'c')
295 {
296 if (endsWith("enci"))
297 r("ence");
298 else if (endsWith("anci"))
299 r("ance");
300 }
301 else if (_b[_k - 1] == 'e')
302 {
303 if (endsWith("izer"))
304 r("ize");
305 }
306 else if (_b[_k - 1] == 'l')
307 {
308 if (endsWith("bli"))
309 r("ble");
310 /* --DEPARTURE--
311 * To match the published algorithm, replace this phrase with
312 * if (endsWith("abli"))
313 * r("able");
314 */
315 else if (endsWith("alli"))
316 r("al");
317 else if (endsWith("entli"))
318 r("ent");
319 else if (endsWith("eli"))
320 r("e");
321 else if (endsWith("ousli"))
322 r("ous");
323 }
324 else if (_b[_k - 1] == 'o')
325 {
326 if (endsWith("ization"))
327 r("ize");
328 else if (endsWith("ation") || endsWith("ator"))
329 r("ate");
330 }
331 else if (_b[_k - 1] == 's')
332 {
333 if (endsWith("alism"))
334 r("al");
335 else if (endsWith("iveness"))
336 r("ive");
337 else if (endsWith("fulness"))
338 r("ful");
339 else if (endsWith("ousness"))
340 r("ous");
341 }
342 else if (_b[_k - 1] == 't')
343 {
344 if (endsWith("aliti"))
345 r("al");
346 else if (endsWith("iviti"))
347 r("ive");
348 else if (endsWith("biliti"))
349 r("ble");
350 }
351 else if (_b[_k - 1] == 'g')
352 {
353 /**
354 * --DEPARTURE--
355 * To match the published algorithm, delete this phrase
356 */
357 if (endsWith("logi"))
358 r("log");
359 }
360 }
361
362 /**
363 * step3() dels with -ic-, -full, -ness etc. similar strategy to step2.
364 */
365 void step3()
366 {
367 if (_b[_k] == 'e')
368 {
369 if (endsWith("icate")) r("ic");
370 else if (endsWith("ative")) r("");
371 else if (endsWith("alize")) r("al");
372 }
373 else if (_b[_k] == 'i')
374 {
375 if (endsWith("iciti")) r("ic");
376 }
377 else if (_b[_k] == 'l')
378 {
379 if (endsWith("ical")) r("ic");
380 else if (endsWith("ful")) r("");
381 }
382 else if (_b[_k] == 's')
383 {
384 if (endsWith("ness")) r("");
385 }
386 }
387
388 /**
389 * step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
390 */
391 void step4()
392 {
393 /* fixes bug 1 */
394 if (_k == 0)
395 return;
396 switch (_b[_k - 1])
397 {
398 case 'a':
399 if (endsWith("al"))
400 break;
401 return;
402 case 'c':
403 if (endsWith("ance") || endsWith("ence"))
404 break;
405 return;
406 case 'e':
407 if (endsWith("er"))
408 break;
409 return;
410 case 'i':
411 if (endsWith("ic"))
412 break;
413 return;
414 case 'l':
415 if (endsWith("able") || endsWith("ible"))
416 break;
417 return;
418 case 'n':
419 if (endsWith("ant") || endsWith("ement") || endsWith("ment") || endsWith("ent"))
420 break;
421 return;
422 case 'o':
423 if (endsWith("ion") && _j >= 0 && (_b[_j] == 's' || _b[_j] == 't'))
424 {
425 /* _j >= 0 fixes bug 2 */
426 break;
427 }
428 if (endsWith("ou"))
429 break;
430 return;
431 case 's':
432 if (endsWith("ism"))
433 break;
434 return;
435 case 't':
436 if (endsWith("ate") || endsWith("iti"))
437 break;
438 return;
439 case 'u':
440 if (endsWith("ous"))
441 break;
442 return;
443 case 'v':
444 if (endsWith("ive"))
445 break;
446 return;
447 case 'z':
448 if (endsWith("ize"))
449 break;
450 return;
451 default:
452 return;
453 }
454
455 if (m() > 1)
456 _k = _j;
457
458 }
459
460 /**
461 * step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
462 */
463 void step5()
464 {
465 _j = _k;
466 if (_b[_k] == 'e' &&
467 _b[0 .. _k] != `false`)
468 {
469 auto a = m();
470 if (a > 1 || (a == 1 && !cvc(_k - 1)))
471 _k--;
472 }
473 if (_b[_k] == 'l' && doublec(_k) && m() > 1)
474 _k--;
475 }
476 }
477
478 unittest
479 {
480 scope stemmer = new Stemmer!string();
481
482 assert(stemmer.stem("") == "");
483 assert(stemmer.stem("x") == "x");
484 assert(stemmer.stem("xyz") == "xyz");
485 assert(stemmer.stem("win") == "win");
486 // TODO assert(stemmer.stem("winner") == "win");
487 assert(stemmer.stem("winning") == "win");
488 assert(stemmer.stem("farted") == "fart");
489 assert(stemmer.stem("caresses") == "caress");
490 assert(stemmer.stem("ponies") == "poni");
491 assert(stemmer.stem("ties") == "ti");
492 assert(stemmer.stem("caress") == "caress");
493 assert(stemmer.stem("cats") == "cat");
494 assert(stemmer.stem("feed") == "feed");
495 assert(stemmer.stem("matting") == "mat");
496 assert(stemmer.stem("mating") == "mate");
497 assert(stemmer.stem("meeting") == "meet");
498 assert(stemmer.stem("milling") == "mill");
499 assert(stemmer.stem("messing") == "mess");
500 assert(stemmer.stem("meetings") == "meet");
501 assert(stemmer.stem("neutralize") == "neutral");
502 assert(stemmer.stem("relational") == "relat");
503 assert(stemmer.stem("relational") == "relat");
504 assert(stemmer.stem("intricate") == "intric");
505
506 assert(stemmer.stem("connection") == "connect");
507 assert(stemmer.stem("connective") == "connect");
508 assert(stemmer.stem("connecting") == "connect");
509
510 assert(stemmer.stem("agreed") == "agre");
511 assert(stemmer.stem("disabled") == "disabl");
512 assert(stemmer.stem("gentle") == "gentl");
513 assert(stemmer.stem("gently") == "gentli");
514 assert(stemmer.stem("served") == "serv");
515 assert(stemmer.stem("competes") == "compet");
516
517 assert(stemmer.stem("fullnessful") == "fullness");
518 assert(stemmer.stem(stemmer.stem("fullnessful")) == "full");
519
520 assert(stemmer.stem("bee") == "bee");
521
522 assert(stemmer.stem("dogs") == "dog");
523 assert(stemmer.stem("churches") == "church");
524 assert(stemmer.stem("hardrock") == "hardrock");
525
526 // TODO assert(stemmer.stem("false") == "false");
527 }
528
529 import nxt.dbgio;
530
531 /** Stem Swedish Word $(D s).
532 */
533 auto ref stemSwedish(S)(S s)
534 if (isSomeString!S)
535 {
536 enum ar = `ar`;
537 enum or = `or`;
538 enum er = `er`;
539 enum ya = `ya`;
540
541 enum en = `en`;
542 enum ern = `ern`;
543 enum an = `an`;
544 enum na = `na`;
545 enum et = `et`;
546 enum aste = `aste`;
547 enum are = `are`;
548 enum ast = `ast`;
549 enum iserad = `iserad`;
550 enum de = `de`;
551 enum ing = `ing`;
552 enum igt = `igt`;
553 enum llt = `llt`;
554
555 switch (s)
556 {
557 case `samtida`: return `samtid`;
558 default: break;
559 }
560
561 if (s.endsWith(`n`))
562 {
563 if (s.endsWith(en))
564 {
565 const t = s[0 .. $ - en.length];
566 if (s.among!(`även`))
567 {
568 return s;
569 }
570 else if (t.among!(`sann`))
571 {
572 return t;
573 }
574 else if (t.endsWith(`mm`, `nn`))
575 {
576 return t[0 .. $ - 1];
577 }
578 return t;
579 }
580 if (s.endsWith(ern))
581 {
582 return s[0 .. $ - 1];
583 }
584 if (s.endsWith(an))
585 {
586 const t = s[0 .. $ - an.length];
587 if (t.length >= 3 &&
588 t.endsWith(`tt`, `mp`, `ck`, `st`))
589 {
590 return s[0 ..$ - 1];
591 }
592 else if (t.length >= 2 &&
593 t.endsWith(`n`, `p`))
594 {
595 return s[0 ..$ - 1];
596 }
597 else if (t.length < 3)
598 {
599 return s;
600 }
601 return t;
602 }
603 }
604
605 if (s.endsWith(igt))
606 {
607 return s[0 .. $ - 1];
608 }
609
610 if (s.endsWith(ya))
611 {
612 return s[0 .. $ - 1];
613 }
614
615 if (s.endsWith(na))
616 {
617 if (s.among!(`sina`, `dina`, `mina`))
618 {
619 return s[0 .. $ - 1];
620 }
621 auto t = s[0 .. $ - na.length];
622 if (t.endsWith(`r`))
623 {
624 if (t.endsWith(ar, or, er))
625 {
626 const u = t[0 .. $ - ar.length];
627 if (u.canFind!(a => a.isSwedishVowel))
628 {
629 return u;
630 }
631 else
632 {
633 return t[0 .. $ - 1];
634 }
635 }
636 }
637 }
638
639 if (s.endsWith(et))
640 {
641 const t = s[0 .. $ - et.length];
642 if (t.length >= 3 &&
643 t[$ - 3].isSwedishConsonant &&
644 t[$ - 2].isSwedishConsonant &&
645 t[$ - 1].isSwedishConsonant)
646 {
647 return s[0 .. $ - 1];
648 }
649 else if (t.endsWith(`ck`))
650 {
651 return s[0 .. $ - 1];
652 }
653
654 return t;
655 }
656
657 if (s.endsWith(ar, or, er))
658 {
659 const t = s[0 .. $ - ar.length];
660 if (t.canFind!(a => a.isSwedishVowel))
661 {
662 if (t.endsWith(`mm`, `nn`))
663 {
664 return t[0 .. $ - 1];
665 }
666 else
667 {
668 return t;
669 }
670 }
671 else
672 {
673 return s[0 .. $ - 1];
674 }
675 }
676
677 if (s.endsWith(aste))
678 {
679 const t = s[0 .. $ - aste.length];
680 if (t.among!(`sann`))
681 {
682 return t;
683 }
684 if (t.endsWith(`mm`, `nn`))
685 {
686 return t[0 .. $ - 1];
687 }
688 if (t.canFind!(a => a.isSwedishVowel))
689 {
690 return t;
691 }
692 }
693
694 if (s.endsWith(are, ast))
695 {
696 const t = s[0 .. $ - are.length];
697 if (t.among!(`sann`))
698 {
699 return t;
700 }
701 if (t.endsWith(`mm`, `nn`))
702 {
703 return t[0 .. $ - 1];
704 }
705 if (t.canFind!(a => a.isSwedishVowel))
706 {
707 return t;
708 }
709 }
710
711 if (s.endsWith(iserad))
712 {
713 const t = s[0 .. $ - iserad.length];
714 if (!t.endsWith(`n`))
715 {
716 return t;
717 }
718 }
719
720 if (s.endsWith(de))
721 {
722 enum ande = `ande`;
723 if (s.endsWith(ande))
724 {
725 const t = s[0 .. $ - ande.length];
726 if (t.empty)
727 {
728 return s;
729 }
730 else if (t[$ - 1].isSwedishConsonant)
731 {
732 return s[0 .. $ - 3];
733 }
734 return t;
735 }
736 if (s.among!(`hade`))
737 {
738 return s;
739 }
740 const t = s[0 .. $ - de.length];
741 return t;
742 }
743
744 if (s.endsWith(ing))
745 {
746 enum ning = `ning`;
747 if (s.endsWith(ning))
748 {
749 const t = s[0 .. $ - ning.length];
750 if (!t.endsWith(`n`) &&
751 t != `tid`)
752 {
753 return t;
754 }
755 }
756 return s[0 .. $ - ing.length];
757 }
758
759 if (s.endsWith(llt))
760 {
761 return s[0 .. $ - 1];
762 }
763
764 return s;
765 }
766
767 unittest
768 {
769 // import nxt.assert_ex;
770
771 assert("rumpan".stemSwedish == "rumpa");
772 assert("sopan".stemSwedish == "sopa");
773 assert("kistan".stemSwedish == "kista");
774
775 assert("karl".stemSwedish == "karl");
776
777 assert("grenen".stemSwedish == "gren");
778 assert("busen".stemSwedish == "bus");
779 assert("husen".stemSwedish == "hus");
780 assert("räven".stemSwedish == "räv");
781 assert("dunken".stemSwedish == "dunk");
782 assert("männen".stemSwedish == "män");
783 assert("manen".stemSwedish == "man");
784 assert("mannen".stemSwedish == "man");
785
786 assert("skalet".stemSwedish == "skal");
787 assert("karet".stemSwedish == "kar");
788 assert("taket".stemSwedish == "tak");
789 assert("stinget".stemSwedish == "sting");
790
791 assert("äpplet".stemSwedish == "äpple");
792
793 assert("jakt".stemSwedish == "jakt");
794
795 assert("sot".stemSwedish == "sot");
796 assert("sotare".stemSwedish == "sot");
797
798 assert("klok".stemSwedish == "klok");
799 assert("klokare".stemSwedish == "klok");
800 assert("klokast".stemSwedish == "klok");
801
802 assert("stark".stemSwedish == "stark");
803 assert("starkare".stemSwedish == "stark");
804 assert("starkast".stemSwedish == "stark");
805
806 assert("kort".stemSwedish == "kort");
807 assert("kortare".stemSwedish == "kort");
808 assert("kortast".stemSwedish == "kort");
809
810 assert("rolig".stemSwedish == "rolig");
811 assert("roligare".stemSwedish == "rolig");
812 assert("roligast".stemSwedish == "rolig");
813
814 assert("dum".stemSwedish == "dum");
815 assert("dummare".stemSwedish == "dum");
816 assert("dummast".stemSwedish == "dum");
817 assert("dummaste".stemSwedish == "dum");
818 assert("senaste".stemSwedish == "sen");
819
820 assert("sanning".stemSwedish == "sann");
821 assert("sann".stemSwedish == "sann");
822 assert("sannare".stemSwedish == "sann");
823 assert("sannare".stemSwedish == "sann");
824
825 assert("stare".stemSwedish == "stare");
826 assert("kvast".stemSwedish == "kvast");
827
828 assert("täcket".stemSwedish == "täcke");
829 assert("räcket".stemSwedish == "räcke");
830
831 assert("van".stemSwedish == "van");
832 assert("dan".stemSwedish == "dan");
833 assert("man".stemSwedish == "man");
834 assert("ovan".stemSwedish == "ovan");
835 assert("stan".stemSwedish == "stan");
836 assert("klan".stemSwedish == "klan");
837
838 assert("klockan".stemSwedish == "klocka");
839 assert("klockande".stemSwedish == "klocka");
840 assert("sockan".stemSwedish == "socka");
841 assert("rockan".stemSwedish == "rocka");
842 assert("rock".stemSwedish == "rock");
843
844 assert("agenter".stemSwedish == "agent");
845 assert("agenterna".stemSwedish == "agent");
846 assert("regenter".stemSwedish == "regent");
847 assert("regenterna".stemSwedish == "regent");
848
849 assert("brodern".stemSwedish == "broder");
850 assert("kärnan".stemSwedish == "kärna");
851
852 assert("skorna".stemSwedish == "sko");
853
854 assert("inträffade".stemSwedish == "inträffa");
855 assert("roa".stemSwedish == "roa");
856 assert("roade".stemSwedish == "roa");
857 assert("hade".stemSwedish == "hade");
858 assert("hades".stemSwedish == "hades");
859
860 assert("fullt".stemSwedish == "full");
861
862 assert("kanaliserad".stemSwedish == "kanal");
863 assert("alkoholiserad".stemSwedish == "alkohol");
864
865 assert("roande".stemSwedish == "ro");
866
867 /* assertEqual("ror".stemSwedish, "ro"); */
868 /* assertEqual("öbor".stemSwedish, "öbo"); */
869
870 assert("ande".stemSwedish == "ande");
871
872 assert("störande".stemSwedish == "störa");
873 assert("nekande".stemSwedish == "neka");
874 assert("jagande".stemSwedish == "jaga");
875 assert("stimulerande".stemSwedish == "stimulera");
876
877 assert("karlar".stemSwedish == "karl");
878 assert("lagar".stemSwedish == "lag");
879
880 assert("sina".stemSwedish == "sin");
881 assert("dina".stemSwedish == "din");
882 assert("mina".stemSwedish == "min");
883
884 assert("även".stemSwedish == "även");
885
886 assert("samtida".stemSwedish == "samtid");
887
888 assert("trattar".stemSwedish == "tratt");
889
890 assert("katter".stemSwedish == "katt");
891 assert("dagar".stemSwedish == "dag");
892 assert("öar".stemSwedish == "ö");
893 assert("åar".stemSwedish == "å");
894 assert("ängar".stemSwedish == "äng");
895
896 assert("spelar".stemSwedish == "spel");
897 assert("drar".stemSwedish == "dra");
898
899 assert("kullar".stemSwedish == "kull");
900 assert("kullarna".stemSwedish == "kull");
901
902 assert("mamma".stemSwedish == "mamma");
903
904 assert("bestyr".stemSwedish == "bestyr");
905
906 assert("krya".stemSwedish == "kry");
907 assert("nya".stemSwedish == "ny");
908
909 assert("lemmar".stemSwedish == "lem");
910
911 /* assertEqual("ämnar".stemSwedish, "ämna"); */
912 /* assert("rämnar".stemSwedish == "rämna"); */
913 /* assert("lämnar".stemSwedish == "lämna"); */
914 }
915
916 auto ref stemNorvegian(S)(S s)
917 if (isSomeString!S)
918 {
919 s.skipOverBack(`ede`);
920 return s;
921 }
922
923 /** Stem $(D s) in Language $(D lang).
924 If lang is unknown try each known language until failure.
925 */
926 Tuple!(S, Lang) stemIn(S)(S s, Lang lang = Lang.init)
927 if (isSomeString!S)
928 {
929 typeof(return) t;
930 switch (lang) with (Lang)
931 {
932 case unknown:
933 t = s.stemIn(en); if (t[0].length != s.length) return t;
934 t = s.stemIn(sv); if (t[0].length != s.length) return t;
935 t = s.stemIn(no); if (t[0].length != s.length) return t;
936 break;
937 case sv: t = tuple(s.stemSwedish, sv); break;
938 case no: t = tuple(s.stemNorvegian, no); break;
939 case en:
940 default:
941 auto stemmer = new Stemmer!string();
942 t = tuple(stemmer.stem(s), lang);
943 }
944 return t;
945 }
946
947 /** Destructively Stem $(D s) in Language $(D lang). */
948 Tuple!(bool, Lang) stemize(S)(ref S s, Lang lang = Lang.init)
949 if (isSomeString!S)
950 {
951 const n = s.length;
952 auto t = s.stemIn(lang);
953 s = t[0];
954 return tuple(n != s.length, t[1]);
955 }
956
957 /** Return Stem of $(D s) using Porter's algorithm
958 See_Also: https://en.wikipedia.org/wiki/I_m_still_remembering
959 See_Also: https://en.wikipedia.org/wiki/Martin_Porter
960 See_Also: https://www.youtube.com/watch?v=2s7f8mBwnko&list=PL6397E4B26D00A269&index=4.
961 */
962 S alternativePorterStemEnglish(S)(S s)
963 if (isSomeString!S)
964 {
965 /* Step 1a */
966 if (s.endsWith(`sses`)) { s = s[0 .. $-2]; }
967 else if (s.endsWith(`ies`)) { s = s[0 .. $-2]; }
968 else if (s.endsWith(`ss`)) { }
969 else if (s.endsWith(`s`)) { s = s[0 .. $-1]; }
970
971 /* Step 2 */
972 if (s.endsWith(`ational`)) { s = s[0 .. $-7] ~ `ate`; }
973 else if (s.endsWith(`izer`)) { s = s[0 .. $-1]; }
974 else if (s.endsWith(`ator`)) { s = s[0 .. $-2] ~ `e`; }
975
976 /* Step 3 */
977 else if (s.endsWith(`al`)) { s = s[0 .. $-2] ~ `e`; }
978 else if (s.endsWith(`able`)) { s = s[0 .. $-4]; }
979 else if (s.endsWith(`ate`)) { s = s[0 .. $-3] ~ `e`; }
980
981 return s;
982 }
983
984 unittest
985 {
986 assert(`caresses`.alternativePorterStemEnglish == `caress`);
987 assert(`ponies`.alternativePorterStemEnglish == `poni`);
988 assert(`caress`.alternativePorterStemEnglish == `caress`);
989 assert(`cats`.alternativePorterStemEnglish == `cat`);
990
991 assert(`relational`.alternativePorterStemEnglish == `relate`);
992 assert(`digitizer`.alternativePorterStemEnglish == `digitize`);
993 assert(`operator`.alternativePorterStemEnglish == `operate`);
994
995 assert(`revival`.alternativePorterStemEnglish == `revive`);
996 assert(`adjustable`.alternativePorterStemEnglish == `adjust`);
997 assert(`activate`.alternativePorterStemEnglish == `active`);
998 }