1 #!/usr/bin/env rdmd-dev
2 
3 module nxt.syllables;
4 
5 import nxt.languages : Lang;
6 import std.traits: isSomeString;
7 import std.uni: byGrapheme;
8 
9 /** Count Number of Syllables in $(D s) interpreted in language $(D lang).
10 
11     The Algorithm:
12 
13     - If number of letters <= 3 : return 1. Incorrect for Ira, weapon:usi.
14 
15     - If doesn’t end with “ted” or “tes” or “ses” or “ied” or “ies”, discard
16       “es” and “ed” at the end. If it has only 1 vowel or 1 set of consecutive
17       vowels, discard. (like “speed”, “fled” etc.)
18 
19      - Discard trailing “e”, except where ending is “le” and isn’t in the
20        le_except array
21 
22     - Check if consecutive vowels exists, triplets or pairs, count them as one.
23 
24     - Count remaining vowels in the word.
25 
26     - Add one if begins with “mc”
27 
28     - Add one if ends with “y” but is not surrouned by vowel. (ex. “mickey”)
29 
30     - Add one if “y” is surrounded by non-vowels and is not in the last
31       word. (ex. “python”)
32 
33     - If begins with “tri-” or “bi-” and is followed by a vowel, add one. (so
34       that “ia” at “triangle” won’t be mistreated by step 4)
35 
36     - If ends with “-ian”, should be counted as two syllables, except for
37       “-tian” and “-cian”. (ex. “indian” and “politician” should be handled
38       differently and shouldn’t be mistreated by step 4)
39 
40     - If begins with “co-” and is followed by a vowel, check if it exists in the
41       double syllable dictionary, if not, check if in single dictionary and act
42       accordingly. (co_one and co_two dictionaries handle it. Ex. “coach” and
43       “coapt” shouldn’t be treated equally by step 4)
44 
45     - If starts with “pre-” and is followed by a vowel, check if exists in the
46       double syllable dictionary, if not, check if in single dictionary and act
47       accordingly. (similar to step 11, but very weak dictionary for the moment)
48 
49     - Check for “-n’t” and cross match with dictionary to add
50       syllable. (ex. “doesn’t”, “couldn’t”)
51 
52     - Handling the exceptional words. (ex. “serious”, “fortunately”)
53 
54     Like I said earlier, this isn’t perfect, so there are some steps to add or
55     modify, but it works just “fine”. Some exceptions should be added such as
56     “evacuate”, “ambulances”, “shuttled”, “anyone” etc… Also it can’t handle
57     some compund words like “facebook”. Counting only “face” would result
58     correctly “1″, and “book” would also come out correct, but due to the “e”
59     letter not being detected as a “silent e”, “facebook” will return “3
60     syllables.”
61 
62     See_Also: http://eayd.in/?p=232
63     See_Also: http://forum.dlang.org/thread/ovzcetxbrdblpmyizdjr@forum.dlang.org#post-ovzcetxbrdblpmyizdjr:40forum.dlang.org
64  */
65 uint countSyllables(S)(S s, Lang lang = Lang.en)
66 if (isSomeString!S)
67 {
68     import std.string: toLower;
69     s = s.toLower;
70 
71     enum exception_add = ["serious", "crucial"]; /* words that need extra syllables */
72     enum exception_del = ["fortunately", "unfortunately"]; /* words that need less syllables */
73     enum co_one = ["cool", "coach", "coat", "coal", "count", "coin", "coarse", "coup", "coif", "cook", "coign", "coiffe", "coof", "court"];
74     enum co_two = ["coapt", "coed", "coinci"];
75     enum pre_one = ["preach"];
76 
77     uint syls = 0;  // added syllable number
78     uint disc = 0; // discarded syllable number
79 
80     return 0;
81 }
82 
83 /* what about the word ira? */
84 /* #1) if letters < 3 : return 1 */
85 /*     if len(word) <= 3 : */
86 /* syls = 1 */
87 /* return syls */
88 
89 /* #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end. */
90 /*     # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.) */
91 
92 /*     if word[-2:] == "es" or word[-2:] == "ed" : */
93 /*         doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word)) */
94 /*         if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 : */
95 /*             if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" : */
96 /*                 pass */
97 /*             else : */
98 /*                 disc+=1 */
99 
100 /*     #3) discard trailing "e", except where ending is "le" */
101 
102 /*     le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while'] */
103 
104 /*     if word[-1:] == "e" : */
105 /*         if word[-2:] == "le" and word not in le_except : */
106 /*             pass */
107 
108 /*         else : */
109 /*             disc+=1 */
110 
111 /*     #4) check if consecutive vowels exists, triplets or pairs, count them as one. */
112 
113 /*     doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word)) */
114 /*     tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word)) */
115 /*     disc+=doubleAndtripple + tripple */
116 
117 /*     #5) count remaining vowels in word. */
118 /*     numVowels = len(re.findall(r'[eaoui]',word)) */
119 
120 /*     #6) add one if starts with "mc" */
121 /*     if word[:2] == "mc" : */
122 /*         syls+=1 */
123 
124 /*     #7) add one if ends with "y" but is not surrouned by vowel */
125 /*     if word[-1:] == "y" and word[-2] not in "aeoui" : */
126 /*         syls +=1 */
127 
128 /*     #8) add one if "y" is surrounded by non-vowels and is not in the last word. */
129 
130 /*     for i,j in enumerate(word) : */
131 /*         if j == "y" : */
132 /*             if (i != 0) and (i != len(word)-1) : */
133 /*                 if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" : */
134 /*                     syls+=1 */
135 
136 /*     #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one. */
137 
138 /*     if word[:3] == "tri" and word[3] in "aeoui" : */
139 /*         syls+=1 */
140 
141 /*     if word[:2] == "bi" and word[2] in "aeoui" : */
142 /*         syls+=1 */
143 
144 /*     #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian" */
145 
146 /*     if word[-3:] == "ian" : */
147 /*     #and (word[-4:] != "cian" or word[-4:] != "tian") : */
148 /*         if word[-4:] == "cian" or word[-4:] == "tian" : */
149 /*             pass */
150 /*         else : */
151 /*             syls+=1 */
152 
153 /*     #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly. */
154 
155 /*     if word[:2] == "co" and word[2] in 'eaoui' : */
156 
157 /*         if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two : */
158 /*             syls+=1 */
159 /*         elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one : */
160 /*             pass */
161 /*         else : */
162 /*             syls+=1 */
163 
164 /*     #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly. */
165 
166 /*     if word[:3] == "pre" and word[3] in 'eaoui' : */
167 /*         if word[:6] in pre_one : */
168 /*             pass */
169 /*         else : */
170 /*             syls+=1 */
171 
172 /*     #13) check for "-n't" and cross match with dictionary to add syllable. */
173 
174 /*     negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"] */
175 
176 /*     if word[-3:] == "n't" : */
177 /*         if word in negative : */
178 /*             syls+=1 */
179 /*         else : */
180 /*             pass */
181 
182 /*     #14) Handling the exceptional words. */
183 
184 /*     if word in exception_del : */
185 /*         disc+=1 */
186 
187 /*     if word in exception_add : */
188 /*         syls+=1 */
189 
190 /*     # calculate the output */
191 /*     return numVowels - disc + syls */