Line data Source code
1 : /* phonetic.c - generic replacement aglogithms for phonetic transformation
2 : Copyright (C) 2000 Bjoern Jacke
3 :
4 : This library is free software; you can redistribute it and/or
5 : modify it under the terms of the GNU Lesser General Public
6 : License version 2.1 as published by the Free Software Foundation;
7 :
8 : This library is distributed in the hope that it will be useful,
9 : but WITHOUT ANY WARRANTY; without even the implied warranty of
10 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 : Lesser General Public License for more details.
12 :
13 : You should have received a copy of the GNU Lesser General Public
14 : License along with this library; If not, see
15 : <http://www.gnu.org/licenses/>.
16 :
17 : Changelog:
18 :
19 : 2000-01-05 Bjoern Jacke <bjoern at j3e.de>
20 : Initial Release insprired by the article about phonetic
21 : transformations out of c't 25/1999
22 :
23 : 2007-07-26 Bjoern Jacke <bjoern at j3e.de>
24 : Released under MPL/GPL/LGPL tri-license for Hunspell
25 :
26 : 2007-08-23 Laszlo Nemeth <nemeth at OOo>
27 : Porting from Aspell to Hunspell using C-like structs
28 : */
29 :
30 : #include <stdlib.h>
31 : #include <string.h>
32 : #include <stdio.h>
33 : #include <ctype.h>
34 :
35 : #include "csutil.hxx"
36 : #include "phonet.hxx"
37 :
38 0 : void init_phonet_hash(phonetable& parms) {
39 0 : for (int i = 0; i < HASHSIZE; i++) {
40 0 : parms.hash[i] = -1;
41 : }
42 :
43 0 : for (int i = 0; parms.rules[i][0] != '\0'; i += 2) {
44 : /** set hash value **/
45 0 : int k = (unsigned char)parms.rules[i][0];
46 :
47 0 : if (parms.hash[k] < 0) {
48 0 : parms.hash[k] = i;
49 : }
50 : }
51 0 : }
52 :
53 : // like strcpy but safe if the strings overlap
54 : // but only if dest < src
55 0 : static inline void strmove(char* dest, char* src) {
56 0 : while (*src)
57 0 : *dest++ = *src++;
58 0 : *dest = '\0';
59 0 : }
60 :
61 0 : static int myisalpha(char ch) {
62 0 : if ((unsigned char)ch < 128)
63 0 : return isalpha(ch);
64 0 : return 1;
65 : }
66 :
67 : /* Do phonetic transformation. */
68 : /* phonetic transcription algorithm */
69 : /* see: http://aspell.net/man-html/Phonetic-Code.html */
70 : /* convert string to uppercase before this call */
71 0 : std::string phonet(const std::string& inword, phonetable& parms) {
72 :
73 0 : int i, k = 0, p, z;
74 0 : int k0, n0, p0 = -333;
75 : char c;
76 : typedef unsigned char uchar;
77 :
78 0 : size_t len = inword.size();
79 0 : if (len > MAXPHONETUTF8LEN)
80 0 : return std::string();
81 : char word[MAXPHONETUTF8LEN + 1];
82 0 : strncpy(word, inword.c_str(), MAXPHONETUTF8LEN);
83 0 : word[MAXPHONETUTF8LEN] = '\0';
84 :
85 0 : std::string target;
86 : /** check word **/
87 0 : i = z = 0;
88 0 : while ((c = word[i]) != '\0') {
89 0 : int n = parms.hash[(uchar)c];
90 0 : int z0 = 0;
91 :
92 0 : if (n >= 0 && !parms.rules[n].empty()) {
93 : /** check all rules for the same letter **/
94 0 : while (parms.rules[n][0] == c) {
95 : /** check whole string **/
96 0 : k = 1; /** number of found letters **/
97 0 : p = 5; /** default priority **/
98 0 : const char*s = parms.rules[n].c_str();
99 0 : s++; /** important for (see below) "*(s-1)" **/
100 :
101 0 : while (*s != '\0' && word[i + k] == *s && !isdigit((unsigned char)*s) &&
102 0 : strchr("(-<^$", *s) == NULL) {
103 0 : k++;
104 0 : s++;
105 : }
106 0 : if (*s == '(') {
107 : /** check letters in "(..)" **/
108 0 : if (myisalpha(word[i + k]) // ...could be implied?
109 0 : && strchr(s + 1, word[i + k]) != NULL) {
110 0 : k++;
111 0 : while (*s != ')')
112 0 : s++;
113 0 : s++;
114 : }
115 : }
116 0 : p0 = (int)*s;
117 0 : k0 = k;
118 0 : while (*s == '-' && k > 1) {
119 0 : k--;
120 0 : s++;
121 : }
122 0 : if (*s == '<')
123 0 : s++;
124 0 : if (isdigit((unsigned char)*s)) {
125 : /** determine priority **/
126 0 : p = *s - '0';
127 0 : s++;
128 : }
129 0 : if (*s == '^' && *(s + 1) == '^')
130 0 : s++;
131 :
132 0 : if (*s == '\0' || (*s == '^' && (i == 0 || !myisalpha(word[i - 1])) &&
133 0 : (*(s + 1) != '$' || (!myisalpha(word[i + k0])))) ||
134 0 : (*s == '$' && i > 0 && myisalpha(word[i - 1]) &&
135 0 : (!myisalpha(word[i + k0])))) {
136 : /** search for followup rules, if: **/
137 : /** parms.followup and k > 1 and NO '-' in searchstring **/
138 0 : char c0 = word[i + k - 1];
139 0 : n0 = parms.hash[(uchar)c0];
140 :
141 : // if (parms.followup && k > 1 && n0 >= 0
142 0 : if (k > 1 && n0 >= 0 && p0 != (int)'-' && word[i + k] != '\0' && !parms.rules[n0].empty()) {
143 : /** test follow-up rule for "word[i+k]" **/
144 0 : while (parms.rules[n0][0] == c0) {
145 : /** check whole string **/
146 0 : k0 = k;
147 0 : p0 = 5;
148 0 : s = parms.rules[n0].c_str();
149 0 : s++;
150 0 : while (*s != '\0' && word[i + k0] == *s &&
151 0 : !isdigit((unsigned char)*s) &&
152 0 : strchr("(-<^$", *s) == NULL) {
153 0 : k0++;
154 0 : s++;
155 : }
156 0 : if (*s == '(') {
157 : /** check letters **/
158 0 : if (myisalpha(word[i + k0]) &&
159 0 : strchr(s + 1, word[i + k0]) != NULL) {
160 0 : k0++;
161 0 : while (*s != ')' && *s != '\0')
162 0 : s++;
163 0 : if (*s == ')')
164 0 : s++;
165 : }
166 : }
167 0 : while (*s == '-') {
168 : /** "k0" gets NOT reduced **/
169 : /** because "if (k0 == k)" **/
170 0 : s++;
171 : }
172 0 : if (*s == '<')
173 0 : s++;
174 0 : if (isdigit((unsigned char)*s)) {
175 0 : p0 = *s - '0';
176 0 : s++;
177 : }
178 :
179 0 : if (*s == '\0'
180 : /** *s == '^' cuts **/
181 0 : || (*s == '$' && !myisalpha(word[i + k0]))) {
182 0 : if (k0 == k) {
183 : /** this is just a piece of the string **/
184 0 : n0 += 2;
185 0 : continue;
186 : }
187 :
188 0 : if (p0 < p) {
189 : /** priority too low **/
190 0 : n0 += 2;
191 0 : continue;
192 : }
193 : /** rule fits; stop search **/
194 0 : break;
195 : }
196 0 : n0 += 2;
197 : } /** End of "while (parms.rules[n0][0] == c0)" **/
198 :
199 0 : if (p0 >= p && parms.rules[n0][0] == c0) {
200 0 : n += 2;
201 0 : continue;
202 : }
203 : } /** end of follow-up stuff **/
204 :
205 : /** replace string **/
206 0 : s = parms.rules[n + 1].c_str();
207 0 : p0 = (!parms.rules[n].empty() &&
208 0 : strchr(parms.rules[n].c_str() + 1, '<') != NULL)
209 0 : ? 1
210 : : 0;
211 0 : if (p0 == 1 && z == 0) {
212 : /** rule with '<' is used **/
213 0 : if (!target.empty() && *s != '\0' &&
214 0 : (target[target.size()-1] == c || target[target.size()-1] == *s)) {
215 0 : target.erase(target.size() - 1);
216 : }
217 0 : z0 = 1;
218 0 : z = 1;
219 0 : k0 = 0;
220 0 : while (*s != '\0' && word[i + k0] != '\0') {
221 0 : word[i + k0] = *s;
222 0 : k0++;
223 0 : s++;
224 : }
225 0 : if (k > k0)
226 0 : strmove(&word[0] + i + k0, &word[0] + i + k);
227 :
228 : /** new "actual letter" **/
229 0 : c = word[i];
230 : } else { /** no '<' rule used **/
231 0 : i += k - 1;
232 0 : z = 0;
233 0 : while (*s != '\0' && *(s + 1) != '\0' && target.size() < len) {
234 0 : if (target.empty() || target[target.size()-1] != *s) {
235 0 : target.push_back(*s);
236 : }
237 0 : s++;
238 : }
239 : /** new "actual letter" **/
240 0 : c = *s;
241 0 : if (!parms.rules[n].empty() &&
242 0 : strstr(parms.rules[n].c_str() + 1, "^^") != NULL) {
243 0 : if (c != '\0') {
244 0 : target.push_back(c);
245 : }
246 0 : strmove(&word[0], &word[0] + i + 1);
247 0 : i = 0;
248 0 : z0 = 1;
249 : }
250 : }
251 0 : break;
252 : } /** end of follow-up stuff **/
253 0 : n += 2;
254 : } /** end of while (parms.rules[n][0] == c) **/
255 : } /** end of if (n >= 0) **/
256 0 : if (z0 == 0) {
257 0 : if (k && !p0 && target.size() < len && c != '\0') {
258 : /** condense only double letters **/
259 0 : target.push_back(c);
260 : /// printf("\n setting \n");
261 : }
262 :
263 0 : i++;
264 0 : z = 0;
265 0 : k = 0;
266 : }
267 : } /** end of while ((c = word[i]) != '\0') **/
268 :
269 0 : return target;
270 : } /** end of function "phonet" **/
|