Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "GreekCasing.h"
7 : #include "nsUnicharUtils.h"
8 : #include "nsUnicodeProperties.h"
9 :
10 : // Custom uppercase mapping for Greek; see bug 307039 for details
11 : #define GREEK_LOWER_ALPHA 0x03B1
12 : #define GREEK_LOWER_ALPHA_TONOS 0x03AC
13 : #define GREEK_LOWER_ALPHA_OXIA 0x1F71
14 : #define GREEK_LOWER_EPSILON 0x03B5
15 : #define GREEK_LOWER_EPSILON_TONOS 0x03AD
16 : #define GREEK_LOWER_EPSILON_OXIA 0x1F73
17 : #define GREEK_LOWER_ETA 0x03B7
18 : #define GREEK_LOWER_ETA_TONOS 0x03AE
19 : #define GREEK_LOWER_ETA_OXIA 0x1F75
20 : #define GREEK_LOWER_IOTA 0x03B9
21 : #define GREEK_LOWER_IOTA_TONOS 0x03AF
22 : #define GREEK_LOWER_IOTA_OXIA 0x1F77
23 : #define GREEK_LOWER_IOTA_DIALYTIKA 0x03CA
24 : #define GREEK_LOWER_IOTA_DIALYTIKA_TONOS 0x0390
25 : #define GREEK_LOWER_IOTA_DIALYTIKA_OXIA 0x1FD3
26 : #define GREEK_LOWER_OMICRON 0x03BF
27 : #define GREEK_LOWER_OMICRON_TONOS 0x03CC
28 : #define GREEK_LOWER_OMICRON_OXIA 0x1F79
29 : #define GREEK_LOWER_UPSILON 0x03C5
30 : #define GREEK_LOWER_UPSILON_TONOS 0x03CD
31 : #define GREEK_LOWER_UPSILON_OXIA 0x1F7B
32 : #define GREEK_LOWER_UPSILON_DIALYTIKA 0x03CB
33 : #define GREEK_LOWER_UPSILON_DIALYTIKA_TONOS 0x03B0
34 : #define GREEK_LOWER_UPSILON_DIALYTIKA_OXIA 0x1FE3
35 : #define GREEK_LOWER_OMEGA 0x03C9
36 : #define GREEK_LOWER_OMEGA_TONOS 0x03CE
37 : #define GREEK_LOWER_OMEGA_OXIA 0x1F7D
38 : #define GREEK_UPPER_ALPHA 0x0391
39 : #define GREEK_UPPER_EPSILON 0x0395
40 : #define GREEK_UPPER_ETA 0x0397
41 : #define GREEK_UPPER_IOTA 0x0399
42 : #define GREEK_UPPER_IOTA_DIALYTIKA 0x03AA
43 : #define GREEK_UPPER_OMICRON 0x039F
44 : #define GREEK_UPPER_UPSILON 0x03A5
45 : #define GREEK_UPPER_UPSILON_DIALYTIKA 0x03AB
46 : #define GREEK_UPPER_OMEGA 0x03A9
47 : #define GREEK_UPPER_ALPHA_TONOS 0x0386
48 : #define GREEK_UPPER_ALPHA_OXIA 0x1FBB
49 : #define GREEK_UPPER_EPSILON_TONOS 0x0388
50 : #define GREEK_UPPER_EPSILON_OXIA 0x1FC9
51 : #define GREEK_UPPER_ETA_TONOS 0x0389
52 : #define GREEK_UPPER_ETA_OXIA 0x1FCB
53 : #define GREEK_UPPER_IOTA_TONOS 0x038A
54 : #define GREEK_UPPER_IOTA_OXIA 0x1FDB
55 : #define GREEK_UPPER_OMICRON_TONOS 0x038C
56 : #define GREEK_UPPER_OMICRON_OXIA 0x1FF9
57 : #define GREEK_UPPER_UPSILON_TONOS 0x038E
58 : #define GREEK_UPPER_UPSILON_OXIA 0x1FEB
59 : #define GREEK_UPPER_OMEGA_TONOS 0x038F
60 : #define GREEK_UPPER_OMEGA_OXIA 0x1FFB
61 : #define COMBINING_ACUTE_ACCENT 0x0301
62 : #define COMBINING_DIAERESIS 0x0308
63 : #define COMBINING_ACUTE_TONE_MARK 0x0341
64 : #define COMBINING_GREEK_DIALYTIKA_TONOS 0x0344
65 :
66 : namespace mozilla {
67 :
68 : uint32_t
69 0 : GreekCasing::UpperCase(uint32_t aCh, GreekCasing::State& aState,
70 : bool& aMarkEtaPos, bool& aUpdateMarkedEta)
71 : {
72 0 : aMarkEtaPos = false;
73 0 : aUpdateMarkedEta = false;
74 :
75 0 : uint8_t category = unicode::GetGeneralCategory(aCh);
76 :
77 0 : if (aState == kEtaAccMarked) {
78 0 : switch (category) {
79 : case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
80 : case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
81 : case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
82 : case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
83 : case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
84 : case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
85 : case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
86 : case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
87 0 : aUpdateMarkedEta = true;
88 0 : break;
89 : default:
90 0 : break;
91 : }
92 0 : aState = kEtaAcc;
93 : }
94 :
95 0 : switch (aCh) {
96 : case GREEK_UPPER_ALPHA:
97 : case GREEK_LOWER_ALPHA:
98 0 : aState = kAlpha;
99 0 : return GREEK_UPPER_ALPHA;
100 :
101 : case GREEK_UPPER_EPSILON:
102 : case GREEK_LOWER_EPSILON:
103 0 : aState = kEpsilon;
104 0 : return GREEK_UPPER_EPSILON;
105 :
106 : case GREEK_UPPER_ETA:
107 : case GREEK_LOWER_ETA:
108 0 : aState = kEta;
109 0 : return GREEK_UPPER_ETA;
110 :
111 : case GREEK_UPPER_IOTA:
112 0 : aState = kIota;
113 0 : return GREEK_UPPER_IOTA;
114 :
115 : case GREEK_UPPER_OMICRON:
116 : case GREEK_LOWER_OMICRON:
117 0 : aState = kOmicron;
118 0 : return GREEK_UPPER_OMICRON;
119 :
120 : case GREEK_UPPER_UPSILON:
121 0 : switch (aState) {
122 : case kOmicron:
123 0 : aState = kOmicronUpsilon;
124 0 : break;
125 : default:
126 0 : aState = kUpsilon;
127 0 : break;
128 : }
129 0 : return GREEK_UPPER_UPSILON;
130 :
131 : case GREEK_UPPER_OMEGA:
132 : case GREEK_LOWER_OMEGA:
133 0 : aState = kOmega;
134 0 : return GREEK_UPPER_OMEGA;
135 :
136 : // iota and upsilon may be the second vowel of a diphthong
137 : case GREEK_LOWER_IOTA:
138 0 : switch (aState) {
139 : case kAlphaAcc:
140 : case kEpsilonAcc:
141 : case kOmicronAcc:
142 : case kUpsilonAcc:
143 0 : aState = kInWord;
144 0 : return GREEK_UPPER_IOTA_DIALYTIKA;
145 : default:
146 0 : break;
147 : }
148 0 : aState = kIota;
149 0 : return GREEK_UPPER_IOTA;
150 :
151 : case GREEK_LOWER_UPSILON:
152 0 : switch (aState) {
153 : case kAlphaAcc:
154 : case kEpsilonAcc:
155 : case kEtaAcc:
156 : case kOmicronAcc:
157 0 : aState = kInWord;
158 0 : return GREEK_UPPER_UPSILON_DIALYTIKA;
159 : case kOmicron:
160 0 : aState = kOmicronUpsilon;
161 0 : break;
162 : default:
163 0 : aState = kUpsilon;
164 0 : break;
165 : }
166 0 : return GREEK_UPPER_UPSILON;
167 :
168 : case GREEK_UPPER_IOTA_DIALYTIKA:
169 : case GREEK_LOWER_IOTA_DIALYTIKA:
170 : case GREEK_UPPER_UPSILON_DIALYTIKA:
171 : case GREEK_LOWER_UPSILON_DIALYTIKA:
172 : case COMBINING_DIAERESIS:
173 0 : aState = kDiaeresis;
174 0 : return ToUpperCase(aCh);
175 :
176 : // remove accent if it follows a vowel or diaeresis,
177 : // and set appropriate state for diphthong detection
178 : case COMBINING_ACUTE_ACCENT:
179 : case COMBINING_ACUTE_TONE_MARK:
180 0 : switch (aState) {
181 : case kAlpha:
182 0 : aState = kAlphaAcc;
183 0 : return uint32_t(-1); // omit this char from result string
184 : case kEpsilon:
185 0 : aState = kEpsilonAcc;
186 0 : return uint32_t(-1);
187 : case kEta:
188 0 : aState = kEtaAcc;
189 0 : return uint32_t(-1);
190 : case kIota:
191 0 : aState = kIotaAcc;
192 0 : return uint32_t(-1);
193 : case kOmicron:
194 0 : aState = kOmicronAcc;
195 0 : return uint32_t(-1);
196 : case kUpsilon:
197 0 : aState = kUpsilonAcc;
198 0 : return uint32_t(-1);
199 : case kOmicronUpsilon:
200 0 : aState = kInWord; // this completed a diphthong
201 0 : return uint32_t(-1);
202 : case kOmega:
203 0 : aState = kOmegaAcc;
204 0 : return uint32_t(-1);
205 : case kDiaeresis:
206 0 : aState = kInWord;
207 0 : return uint32_t(-1);
208 : default:
209 0 : break;
210 : }
211 0 : break;
212 :
213 : // combinations with dieresis+accent just strip the accent,
214 : // and reset to start state (don't form diphthong with following vowel)
215 : case GREEK_LOWER_IOTA_DIALYTIKA_TONOS:
216 : case GREEK_LOWER_IOTA_DIALYTIKA_OXIA:
217 0 : aState = kInWord;
218 0 : return GREEK_UPPER_IOTA_DIALYTIKA;
219 :
220 : case GREEK_LOWER_UPSILON_DIALYTIKA_TONOS:
221 : case GREEK_LOWER_UPSILON_DIALYTIKA_OXIA:
222 0 : aState = kInWord;
223 0 : return GREEK_UPPER_UPSILON_DIALYTIKA;
224 :
225 : case COMBINING_GREEK_DIALYTIKA_TONOS:
226 0 : aState = kInWord;
227 0 : return COMBINING_DIAERESIS;
228 :
229 : // strip accents from vowels, and note the vowel seen so that we can detect
230 : // diphthongs where diaeresis needs to be added
231 : case GREEK_LOWER_ALPHA_TONOS:
232 : case GREEK_LOWER_ALPHA_OXIA:
233 : case GREEK_UPPER_ALPHA_TONOS:
234 : case GREEK_UPPER_ALPHA_OXIA:
235 0 : aState = kAlphaAcc;
236 0 : return GREEK_UPPER_ALPHA;
237 :
238 : case GREEK_LOWER_EPSILON_TONOS:
239 : case GREEK_LOWER_EPSILON_OXIA:
240 : case GREEK_UPPER_EPSILON_TONOS:
241 : case GREEK_UPPER_EPSILON_OXIA:
242 0 : aState = kEpsilonAcc;
243 0 : return GREEK_UPPER_EPSILON;
244 :
245 : case GREEK_LOWER_ETA_TONOS:
246 : case GREEK_UPPER_ETA_TONOS:
247 0 : if (aState == kStart) {
248 0 : aState = kEtaAccMarked;
249 0 : aMarkEtaPos = true; // mark in case we need to remove the tonos later
250 0 : return GREEK_UPPER_ETA_TONOS; // treat as disjunctive eta for now
251 : }
252 : // if not in initial state, fall through to strip the accent
253 : MOZ_FALLTHROUGH;
254 :
255 : case GREEK_LOWER_ETA_OXIA:
256 : case GREEK_UPPER_ETA_OXIA:
257 0 : aState = kEtaAcc;
258 0 : return GREEK_UPPER_ETA;
259 :
260 : case GREEK_LOWER_IOTA_TONOS:
261 : case GREEK_LOWER_IOTA_OXIA:
262 : case GREEK_UPPER_IOTA_TONOS:
263 : case GREEK_UPPER_IOTA_OXIA:
264 0 : aState = kIotaAcc;
265 0 : return GREEK_UPPER_IOTA;
266 :
267 : case GREEK_LOWER_OMICRON_TONOS:
268 : case GREEK_LOWER_OMICRON_OXIA:
269 : case GREEK_UPPER_OMICRON_TONOS:
270 : case GREEK_UPPER_OMICRON_OXIA:
271 0 : aState = kOmicronAcc;
272 0 : return GREEK_UPPER_OMICRON;
273 :
274 : case GREEK_LOWER_UPSILON_TONOS:
275 : case GREEK_LOWER_UPSILON_OXIA:
276 : case GREEK_UPPER_UPSILON_TONOS:
277 : case GREEK_UPPER_UPSILON_OXIA:
278 0 : switch (aState) {
279 : case kOmicron:
280 0 : aState = kInWord; // this completed a diphthong
281 0 : break;
282 : default:
283 0 : aState = kUpsilonAcc;
284 0 : break;
285 : }
286 0 : return GREEK_UPPER_UPSILON;
287 :
288 : case GREEK_LOWER_OMEGA_TONOS:
289 : case GREEK_LOWER_OMEGA_OXIA:
290 : case GREEK_UPPER_OMEGA_TONOS:
291 : case GREEK_UPPER_OMEGA_OXIA:
292 0 : aState = kOmegaAcc;
293 0 : return GREEK_UPPER_OMEGA;
294 : }
295 :
296 : // all other characters just reset the state to either kStart or kInWord,
297 : // and use standard mappings
298 0 : switch (category) {
299 : case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
300 : case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
301 : case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
302 : case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
303 : case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
304 : case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
305 : case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
306 : case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
307 0 : aState = kInWord;
308 0 : break;
309 : default:
310 0 : aState = kStart;
311 0 : break;
312 : }
313 :
314 0 : return ToUpperCase(aCh);
315 : }
316 :
317 : } // namespace mozilla
|