Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : /******************************************************************************
7 :
8 : This file provides a finite state machine to support Irish Gaelic uppercasing
9 : rules.
10 :
11 : The caller will need to iterate through a string, passing a State variable
12 : along with the current character to each UpperCase call and checking the flags
13 : that are returned:
14 :
15 : If aMarkPos is true, caller must remember the current index in the string as
16 : a possible target for a future action.
17 :
18 : If aAction is non-zero, then one or more characters from the marked index are
19 : to be modified:
20 : 1 lowercase the marked letter
21 : 2 lowercase the marked letter and its successor
22 : 3 lowercase the marked letter, and delete its successor
23 :
24 :
25 : ### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
26 : ### comments 1 and 4:
27 :
28 : v = [a,á,e,é,i,í,o,ó,u,ú]
29 : V = [A,Á,E,É,I,Í,O,Ó,U,Ú]
30 :
31 : bhf -> bhF
32 : bhF -> bhF
33 : bp -> bP
34 : bP -> bP
35 : dt -> dT
36 : dT -> dT
37 : gc -> gC
38 : gC -> gC
39 : h{V} -> h{V}
40 : mb -> mB
41 : mB -> mB
42 : n-{v} -> n{V}
43 : n{V} -> n{V}
44 : nd -> nD
45 : nD -> nD
46 : ng -> nG
47 : nG -> nG
48 : t-{v} -> t{V}
49 : t{V} -> t{V}
50 : ts{v} -> tS{V}
51 : tS{v} -> tS{V}
52 : tS{V} -> tS{V}
53 : tsl -> tSL
54 : tSl -> tSL
55 : tSL -> tSL
56 : tsn -> tSN
57 : tSn -> tSN
58 : tSN -> tSN
59 : tsr -> tSR
60 : tSr -> tSR
61 : tSR -> tSR
62 :
63 : ### Create table of states and actions for each input class.
64 :
65 : Start (non-word) state is #; generic in-word state is _, once we know there's
66 : no special action to do in this word.
67 :
68 : # _ b bh d g h m n n- t t- ts
69 : input\state
70 : b b' _ _ _ _ _ _ 1 _ _ _ _ _
71 : B _ _ _ _ _ _ _ 1 _ _ _ _ _
72 : c _ _ _ _ _ 1 _ _ _ _ _ _ _
73 : C _ _ _ _ _ 1 _ _ _ _ _ _ _
74 : d d' _ _ _ _ _ _ _ 1 _ _ _ _
75 : D _ _ _ _ _ _ _ _ 1 _ _ _ _
76 : f _ _ _ 2 _ _ _ _ _ _ _ _ _
77 : F _ _ _ 2 _ _ _ _ _ _ _ _ _
78 : g g' _ _ _ _ _ _ _ 1 _ _ _ _
79 : G _ _ _ _ _ _ _ _ 1 _ _ _ _
80 : h h' _ bh _ _ _ _ _ _ _ _ _ _
81 : l _ _ _ _ _ _ _ _ _ _ _ _ 1
82 : L _ _ _ _ _ _ _ _ _ _ _ _ 1
83 : m m' _ _ _ _ _ _ _ _ _ _ _ _
84 : n n' _ _ _ _ _ _ _ _ _ _ _ 1
85 : N _ _ _ _ _ _ _ _ _ _ _ _ 1
86 : p _ _ 1 _ _ _ _ _ _ _ _ _ _
87 : P _ _ 1 _ _ _ _ _ _ _ _ _ _
88 : r _ _ _ _ _ _ _ _ _ _ _ _ 1
89 : R _ _ _ _ _ _ _ _ _ _ _ _ 1
90 : s _ _ _ _ _ _ _ _ _ _ ts _ _
91 : S _ _ _ _ _ _ _ _ _ _ ts _ _
92 : t t' _ _ _ 1 _ _ _ _ _ _ _ _
93 : T _ _ _ _ 1 _ _ _ _ _ _ _ _
94 : vowel _ _ _ _ _ _ _ _ _ 1d _ 1d 1
95 : Vowel _ _ _ _ _ _ 1 _ 1 _ 1 _ 1
96 : hyph _ _ _ _ _ _ _ _ n- _ t- _ _
97 : letter _ _ _ _ _ _ _ _ _ _ _ _ _
98 : other # # # # # # # # # # # # #
99 :
100 : Actions:
101 : 1 lowercase one letter at start of word
102 : 2 lowercase two letters at start of word
103 : 1d lowercase one letter at start of word, and delete next
104 : (and then go to state _, nothing further to do in this word)
105 :
106 : else just go to the given state; suffix ' indicates mark start-of-word.
107 :
108 : ### Consolidate identical states and classes:
109 :
110 : 0 1 2 3 4 5 6 7 8 9 A B
111 : # _ b bh d g h m n [nt]- t ts
112 : input\state
113 : b b' _ _ _ _ _ _ 1 _ _ _ _
114 : B _ _ _ _ _ _ _ 1 _ _ _ _
115 : [cC] _ _ _ _ _ 1 _ _ _ _ _ _
116 : d d' _ _ _ _ _ _ _ 1 _ _ _
117 : [DG] _ _ _ _ _ _ _ _ 1 _ _ _
118 : [fF] _ _ _ 2 _ _ _ _ _ _ _ _
119 : g g' _ _ _ _ _ _ _ 1 _ _ _
120 : h h' _ bh _ _ _ _ _ _ _ _ _
121 : [lLNrR] _ _ _ _ _ _ _ _ _ _ _ 1
122 : m m' _ _ _ _ _ _ _ _ _ _ _
123 : n n' _ _ _ _ _ _ _ _ _ _ 1
124 : [pP] _ _ 1 _ _ _ _ _ _ _ _ _
125 : [sS] _ _ _ _ _ _ _ _ _ _ ts _
126 : t t' _ _ _ 1 _ _ _ _ _ _ _
127 : T _ _ _ _ 1 _ _ _ _ _ _ _
128 : vowel _ _ _ _ _ _ _ _ _ 1d _ 1
129 : Vowel _ _ _ _ _ _ 1 _ 1 _ 1 1
130 : hyph _ _ _ _ _ _ _ _ [nt-] _ [nt-] _
131 : letter _ _ _ _ _ _ _ _ _ _ _ _
132 : other # # # # # # # # # # # #
133 :
134 : So we have 20 input classes, and 12 states.
135 :
136 : State table array will contain bytes that encode action and new state:
137 :
138 : 0x80 - bit flag: mark start-of-word position
139 : 0x40 - currently unused
140 : 0x30 - action mask: 4 values
141 : 0x00 - do nothing
142 : 0x10 - lowercase one letter
143 : 0x20 - lowercase two letters
144 : 0x30 - lowercase one, delete one
145 : 0x0F - next-state mask
146 : ******************************************************************************/
147 :
148 : #include "IrishCasing.h"
149 :
150 : #include "nsUnicodeProperties.h"
151 : #include "nsUnicharUtils.h"
152 :
153 : namespace mozilla {
154 :
155 : const uint8_t
156 : IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
157 : // # _ b bh d g h m n [nt]- t ts
158 : { 0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // b
159 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // B
160 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [cC]
161 : { 0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // d
162 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // [DG]
163 : { 0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [fF]
164 : { 0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // g
165 : { 0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // h
166 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // [lLNrR]
167 : { 0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // m
168 : { 0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // n
169 : { 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [pP]
170 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x01 }, // [sS]
171 : { 0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // t
172 : { 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // T
173 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 0x11 }, // vowel
174 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 0x11 }, // Vowel
175 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 0x01 }, // hyph
176 : { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // letter
177 : { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } // other
178 : };
179 :
180 : #define HYPHEN 0x2010
181 : #define NO_BREAK_HYPHEN 0x2011
182 : #define a_ACUTE 0x00e1
183 : #define e_ACUTE 0x00e9
184 : #define i_ACUTE 0x00ed
185 : #define o_ACUTE 0x00f3
186 : #define u_ACUTE 0x00fa
187 : #define A_ACUTE 0x00c1
188 : #define E_ACUTE 0x00c9
189 : #define I_ACUTE 0x00cd
190 : #define O_ACUTE 0x00d3
191 : #define U_ACUTE 0x00da
192 :
193 : const uint8_t IrishCasing::sLcClasses[26] = {
194 : kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel,
195 : kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter,
196 : kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel,
197 : kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t,
198 : kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
199 : kClass_letter
200 : };
201 :
202 : const uint8_t IrishCasing::sUcClasses[26] = {
203 : kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel,
204 : kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter,
205 : kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel,
206 : kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T,
207 : kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
208 : kClass_letter
209 : };
210 :
211 : uint8_t
212 0 : IrishCasing::GetClass(uint32_t aCh)
213 : {
214 : using mozilla::unicode::GetGenCategory;
215 0 : if (aCh >= 'a' && aCh <= 'z') {
216 0 : return sLcClasses[aCh - 'a'];
217 0 : } else if (aCh >= 'A' && aCh <= 'Z') {
218 0 : return sUcClasses[aCh - 'A'];
219 0 : } else if (GetGenCategory(aCh) == nsUGenCategory::kLetter) {
220 0 : if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE ||
221 0 : aCh == o_ACUTE || aCh == u_ACUTE) {
222 0 : return kClass_vowel;
223 0 : } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
224 0 : aCh == O_ACUTE || aCh == U_ACUTE) {
225 0 : return kClass_Vowel;
226 : } else {
227 0 : return kClass_letter;
228 : }
229 0 : } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
230 0 : return kClass_hyph;
231 : } else {
232 0 : return kClass_other;
233 : }
234 : }
235 :
236 : uint32_t
237 0 : IrishCasing::UpperCase(uint32_t aCh, State& aState,
238 : bool& aMarkPos, uint8_t& aAction)
239 : {
240 0 : uint8_t cls = GetClass(aCh);
241 0 : uint8_t stateEntry = sUppercaseStateTable[cls][aState];
242 0 : aMarkPos = !!(stateEntry & kMarkPositionFlag);
243 0 : aAction = (stateEntry & kActionMask) >> kActionShift;
244 0 : aState = State(stateEntry & kNextStateMask);
245 :
246 0 : return ToUpperCase(aCh);
247 : }
248 :
249 : } // namespace mozilla
|