Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2013-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationruleparser.cpp
9 : *
10 : * (replaced the former ucol_tok.cpp)
11 : *
12 : * created on: 2013apr10
13 : * created by: Markus W. Scherer
14 : */
15 :
16 : #include "unicode/utypes.h"
17 :
18 : #if !UCONFIG_NO_COLLATION
19 :
20 : #include "unicode/normalizer2.h"
21 : #include "unicode/parseerr.h"
22 : #include "unicode/uchar.h"
23 : #include "unicode/ucol.h"
24 : #include "unicode/uloc.h"
25 : #include "unicode/unistr.h"
26 : #include "unicode/utf16.h"
27 : #include "charstr.h"
28 : #include "cmemory.h"
29 : #include "collation.h"
30 : #include "collationdata.h"
31 : #include "collationruleparser.h"
32 : #include "collationsettings.h"
33 : #include "collationtailoring.h"
34 : #include "cstring.h"
35 : #include "patternprops.h"
36 : #include "uassert.h"
37 : #include "uvectr32.h"
38 :
39 : U_NAMESPACE_BEGIN
40 :
41 : namespace {
42 :
43 : static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
44 : const int32_t BEFORE_LENGTH = 7;
45 :
46 : } // namespace
47 :
48 0 : CollationRuleParser::Sink::~Sink() {}
49 :
50 : void
51 0 : CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52 :
53 : void
54 0 : CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55 :
56 0 : CollationRuleParser::Importer::~Importer() {}
57 :
58 0 : CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59 0 : : nfd(*Normalizer2::getNFDInstance(errorCode)),
60 0 : nfc(*Normalizer2::getNFCInstance(errorCode)),
61 : rules(NULL), baseData(base), settings(NULL),
62 : parseError(NULL), errorReason(NULL),
63 : sink(NULL), importer(NULL),
64 0 : ruleIndex(0) {
65 0 : }
66 :
67 0 : CollationRuleParser::~CollationRuleParser() {
68 0 : }
69 :
70 : void
71 0 : CollationRuleParser::parse(const UnicodeString &ruleString,
72 : CollationSettings &outSettings,
73 : UParseError *outParseError,
74 : UErrorCode &errorCode) {
75 0 : if(U_FAILURE(errorCode)) { return; }
76 0 : settings = &outSettings;
77 0 : parseError = outParseError;
78 0 : if(parseError != NULL) {
79 0 : parseError->line = 0;
80 0 : parseError->offset = -1;
81 0 : parseError->preContext[0] = 0;
82 0 : parseError->postContext[0] = 0;
83 : }
84 0 : errorReason = NULL;
85 0 : parse(ruleString, errorCode);
86 : }
87 :
88 : void
89 0 : CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90 0 : if(U_FAILURE(errorCode)) { return; }
91 0 : rules = &ruleString;
92 0 : ruleIndex = 0;
93 :
94 0 : while(ruleIndex < rules->length()) {
95 0 : UChar c = rules->charAt(ruleIndex);
96 0 : if(PatternProps::isWhiteSpace(c)) {
97 0 : ++ruleIndex;
98 0 : continue;
99 : }
100 0 : switch(c) {
101 : case 0x26: // '&'
102 0 : parseRuleChain(errorCode);
103 0 : break;
104 : case 0x5b: // '['
105 0 : parseSetting(errorCode);
106 0 : break;
107 : case 0x23: // '#' starts a comment, until the end of the line
108 0 : ruleIndex = skipComment(ruleIndex + 1);
109 0 : break;
110 : case 0x40: // '@' is equivalent to [backwards 2]
111 0 : settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112 0 : UCOL_ON, 0, errorCode);
113 0 : ++ruleIndex;
114 0 : break;
115 : case 0x21: // '!' used to turn on Thai/Lao character reversal
116 : // Accept but ignore. The root collator has contractions
117 : // that are equivalent to the character reversal, where appropriate.
118 0 : ++ruleIndex;
119 0 : break;
120 : default:
121 0 : setParseError("expected a reset or setting or comment", errorCode);
122 0 : break;
123 : }
124 0 : if(U_FAILURE(errorCode)) { return; }
125 : }
126 : }
127 :
128 : void
129 0 : CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130 0 : int32_t resetStrength = parseResetAndPosition(errorCode);
131 0 : UBool isFirstRelation = TRUE;
132 : for(;;) {
133 0 : int32_t result = parseRelationOperator(errorCode);
134 0 : if(U_FAILURE(errorCode)) { return; }
135 0 : if(result < 0) {
136 0 : if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137 : // '#' starts a comment, until the end of the line
138 0 : ruleIndex = skipComment(ruleIndex + 1);
139 0 : continue;
140 : }
141 0 : if(isFirstRelation) {
142 0 : setParseError("reset not followed by a relation", errorCode);
143 : }
144 0 : return;
145 : }
146 0 : int32_t strength = result & STRENGTH_MASK;
147 0 : if(resetStrength < UCOL_IDENTICAL) {
148 : // reset-before rule chain
149 0 : if(isFirstRelation) {
150 0 : if(strength != resetStrength) {
151 0 : setParseError("reset-before strength differs from its first relation", errorCode);
152 0 : return;
153 : }
154 : } else {
155 0 : if(strength < resetStrength) {
156 0 : setParseError("reset-before strength followed by a stronger relation", errorCode);
157 0 : return;
158 : }
159 : }
160 : }
161 0 : int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
162 0 : if((result & STARRED_FLAG) == 0) {
163 0 : parseRelationStrings(strength, i, errorCode);
164 : } else {
165 0 : parseStarredCharacters(strength, i, errorCode);
166 : }
167 0 : if(U_FAILURE(errorCode)) { return; }
168 0 : isFirstRelation = FALSE;
169 0 : }
170 : }
171 :
172 : int32_t
173 0 : CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174 0 : if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175 0 : int32_t i = skipWhiteSpace(ruleIndex + 1);
176 : int32_t j;
177 : UChar c;
178 : int32_t resetStrength;
179 0 : if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180 0 : (j = i + BEFORE_LENGTH) < rules->length() &&
181 0 : PatternProps::isWhiteSpace(rules->charAt(j)) &&
182 0 : ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183 0 : 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184 0 : rules->charAt(j + 1) == 0x5d) {
185 : // &[before n] with n=1 or 2 or 3
186 0 : resetStrength = UCOL_PRIMARY + (c - 0x31);
187 0 : i = skipWhiteSpace(j + 2);
188 : } else {
189 0 : resetStrength = UCOL_IDENTICAL;
190 : }
191 0 : if(i >= rules->length()) {
192 0 : setParseError("reset without position", errorCode);
193 0 : return UCOL_DEFAULT;
194 : }
195 0 : UnicodeString str;
196 0 : if(rules->charAt(i) == 0x5b) { // '['
197 0 : i = parseSpecialPosition(i, str, errorCode);
198 : } else {
199 0 : i = parseTailoringString(i, str, errorCode);
200 : }
201 0 : sink->addReset(resetStrength, str, errorReason, errorCode);
202 0 : if(U_FAILURE(errorCode)) { setErrorContext(); }
203 0 : ruleIndex = i;
204 0 : return resetStrength;
205 : }
206 :
207 : int32_t
208 0 : CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209 0 : if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210 0 : ruleIndex = skipWhiteSpace(ruleIndex);
211 0 : if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212 : int32_t strength;
213 0 : int32_t i = ruleIndex;
214 0 : UChar c = rules->charAt(i++);
215 0 : switch(c) {
216 : case 0x3c: // '<'
217 0 : if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
218 0 : ++i;
219 0 : if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
220 0 : ++i;
221 0 : if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
222 0 : ++i;
223 0 : strength = UCOL_QUATERNARY;
224 : } else {
225 0 : strength = UCOL_TERTIARY;
226 : }
227 : } else {
228 0 : strength = UCOL_SECONDARY;
229 : }
230 : } else {
231 0 : strength = UCOL_PRIMARY;
232 : }
233 0 : if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
234 0 : ++i;
235 0 : strength |= STARRED_FLAG;
236 : }
237 0 : break;
238 : case 0x3b: // ';' same as <<
239 0 : strength = UCOL_SECONDARY;
240 0 : break;
241 : case 0x2c: // ',' same as <<<
242 0 : strength = UCOL_TERTIARY;
243 0 : break;
244 : case 0x3d: // '='
245 0 : strength = UCOL_IDENTICAL;
246 0 : if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
247 0 : ++i;
248 0 : strength |= STARRED_FLAG;
249 : }
250 0 : break;
251 : default:
252 0 : return UCOL_DEFAULT;
253 : }
254 0 : return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255 : }
256 :
257 : void
258 0 : CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259 : // Parse
260 : // prefix | str / extension
261 : // where prefix and extension are optional.
262 0 : UnicodeString prefix, str, extension;
263 0 : i = parseTailoringString(i, str, errorCode);
264 0 : if(U_FAILURE(errorCode)) { return; }
265 0 : UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266 0 : if(next == 0x7c) { // '|' separates the context prefix from the string.
267 0 : prefix = str;
268 0 : i = parseTailoringString(i + 1, str, errorCode);
269 0 : if(U_FAILURE(errorCode)) { return; }
270 0 : next = (i < rules->length()) ? rules->charAt(i) : 0;
271 : }
272 0 : if(next == 0x2f) { // '/' separates the string from the extension.
273 0 : i = parseTailoringString(i + 1, extension, errorCode);
274 : }
275 0 : if(!prefix.isEmpty()) {
276 0 : UChar32 prefix0 = prefix.char32At(0);
277 0 : UChar32 c = str.char32At(0);
278 0 : if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279 : setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280 0 : errorCode);
281 0 : return;
282 : }
283 : }
284 0 : sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285 0 : if(U_FAILURE(errorCode)) { setErrorContext(); }
286 0 : ruleIndex = i;
287 : }
288 :
289 : void
290 0 : CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291 0 : UnicodeString empty, raw;
292 0 : i = parseString(skipWhiteSpace(i), raw, errorCode);
293 0 : if(U_FAILURE(errorCode)) { return; }
294 0 : if(raw.isEmpty()) {
295 0 : setParseError("missing starred-relation string", errorCode);
296 0 : return;
297 : }
298 0 : UChar32 prev = -1;
299 0 : int32_t j = 0;
300 : for(;;) {
301 0 : while(j < raw.length()) {
302 0 : UChar32 c = raw.char32At(j);
303 0 : if(!nfd.isInert(c)) {
304 0 : setParseError("starred-relation string is not all NFD-inert", errorCode);
305 0 : return;
306 : }
307 0 : sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308 0 : if(U_FAILURE(errorCode)) {
309 0 : setErrorContext();
310 0 : return;
311 : }
312 0 : j += U16_LENGTH(c);
313 0 : prev = c;
314 : }
315 0 : if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
316 0 : break;
317 : }
318 0 : if(prev < 0) {
319 0 : setParseError("range without start in starred-relation string", errorCode);
320 0 : return;
321 : }
322 0 : i = parseString(i + 1, raw, errorCode);
323 0 : if(U_FAILURE(errorCode)) { return; }
324 0 : if(raw.isEmpty()) {
325 0 : setParseError("range without end in starred-relation string", errorCode);
326 0 : return;
327 : }
328 0 : UChar32 c = raw.char32At(0);
329 0 : if(c < prev) {
330 0 : setParseError("range start greater than end in starred-relation string", errorCode);
331 0 : return;
332 : }
333 : // range prev-c
334 0 : UnicodeString s;
335 0 : while(++prev <= c) {
336 0 : if(!nfd.isInert(prev)) {
337 0 : setParseError("starred-relation string range is not all NFD-inert", errorCode);
338 0 : return;
339 : }
340 0 : if(U_IS_SURROGATE(prev)) {
341 0 : setParseError("starred-relation string range contains a surrogate", errorCode);
342 0 : return;
343 : }
344 0 : if(0xfffd <= prev && prev <= 0xffff) {
345 0 : setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346 0 : return;
347 : }
348 0 : s.setTo(prev);
349 0 : sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350 0 : if(U_FAILURE(errorCode)) {
351 0 : setErrorContext();
352 0 : return;
353 : }
354 : }
355 0 : prev = -1;
356 0 : j = U16_LENGTH(c);
357 0 : }
358 0 : ruleIndex = skipWhiteSpace(i);
359 : }
360 :
361 : int32_t
362 0 : CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363 0 : i = parseString(skipWhiteSpace(i), raw, errorCode);
364 0 : if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365 0 : setParseError("missing relation string", errorCode);
366 : }
367 0 : return skipWhiteSpace(i);
368 : }
369 :
370 : int32_t
371 0 : CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372 0 : if(U_FAILURE(errorCode)) { return i; }
373 0 : raw.remove();
374 0 : while(i < rules->length()) {
375 0 : UChar32 c = rules->charAt(i++);
376 0 : if(isSyntaxChar(c)) {
377 0 : if(c == 0x27) { // apostrophe
378 0 : if(i < rules->length() && rules->charAt(i) == 0x27) {
379 : // Double apostrophe, encodes a single one.
380 0 : raw.append((UChar)0x27);
381 0 : ++i;
382 0 : continue;
383 : }
384 : // Quote literal text until the next single apostrophe.
385 : for(;;) {
386 0 : if(i == rules->length()) {
387 0 : setParseError("quoted literal text missing terminating apostrophe", errorCode);
388 0 : return i;
389 : }
390 0 : c = rules->charAt(i++);
391 0 : if(c == 0x27) {
392 0 : if(i < rules->length() && rules->charAt(i) == 0x27) {
393 : // Double apostrophe inside quoted literal text,
394 : // still encodes a single apostrophe.
395 0 : ++i;
396 : } else {
397 0 : break;
398 : }
399 : }
400 0 : raw.append((UChar)c);
401 : }
402 0 : } else if(c == 0x5c) { // backslash
403 0 : if(i == rules->length()) {
404 0 : setParseError("backslash escape at the end of the rule string", errorCode);
405 0 : return i;
406 : }
407 0 : c = rules->char32At(i);
408 0 : raw.append(c);
409 0 : i += U16_LENGTH(c);
410 : } else {
411 : // Any other syntax character terminates a string.
412 0 : --i;
413 0 : break;
414 : }
415 0 : } else if(PatternProps::isWhiteSpace(c)) {
416 : // Unquoted white space terminates a string.
417 0 : --i;
418 0 : break;
419 : } else {
420 0 : raw.append((UChar)c);
421 : }
422 : }
423 0 : for(int32_t j = 0; j < raw.length();) {
424 0 : UChar32 c = raw.char32At(j);
425 0 : if(U_IS_SURROGATE(c)) {
426 0 : setParseError("string contains an unpaired surrogate", errorCode);
427 0 : return i;
428 : }
429 0 : if(0xfffd <= c && c <= 0xffff) {
430 0 : setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431 0 : return i;
432 : }
433 0 : j += U16_LENGTH(c);
434 : }
435 0 : return i;
436 : }
437 :
438 : namespace {
439 :
440 : static const char *const positions[] = {
441 : "first tertiary ignorable",
442 : "last tertiary ignorable",
443 : "first secondary ignorable",
444 : "last secondary ignorable",
445 : "first primary ignorable",
446 : "last primary ignorable",
447 : "first variable",
448 : "last variable",
449 : "first regular",
450 : "last regular",
451 : "first implicit",
452 : "last implicit",
453 : "first trailing",
454 : "last trailing"
455 : };
456 :
457 : } // namespace
458 :
459 : int32_t
460 0 : CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461 0 : if(U_FAILURE(errorCode)) { return 0; }
462 0 : UnicodeString raw;
463 0 : int32_t j = readWords(i + 1, raw);
464 0 : if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
465 0 : ++j;
466 0 : for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467 0 : if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468 0 : str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469 0 : return j;
470 : }
471 : }
472 0 : if(raw == UNICODE_STRING_SIMPLE("top")) {
473 0 : str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474 0 : return j;
475 : }
476 0 : if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477 0 : str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478 0 : return j;
479 : }
480 : }
481 0 : setParseError("not a valid special reset position", errorCode);
482 0 : return i;
483 : }
484 :
485 : void
486 0 : CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487 0 : if(U_FAILURE(errorCode)) { return; }
488 0 : UnicodeString raw;
489 0 : int32_t i = ruleIndex + 1;
490 0 : int32_t j = readWords(i, raw);
491 0 : if(j <= i || raw.isEmpty()) {
492 0 : setParseError("expected a setting/option at '['", errorCode);
493 : }
494 0 : if(rules->charAt(j) == 0x5d) { // words end with ]
495 0 : ++j;
496 0 : if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497 0 : (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498 0 : parseReordering(raw, errorCode);
499 0 : ruleIndex = j;
500 0 : return;
501 : }
502 0 : if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503 0 : settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504 0 : UCOL_ON, 0, errorCode);
505 0 : ruleIndex = j;
506 0 : return;
507 : }
508 0 : UnicodeString v;
509 0 : int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510 0 : if(valueIndex >= 0) {
511 0 : v.setTo(raw, valueIndex + 1);
512 0 : raw.truncate(valueIndex);
513 : }
514 0 : if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515 0 : int32_t value = UCOL_DEFAULT;
516 0 : UChar c = v.charAt(0);
517 0 : if(0x31 <= c && c <= 0x34) { // 1..4
518 0 : value = UCOL_PRIMARY + (c - 0x31);
519 0 : } else if(c == 0x49) { // 'I'
520 0 : value = UCOL_IDENTICAL;
521 : }
522 0 : if(value != UCOL_DEFAULT) {
523 0 : settings->setStrength(value, 0, errorCode);
524 0 : ruleIndex = j;
525 0 : return;
526 : }
527 0 : } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528 0 : UColAttributeValue value = UCOL_DEFAULT;
529 0 : if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530 0 : value = UCOL_NON_IGNORABLE;
531 0 : } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532 0 : value = UCOL_SHIFTED;
533 : }
534 0 : if(value != UCOL_DEFAULT) {
535 0 : settings->setAlternateHandling(value, 0, errorCode);
536 0 : ruleIndex = j;
537 0 : return;
538 : }
539 0 : } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540 0 : int32_t value = UCOL_DEFAULT;
541 0 : if(v == UNICODE_STRING_SIMPLE("space")) {
542 0 : value = CollationSettings::MAX_VAR_SPACE;
543 0 : } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544 0 : value = CollationSettings::MAX_VAR_PUNCT;
545 0 : } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546 0 : value = CollationSettings::MAX_VAR_SYMBOL;
547 0 : } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548 0 : value = CollationSettings::MAX_VAR_CURRENCY;
549 : }
550 0 : if(value != UCOL_DEFAULT) {
551 0 : settings->setMaxVariable(value, 0, errorCode);
552 0 : settings->variableTop = baseData->getLastPrimaryForGroup(
553 : UCOL_REORDER_CODE_FIRST + value);
554 0 : U_ASSERT(settings->variableTop != 0);
555 0 : ruleIndex = j;
556 0 : return;
557 : }
558 0 : } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559 0 : UColAttributeValue value = UCOL_DEFAULT;
560 0 : if(v == UNICODE_STRING_SIMPLE("off")) {
561 0 : value = UCOL_OFF;
562 0 : } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563 0 : value = UCOL_LOWER_FIRST;
564 0 : } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565 0 : value = UCOL_UPPER_FIRST;
566 : }
567 0 : if(value != UCOL_DEFAULT) {
568 0 : settings->setCaseFirst(value, 0, errorCode);
569 0 : ruleIndex = j;
570 0 : return;
571 : }
572 0 : } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573 0 : UColAttributeValue value = getOnOffValue(v);
574 0 : if(value != UCOL_DEFAULT) {
575 0 : settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576 0 : ruleIndex = j;
577 0 : return;
578 : }
579 0 : } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580 0 : UColAttributeValue value = getOnOffValue(v);
581 0 : if(value != UCOL_DEFAULT) {
582 0 : settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583 0 : ruleIndex = j;
584 0 : return;
585 : }
586 0 : } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587 0 : UColAttributeValue value = getOnOffValue(v);
588 0 : if(value != UCOL_DEFAULT) {
589 0 : settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590 0 : ruleIndex = j;
591 0 : return;
592 : }
593 0 : } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594 0 : UColAttributeValue value = getOnOffValue(v);
595 0 : if(value != UCOL_DEFAULT) {
596 0 : if(value == UCOL_ON) {
597 0 : setParseError("[hiraganaQ on] is not supported", errorCode);
598 : }
599 0 : ruleIndex = j;
600 0 : return;
601 : }
602 0 : } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603 0 : CharString lang;
604 0 : lang.appendInvariantChars(v, errorCode);
605 0 : if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606 : // BCP 47 language tag -> ICU locale ID
607 : char localeID[ULOC_FULLNAME_CAPACITY];
608 : int32_t parsedLength;
609 0 : int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610 0 : &parsedLength, &errorCode);
611 0 : if(U_FAILURE(errorCode) ||
612 0 : parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613 0 : errorCode = U_ZERO_ERROR;
614 0 : setParseError("expected language tag in [import langTag]", errorCode);
615 0 : return;
616 : }
617 : // localeID minus all keywords
618 : char baseID[ULOC_FULLNAME_CAPACITY];
619 0 : length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620 0 : if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621 0 : errorCode = U_ZERO_ERROR;
622 0 : setParseError("expected language tag in [import langTag]", errorCode);
623 0 : return;
624 : }
625 0 : if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
626 0 : uprv_strcpy(baseID, "root");
627 : }
628 : // @collation=type, or length=0 if not specified
629 : char collationType[ULOC_KEYWORDS_CAPACITY];
630 : length = uloc_getKeywordValue(localeID, "collation",
631 : collationType, ULOC_KEYWORDS_CAPACITY,
632 0 : &errorCode);
633 0 : if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
634 0 : errorCode = U_ZERO_ERROR;
635 0 : setParseError("expected language tag in [import langTag]", errorCode);
636 0 : return;
637 : }
638 0 : if(importer == NULL) {
639 0 : setParseError("[import langTag] is not supported", errorCode);
640 : } else {
641 0 : UnicodeString importedRules;
642 0 : importer->getRules(baseID, length > 0 ? collationType : "standard",
643 0 : importedRules, errorReason, errorCode);
644 0 : if(U_FAILURE(errorCode)) {
645 0 : if(errorReason == NULL) {
646 0 : errorReason = "[import langTag] failed";
647 : }
648 0 : setErrorContext();
649 0 : return;
650 : }
651 0 : const UnicodeString *outerRules = rules;
652 0 : int32_t outerRuleIndex = ruleIndex;
653 0 : parse(importedRules, errorCode);
654 0 : if(U_FAILURE(errorCode)) {
655 0 : if(parseError != NULL) {
656 0 : parseError->offset = outerRuleIndex;
657 : }
658 : }
659 0 : rules = outerRules;
660 0 : ruleIndex = j;
661 : }
662 0 : return;
663 : }
664 0 : } else if(rules->charAt(j) == 0x5b) { // words end with [
665 0 : UnicodeSet set;
666 0 : j = parseUnicodeSet(j, set, errorCode);
667 0 : if(U_FAILURE(errorCode)) { return; }
668 0 : if(raw == UNICODE_STRING_SIMPLE("optimize")) {
669 0 : sink->optimize(set, errorReason, errorCode);
670 0 : if(U_FAILURE(errorCode)) { setErrorContext(); }
671 0 : ruleIndex = j;
672 0 : return;
673 0 : } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
674 0 : sink->suppressContractions(set, errorReason, errorCode);
675 0 : if(U_FAILURE(errorCode)) { setErrorContext(); }
676 0 : ruleIndex = j;
677 0 : return;
678 : }
679 : }
680 0 : setParseError("not a valid setting/option", errorCode);
681 : }
682 :
683 : void
684 0 : CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
685 0 : if(U_FAILURE(errorCode)) { return; }
686 0 : int32_t i = 7; // after "reorder"
687 0 : if(i == raw.length()) {
688 : // empty [reorder] with no codes
689 0 : settings->resetReordering();
690 0 : return;
691 : }
692 : // Parse the codes in [reorder aa bb cc].
693 0 : UVector32 reorderCodes(errorCode);
694 0 : if(U_FAILURE(errorCode)) { return; }
695 0 : CharString word;
696 0 : while(i < raw.length()) {
697 0 : ++i; // skip the word-separating space
698 0 : int32_t limit = raw.indexOf((UChar)0x20, i);
699 0 : if(limit < 0) { limit = raw.length(); }
700 0 : word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
701 0 : if(U_FAILURE(errorCode)) { return; }
702 0 : int32_t code = getReorderCode(word.data());
703 0 : if(code < 0) {
704 0 : setParseError("unknown script or reorder code", errorCode);
705 0 : return;
706 : }
707 0 : reorderCodes.addElement(code, errorCode);
708 0 : if(U_FAILURE(errorCode)) { return; }
709 0 : i = limit;
710 : }
711 0 : settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
712 : }
713 :
714 : static const char *const gSpecialReorderCodes[] = {
715 : "space", "punct", "symbol", "currency", "digit"
716 : };
717 :
718 : int32_t
719 0 : CollationRuleParser::getReorderCode(const char *word) {
720 0 : for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
721 0 : if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
722 0 : return UCOL_REORDER_CODE_FIRST + i;
723 : }
724 : }
725 0 : int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
726 0 : if(script >= 0) {
727 0 : return script;
728 : }
729 0 : if(uprv_stricmp(word, "others") == 0) {
730 0 : return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
731 : }
732 0 : return -1;
733 : }
734 :
735 : UColAttributeValue
736 0 : CollationRuleParser::getOnOffValue(const UnicodeString &s) {
737 0 : if(s == UNICODE_STRING_SIMPLE("on")) {
738 0 : return UCOL_ON;
739 0 : } else if(s == UNICODE_STRING_SIMPLE("off")) {
740 0 : return UCOL_OFF;
741 : } else {
742 0 : return UCOL_DEFAULT;
743 : }
744 : }
745 :
746 : int32_t
747 0 : CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
748 : // Collect a UnicodeSet pattern between a balanced pair of [brackets].
749 0 : int32_t level = 0;
750 0 : int32_t j = i;
751 : for(;;) {
752 0 : if(j == rules->length()) {
753 0 : setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
754 0 : return j;
755 : }
756 0 : UChar c = rules->charAt(j++);
757 0 : if(c == 0x5b) { // '['
758 0 : ++level;
759 0 : } else if(c == 0x5d) { // ']'
760 0 : if(--level == 0) { break; }
761 : }
762 0 : }
763 0 : set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
764 0 : if(U_FAILURE(errorCode)) {
765 0 : errorCode = U_ZERO_ERROR;
766 0 : setParseError("not a valid UnicodeSet pattern", errorCode);
767 0 : return j;
768 : }
769 0 : j = skipWhiteSpace(j);
770 0 : if(j == rules->length() || rules->charAt(j) != 0x5d) {
771 0 : setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
772 0 : return j;
773 : }
774 0 : return ++j;
775 : }
776 :
777 : int32_t
778 0 : CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
779 : static const UChar sp = 0x20;
780 0 : raw.remove();
781 0 : i = skipWhiteSpace(i);
782 : for(;;) {
783 0 : if(i >= rules->length()) { return 0; }
784 0 : UChar c = rules->charAt(i);
785 0 : if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
786 0 : if(raw.isEmpty()) { return i; }
787 0 : if(raw.endsWith(&sp, 1)) { // remove trailing space
788 0 : raw.truncate(raw.length() - 1);
789 : }
790 0 : return i;
791 : }
792 0 : if(PatternProps::isWhiteSpace(c)) {
793 0 : raw.append(sp);
794 0 : i = skipWhiteSpace(i + 1);
795 : } else {
796 0 : raw.append(c);
797 0 : ++i;
798 : }
799 0 : }
800 : }
801 :
802 : int32_t
803 0 : CollationRuleParser::skipComment(int32_t i) const {
804 : // skip to past the newline
805 0 : while(i < rules->length()) {
806 0 : UChar c = rules->charAt(i++);
807 : // LF or FF or CR or NEL or LS or PS
808 0 : if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
809 : // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
810 : // NLF (new line function) = CR or LF or CR+LF or NEL.
811 : // No need to collect all of CR+LF because a following LF will be ignored anyway.
812 : break;
813 : }
814 : }
815 0 : return i;
816 : }
817 :
818 : void
819 0 : CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
820 0 : if(U_FAILURE(errorCode)) { return; }
821 : // Error code consistent with the old parser (from ca. 2001),
822 : // rather than U_PARSE_ERROR;
823 0 : errorCode = U_INVALID_FORMAT_ERROR;
824 0 : errorReason = reason;
825 0 : if(parseError != NULL) { setErrorContext(); }
826 : }
827 :
828 : void
829 0 : CollationRuleParser::setErrorContext() {
830 0 : if(parseError == NULL) { return; }
831 :
832 : // Note: This relies on the calling code maintaining the ruleIndex
833 : // at a position that is useful for debugging.
834 : // For example, at the beginning of a reset or relation etc.
835 0 : parseError->offset = ruleIndex;
836 0 : parseError->line = 0; // We are not counting line numbers.
837 :
838 : // before ruleIndex
839 0 : int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
840 0 : if(start < 0) {
841 0 : start = 0;
842 0 : } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
843 0 : ++start;
844 : }
845 0 : int32_t length = ruleIndex - start;
846 0 : rules->extract(start, length, parseError->preContext);
847 0 : parseError->preContext[length] = 0;
848 :
849 : // starting from ruleIndex
850 0 : length = rules->length() - ruleIndex;
851 0 : if(length >= U_PARSE_CONTEXT_LEN) {
852 0 : length = U_PARSE_CONTEXT_LEN - 1;
853 0 : if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
854 0 : --length;
855 : }
856 : }
857 0 : rules->extract(ruleIndex, length, parseError->postContext);
858 0 : parseError->postContext[length] = 0;
859 : }
860 :
861 : UBool
862 0 : CollationRuleParser::isSyntaxChar(UChar32 c) {
863 0 : return 0x21 <= c && c <= 0x7e &&
864 0 : (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
865 0 : (0x5b <= c && c <= 0x60) || (0x7b <= c));
866 : }
867 :
868 : int32_t
869 0 : CollationRuleParser::skipWhiteSpace(int32_t i) const {
870 0 : while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
871 0 : ++i;
872 : }
873 0 : return i;
874 : }
875 :
876 : U_NAMESPACE_END
877 :
878 : #endif // !UCONFIG_NO_COLLATION
|