Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2013-2014, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationruleparser.h
9 : *
10 : * created on: 2013apr10
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #ifndef __COLLATIONRULEPARSER_H__
15 : #define __COLLATIONRULEPARSER_H__
16 :
17 : #include "unicode/utypes.h"
18 :
19 : #if !UCONFIG_NO_COLLATION
20 :
21 : #include "unicode/ucol.h"
22 : #include "unicode/uniset.h"
23 : #include "unicode/unistr.h"
24 :
25 : struct UParseError;
26 :
27 : U_NAMESPACE_BEGIN
28 :
29 : struct CollationData;
30 : struct CollationTailoring;
31 :
32 : class Locale;
33 : class Normalizer2;
34 :
35 : struct CollationSettings;
36 :
37 : class U_I18N_API CollationRuleParser : public UMemory {
38 : public:
39 : /** Special reset positions. */
40 : enum Position {
41 : FIRST_TERTIARY_IGNORABLE,
42 : LAST_TERTIARY_IGNORABLE,
43 : FIRST_SECONDARY_IGNORABLE,
44 : LAST_SECONDARY_IGNORABLE,
45 : FIRST_PRIMARY_IGNORABLE,
46 : LAST_PRIMARY_IGNORABLE,
47 : FIRST_VARIABLE,
48 : LAST_VARIABLE,
49 : FIRST_REGULAR,
50 : LAST_REGULAR,
51 : FIRST_IMPLICIT,
52 : LAST_IMPLICIT,
53 : FIRST_TRAILING,
54 : LAST_TRAILING
55 : };
56 :
57 : /**
58 : * First character of contractions that encode special reset positions.
59 : * U+FFFE cannot be tailored via rule syntax.
60 : *
61 : * The second contraction character is POS_BASE + Position.
62 : */
63 : static const UChar POS_LEAD = 0xfffe;
64 : /**
65 : * Base for the second character of contractions that encode special reset positions.
66 : * Braille characters U+28xx are printable and normalization-inert.
67 : * @see POS_LEAD
68 : */
69 : static const UChar POS_BASE = 0x2800;
70 :
71 0 : class U_I18N_API Sink : public UObject {
72 : public:
73 : virtual ~Sink();
74 : /**
75 : * Adds a reset.
76 : * strength=UCOL_IDENTICAL for &str.
77 : * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
78 : */
79 : virtual void addReset(int32_t strength, const UnicodeString &str,
80 : const char *&errorReason, UErrorCode &errorCode) = 0;
81 : /**
82 : * Adds a relation with strength and prefix | str / extension.
83 : */
84 : virtual void addRelation(int32_t strength, const UnicodeString &prefix,
85 : const UnicodeString &str, const UnicodeString &extension,
86 : const char *&errorReason, UErrorCode &errorCode) = 0;
87 :
88 : virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
89 : UErrorCode &errorCode);
90 :
91 : virtual void optimize(const UnicodeSet &set, const char *&errorReason,
92 : UErrorCode &errorCode);
93 : };
94 :
95 0 : class U_I18N_API Importer : public UObject {
96 : public:
97 : virtual ~Importer();
98 : virtual void getRules(
99 : const char *localeID, const char *collationType,
100 : UnicodeString &rules,
101 : const char *&errorReason, UErrorCode &errorCode) = 0;
102 : };
103 :
104 : /**
105 : * Constructor.
106 : * The Sink must be set before parsing.
107 : * The Importer can be set, otherwise [import locale] syntax is not supported.
108 : */
109 : CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
110 : ~CollationRuleParser();
111 :
112 : /**
113 : * Sets the pointer to a Sink object.
114 : * The pointer is aliased: Pointer copy without cloning or taking ownership.
115 : */
116 0 : void setSink(Sink *sinkAlias) {
117 0 : sink = sinkAlias;
118 0 : }
119 :
120 : /**
121 : * Sets the pointer to an Importer object.
122 : * The pointer is aliased: Pointer copy without cloning or taking ownership.
123 : */
124 0 : void setImporter(Importer *importerAlias) {
125 0 : importer = importerAlias;
126 0 : }
127 :
128 : void parse(const UnicodeString &ruleString,
129 : CollationSettings &outSettings,
130 : UParseError *outParseError,
131 : UErrorCode &errorCode);
132 :
133 0 : const char *getErrorReason() const { return errorReason; }
134 :
135 : /**
136 : * Gets a script or reorder code from its string representation.
137 : * @return the script/reorder code, or
138 : * -1 if not recognized
139 : */
140 : static int32_t getReorderCode(const char *word);
141 :
142 : private:
143 : /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
144 : static const int32_t STRENGTH_MASK = 0xf;
145 : static const int32_t STARRED_FLAG = 0x10;
146 : static const int32_t OFFSET_SHIFT = 8;
147 :
148 : void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
149 : void parseRuleChain(UErrorCode &errorCode);
150 : int32_t parseResetAndPosition(UErrorCode &errorCode);
151 : int32_t parseRelationOperator(UErrorCode &errorCode);
152 : void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
153 : void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
154 : int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
155 : int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
156 :
157 : /**
158 : * Sets str to a contraction of U+FFFE and (U+2800 + Position).
159 : * @return rule index after the special reset position
160 : */
161 : int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
162 : void parseSetting(UErrorCode &errorCode);
163 : void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
164 : static UColAttributeValue getOnOffValue(const UnicodeString &s);
165 :
166 : int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
167 : int32_t readWords(int32_t i, UnicodeString &raw) const;
168 : int32_t skipComment(int32_t i) const;
169 :
170 : void setParseError(const char *reason, UErrorCode &errorCode);
171 : void setErrorContext();
172 :
173 : /**
174 : * ASCII [:P:] and [:S:]:
175 : * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
176 : */
177 : static UBool isSyntaxChar(UChar32 c);
178 : int32_t skipWhiteSpace(int32_t i) const;
179 :
180 : const Normalizer2 &nfd, &nfc;
181 :
182 : const UnicodeString *rules;
183 : const CollationData *const baseData;
184 : CollationSettings *settings;
185 : UParseError *parseError;
186 : const char *errorReason;
187 :
188 : Sink *sink;
189 : Importer *importer;
190 :
191 : int32_t ruleIndex;
192 : };
193 :
194 : U_NAMESPACE_END
195 :
196 : #endif // !UCONFIG_NO_COLLATION
197 : #endif // __COLLATIONRULEPARSER_H__
|