Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2012-2014, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationdatabuilder.h
9 : *
10 : * created on: 2012apr01
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #ifndef __COLLATIONDATABUILDER_H__
15 : #define __COLLATIONDATABUILDER_H__
16 :
17 : #include "unicode/utypes.h"
18 :
19 : #if !UCONFIG_NO_COLLATION
20 :
21 : #include "unicode/uniset.h"
22 : #include "unicode/unistr.h"
23 : #include "unicode/uversion.h"
24 : #include "collation.h"
25 : #include "collationdata.h"
26 : #include "collationsettings.h"
27 : #include "normalizer2impl.h"
28 : #include "utrie2.h"
29 : #include "uvectr32.h"
30 : #include "uvectr64.h"
31 : #include "uvector.h"
32 :
33 : U_NAMESPACE_BEGIN
34 :
35 : struct ConditionalCE32;
36 :
37 : class CollationFastLatinBuilder;
38 : class CopyHelper;
39 : class DataBuilderCollationIterator;
40 : class UCharsTrieBuilder;
41 :
42 : /**
43 : * Low-level CollationData builder.
44 : * Takes (character, CE) pairs and builds them into runtime data structures.
45 : * Supports characters with context prefixes and contraction suffixes.
46 : */
47 : class U_I18N_API CollationDataBuilder : public UObject {
48 : public:
49 : /**
50 : * Collation element modifier. Interface class for a modifier
51 : * that changes a tailoring builder's temporary CEs to final CEs.
52 : * Called for every non-special CE32 and every expansion CE.
53 : */
54 0 : class CEModifier : public UObject {
55 : public:
56 : virtual ~CEModifier();
57 : /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
58 : virtual int64_t modifyCE32(uint32_t ce32) const = 0;
59 : /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
60 : virtual int64_t modifyCE(int64_t ce) const = 0;
61 : };
62 :
63 : CollationDataBuilder(UErrorCode &errorCode);
64 :
65 : virtual ~CollationDataBuilder();
66 :
67 : void initForTailoring(const CollationData *b, UErrorCode &errorCode);
68 :
69 : virtual UBool isCompressibleLeadByte(uint32_t b) const;
70 :
71 0 : inline UBool isCompressiblePrimary(uint32_t p) const {
72 0 : return isCompressibleLeadByte(p >> 24);
73 : }
74 :
75 : /**
76 : * @return TRUE if this builder has mappings (e.g., add() has been called)
77 : */
78 0 : UBool hasMappings() const { return modified; }
79 :
80 : /**
81 : * @return TRUE if c has CEs in this builder
82 : */
83 : UBool isAssigned(UChar32 c) const;
84 :
85 : /**
86 : * @return the three-byte primary if c maps to a single such CE and has no context data,
87 : * otherwise returns 0.
88 : */
89 : uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
90 :
91 : /**
92 : * @return the single CE for c.
93 : * Sets an error code if c does not have a single CE.
94 : */
95 : int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
96 :
97 : void add(const UnicodeString &prefix, const UnicodeString &s,
98 : const int64_t ces[], int32_t cesLength,
99 : UErrorCode &errorCode);
100 :
101 : /**
102 : * Encodes the ces as either the returned ce32 by itself,
103 : * or by storing an expansion, with the returned ce32 referring to that.
104 : *
105 : * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
106 : */
107 : virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
108 : void addCE32(const UnicodeString &prefix, const UnicodeString &s,
109 : uint32_t ce32, UErrorCode &errorCode);
110 :
111 : /**
112 : * Sets three-byte-primary CEs for a range of code points in code point order,
113 : * if it is worth doing; otherwise no change is made.
114 : * None of the code points in the range should have complex mappings so far
115 : * (expansions/contractions/prefixes).
116 : * @param start first code point
117 : * @param end last code point (inclusive)
118 : * @param primary primary weight for 'start'
119 : * @param step per-code point primary-weight increment
120 : * @param errorCode ICU in/out error code
121 : * @return TRUE if an OFFSET_TAG range was used for start..end
122 : */
123 : UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
124 : uint32_t primary, int32_t step,
125 : UErrorCode &errorCode);
126 :
127 : /**
128 : * Sets three-byte-primary CEs for a range of code points in code point order.
129 : * Sets range values if that is worth doing, or else individual values.
130 : * None of the code points in the range should have complex mappings so far
131 : * (expansions/contractions/prefixes).
132 : * @param start first code point
133 : * @param end last code point (inclusive)
134 : * @param primary primary weight for 'start'
135 : * @param step per-code point primary-weight increment
136 : * @param errorCode ICU in/out error code
137 : * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
138 : */
139 : uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
140 : uint32_t primary, int32_t step,
141 : UErrorCode &errorCode);
142 :
143 : /**
144 : * Copies all mappings from the src builder, with modifications.
145 : * This builder here must not be built yet, and should be empty.
146 : */
147 : void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
148 : UErrorCode &errorCode);
149 :
150 : void optimize(const UnicodeSet &set, UErrorCode &errorCode);
151 : void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
152 :
153 0 : void enableFastLatin() { fastLatinEnabled = TRUE; }
154 : virtual void build(CollationData &data, UErrorCode &errorCode);
155 :
156 : /**
157 : * Looks up CEs for s and appends them to the ces array.
158 : * Does not handle normalization: s should be in FCD form.
159 : *
160 : * Does not write completely ignorable CEs.
161 : * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
162 : *
163 : * @return incremented cesLength
164 : */
165 : int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
166 : int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
167 : int64_t ces[], int32_t cesLength);
168 :
169 : protected:
170 : friend class CopyHelper;
171 : friend class DataBuilderCollationIterator;
172 :
173 : uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
174 :
175 : int32_t addCE(int64_t ce, UErrorCode &errorCode);
176 : int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
177 : int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
178 :
179 0 : inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
180 0 : return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
181 : }
182 0 : inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
183 0 : return getConditionalCE32(Collation::indexFromCE32(ce32));
184 : }
185 :
186 0 : static uint32_t makeBuilderContextCE32(int32_t index) {
187 0 : return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
188 : }
189 0 : static inline UBool isBuilderContextCE32(uint32_t ce32) {
190 0 : return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
191 : }
192 :
193 : static uint32_t encodeOneCEAsCE32(int64_t ce);
194 : uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
195 : uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
196 : uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
197 :
198 : uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
199 : /**
200 : * Copies base contractions to a list of ConditionalCE32.
201 : * Sets cond->next to the index of the first new item
202 : * and returns the index of the last new item.
203 : */
204 : int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
205 : ConditionalCE32 *cond, UErrorCode &errorCode);
206 :
207 : UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
208 : void setDigitTags(UErrorCode &errorCode);
209 : void setLeadSurrogates(UErrorCode &errorCode);
210 :
211 : void buildMappings(CollationData &data, UErrorCode &errorCode);
212 :
213 : void clearContexts();
214 : void buildContexts(UErrorCode &errorCode);
215 : uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
216 : int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
217 : UErrorCode &errorCode);
218 :
219 : void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
220 :
221 : int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
222 :
223 0 : static UChar32 jamoCpFromIndex(int32_t i) {
224 : // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
225 0 : if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
226 0 : i -= Hangul::JAMO_L_COUNT;
227 0 : if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
228 0 : i -= Hangul::JAMO_V_COUNT;
229 : // i < 27
230 0 : return Hangul::JAMO_T_BASE + 1 + i;
231 : }
232 :
233 : /** @see Collation::BUILDER_DATA_TAG */
234 : static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
235 :
236 : const Normalizer2Impl &nfcImpl;
237 : const CollationData *base;
238 : const CollationSettings *baseSettings;
239 : UTrie2 *trie;
240 : UVector32 ce32s;
241 : UVector64 ce64s;
242 : UVector conditionalCE32s; // vector of ConditionalCE32
243 : // Characters that have context (prefixes or contraction suffixes).
244 : UnicodeSet contextChars;
245 : // Serialized UCharsTrie structures for finalized contexts.
246 : UnicodeString contexts;
247 : UnicodeSet unsafeBackwardSet;
248 : UBool modified;
249 :
250 : UBool fastLatinEnabled;
251 : CollationFastLatinBuilder *fastLatinBuilder;
252 :
253 : DataBuilderCollationIterator *collIter;
254 : };
255 :
256 : U_NAMESPACE_END
257 :
258 : #endif // !UCONFIG_NO_COLLATION
259 : #endif // __COLLATIONDATABUILDER_H__
|