Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 1996-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationcompare.cpp
9 : *
10 : * created on: 2012feb14 with new and old collation code
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #include "unicode/utypes.h"
15 :
16 : #if !UCONFIG_NO_COLLATION
17 :
18 : #include "unicode/ucol.h"
19 : #include "cmemory.h"
20 : #include "collation.h"
21 : #include "collationcompare.h"
22 : #include "collationiterator.h"
23 : #include "collationsettings.h"
24 : #include "uassert.h"
25 :
26 : U_NAMESPACE_BEGIN
27 :
28 : UCollationResult
29 0 : CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
30 : const CollationSettings &settings,
31 : UErrorCode &errorCode) {
32 0 : if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
33 :
34 0 : int32_t options = settings.options;
35 : uint32_t variableTop;
36 0 : if((options & CollationSettings::ALTERNATE_MASK) == 0) {
37 0 : variableTop = 0;
38 : } else {
39 : // +1 so that we can use "<" and primary ignorables test out early.
40 0 : variableTop = settings.variableTop + 1;
41 : }
42 0 : UBool anyVariable = FALSE;
43 :
44 : // Fetch CEs, compare primaries, store secondary & tertiary weights.
45 : for(;;) {
46 : // We fetch CEs until we get a non-ignorable primary or reach the end.
47 : uint32_t leftPrimary;
48 0 : do {
49 0 : int64_t ce = left.nextCE(errorCode);
50 0 : leftPrimary = (uint32_t)(ce >> 32);
51 0 : if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
52 : // Variable CE, shift it to quaternary level.
53 : // Ignore all following primary ignorables, and shift further variable CEs.
54 0 : anyVariable = TRUE;
55 0 : do {
56 : // Store only the primary of the variable CE.
57 0 : left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
58 : for(;;) {
59 0 : ce = left.nextCE(errorCode);
60 0 : leftPrimary = (uint32_t)(ce >> 32);
61 0 : if(leftPrimary == 0) {
62 0 : left.setCurrentCE(0);
63 : } else {
64 0 : break;
65 : }
66 : }
67 0 : } while(leftPrimary < variableTop &&
68 : leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
69 : }
70 0 : } while(leftPrimary == 0);
71 :
72 : uint32_t rightPrimary;
73 0 : do {
74 0 : int64_t ce = right.nextCE(errorCode);
75 0 : rightPrimary = (uint32_t)(ce >> 32);
76 0 : if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
77 : // Variable CE, shift it to quaternary level.
78 : // Ignore all following primary ignorables, and shift further variable CEs.
79 0 : anyVariable = TRUE;
80 0 : do {
81 : // Store only the primary of the variable CE.
82 0 : right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
83 : for(;;) {
84 0 : ce = right.nextCE(errorCode);
85 0 : rightPrimary = (uint32_t)(ce >> 32);
86 0 : if(rightPrimary == 0) {
87 0 : right.setCurrentCE(0);
88 : } else {
89 0 : break;
90 : }
91 : }
92 0 : } while(rightPrimary < variableTop &&
93 : rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
94 : }
95 0 : } while(rightPrimary == 0);
96 :
97 0 : if(leftPrimary != rightPrimary) {
98 : // Return the primary difference, with script reordering.
99 0 : if(settings.hasReordering()) {
100 0 : leftPrimary = settings.reorder(leftPrimary);
101 0 : rightPrimary = settings.reorder(rightPrimary);
102 : }
103 0 : return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
104 : }
105 0 : if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
106 0 : }
107 0 : if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
108 :
109 : // Compare the buffered secondary & tertiary weights.
110 : // We might skip the secondary level but continue with the case level
111 : // which is turned on separately.
112 0 : if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
113 0 : if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
114 0 : int32_t leftIndex = 0;
115 0 : int32_t rightIndex = 0;
116 : for(;;) {
117 : uint32_t leftSecondary;
118 0 : do {
119 0 : leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
120 0 : } while(leftSecondary == 0);
121 :
122 : uint32_t rightSecondary;
123 0 : do {
124 0 : rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
125 0 : } while(rightSecondary == 0);
126 :
127 0 : if(leftSecondary != rightSecondary) {
128 0 : return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
129 : }
130 0 : if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
131 0 : }
132 : } else {
133 : // The backwards secondary level compares secondary weights backwards
134 : // within segments separated by the merge separator (U+FFFE, weight 02).
135 0 : int32_t leftStart = 0;
136 0 : int32_t rightStart = 0;
137 : for(;;) {
138 : // Find the merge separator or the NO_CE terminator.
139 : uint32_t p;
140 0 : int32_t leftLimit = leftStart;
141 0 : while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
142 0 : Collation::MERGE_SEPARATOR_PRIMARY ||
143 : p == 0) {
144 0 : ++leftLimit;
145 : }
146 0 : int32_t rightLimit = rightStart;
147 0 : while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
148 0 : Collation::MERGE_SEPARATOR_PRIMARY ||
149 : p == 0) {
150 0 : ++rightLimit;
151 : }
152 :
153 : // Compare the segments.
154 0 : int32_t leftIndex = leftLimit;
155 0 : int32_t rightIndex = rightLimit;
156 : for(;;) {
157 0 : int32_t leftSecondary = 0;
158 0 : while(leftSecondary == 0 && leftIndex > leftStart) {
159 0 : leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
160 : }
161 :
162 0 : int32_t rightSecondary = 0;
163 0 : while(rightSecondary == 0 && rightIndex > rightStart) {
164 0 : rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
165 : }
166 :
167 0 : if(leftSecondary != rightSecondary) {
168 0 : return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
169 : }
170 0 : if(leftSecondary == 0) { break; }
171 0 : }
172 :
173 : // Did we reach the end of either string?
174 : // Both strings have the same number of merge separators,
175 : // or else there would have been a primary-level difference.
176 0 : U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
177 0 : if(p == Collation::NO_CE_PRIMARY) { break; }
178 : // Skip both merge separators and continue.
179 0 : leftStart = leftLimit + 1;
180 0 : rightStart = rightLimit + 1;
181 0 : }
182 : }
183 : }
184 :
185 0 : if((options & CollationSettings::CASE_LEVEL) != 0) {
186 0 : int32_t strength = CollationSettings::getStrength(options);
187 0 : int32_t leftIndex = 0;
188 0 : int32_t rightIndex = 0;
189 : for(;;) {
190 : uint32_t leftCase, leftLower32, rightCase;
191 0 : if(strength == UCOL_PRIMARY) {
192 : // Primary+caseLevel: Ignore case level weights of primary ignorables.
193 : // Otherwise we would get a-umlaut > a
194 : // which is not desirable for accent-insensitive sorting.
195 : // Check for (lower 32 bits) == 0 as well because variable CEs are stored
196 : // with only primary weights.
197 : int64_t ce;
198 0 : do {
199 0 : ce = left.getCE(leftIndex++);
200 0 : leftCase = (uint32_t)ce;
201 0 : } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
202 0 : leftLower32 = leftCase;
203 0 : leftCase &= 0xc000;
204 :
205 0 : do {
206 0 : ce = right.getCE(rightIndex++);
207 0 : rightCase = (uint32_t)ce;
208 0 : } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
209 0 : rightCase &= 0xc000;
210 : } else {
211 : // Secondary+caseLevel: By analogy with the above,
212 : // ignore case level weights of secondary ignorables.
213 : //
214 : // Note: A tertiary CE has uppercase case bits (0.0.ut)
215 : // to keep tertiary+caseFirst well-formed.
216 : //
217 : // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
218 : // Otherwise a tertiary CE's uppercase would be no greater than
219 : // a primary/secondary CE's uppercase.
220 : // (See UCA well-formedness condition 2.)
221 : // We could construct a special case weight higher than uppercase,
222 : // but it's simpler to always ignore case weights of secondary ignorables,
223 : // turning 0.0.ut into 0.0.0.t.
224 : // (See LDML Collation, Case Parameters.)
225 0 : do {
226 0 : leftCase = (uint32_t)left.getCE(leftIndex++);
227 0 : } while(leftCase <= 0xffff);
228 0 : leftLower32 = leftCase;
229 0 : leftCase &= 0xc000;
230 :
231 0 : do {
232 0 : rightCase = (uint32_t)right.getCE(rightIndex++);
233 0 : } while(rightCase <= 0xffff);
234 0 : rightCase &= 0xc000;
235 : }
236 :
237 : // No need to handle NO_CE and MERGE_SEPARATOR specially:
238 : // There is one case weight for each previous-level weight,
239 : // so level length differences were handled there.
240 0 : if(leftCase != rightCase) {
241 0 : if((options & CollationSettings::UPPER_FIRST) == 0) {
242 0 : return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
243 : } else {
244 0 : return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
245 : }
246 : }
247 0 : if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
248 0 : }
249 : }
250 0 : if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
251 :
252 0 : uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
253 :
254 0 : int32_t leftIndex = 0;
255 0 : int32_t rightIndex = 0;
256 0 : uint32_t anyQuaternaries = 0;
257 : for(;;) {
258 : uint32_t leftLower32, leftTertiary;
259 0 : do {
260 0 : leftLower32 = (uint32_t)left.getCE(leftIndex++);
261 0 : anyQuaternaries |= leftLower32;
262 0 : U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
263 : (leftLower32 & 0xc0c0) == 0);
264 0 : leftTertiary = leftLower32 & tertiaryMask;
265 0 : } while(leftTertiary == 0);
266 :
267 : uint32_t rightLower32, rightTertiary;
268 0 : do {
269 0 : rightLower32 = (uint32_t)right.getCE(rightIndex++);
270 0 : anyQuaternaries |= rightLower32;
271 0 : U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
272 : (rightLower32 & 0xc0c0) == 0);
273 0 : rightTertiary = rightLower32 & tertiaryMask;
274 0 : } while(rightTertiary == 0);
275 :
276 0 : if(leftTertiary != rightTertiary) {
277 0 : if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
278 : // Pass through NO_CE and keep real tertiary weights larger than that.
279 : // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
280 : // to keep tertiary CEs well-formed.
281 : // Their case+tertiary weights must be greater than those of
282 : // primary and secondary CEs.
283 0 : if(leftTertiary > Collation::NO_CE_WEIGHT16) {
284 0 : if(leftLower32 > 0xffff) {
285 0 : leftTertiary ^= 0xc000;
286 : } else {
287 0 : leftTertiary += 0x4000;
288 : }
289 : }
290 0 : if(rightTertiary > Collation::NO_CE_WEIGHT16) {
291 0 : if(rightLower32 > 0xffff) {
292 0 : rightTertiary ^= 0xc000;
293 : } else {
294 0 : rightTertiary += 0x4000;
295 : }
296 : }
297 : }
298 0 : return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
299 : }
300 0 : if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
301 0 : }
302 0 : if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
303 :
304 0 : if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
305 : // If there are no "variable" CEs and no non-zero quaternary weights,
306 : // then there are no quaternary differences.
307 0 : return UCOL_EQUAL;
308 : }
309 :
310 0 : leftIndex = 0;
311 0 : rightIndex = 0;
312 : for(;;) {
313 : uint32_t leftQuaternary;
314 0 : do {
315 0 : int64_t ce = left.getCE(leftIndex++);
316 0 : leftQuaternary = (uint32_t)ce & 0xffff;
317 0 : if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
318 : // Variable primary or completely ignorable or NO_CE.
319 0 : leftQuaternary = (uint32_t)(ce >> 32);
320 : } else {
321 : // Regular CE, not tertiary ignorable.
322 : // Preserve the quaternary weight in bits 7..6.
323 0 : leftQuaternary |= 0xffffff3f;
324 : }
325 0 : } while(leftQuaternary == 0);
326 :
327 : uint32_t rightQuaternary;
328 0 : do {
329 0 : int64_t ce = right.getCE(rightIndex++);
330 0 : rightQuaternary = (uint32_t)ce & 0xffff;
331 0 : if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
332 : // Variable primary or completely ignorable or NO_CE.
333 0 : rightQuaternary = (uint32_t)(ce >> 32);
334 : } else {
335 : // Regular CE, not tertiary ignorable.
336 : // Preserve the quaternary weight in bits 7..6.
337 0 : rightQuaternary |= 0xffffff3f;
338 : }
339 0 : } while(rightQuaternary == 0);
340 :
341 0 : if(leftQuaternary != rightQuaternary) {
342 : // Return the difference, with script reordering.
343 0 : if(settings.hasReordering()) {
344 0 : leftQuaternary = settings.reorder(leftQuaternary);
345 0 : rightQuaternary = settings.reorder(rightQuaternary);
346 : }
347 0 : return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
348 : }
349 0 : if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
350 0 : }
351 0 : return UCOL_EQUAL;
352 : }
353 :
354 : U_NAMESPACE_END
355 :
356 : #endif // !UCONFIG_NO_COLLATION
|