Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 1996-2014, International Business Machines Corporation and
6 : * others. All Rights Reserved.
7 : *******************************************************************************
8 : */
9 :
10 : /*
11 : * File coleitr.cpp
12 : *
13 : * Created by: Helena Shih
14 : *
15 : * Modification History:
16 : *
17 : * Date Name Description
18 : *
19 : * 6/23/97 helena Adding comments to make code more readable.
20 : * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
21 : * 12/10/99 aliu Ported Thai collation support from Java.
22 : * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
23 : * 02/19/01 swquek Removed CollationElementIterator() since it is
24 : * private constructor and no calls are made to it
25 : * 2012-2014 markus Rewritten in C++ again.
26 : */
27 :
28 : #include "unicode/utypes.h"
29 :
30 : #if !UCONFIG_NO_COLLATION
31 :
32 : #include "unicode/chariter.h"
33 : #include "unicode/coleitr.h"
34 : #include "unicode/tblcoll.h"
35 : #include "unicode/ustring.h"
36 : #include "cmemory.h"
37 : #include "collation.h"
38 : #include "collationdata.h"
39 : #include "collationiterator.h"
40 : #include "collationsets.h"
41 : #include "collationtailoring.h"
42 : #include "uassert.h"
43 : #include "uhash.h"
44 : #include "utf16collationiterator.h"
45 : #include "uvectr32.h"
46 :
47 : /* Constants --------------------------------------------------------------- */
48 :
49 : U_NAMESPACE_BEGIN
50 :
51 0 : UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
52 :
53 : /* CollationElementIterator public constructor/destructor ------------------ */
54 :
55 0 : CollationElementIterator::CollationElementIterator(
56 0 : const CollationElementIterator& other)
57 0 : : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
58 0 : *this = other;
59 0 : }
60 :
61 0 : CollationElementIterator::~CollationElementIterator()
62 : {
63 0 : delete iter_;
64 0 : delete offsets_;
65 0 : }
66 :
67 : /* CollationElementIterator public methods --------------------------------- */
68 :
69 : namespace {
70 :
71 0 : uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
72 0 : return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
73 : }
74 0 : uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
75 0 : return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
76 : }
77 0 : UBool ceNeedsTwoParts(int64_t ce) {
78 0 : return (ce & INT64_C(0xffff00ff003f)) != 0;
79 : }
80 :
81 : } // namespace
82 :
83 0 : int32_t CollationElementIterator::getOffset() const
84 : {
85 0 : if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
86 : // CollationIterator::previousCE() decrements the CEs length
87 : // while it pops CEs from its internal buffer.
88 0 : int32_t i = iter_->getCEsLength();
89 0 : if (otherHalf_ != 0) {
90 : // Return the trailing CE offset while we are in the middle of a 64-bit CE.
91 0 : ++i;
92 : }
93 0 : U_ASSERT(i < offsets_->size());
94 0 : return offsets_->elementAti(i);
95 : }
96 0 : return iter_->getOffset();
97 : }
98 :
99 : /**
100 : * Get the ordering priority of the next character in the string.
101 : * @return the next character's ordering. Returns NULLORDER if an error has
102 : * occured or if the end of string has been reached
103 : */
104 0 : int32_t CollationElementIterator::next(UErrorCode& status)
105 : {
106 0 : if (U_FAILURE(status)) { return NULLORDER; }
107 0 : if (dir_ > 1) {
108 : // Continue forward iteration. Test this first.
109 0 : if (otherHalf_ != 0) {
110 0 : uint32_t oh = otherHalf_;
111 0 : otherHalf_ = 0;
112 0 : return oh;
113 : }
114 0 : } else if (dir_ == 1) {
115 : // next() after setOffset()
116 0 : dir_ = 2;
117 0 : } else if (dir_ == 0) {
118 : // The iter_ is already reset to the start of the text.
119 0 : dir_ = 2;
120 : } else /* dir_ < 0 */ {
121 : // illegal change of direction
122 0 : status = U_INVALID_STATE_ERROR;
123 0 : return NULLORDER;
124 : }
125 : // No need to keep all CEs in the buffer when we iterate.
126 0 : iter_->clearCEsIfNoneRemaining();
127 0 : int64_t ce = iter_->nextCE(status);
128 0 : if (ce == Collation::NO_CE) { return NULLORDER; }
129 : // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
130 0 : uint32_t p = (uint32_t)(ce >> 32);
131 0 : uint32_t lower32 = (uint32_t)ce;
132 0 : uint32_t firstHalf = getFirstHalf(p, lower32);
133 0 : uint32_t secondHalf = getSecondHalf(p, lower32);
134 0 : if (secondHalf != 0) {
135 0 : otherHalf_ = secondHalf | 0xc0; // continuation CE
136 : }
137 0 : return firstHalf;
138 : }
139 :
140 0 : UBool CollationElementIterator::operator!=(
141 : const CollationElementIterator& other) const
142 : {
143 0 : return !(*this == other);
144 : }
145 :
146 0 : UBool CollationElementIterator::operator==(
147 : const CollationElementIterator& that) const
148 : {
149 0 : if (this == &that) {
150 0 : return TRUE;
151 : }
152 :
153 : return
154 0 : (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
155 0 : otherHalf_ == that.otherHalf_ &&
156 0 : normalizeDir() == that.normalizeDir() &&
157 0 : string_ == that.string_ &&
158 0 : *iter_ == *that.iter_;
159 : }
160 :
161 : /**
162 : * Get the ordering priority of the previous collation element in the string.
163 : * @param status the error code status.
164 : * @return the previous element's ordering. Returns NULLORDER if an error has
165 : * occured or if the start of string has been reached.
166 : */
167 0 : int32_t CollationElementIterator::previous(UErrorCode& status)
168 : {
169 0 : if (U_FAILURE(status)) { return NULLORDER; }
170 0 : if (dir_ < 0) {
171 : // Continue backwards iteration. Test this first.
172 0 : if (otherHalf_ != 0) {
173 0 : uint32_t oh = otherHalf_;
174 0 : otherHalf_ = 0;
175 0 : return oh;
176 : }
177 0 : } else if (dir_ == 0) {
178 0 : iter_->resetToOffset(string_.length());
179 0 : dir_ = -1;
180 0 : } else if (dir_ == 1) {
181 : // previous() after setOffset()
182 0 : dir_ = -1;
183 : } else /* dir_ > 1 */ {
184 : // illegal change of direction
185 0 : status = U_INVALID_STATE_ERROR;
186 0 : return NULLORDER;
187 : }
188 0 : if (offsets_ == NULL) {
189 0 : offsets_ = new UVector32(status);
190 0 : if (offsets_ == NULL) {
191 0 : status = U_MEMORY_ALLOCATION_ERROR;
192 0 : return NULLORDER;
193 : }
194 : }
195 : // If we already have expansion CEs, then we also have offsets.
196 : // Otherwise remember the trailing offset in case we need to
197 : // write offsets for an artificial expansion.
198 0 : int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
199 0 : int64_t ce = iter_->previousCE(*offsets_, status);
200 0 : if (ce == Collation::NO_CE) { return NULLORDER; }
201 : // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
202 0 : uint32_t p = (uint32_t)(ce >> 32);
203 0 : uint32_t lower32 = (uint32_t)ce;
204 0 : uint32_t firstHalf = getFirstHalf(p, lower32);
205 0 : uint32_t secondHalf = getSecondHalf(p, lower32);
206 0 : if (secondHalf != 0) {
207 0 : if (offsets_->isEmpty()) {
208 : // When we convert a single 64-bit CE into two 32-bit CEs,
209 : // we need to make this artificial expansion behave like a normal expansion.
210 : // See CollationIterator::previousCE().
211 0 : offsets_->addElement(iter_->getOffset(), status);
212 0 : offsets_->addElement(limitOffset, status);
213 : }
214 0 : otherHalf_ = firstHalf;
215 0 : return secondHalf | 0xc0; // continuation CE
216 : }
217 0 : return firstHalf;
218 : }
219 :
220 : /**
221 : * Resets the cursor to the beginning of the string.
222 : */
223 0 : void CollationElementIterator::reset()
224 : {
225 0 : iter_ ->resetToOffset(0);
226 0 : otherHalf_ = 0;
227 0 : dir_ = 0;
228 0 : }
229 :
230 0 : void CollationElementIterator::setOffset(int32_t newOffset,
231 : UErrorCode& status)
232 : {
233 0 : if (U_FAILURE(status)) { return; }
234 0 : if (0 < newOffset && newOffset < string_.length()) {
235 0 : int32_t offset = newOffset;
236 0 : do {
237 0 : UChar c = string_.charAt(offset);
238 0 : if (!rbc_->isUnsafe(c) ||
239 0 : (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
240 0 : break;
241 : }
242 : // Back up to before this unsafe character.
243 0 : --offset;
244 0 : } while (offset > 0);
245 0 : if (offset < newOffset) {
246 : // We might have backed up more than necessary.
247 : // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
248 : // but for text "chu" setOffset(2) should remain at 2
249 : // although we initially back up to offset 0.
250 : // Find the last safe offset no greater than newOffset by iterating forward.
251 0 : int32_t lastSafeOffset = offset;
252 0 : do {
253 0 : iter_->resetToOffset(lastSafeOffset);
254 0 : do {
255 0 : iter_->nextCE(status);
256 0 : if (U_FAILURE(status)) { return; }
257 0 : } while ((offset = iter_->getOffset()) == lastSafeOffset);
258 0 : if (offset <= newOffset) {
259 0 : lastSafeOffset = offset;
260 : }
261 0 : } while (offset < newOffset);
262 0 : newOffset = lastSafeOffset;
263 : }
264 : }
265 0 : iter_->resetToOffset(newOffset);
266 0 : otherHalf_ = 0;
267 0 : dir_ = 1;
268 : }
269 :
270 : /**
271 : * Sets the source to the new source string.
272 : */
273 0 : void CollationElementIterator::setText(const UnicodeString& source,
274 : UErrorCode& status)
275 : {
276 0 : if (U_FAILURE(status)) {
277 0 : return;
278 : }
279 :
280 0 : string_ = source;
281 0 : const UChar *s = string_.getBuffer();
282 : CollationIterator *newIter;
283 0 : UBool numeric = rbc_->settings->isNumeric();
284 0 : if (rbc_->settings->dontCheckFCD()) {
285 0 : newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
286 : } else {
287 0 : newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
288 : }
289 0 : if (newIter == NULL) {
290 0 : status = U_MEMORY_ALLOCATION_ERROR;
291 0 : return;
292 : }
293 0 : delete iter_;
294 0 : iter_ = newIter;
295 0 : otherHalf_ = 0;
296 0 : dir_ = 0;
297 : }
298 :
299 : // Sets the source to the new character iterator.
300 0 : void CollationElementIterator::setText(CharacterIterator& source,
301 : UErrorCode& status)
302 : {
303 0 : if (U_FAILURE(status))
304 0 : return;
305 :
306 0 : source.getText(string_);
307 0 : setText(string_, status);
308 : }
309 :
310 0 : int32_t CollationElementIterator::strengthOrder(int32_t order) const
311 : {
312 0 : UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
313 : // Mask off the unwanted differences.
314 0 : if (s == UCOL_PRIMARY) {
315 0 : order &= 0xffff0000;
316 : }
317 0 : else if (s == UCOL_SECONDARY) {
318 0 : order &= 0xffffff00;
319 : }
320 :
321 0 : return order;
322 : }
323 :
324 : /* CollationElementIterator private constructors/destructors --------------- */
325 :
326 : /**
327 : * This is the "real" constructor for this class; it constructs an iterator
328 : * over the source text using the specified collator
329 : */
330 0 : CollationElementIterator::CollationElementIterator(
331 : const UnicodeString &source,
332 : const RuleBasedCollator *coll,
333 0 : UErrorCode &status)
334 0 : : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
335 0 : setText(source, status);
336 0 : }
337 :
338 : /**
339 : * This is the "real" constructor for this class; it constructs an iterator over
340 : * the source text using the specified collator
341 : */
342 0 : CollationElementIterator::CollationElementIterator(
343 : const CharacterIterator &source,
344 : const RuleBasedCollator *coll,
345 0 : UErrorCode &status)
346 0 : : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
347 : // We only call source.getText() which should be const anyway.
348 0 : setText(const_cast<CharacterIterator &>(source), status);
349 0 : }
350 :
351 : /* CollationElementIterator private methods -------------------------------- */
352 :
353 0 : const CollationElementIterator& CollationElementIterator::operator=(
354 : const CollationElementIterator& other)
355 : {
356 0 : if (this == &other) {
357 0 : return *this;
358 : }
359 :
360 : CollationIterator *newIter;
361 : const FCDUTF16CollationIterator *otherFCDIter =
362 0 : dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
363 0 : if(otherFCDIter != NULL) {
364 0 : newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
365 : } else {
366 : const UTF16CollationIterator *otherIter =
367 0 : dynamic_cast<const UTF16CollationIterator *>(other.iter_);
368 0 : if(otherIter != NULL) {
369 0 : newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
370 : } else {
371 0 : newIter = NULL;
372 : }
373 : }
374 0 : if(newIter != NULL) {
375 0 : delete iter_;
376 0 : iter_ = newIter;
377 0 : rbc_ = other.rbc_;
378 0 : otherHalf_ = other.otherHalf_;
379 0 : dir_ = other.dir_;
380 :
381 0 : string_ = other.string_;
382 : }
383 0 : if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
384 0 : UErrorCode errorCode = U_ZERO_ERROR;
385 0 : if(offsets_ == NULL) {
386 0 : offsets_ = new UVector32(other.offsets_->size(), errorCode);
387 : }
388 0 : if(offsets_ != NULL) {
389 0 : offsets_->assign(*other.offsets_, errorCode);
390 : }
391 : }
392 0 : return *this;
393 : }
394 :
395 : namespace {
396 :
397 : class MaxExpSink : public ContractionsAndExpansions::CESink {
398 : public:
399 0 : MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
400 : virtual ~MaxExpSink();
401 0 : virtual void handleCE(int64_t /*ce*/) {}
402 0 : virtual void handleExpansion(const int64_t ces[], int32_t length) {
403 0 : if (length <= 1) {
404 : // We do not need to add single CEs into the map.
405 0 : return;
406 : }
407 0 : int32_t count = 0; // number of CE "halves"
408 0 : for (int32_t i = 0; i < length; ++i) {
409 0 : count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
410 : }
411 : // last "half" of the last CE
412 0 : int64_t ce = ces[length - 1];
413 0 : uint32_t p = (uint32_t)(ce >> 32);
414 0 : uint32_t lower32 = (uint32_t)ce;
415 0 : uint32_t lastHalf = getSecondHalf(p, lower32);
416 0 : if (lastHalf == 0) {
417 0 : lastHalf = getFirstHalf(p, lower32);
418 0 : U_ASSERT(lastHalf != 0);
419 : } else {
420 0 : lastHalf |= 0xc0; // old-style continuation CE
421 : }
422 0 : if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
423 0 : uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
424 : }
425 : }
426 :
427 : private:
428 : UHashtable *maxExpansions;
429 : UErrorCode &errorCode;
430 : };
431 :
432 0 : MaxExpSink::~MaxExpSink() {}
433 :
434 : } // namespace
435 :
436 : UHashtable *
437 0 : CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
438 0 : if (U_FAILURE(errorCode)) { return NULL; }
439 : UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
440 0 : uhash_compareLong, &errorCode);
441 0 : if (U_FAILURE(errorCode)) { return NULL; }
442 0 : MaxExpSink sink(maxExpansions, errorCode);
443 0 : ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
444 0 : if (U_FAILURE(errorCode)) {
445 0 : uhash_close(maxExpansions);
446 0 : return NULL;
447 : }
448 0 : return maxExpansions;
449 : }
450 :
451 : int32_t
452 0 : CollationElementIterator::getMaxExpansion(int32_t order) const {
453 0 : return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
454 : }
455 :
456 : int32_t
457 0 : CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
458 0 : if (order == 0) { return 1; }
459 : int32_t max;
460 0 : if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
461 0 : return max;
462 : }
463 0 : if ((order & 0xc0) == 0xc0) {
464 : // old-style continuation CE
465 0 : return 2;
466 : } else {
467 0 : return 1;
468 : }
469 : }
470 :
471 : U_NAMESPACE_END
472 :
473 : #endif /* #if !UCONFIG_NO_COLLATION */
|