Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2012, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * file name: ucharstrie.h
9 : * encoding: UTF-8
10 : * tab size: 8 (not used)
11 : * indentation:4
12 : *
13 : * created on: 2010nov14
14 : * created by: Markus W. Scherer
15 : */
16 :
17 : #ifndef __UCHARSTRIE_H__
18 : #define __UCHARSTRIE_H__
19 :
20 : /**
21 : * \file
22 : * \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences)
23 : * to integer values.
24 : */
25 :
26 : #include "unicode/utypes.h"
27 : #include "unicode/unistr.h"
28 : #include "unicode/uobject.h"
29 : #include "unicode/ustringtrie.h"
30 :
31 : U_NAMESPACE_BEGIN
32 :
33 : class Appendable;
34 : class UCharsTrieBuilder;
35 : class UVector32;
36 :
37 : /**
38 : * Light-weight, non-const reader class for a UCharsTrie.
39 : * Traverses a char16_t-serialized data structure with minimal state,
40 : * for mapping strings (16-bit-unit sequences) to non-negative integer values.
41 : *
42 : * This class owns the serialized trie data only if it was constructed by
43 : * the builder's build() method.
44 : * The public constructor and the copy constructor only alias the data (only copy the pointer).
45 : * There is no assignment operator.
46 : *
47 : * This class is not intended for public subclassing.
48 : * @stable ICU 4.8
49 : */
50 : class U_COMMON_API UCharsTrie : public UMemory {
51 : public:
52 : /**
53 : * Constructs a UCharsTrie reader instance.
54 : *
55 : * The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder,
56 : * starting with the first char16_t of that sequence.
57 : * The UCharsTrie object will not read more char16_ts than
58 : * the UCharsTrieBuilder generated in the corresponding build() call.
59 : *
60 : * The array is not copied/cloned and must not be modified while
61 : * the UCharsTrie object is in use.
62 : *
63 : * @param trieUChars The char16_t array that contains the serialized trie.
64 : * @stable ICU 4.8
65 : */
66 0 : UCharsTrie(ConstChar16Ptr trieUChars)
67 0 : : ownedArray_(NULL), uchars_(trieUChars),
68 0 : pos_(uchars_), remainingMatchLength_(-1) {}
69 :
70 : /**
71 : * Destructor.
72 : * @stable ICU 4.8
73 : */
74 : ~UCharsTrie();
75 :
76 : /**
77 : * Copy constructor, copies the other trie reader object and its state,
78 : * but not the char16_t array which will be shared. (Shallow copy.)
79 : * @param other Another UCharsTrie object.
80 : * @stable ICU 4.8
81 : */
82 : UCharsTrie(const UCharsTrie &other)
83 : : ownedArray_(NULL), uchars_(other.uchars_),
84 : pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
85 :
86 : /**
87 : * Resets this trie to its initial state.
88 : * @return *this
89 : * @stable ICU 4.8
90 : */
91 0 : UCharsTrie &reset() {
92 0 : pos_=uchars_;
93 0 : remainingMatchLength_=-1;
94 0 : return *this;
95 : }
96 :
97 : /**
98 : * UCharsTrie state object, for saving a trie's current state
99 : * and resetting the trie back to this state later.
100 : * @stable ICU 4.8
101 : */
102 : class State : public UMemory {
103 : public:
104 : /**
105 : * Constructs an empty State.
106 : * @stable ICU 4.8
107 : */
108 0 : State() { uchars=NULL; }
109 : private:
110 : friend class UCharsTrie;
111 :
112 : const char16_t *uchars;
113 : const char16_t *pos;
114 : int32_t remainingMatchLength;
115 : };
116 :
117 : /**
118 : * Saves the state of this trie.
119 : * @param state The State object to hold the trie's state.
120 : * @return *this
121 : * @see resetToState
122 : * @stable ICU 4.8
123 : */
124 0 : const UCharsTrie &saveState(State &state) const {
125 0 : state.uchars=uchars_;
126 0 : state.pos=pos_;
127 0 : state.remainingMatchLength=remainingMatchLength_;
128 0 : return *this;
129 : }
130 :
131 : /**
132 : * Resets this trie to the saved state.
133 : * If the state object contains no state, or the state of a different trie,
134 : * then this trie remains unchanged.
135 : * @param state The State object which holds a saved trie state.
136 : * @return *this
137 : * @see saveState
138 : * @see reset
139 : * @stable ICU 4.8
140 : */
141 0 : UCharsTrie &resetToState(const State &state) {
142 0 : if(uchars_==state.uchars && uchars_!=NULL) {
143 0 : pos_=state.pos;
144 0 : remainingMatchLength_=state.remainingMatchLength;
145 : }
146 0 : return *this;
147 : }
148 :
149 : /**
150 : * Determines whether the string so far matches, whether it has a value,
151 : * and whether another input char16_t can continue a matching string.
152 : * @return The match/value Result.
153 : * @stable ICU 4.8
154 : */
155 : UStringTrieResult current() const;
156 :
157 : /**
158 : * Traverses the trie from the initial state for this input char16_t.
159 : * Equivalent to reset().next(uchar).
160 : * @param uchar Input char value. Values below 0 and above 0xffff will never match.
161 : * @return The match/value Result.
162 : * @stable ICU 4.8
163 : */
164 0 : inline UStringTrieResult first(int32_t uchar) {
165 0 : remainingMatchLength_=-1;
166 0 : return nextImpl(uchars_, uchar);
167 : }
168 :
169 : /**
170 : * Traverses the trie from the initial state for the
171 : * one or two UTF-16 code units for this input code point.
172 : * Equivalent to reset().nextForCodePoint(cp).
173 : * @param cp A Unicode code point 0..0x10ffff.
174 : * @return The match/value Result.
175 : * @stable ICU 4.8
176 : */
177 : UStringTrieResult firstForCodePoint(UChar32 cp);
178 :
179 : /**
180 : * Traverses the trie from the current state for this input char16_t.
181 : * @param uchar Input char value. Values below 0 and above 0xffff will never match.
182 : * @return The match/value Result.
183 : * @stable ICU 4.8
184 : */
185 : UStringTrieResult next(int32_t uchar);
186 :
187 : /**
188 : * Traverses the trie from the current state for the
189 : * one or two UTF-16 code units for this input code point.
190 : * @param cp A Unicode code point 0..0x10ffff.
191 : * @return The match/value Result.
192 : * @stable ICU 4.8
193 : */
194 : UStringTrieResult nextForCodePoint(UChar32 cp);
195 :
196 : /**
197 : * Traverses the trie from the current state for this string.
198 : * Equivalent to
199 : * \code
200 : * Result result=current();
201 : * for(each c in s)
202 : * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
203 : * result=next(c);
204 : * return result;
205 : * \endcode
206 : * @param s A string. Can be NULL if length is 0.
207 : * @param length The length of the string. Can be -1 if NUL-terminated.
208 : * @return The match/value Result.
209 : * @stable ICU 4.8
210 : */
211 : UStringTrieResult next(ConstChar16Ptr s, int32_t length);
212 :
213 : /**
214 : * Returns a matching string's value if called immediately after
215 : * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
216 : * getValue() can be called multiple times.
217 : *
218 : * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
219 : * @return The value for the string so far.
220 : * @stable ICU 4.8
221 : */
222 0 : inline int32_t getValue() const {
223 0 : const char16_t *pos=pos_;
224 0 : int32_t leadUnit=*pos++;
225 : // U_ASSERT(leadUnit>=kMinValueLead);
226 0 : return leadUnit&kValueIsFinal ?
227 0 : readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
228 : }
229 :
230 : /**
231 : * Determines whether all strings reachable from the current state
232 : * map to the same value.
233 : * @param uniqueValue Receives the unique value, if this function returns TRUE.
234 : * (output-only)
235 : * @return TRUE if all strings reachable from the current state
236 : * map to the same value.
237 : * @stable ICU 4.8
238 : */
239 : inline UBool hasUniqueValue(int32_t &uniqueValue) const {
240 : const char16_t *pos=pos_;
241 : // Skip the rest of a pending linear-match node.
242 : return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
243 : }
244 :
245 : /**
246 : * Finds each char16_t which continues the string from the current state.
247 : * That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now.
248 : * @param out Each next char16_t is appended to this object.
249 : * @return the number of char16_ts which continue the string from here
250 : * @stable ICU 4.8
251 : */
252 : int32_t getNextUChars(Appendable &out) const;
253 :
254 : /**
255 : * Iterator for all of the (string, value) pairs in a UCharsTrie.
256 : * @stable ICU 4.8
257 : */
258 : class U_COMMON_API Iterator : public UMemory {
259 : public:
260 : /**
261 : * Iterates from the root of a char16_t-serialized UCharsTrie.
262 : * @param trieUChars The trie char16_ts.
263 : * @param maxStringLength If 0, the iterator returns full strings.
264 : * Otherwise, the iterator returns strings with this maximum length.
265 : * @param errorCode Standard ICU error code. Its input value must
266 : * pass the U_SUCCESS() test, or else the function returns
267 : * immediately. Check for U_FAILURE() on output or use with
268 : * function chaining. (See User Guide for details.)
269 : * @stable ICU 4.8
270 : */
271 : Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
272 :
273 : /**
274 : * Iterates from the current state of the specified UCharsTrie.
275 : * @param trie The trie whose state will be copied for iteration.
276 : * @param maxStringLength If 0, the iterator returns full strings.
277 : * Otherwise, the iterator returns strings with this maximum length.
278 : * @param errorCode Standard ICU error code. Its input value must
279 : * pass the U_SUCCESS() test, or else the function returns
280 : * immediately. Check for U_FAILURE() on output or use with
281 : * function chaining. (See User Guide for details.)
282 : * @stable ICU 4.8
283 : */
284 : Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
285 :
286 : /**
287 : * Destructor.
288 : * @stable ICU 4.8
289 : */
290 : ~Iterator();
291 :
292 : /**
293 : * Resets this iterator to its initial state.
294 : * @return *this
295 : * @stable ICU 4.8
296 : */
297 : Iterator &reset();
298 :
299 : /**
300 : * @return TRUE if there are more elements.
301 : * @stable ICU 4.8
302 : */
303 : UBool hasNext() const;
304 :
305 : /**
306 : * Finds the next (string, value) pair if there is one.
307 : *
308 : * If the string is truncated to the maximum length and does not
309 : * have a real value, then the value is set to -1.
310 : * In this case, this "not a real value" is indistinguishable from
311 : * a real value of -1.
312 : * @param errorCode Standard ICU error code. Its input value must
313 : * pass the U_SUCCESS() test, or else the function returns
314 : * immediately. Check for U_FAILURE() on output or use with
315 : * function chaining. (See User Guide for details.)
316 : * @return TRUE if there is another element.
317 : * @stable ICU 4.8
318 : */
319 : UBool next(UErrorCode &errorCode);
320 :
321 : /**
322 : * @return The string for the last successful next().
323 : * @stable ICU 4.8
324 : */
325 0 : const UnicodeString &getString() const { return str_; }
326 : /**
327 : * @return The value for the last successful next().
328 : * @stable ICU 4.8
329 : */
330 0 : int32_t getValue() const { return value_; }
331 :
332 : private:
333 0 : UBool truncateAndStop() {
334 0 : pos_=NULL;
335 0 : value_=-1; // no real value for str
336 0 : return TRUE;
337 : }
338 :
339 : const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
340 :
341 : const char16_t *uchars_;
342 : const char16_t *pos_;
343 : const char16_t *initialPos_;
344 : int32_t remainingMatchLength_;
345 : int32_t initialRemainingMatchLength_;
346 : UBool skipValue_; // Skip intermediate value which was already delivered.
347 :
348 : UnicodeString str_;
349 : int32_t maxLength_;
350 : int32_t value_;
351 :
352 : // The stack stores pairs of integers for backtracking to another
353 : // outbound edge of a branch node.
354 : // The first integer is an offset from uchars_.
355 : // The second integer has the str_.length() from before the node in bits 15..0,
356 : // and the remaining branch length in bits 31..16.
357 : // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
358 : // but the code looks more confusing that way.)
359 : UVector32 *stack_;
360 : };
361 :
362 : private:
363 : friend class UCharsTrieBuilder;
364 :
365 : /**
366 : * Constructs a UCharsTrie reader instance.
367 : * Unlike the public constructor which just aliases an array,
368 : * this constructor adopts the builder's array.
369 : * This constructor is only called by the builder.
370 : */
371 0 : UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
372 0 : : ownedArray_(adoptUChars), uchars_(trieUChars),
373 0 : pos_(uchars_), remainingMatchLength_(-1) {}
374 :
375 : // No assignment operator.
376 : UCharsTrie &operator=(const UCharsTrie &other);
377 :
378 0 : inline void stop() {
379 0 : pos_=NULL;
380 0 : }
381 :
382 : // Reads a compact 32-bit integer.
383 : // pos is already after the leadUnit, and the lead unit has bit 15 reset.
384 0 : static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
385 : int32_t value;
386 0 : if(leadUnit<kMinTwoUnitValueLead) {
387 0 : value=leadUnit;
388 0 : } else if(leadUnit<kThreeUnitValueLead) {
389 0 : value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
390 : } else {
391 0 : value=(pos[0]<<16)|pos[1];
392 : }
393 0 : return value;
394 : }
395 0 : static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
396 0 : if(leadUnit>=kMinTwoUnitValueLead) {
397 0 : if(leadUnit<kThreeUnitValueLead) {
398 0 : ++pos;
399 : } else {
400 0 : pos+=2;
401 : }
402 : }
403 0 : return pos;
404 : }
405 0 : static inline const char16_t *skipValue(const char16_t *pos) {
406 0 : int32_t leadUnit=*pos++;
407 0 : return skipValue(pos, leadUnit&0x7fff);
408 : }
409 :
410 0 : static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
411 : // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
412 : int32_t value;
413 0 : if(leadUnit<kMinTwoUnitNodeValueLead) {
414 0 : value=(leadUnit>>6)-1;
415 0 : } else if(leadUnit<kThreeUnitNodeValueLead) {
416 0 : value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
417 : } else {
418 0 : value=(pos[0]<<16)|pos[1];
419 : }
420 0 : return value;
421 : }
422 0 : static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
423 : // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
424 0 : if(leadUnit>=kMinTwoUnitNodeValueLead) {
425 0 : if(leadUnit<kThreeUnitNodeValueLead) {
426 0 : ++pos;
427 : } else {
428 0 : pos+=2;
429 : }
430 : }
431 0 : return pos;
432 : }
433 :
434 0 : static inline const char16_t *jumpByDelta(const char16_t *pos) {
435 0 : int32_t delta=*pos++;
436 0 : if(delta>=kMinTwoUnitDeltaLead) {
437 0 : if(delta==kThreeUnitDeltaLead) {
438 0 : delta=(pos[0]<<16)|pos[1];
439 0 : pos+=2;
440 : } else {
441 0 : delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
442 : }
443 : }
444 0 : return pos+delta;
445 : }
446 :
447 0 : static const char16_t *skipDelta(const char16_t *pos) {
448 0 : int32_t delta=*pos++;
449 0 : if(delta>=kMinTwoUnitDeltaLead) {
450 0 : if(delta==kThreeUnitDeltaLead) {
451 0 : pos+=2;
452 : } else {
453 0 : ++pos;
454 : }
455 : }
456 0 : return pos;
457 : }
458 :
459 0 : static inline UStringTrieResult valueResult(int32_t node) {
460 0 : return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node>>15));
461 : }
462 :
463 : // Handles a branch node for both next(uchar) and next(string).
464 : UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
465 :
466 : // Requires remainingLength_<0.
467 : UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
468 :
469 : // Helper functions for hasUniqueValue().
470 : // Recursively finds a unique value (or whether there is not a unique one)
471 : // from a branch.
472 : static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
473 : UBool haveUniqueValue, int32_t &uniqueValue);
474 : // Recursively finds a unique value (or whether there is not a unique one)
475 : // starting from a position on a node lead unit.
476 : static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
477 :
478 : // Helper functions for getNextUChars().
479 : // getNextUChars() when pos is on a branch node.
480 : static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
481 :
482 : // UCharsTrie data structure
483 : //
484 : // The trie consists of a series of char16_t-serialized nodes for incremental
485 : // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
486 : // The root node is at the beginning of the trie data.
487 : //
488 : // Types of nodes are distinguished by their node lead unit ranges.
489 : // After each node, except a final-value node, another node follows to
490 : // encode match values or continue matching further units.
491 : //
492 : // Node types:
493 : // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
494 : // The value is for the string/char16_t sequence so far.
495 : // - Match node, optionally with an intermediate value in a different compact format.
496 : // The value, if present, is for the string/char16_t sequence so far.
497 : //
498 : // Aside from the value, which uses the node lead unit's high bits:
499 : //
500 : // - Linear-match node: Matches a number of units.
501 : // - Branch node: Branches to other nodes according to the current input unit.
502 : // The node unit is the length of the branch (number of units to select from)
503 : // minus 1. It is followed by a sub-node:
504 : // - If the length is at most kMaxBranchLinearSubNodeLength, then
505 : // there are length-1 (key, value) pairs and then one more comparison unit.
506 : // If one of the key units matches, then the value is either a final value for
507 : // the string so far, or a "jump" delta to the next node.
508 : // If the last unit matches, then matching continues with the next node.
509 : // (Values have the same encoding as final-value nodes.)
510 : // - If the length is greater than kMaxBranchLinearSubNodeLength, then
511 : // there is one unit and one "jump" delta.
512 : // If the input unit is less than the sub-node unit, then "jump" by delta to
513 : // the next sub-node which will have a length of length/2.
514 : // (The delta has its own compact encoding.)
515 : // Otherwise, skip the "jump" delta to the next sub-node
516 : // which will have a length of length-length/2.
517 :
518 : // Match-node lead unit values, after masking off intermediate-value bits:
519 :
520 : // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
521 : // the length is one more than the next unit.
522 :
523 : // For a branch sub-node with at most this many entries, we drop down
524 : // to a linear search.
525 : static const int32_t kMaxBranchLinearSubNodeLength=5;
526 :
527 : // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
528 : static const int32_t kMinLinearMatch=0x30;
529 : static const int32_t kMaxLinearMatchLength=0x10;
530 :
531 : // Match-node lead unit bits 14..6 for the optional intermediate value.
532 : // If these bits are 0, then there is no intermediate value.
533 : // Otherwise, see the *NodeValue* constants below.
534 : static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
535 : static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
536 :
537 : // A final-value node has bit 15 set.
538 : static const int32_t kValueIsFinal=0x8000;
539 :
540 : // Compact value: After testing and masking off bit 15, use the following thresholds.
541 : static const int32_t kMaxOneUnitValue=0x3fff;
542 :
543 : static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
544 : static const int32_t kThreeUnitValueLead=0x7fff;
545 :
546 : static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
547 :
548 : // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
549 : static const int32_t kMaxOneUnitNodeValue=0xff;
550 : static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
551 : static const int32_t kThreeUnitNodeValueLead=0x7fc0;
552 :
553 : static const int32_t kMaxTwoUnitNodeValue=
554 : ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
555 :
556 : // Compact delta integers.
557 : static const int32_t kMaxOneUnitDelta=0xfbff;
558 : static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
559 : static const int32_t kThreeUnitDeltaLead=0xffff;
560 :
561 : static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
562 :
563 : char16_t *ownedArray_;
564 :
565 : // Fixed value referencing the UCharsTrie words.
566 : const char16_t *uchars_;
567 :
568 : // Iterator variables.
569 :
570 : // Pointer to next trie unit to read. NULL if no more matches.
571 : const char16_t *pos_;
572 : // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
573 : int32_t remainingMatchLength_;
574 : };
575 :
576 : U_NAMESPACE_END
577 :
578 : #endif // __UCHARSTRIE_H__
|