Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2012, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * file name: bytestrie.h
9 : * encoding: UTF-8
10 : * tab size: 8 (not used)
11 : * indentation:4
12 : *
13 : * created on: 2010sep25
14 : * created by: Markus W. Scherer
15 : */
16 :
17 : #ifndef __BYTESTRIE_H__
18 : #define __BYTESTRIE_H__
19 :
20 : /**
21 : * \file
22 : * \brief C++ API: Trie for mapping byte sequences to integer values.
23 : */
24 :
25 : #include "unicode/utypes.h"
26 : #include "unicode/stringpiece.h"
27 : #include "unicode/uobject.h"
28 : #include "unicode/ustringtrie.h"
29 :
30 : U_NAMESPACE_BEGIN
31 :
32 : class ByteSink;
33 : class BytesTrieBuilder;
34 : class CharString;
35 : class UVector32;
36 :
37 : /**
38 : * Light-weight, non-const reader class for a BytesTrie.
39 : * Traverses a byte-serialized data structure with minimal state,
40 : * for mapping byte sequences to non-negative integer values.
41 : *
42 : * This class owns the serialized trie data only if it was constructed by
43 : * the builder's build() method.
44 : * The public constructor and the copy constructor only alias the data (only copy the pointer).
45 : * There is no assignment operator.
46 : *
47 : * This class is not intended for public subclassing.
48 : * @stable ICU 4.8
49 : */
50 : class U_COMMON_API BytesTrie : public UMemory {
51 : public:
52 : /**
53 : * Constructs a BytesTrie reader instance.
54 : *
55 : * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
56 : * starting with the first byte of that sequence.
57 : * The BytesTrie object will not read more bytes than
58 : * the BytesTrieBuilder generated in the corresponding build() call.
59 : *
60 : * The array is not copied/cloned and must not be modified while
61 : * the BytesTrie object is in use.
62 : *
63 : * @param trieBytes The byte array that contains the serialized trie.
64 : * @stable ICU 4.8
65 : */
66 0 : BytesTrie(const void *trieBytes)
67 0 : : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
68 0 : pos_(bytes_), remainingMatchLength_(-1) {}
69 :
70 : /**
71 : * Destructor.
72 : * @stable ICU 4.8
73 : */
74 : ~BytesTrie();
75 :
76 : /**
77 : * Copy constructor, copies the other trie reader object and its state,
78 : * but not the byte array which will be shared. (Shallow copy.)
79 : * @param other Another BytesTrie object.
80 : * @stable ICU 4.8
81 : */
82 : BytesTrie(const BytesTrie &other)
83 : : ownedArray_(NULL), bytes_(other.bytes_),
84 : pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
85 :
86 : /**
87 : * Resets this trie to its initial state.
88 : * @return *this
89 : * @stable ICU 4.8
90 : */
91 : BytesTrie &reset() {
92 : pos_=bytes_;
93 : remainingMatchLength_=-1;
94 : return *this;
95 : }
96 :
97 : /**
98 : * BytesTrie state object, for saving a trie's current state
99 : * and resetting the trie back to this state later.
100 : * @stable ICU 4.8
101 : */
102 : class State : public UMemory {
103 : public:
104 : /**
105 : * Constructs an empty State.
106 : * @stable ICU 4.8
107 : */
108 : State() { bytes=NULL; }
109 : private:
110 : friend class BytesTrie;
111 :
112 : const uint8_t *bytes;
113 : const uint8_t *pos;
114 : int32_t remainingMatchLength;
115 : };
116 :
117 : /**
118 : * Saves the state of this trie.
119 : * @param state The State object to hold the trie's state.
120 : * @return *this
121 : * @see resetToState
122 : * @stable ICU 4.8
123 : */
124 : const BytesTrie &saveState(State &state) const {
125 : state.bytes=bytes_;
126 : state.pos=pos_;
127 : state.remainingMatchLength=remainingMatchLength_;
128 : return *this;
129 : }
130 :
131 : /**
132 : * Resets this trie to the saved state.
133 : * If the state object contains no state, or the state of a different trie,
134 : * then this trie remains unchanged.
135 : * @param state The State object which holds a saved trie state.
136 : * @return *this
137 : * @see saveState
138 : * @see reset
139 : * @stable ICU 4.8
140 : */
141 : BytesTrie &resetToState(const State &state) {
142 : if(bytes_==state.bytes && bytes_!=NULL) {
143 : pos_=state.pos;
144 : remainingMatchLength_=state.remainingMatchLength;
145 : }
146 : return *this;
147 : }
148 :
149 : /**
150 : * Determines whether the byte sequence so far matches, whether it has a value,
151 : * and whether another input byte can continue a matching byte sequence.
152 : * @return The match/value Result.
153 : * @stable ICU 4.8
154 : */
155 : UStringTrieResult current() const;
156 :
157 : /**
158 : * Traverses the trie from the initial state for this input byte.
159 : * Equivalent to reset().next(inByte).
160 : * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
161 : * Values below -0x100 and above 0xff will never match.
162 : * @return The match/value Result.
163 : * @stable ICU 4.8
164 : */
165 : inline UStringTrieResult first(int32_t inByte) {
166 : remainingMatchLength_=-1;
167 : if(inByte<0) {
168 : inByte+=0x100;
169 : }
170 : return nextImpl(bytes_, inByte);
171 : }
172 :
173 : /**
174 : * Traverses the trie from the current state for this input byte.
175 : * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
176 : * Values below -0x100 and above 0xff will never match.
177 : * @return The match/value Result.
178 : * @stable ICU 4.8
179 : */
180 : UStringTrieResult next(int32_t inByte);
181 :
182 : /**
183 : * Traverses the trie from the current state for this byte sequence.
184 : * Equivalent to
185 : * \code
186 : * Result result=current();
187 : * for(each c in s)
188 : * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
189 : * result=next(c);
190 : * return result;
191 : * \endcode
192 : * @param s A string or byte sequence. Can be NULL if length is 0.
193 : * @param length The length of the byte sequence. Can be -1 if NUL-terminated.
194 : * @return The match/value Result.
195 : * @stable ICU 4.8
196 : */
197 : UStringTrieResult next(const char *s, int32_t length);
198 :
199 : /**
200 : * Returns a matching byte sequence's value if called immediately after
201 : * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
202 : * getValue() can be called multiple times.
203 : *
204 : * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
205 : * @return The value for the byte sequence so far.
206 : * @stable ICU 4.8
207 : */
208 0 : inline int32_t getValue() const {
209 0 : const uint8_t *pos=pos_;
210 0 : int32_t leadByte=*pos++;
211 : // U_ASSERT(leadByte>=kMinValueLead);
212 0 : return readValue(pos, leadByte>>1);
213 : }
214 :
215 : /**
216 : * Determines whether all byte sequences reachable from the current state
217 : * map to the same value.
218 : * @param uniqueValue Receives the unique value, if this function returns TRUE.
219 : * (output-only)
220 : * @return TRUE if all byte sequences reachable from the current state
221 : * map to the same value.
222 : * @stable ICU 4.8
223 : */
224 : inline UBool hasUniqueValue(int32_t &uniqueValue) const {
225 : const uint8_t *pos=pos_;
226 : // Skip the rest of a pending linear-match node.
227 : return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
228 : }
229 :
230 : /**
231 : * Finds each byte which continues the byte sequence from the current state.
232 : * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
233 : * @param out Each next byte is appended to this object.
234 : * (Only uses the out.Append(s, length) method.)
235 : * @return the number of bytes which continue the byte sequence from here
236 : * @stable ICU 4.8
237 : */
238 : int32_t getNextBytes(ByteSink &out) const;
239 :
240 : /**
241 : * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
242 : * @stable ICU 4.8
243 : */
244 : class U_COMMON_API Iterator : public UMemory {
245 : public:
246 : /**
247 : * Iterates from the root of a byte-serialized BytesTrie.
248 : * @param trieBytes The trie bytes.
249 : * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
250 : * Otherwise, the iterator returns strings with this maximum length.
251 : * @param errorCode Standard ICU error code. Its input value must
252 : * pass the U_SUCCESS() test, or else the function returns
253 : * immediately. Check for U_FAILURE() on output or use with
254 : * function chaining. (See User Guide for details.)
255 : * @stable ICU 4.8
256 : */
257 : Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
258 :
259 : /**
260 : * Iterates from the current state of the specified BytesTrie.
261 : * @param trie The trie whose state will be copied for iteration.
262 : * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
263 : * Otherwise, the iterator returns strings with this maximum length.
264 : * @param errorCode Standard ICU error code. Its input value must
265 : * pass the U_SUCCESS() test, or else the function returns
266 : * immediately. Check for U_FAILURE() on output or use with
267 : * function chaining. (See User Guide for details.)
268 : * @stable ICU 4.8
269 : */
270 : Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
271 :
272 : /**
273 : * Destructor.
274 : * @stable ICU 4.8
275 : */
276 : ~Iterator();
277 :
278 : /**
279 : * Resets this iterator to its initial state.
280 : * @return *this
281 : * @stable ICU 4.8
282 : */
283 : Iterator &reset();
284 :
285 : /**
286 : * @return TRUE if there are more elements.
287 : * @stable ICU 4.8
288 : */
289 : UBool hasNext() const;
290 :
291 : /**
292 : * Finds the next (byte sequence, value) pair if there is one.
293 : *
294 : * If the byte sequence is truncated to the maximum length and does not
295 : * have a real value, then the value is set to -1.
296 : * In this case, this "not a real value" is indistinguishable from
297 : * a real value of -1.
298 : * @param errorCode Standard ICU error code. Its input value must
299 : * pass the U_SUCCESS() test, or else the function returns
300 : * immediately. Check for U_FAILURE() on output or use with
301 : * function chaining. (See User Guide for details.)
302 : * @return TRUE if there is another element.
303 : * @stable ICU 4.8
304 : */
305 : UBool next(UErrorCode &errorCode);
306 :
307 : /**
308 : * @return The NUL-terminated byte sequence for the last successful next().
309 : * @stable ICU 4.8
310 : */
311 : StringPiece getString() const;
312 : /**
313 : * @return The value for the last successful next().
314 : * @stable ICU 4.8
315 : */
316 : int32_t getValue() const { return value_; }
317 :
318 : private:
319 : UBool truncateAndStop();
320 :
321 : const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
322 :
323 : const uint8_t *bytes_;
324 : const uint8_t *pos_;
325 : const uint8_t *initialPos_;
326 : int32_t remainingMatchLength_;
327 : int32_t initialRemainingMatchLength_;
328 :
329 : CharString *str_;
330 : int32_t maxLength_;
331 : int32_t value_;
332 :
333 : // The stack stores pairs of integers for backtracking to another
334 : // outbound edge of a branch node.
335 : // The first integer is an offset from bytes_.
336 : // The second integer has the str_->length() from before the node in bits 15..0,
337 : // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
338 : // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
339 : // but the code looks more confusing that way.)
340 : UVector32 *stack_;
341 : };
342 :
343 : private:
344 : friend class BytesTrieBuilder;
345 :
346 : /**
347 : * Constructs a BytesTrie reader instance.
348 : * Unlike the public constructor which just aliases an array,
349 : * this constructor adopts the builder's array.
350 : * This constructor is only called by the builder.
351 : */
352 0 : BytesTrie(void *adoptBytes, const void *trieBytes)
353 0 : : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
354 : bytes_(static_cast<const uint8_t *>(trieBytes)),
355 0 : pos_(bytes_), remainingMatchLength_(-1) {}
356 :
357 : // No assignment operator.
358 : BytesTrie &operator=(const BytesTrie &other);
359 :
360 0 : inline void stop() {
361 0 : pos_=NULL;
362 0 : }
363 :
364 : // Reads a compact 32-bit integer.
365 : // pos is already after the leadByte, and the lead byte is already shifted right by 1.
366 : static int32_t readValue(const uint8_t *pos, int32_t leadByte);
367 0 : static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
368 : // U_ASSERT(leadByte>=kMinValueLead);
369 0 : if(leadByte>=(kMinTwoByteValueLead<<1)) {
370 0 : if(leadByte<(kMinThreeByteValueLead<<1)) {
371 0 : ++pos;
372 0 : } else if(leadByte<(kFourByteValueLead<<1)) {
373 0 : pos+=2;
374 : } else {
375 0 : pos+=3+((leadByte>>1)&1);
376 : }
377 : }
378 0 : return pos;
379 : }
380 0 : static inline const uint8_t *skipValue(const uint8_t *pos) {
381 0 : int32_t leadByte=*pos++;
382 0 : return skipValue(pos, leadByte);
383 : }
384 :
385 : // Reads a jump delta and jumps.
386 : static const uint8_t *jumpByDelta(const uint8_t *pos);
387 :
388 0 : static inline const uint8_t *skipDelta(const uint8_t *pos) {
389 0 : int32_t delta=*pos++;
390 0 : if(delta>=kMinTwoByteDeltaLead) {
391 0 : if(delta<kMinThreeByteDeltaLead) {
392 0 : ++pos;
393 0 : } else if(delta<kFourByteDeltaLead) {
394 0 : pos+=2;
395 : } else {
396 0 : pos+=3+(delta&1);
397 : }
398 : }
399 0 : return pos;
400 : }
401 :
402 0 : static inline UStringTrieResult valueResult(int32_t node) {
403 0 : return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
404 : }
405 :
406 : // Handles a branch node for both next(byte) and next(string).
407 : UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
408 :
409 : // Requires remainingLength_<0.
410 : UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
411 :
412 : // Helper functions for hasUniqueValue().
413 : // Recursively finds a unique value (or whether there is not a unique one)
414 : // from a branch.
415 : static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
416 : UBool haveUniqueValue, int32_t &uniqueValue);
417 : // Recursively finds a unique value (or whether there is not a unique one)
418 : // starting from a position on a node lead byte.
419 : static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
420 :
421 : // Helper functions for getNextBytes().
422 : // getNextBytes() when pos is on a branch node.
423 : static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
424 : static void append(ByteSink &out, int c);
425 :
426 : // BytesTrie data structure
427 : //
428 : // The trie consists of a series of byte-serialized nodes for incremental
429 : // string/byte sequence matching. The root node is at the beginning of the trie data.
430 : //
431 : // Types of nodes are distinguished by their node lead byte ranges.
432 : // After each node, except a final-value node, another node follows to
433 : // encode match values or continue matching further bytes.
434 : //
435 : // Node types:
436 : // - Value node: Stores a 32-bit integer in a compact, variable-length format.
437 : // The value is for the string/byte sequence so far.
438 : // One node bit indicates whether the value is final or whether
439 : // matching continues with the next node.
440 : // - Linear-match node: Matches a number of bytes.
441 : // - Branch node: Branches to other nodes according to the current input byte.
442 : // The node byte is the length of the branch (number of bytes to select from)
443 : // minus 1. It is followed by a sub-node:
444 : // - If the length is at most kMaxBranchLinearSubNodeLength, then
445 : // there are length-1 (key, value) pairs and then one more comparison byte.
446 : // If one of the key bytes matches, then the value is either a final value for
447 : // the string/byte sequence so far, or a "jump" delta to the next node.
448 : // If the last byte matches, then matching continues with the next node.
449 : // (Values have the same encoding as value nodes.)
450 : // - If the length is greater than kMaxBranchLinearSubNodeLength, then
451 : // there is one byte and one "jump" delta.
452 : // If the input byte is less than the sub-node byte, then "jump" by delta to
453 : // the next sub-node which will have a length of length/2.
454 : // (The delta has its own compact encoding.)
455 : // Otherwise, skip the "jump" delta to the next sub-node
456 : // which will have a length of length-length/2.
457 :
458 : // Node lead byte values.
459 :
460 : // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
461 : // the length is one more than the next byte.
462 :
463 : // For a branch sub-node with at most this many entries, we drop down
464 : // to a linear search.
465 : static const int32_t kMaxBranchLinearSubNodeLength=5;
466 :
467 : // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
468 : static const int32_t kMinLinearMatch=0x10;
469 : static const int32_t kMaxLinearMatchLength=0x10;
470 :
471 : // 20..ff: Variable-length value node.
472 : // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
473 : // Then shift-right by 1 bit.
474 : // The remaining lead byte value indicates the number of following bytes (0..4)
475 : // and contains the value's top bits.
476 : static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
477 : // It is a final value if bit 0 is set.
478 : static const int32_t kValueIsFinal=1;
479 :
480 : // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
481 : static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10
482 : static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
483 :
484 : static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
485 : static const int32_t kMaxTwoByteValue=0x1aff;
486 :
487 : static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
488 : static const int32_t kFourByteValueLead=0x7e;
489 :
490 : // A little more than Unicode code points. (0x11ffff)
491 : static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
492 :
493 : static const int32_t kFiveByteValueLead=0x7f;
494 :
495 : // Compact delta integers.
496 : static const int32_t kMaxOneByteDelta=0xbf;
497 : static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
498 : static const int32_t kMinThreeByteDeltaLead=0xf0;
499 : static const int32_t kFourByteDeltaLead=0xfe;
500 : static const int32_t kFiveByteDeltaLead=0xff;
501 :
502 : static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
503 : static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
504 :
505 : uint8_t *ownedArray_;
506 :
507 : // Fixed value referencing the BytesTrie bytes.
508 : const uint8_t *bytes_;
509 :
510 : // Iterator variables.
511 :
512 : // Pointer to next trie byte to read. NULL if no more matches.
513 : const uint8_t *pos_;
514 : // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
515 : int32_t remainingMatchLength_;
516 : };
517 :
518 : U_NAMESPACE_END
519 :
520 : #endif // __BYTESTRIE_H__
|