LCOV - code coverage report
Current view: top level - intl/icu/source/common/unicode - bytestrie.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 35 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 8 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : *   Copyright (C) 2010-2012, International Business Machines
       6             : *   Corporation and others.  All Rights Reserved.
       7             : *******************************************************************************
       8             : *   file name:  bytestrie.h
       9             : *   encoding:   UTF-8
      10             : *   tab size:   8 (not used)
      11             : *   indentation:4
      12             : *
      13             : *   created on: 2010sep25
      14             : *   created by: Markus W. Scherer
      15             : */
      16             : 
      17             : #ifndef __BYTESTRIE_H__
      18             : #define __BYTESTRIE_H__
      19             : 
      20             : /**
      21             :  * \file
      22             :  * \brief C++ API: Trie for mapping byte sequences to integer values.
      23             :  */
      24             : 
      25             : #include "unicode/utypes.h"
      26             : #include "unicode/stringpiece.h"
      27             : #include "unicode/uobject.h"
      28             : #include "unicode/ustringtrie.h"
      29             : 
      30             : U_NAMESPACE_BEGIN
      31             : 
      32             : class ByteSink;
      33             : class BytesTrieBuilder;
      34             : class CharString;
      35             : class UVector32;
      36             : 
      37             : /**
      38             :  * Light-weight, non-const reader class for a BytesTrie.
      39             :  * Traverses a byte-serialized data structure with minimal state,
      40             :  * for mapping byte sequences to non-negative integer values.
      41             :  *
      42             :  * This class owns the serialized trie data only if it was constructed by
      43             :  * the builder's build() method.
      44             :  * The public constructor and the copy constructor only alias the data (only copy the pointer).
      45             :  * There is no assignment operator.
      46             :  *
      47             :  * This class is not intended for public subclassing.
      48             :  * @stable ICU 4.8
      49             :  */
      50             : class U_COMMON_API BytesTrie : public UMemory {
      51             : public:
      52             :     /**
      53             :      * Constructs a BytesTrie reader instance.
      54             :      *
      55             :      * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
      56             :      * starting with the first byte of that sequence.
      57             :      * The BytesTrie object will not read more bytes than
      58             :      * the BytesTrieBuilder generated in the corresponding build() call.
      59             :      *
      60             :      * The array is not copied/cloned and must not be modified while
      61             :      * the BytesTrie object is in use.
      62             :      *
      63             :      * @param trieBytes The byte array that contains the serialized trie.
      64             :      * @stable ICU 4.8
      65             :      */
      66           0 :     BytesTrie(const void *trieBytes)
      67           0 :             : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
      68           0 :               pos_(bytes_), remainingMatchLength_(-1) {}
      69             : 
      70             :     /**
      71             :      * Destructor.
      72             :      * @stable ICU 4.8
      73             :      */
      74             :     ~BytesTrie();
      75             : 
      76             :     /**
      77             :      * Copy constructor, copies the other trie reader object and its state,
      78             :      * but not the byte array which will be shared. (Shallow copy.)
      79             :      * @param other Another BytesTrie object.
      80             :      * @stable ICU 4.8
      81             :      */
      82             :     BytesTrie(const BytesTrie &other)
      83             :             : ownedArray_(NULL), bytes_(other.bytes_),
      84             :               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
      85             : 
      86             :     /**
      87             :      * Resets this trie to its initial state.
      88             :      * @return *this
      89             :      * @stable ICU 4.8
      90             :      */
      91             :     BytesTrie &reset() {
      92             :         pos_=bytes_;
      93             :         remainingMatchLength_=-1;
      94             :         return *this;
      95             :     }
      96             : 
      97             :     /**
      98             :      * BytesTrie state object, for saving a trie's current state
      99             :      * and resetting the trie back to this state later.
     100             :      * @stable ICU 4.8
     101             :      */
     102             :     class State : public UMemory {
     103             :     public:
     104             :         /**
     105             :          * Constructs an empty State.
     106             :          * @stable ICU 4.8
     107             :          */
     108             :         State() { bytes=NULL; }
     109             :     private:
     110             :         friend class BytesTrie;
     111             : 
     112             :         const uint8_t *bytes;
     113             :         const uint8_t *pos;
     114             :         int32_t remainingMatchLength;
     115             :     };
     116             : 
     117             :     /**
     118             :      * Saves the state of this trie.
     119             :      * @param state The State object to hold the trie's state.
     120             :      * @return *this
     121             :      * @see resetToState
     122             :      * @stable ICU 4.8
     123             :      */
     124             :     const BytesTrie &saveState(State &state) const {
     125             :         state.bytes=bytes_;
     126             :         state.pos=pos_;
     127             :         state.remainingMatchLength=remainingMatchLength_;
     128             :         return *this;
     129             :     }
     130             : 
     131             :     /**
     132             :      * Resets this trie to the saved state.
     133             :      * If the state object contains no state, or the state of a different trie,
     134             :      * then this trie remains unchanged.
     135             :      * @param state The State object which holds a saved trie state.
     136             :      * @return *this
     137             :      * @see saveState
     138             :      * @see reset
     139             :      * @stable ICU 4.8
     140             :      */
     141             :     BytesTrie &resetToState(const State &state) {
     142             :         if(bytes_==state.bytes && bytes_!=NULL) {
     143             :             pos_=state.pos;
     144             :             remainingMatchLength_=state.remainingMatchLength;
     145             :         }
     146             :         return *this;
     147             :     }
     148             : 
     149             :     /**
     150             :      * Determines whether the byte sequence so far matches, whether it has a value,
     151             :      * and whether another input byte can continue a matching byte sequence.
     152             :      * @return The match/value Result.
     153             :      * @stable ICU 4.8
     154             :      */
     155             :     UStringTrieResult current() const;
     156             : 
     157             :     /**
     158             :      * Traverses the trie from the initial state for this input byte.
     159             :      * Equivalent to reset().next(inByte).
     160             :      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
     161             :      *               Values below -0x100 and above 0xff will never match.
     162             :      * @return The match/value Result.
     163             :      * @stable ICU 4.8
     164             :      */
     165             :     inline UStringTrieResult first(int32_t inByte) {
     166             :         remainingMatchLength_=-1;
     167             :         if(inByte<0) {
     168             :             inByte+=0x100;
     169             :         }
     170             :         return nextImpl(bytes_, inByte);
     171             :     }
     172             : 
     173             :     /**
     174             :      * Traverses the trie from the current state for this input byte.
     175             :      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
     176             :      *               Values below -0x100 and above 0xff will never match.
     177             :      * @return The match/value Result.
     178             :      * @stable ICU 4.8
     179             :      */
     180             :     UStringTrieResult next(int32_t inByte);
     181             : 
     182             :     /**
     183             :      * Traverses the trie from the current state for this byte sequence.
     184             :      * Equivalent to
     185             :      * \code
     186             :      * Result result=current();
     187             :      * for(each c in s)
     188             :      *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
     189             :      *   result=next(c);
     190             :      * return result;
     191             :      * \endcode
     192             :      * @param s A string or byte sequence. Can be NULL if length is 0.
     193             :      * @param length The length of the byte sequence. Can be -1 if NUL-terminated.
     194             :      * @return The match/value Result.
     195             :      * @stable ICU 4.8
     196             :      */
     197             :     UStringTrieResult next(const char *s, int32_t length);
     198             : 
     199             :     /**
     200             :      * Returns a matching byte sequence's value if called immediately after
     201             :      * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
     202             :      * getValue() can be called multiple times.
     203             :      *
     204             :      * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
     205             :      * @return The value for the byte sequence so far.
     206             :      * @stable ICU 4.8
     207             :      */
     208           0 :     inline int32_t getValue() const {
     209           0 :         const uint8_t *pos=pos_;
     210           0 :         int32_t leadByte=*pos++;
     211             :         // U_ASSERT(leadByte>=kMinValueLead);
     212           0 :         return readValue(pos, leadByte>>1);
     213             :     }
     214             : 
     215             :     /**
     216             :      * Determines whether all byte sequences reachable from the current state
     217             :      * map to the same value.
     218             :      * @param uniqueValue Receives the unique value, if this function returns TRUE.
     219             :      *                    (output-only)
     220             :      * @return TRUE if all byte sequences reachable from the current state
     221             :      *         map to the same value.
     222             :      * @stable ICU 4.8
     223             :      */
     224             :     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
     225             :         const uint8_t *pos=pos_;
     226             :         // Skip the rest of a pending linear-match node.
     227             :         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
     228             :     }
     229             : 
     230             :     /**
     231             :      * Finds each byte which continues the byte sequence from the current state.
     232             :      * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
     233             :      * @param out Each next byte is appended to this object.
     234             :      *            (Only uses the out.Append(s, length) method.)
     235             :      * @return the number of bytes which continue the byte sequence from here
     236             :      * @stable ICU 4.8
     237             :      */
     238             :     int32_t getNextBytes(ByteSink &out) const;
     239             : 
     240             :     /**
     241             :      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
     242             :      * @stable ICU 4.8
     243             :      */
     244             :     class U_COMMON_API Iterator : public UMemory {
     245             :     public:
     246             :         /**
     247             :          * Iterates from the root of a byte-serialized BytesTrie.
     248             :          * @param trieBytes The trie bytes.
     249             :          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
     250             :          *                        Otherwise, the iterator returns strings with this maximum length.
     251             :          * @param errorCode Standard ICU error code. Its input value must
     252             :          *                  pass the U_SUCCESS() test, or else the function returns
     253             :          *                  immediately. Check for U_FAILURE() on output or use with
     254             :          *                  function chaining. (See User Guide for details.)
     255             :          * @stable ICU 4.8
     256             :          */
     257             :         Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
     258             : 
     259             :         /**
     260             :          * Iterates from the current state of the specified BytesTrie.
     261             :          * @param trie The trie whose state will be copied for iteration.
     262             :          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
     263             :          *                        Otherwise, the iterator returns strings with this maximum length.
     264             :          * @param errorCode Standard ICU error code. Its input value must
     265             :          *                  pass the U_SUCCESS() test, or else the function returns
     266             :          *                  immediately. Check for U_FAILURE() on output or use with
     267             :          *                  function chaining. (See User Guide for details.)
     268             :          * @stable ICU 4.8
     269             :          */
     270             :         Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
     271             : 
     272             :         /**
     273             :          * Destructor.
     274             :          * @stable ICU 4.8
     275             :          */
     276             :         ~Iterator();
     277             : 
     278             :         /**
     279             :          * Resets this iterator to its initial state.
     280             :          * @return *this
     281             :          * @stable ICU 4.8
     282             :          */
     283             :         Iterator &reset();
     284             : 
     285             :         /**
     286             :          * @return TRUE if there are more elements.
     287             :          * @stable ICU 4.8
     288             :          */
     289             :         UBool hasNext() const;
     290             : 
     291             :         /**
     292             :          * Finds the next (byte sequence, value) pair if there is one.
     293             :          *
     294             :          * If the byte sequence is truncated to the maximum length and does not
     295             :          * have a real value, then the value is set to -1.
     296             :          * In this case, this "not a real value" is indistinguishable from
     297             :          * a real value of -1.
     298             :          * @param errorCode Standard ICU error code. Its input value must
     299             :          *                  pass the U_SUCCESS() test, or else the function returns
     300             :          *                  immediately. Check for U_FAILURE() on output or use with
     301             :          *                  function chaining. (See User Guide for details.)
     302             :          * @return TRUE if there is another element.
     303             :          * @stable ICU 4.8
     304             :          */
     305             :         UBool next(UErrorCode &errorCode);
     306             : 
     307             :         /**
     308             :          * @return The NUL-terminated byte sequence for the last successful next().
     309             :          * @stable ICU 4.8
     310             :          */
     311             :         StringPiece getString() const;
     312             :         /**
     313             :          * @return The value for the last successful next().
     314             :          * @stable ICU 4.8
     315             :          */
     316             :         int32_t getValue() const { return value_; }
     317             : 
     318             :     private:
     319             :         UBool truncateAndStop();
     320             : 
     321             :         const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
     322             : 
     323             :         const uint8_t *bytes_;
     324             :         const uint8_t *pos_;
     325             :         const uint8_t *initialPos_;
     326             :         int32_t remainingMatchLength_;
     327             :         int32_t initialRemainingMatchLength_;
     328             : 
     329             :         CharString *str_;
     330             :         int32_t maxLength_;
     331             :         int32_t value_;
     332             : 
     333             :         // The stack stores pairs of integers for backtracking to another
     334             :         // outbound edge of a branch node.
     335             :         // The first integer is an offset from bytes_.
     336             :         // The second integer has the str_->length() from before the node in bits 15..0,
     337             :         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
     338             :         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
     339             :         // but the code looks more confusing that way.)
     340             :         UVector32 *stack_;
     341             :     };
     342             : 
     343             : private:
     344             :     friend class BytesTrieBuilder;
     345             : 
     346             :     /**
     347             :      * Constructs a BytesTrie reader instance.
     348             :      * Unlike the public constructor which just aliases an array,
     349             :      * this constructor adopts the builder's array.
     350             :      * This constructor is only called by the builder.
     351             :      */
     352           0 :     BytesTrie(void *adoptBytes, const void *trieBytes)
     353           0 :             : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
     354             :               bytes_(static_cast<const uint8_t *>(trieBytes)),
     355           0 :               pos_(bytes_), remainingMatchLength_(-1) {}
     356             : 
     357             :     // No assignment operator.
     358             :     BytesTrie &operator=(const BytesTrie &other);
     359             : 
     360           0 :     inline void stop() {
     361           0 :         pos_=NULL;
     362           0 :     }
     363             : 
     364             :     // Reads a compact 32-bit integer.
     365             :     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
     366             :     static int32_t readValue(const uint8_t *pos, int32_t leadByte);
     367           0 :     static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
     368             :         // U_ASSERT(leadByte>=kMinValueLead);
     369           0 :         if(leadByte>=(kMinTwoByteValueLead<<1)) {
     370           0 :             if(leadByte<(kMinThreeByteValueLead<<1)) {
     371           0 :                 ++pos;
     372           0 :             } else if(leadByte<(kFourByteValueLead<<1)) {
     373           0 :                 pos+=2;
     374             :             } else {
     375           0 :                 pos+=3+((leadByte>>1)&1);
     376             :             }
     377             :         }
     378           0 :         return pos;
     379             :     }
     380           0 :     static inline const uint8_t *skipValue(const uint8_t *pos) {
     381           0 :         int32_t leadByte=*pos++;
     382           0 :         return skipValue(pos, leadByte);
     383             :     }
     384             : 
     385             :     // Reads a jump delta and jumps.
     386             :     static const uint8_t *jumpByDelta(const uint8_t *pos);
     387             : 
     388           0 :     static inline const uint8_t *skipDelta(const uint8_t *pos) {
     389           0 :         int32_t delta=*pos++;
     390           0 :         if(delta>=kMinTwoByteDeltaLead) {
     391           0 :             if(delta<kMinThreeByteDeltaLead) {
     392           0 :                 ++pos;
     393           0 :             } else if(delta<kFourByteDeltaLead) {
     394           0 :                 pos+=2;
     395             :             } else {
     396           0 :                 pos+=3+(delta&1);
     397             :             }
     398             :         }
     399           0 :         return pos;
     400             :     }
     401             : 
     402           0 :     static inline UStringTrieResult valueResult(int32_t node) {
     403           0 :         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
     404             :     }
     405             : 
     406             :     // Handles a branch node for both next(byte) and next(string).
     407             :     UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
     408             : 
     409             :     // Requires remainingLength_<0.
     410             :     UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
     411             : 
     412             :     // Helper functions for hasUniqueValue().
     413             :     // Recursively finds a unique value (or whether there is not a unique one)
     414             :     // from a branch.
     415             :     static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
     416             :                                                     UBool haveUniqueValue, int32_t &uniqueValue);
     417             :     // Recursively finds a unique value (or whether there is not a unique one)
     418             :     // starting from a position on a node lead byte.
     419             :     static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
     420             : 
     421             :     // Helper functions for getNextBytes().
     422             :     // getNextBytes() when pos is on a branch node.
     423             :     static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
     424             :     static void append(ByteSink &out, int c);
     425             : 
     426             :     // BytesTrie data structure
     427             :     //
     428             :     // The trie consists of a series of byte-serialized nodes for incremental
     429             :     // string/byte sequence matching. The root node is at the beginning of the trie data.
     430             :     //
     431             :     // Types of nodes are distinguished by their node lead byte ranges.
     432             :     // After each node, except a final-value node, another node follows to
     433             :     // encode match values or continue matching further bytes.
     434             :     //
     435             :     // Node types:
     436             :     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
     437             :     //    The value is for the string/byte sequence so far.
     438             :     //    One node bit indicates whether the value is final or whether
     439             :     //    matching continues with the next node.
     440             :     //  - Linear-match node: Matches a number of bytes.
     441             :     //  - Branch node: Branches to other nodes according to the current input byte.
     442             :     //    The node byte is the length of the branch (number of bytes to select from)
     443             :     //    minus 1. It is followed by a sub-node:
     444             :     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
     445             :     //      there are length-1 (key, value) pairs and then one more comparison byte.
     446             :     //      If one of the key bytes matches, then the value is either a final value for
     447             :     //      the string/byte sequence so far, or a "jump" delta to the next node.
     448             :     //      If the last byte matches, then matching continues with the next node.
     449             :     //      (Values have the same encoding as value nodes.)
     450             :     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
     451             :     //      there is one byte and one "jump" delta.
     452             :     //      If the input byte is less than the sub-node byte, then "jump" by delta to
     453             :     //      the next sub-node which will have a length of length/2.
     454             :     //      (The delta has its own compact encoding.)
     455             :     //      Otherwise, skip the "jump" delta to the next sub-node
     456             :     //      which will have a length of length-length/2.
     457             : 
     458             :     // Node lead byte values.
     459             : 
     460             :     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
     461             :     // the length is one more than the next byte.
     462             : 
     463             :     // For a branch sub-node with at most this many entries, we drop down
     464             :     // to a linear search.
     465             :     static const int32_t kMaxBranchLinearSubNodeLength=5;
     466             : 
     467             :     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
     468             :     static const int32_t kMinLinearMatch=0x10;
     469             :     static const int32_t kMaxLinearMatchLength=0x10;
     470             : 
     471             :     // 20..ff: Variable-length value node.
     472             :     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
     473             :     // Then shift-right by 1 bit.
     474             :     // The remaining lead byte value indicates the number of following bytes (0..4)
     475             :     // and contains the value's top bits.
     476             :     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
     477             :     // It is a final value if bit 0 is set.
     478             :     static const int32_t kValueIsFinal=1;
     479             : 
     480             :     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
     481             :     static const int32_t kMinOneByteValueLead=kMinValueLead/2;  // 0x10
     482             :     static const int32_t kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
     483             : 
     484             :     static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
     485             :     static const int32_t kMaxTwoByteValue=0x1aff;
     486             : 
     487             :     static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
     488             :     static const int32_t kFourByteValueLead=0x7e;
     489             : 
     490             :     // A little more than Unicode code points. (0x11ffff)
     491             :     static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
     492             : 
     493             :     static const int32_t kFiveByteValueLead=0x7f;
     494             : 
     495             :     // Compact delta integers.
     496             :     static const int32_t kMaxOneByteDelta=0xbf;
     497             :     static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
     498             :     static const int32_t kMinThreeByteDeltaLead=0xf0;
     499             :     static const int32_t kFourByteDeltaLead=0xfe;
     500             :     static const int32_t kFiveByteDeltaLead=0xff;
     501             : 
     502             :     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
     503             :     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
     504             : 
     505             :     uint8_t *ownedArray_;
     506             : 
     507             :     // Fixed value referencing the BytesTrie bytes.
     508             :     const uint8_t *bytes_;
     509             : 
     510             :     // Iterator variables.
     511             : 
     512             :     // Pointer to next trie byte to read. NULL if no more matches.
     513             :     const uint8_t *pos_;
     514             :     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
     515             :     int32_t remainingMatchLength_;
     516             : };
     517             : 
     518             : U_NAMESPACE_END
     519             : 
     520             : #endif  // __BYTESTRIE_H__

Generated by: LCOV version 1.13