LCOV - code coverage report
Current view: top level - intl/icu/source/common - bytestriebuilder.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 278 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 34 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : *   Copyright (C) 2010-2012, International Business Machines
       6             : *   Corporation and others.  All Rights Reserved.
       7             : *******************************************************************************
       8             : *   file name:  bytestriebuilder.cpp
       9             : *   encoding:   UTF-8
      10             : *   tab size:   8 (not used)
      11             : *   indentation:4
      12             : *
      13             : *   created on: 2010sep25
      14             : *   created by: Markus W. Scherer
      15             : */
      16             : 
      17             : #include "unicode/utypes.h"
      18             : #include "unicode/bytestrie.h"
      19             : #include "unicode/bytestriebuilder.h"
      20             : #include "unicode/stringpiece.h"
      21             : #include "charstr.h"
      22             : #include "cmemory.h"
      23             : #include "uhash.h"
      24             : #include "uarrsort.h"
      25             : #include "uassert.h"
      26             : #include "ustr_imp.h"
      27             : 
      28             : U_NAMESPACE_BEGIN
      29             : 
      30             : /*
      31             :  * Note: This builder implementation stores (bytes, value) pairs with full copies
      32             :  * of the byte sequences, until the BytesTrie is built.
      33             :  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
      34             :  */
      35             : 
      36             : class BytesTrieElement : public UMemory {
      37             : public:
      38             :     // Use compiler's default constructor, initializes nothing.
      39             : 
      40             :     void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
      41             : 
      42           0 :     StringPiece getString(const CharString &strings) const {
      43           0 :         int32_t offset=stringOffset;
      44             :         int32_t length;
      45           0 :         if(offset>=0) {
      46           0 :             length=(uint8_t)strings[offset++];
      47             :         } else {
      48           0 :             offset=~offset;
      49           0 :             length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
      50           0 :             offset+=2;
      51             :         }
      52           0 :         return StringPiece(strings.data()+offset, length);
      53             :     }
      54           0 :     int32_t getStringLength(const CharString &strings) const {
      55           0 :         int32_t offset=stringOffset;
      56           0 :         if(offset>=0) {
      57           0 :             return (uint8_t)strings[offset];
      58             :         } else {
      59           0 :             offset=~offset;
      60           0 :             return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
      61             :         }
      62             :     }
      63             : 
      64           0 :     char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
      65             : 
      66           0 :     int32_t getValue() const { return value; }
      67             : 
      68             :     int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
      69             : 
      70             : private:
      71           0 :     const char *data(const CharString &strings) const {
      72           0 :         int32_t offset=stringOffset;
      73           0 :         if(offset>=0) {
      74           0 :             ++offset;
      75             :         } else {
      76           0 :             offset=~offset+2;
      77             :         }
      78           0 :         return strings.data()+offset;
      79             :     }
      80             : 
      81             :     // If the stringOffset is non-negative, then the first strings byte contains
      82             :     // the string length.
      83             :     // If the stringOffset is negative, then the first two strings bytes contain
      84             :     // the string length (big-endian), and the offset needs to be bit-inverted.
      85             :     // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
      86             :     int32_t stringOffset;
      87             :     int32_t value;
      88             : };
      89             : 
      90             : void
      91           0 : BytesTrieElement::setTo(StringPiece s, int32_t val,
      92             :                         CharString &strings, UErrorCode &errorCode) {
      93           0 :     if(U_FAILURE(errorCode)) {
      94           0 :         return;
      95             :     }
      96           0 :     int32_t length=s.length();
      97           0 :     if(length>0xffff) {
      98             :         // Too long: We store the length in 1 or 2 bytes.
      99           0 :         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     100           0 :         return;
     101             :     }
     102           0 :     int32_t offset=strings.length();
     103           0 :     if(length>0xff) {
     104           0 :         offset=~offset;
     105           0 :         strings.append((char)(length>>8), errorCode);
     106             :     }
     107           0 :     strings.append((char)length, errorCode);
     108           0 :     stringOffset=offset;
     109           0 :     value=val;
     110           0 :     strings.append(s, errorCode);
     111             : }
     112             : 
     113             : int32_t
     114           0 : BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
     115             :     // TODO: add StringPiece::compare(), see ticket #8187
     116           0 :     StringPiece thisString=getString(strings);
     117           0 :     StringPiece otherString=other.getString(strings);
     118           0 :     int32_t lengthDiff=thisString.length()-otherString.length();
     119             :     int32_t commonLength;
     120           0 :     if(lengthDiff<=0) {
     121           0 :         commonLength=thisString.length();
     122             :     } else {
     123           0 :         commonLength=otherString.length();
     124             :     }
     125           0 :     int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
     126           0 :     return diff!=0 ? diff : lengthDiff;
     127             : }
     128             : 
     129           0 : BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
     130             :         : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
     131           0 :           bytes(NULL), bytesCapacity(0), bytesLength(0) {
     132           0 :     if(U_FAILURE(errorCode)) {
     133           0 :         return;
     134             :     }
     135           0 :     strings=new CharString();
     136           0 :     if(strings==NULL) {
     137           0 :         errorCode=U_MEMORY_ALLOCATION_ERROR;
     138             :     }
     139             : }
     140             : 
     141           0 : BytesTrieBuilder::~BytesTrieBuilder() {
     142           0 :     delete strings;
     143           0 :     delete[] elements;
     144           0 :     uprv_free(bytes);
     145           0 : }
     146             : 
     147             : BytesTrieBuilder &
     148           0 : BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
     149           0 :     if(U_FAILURE(errorCode)) {
     150           0 :         return *this;
     151             :     }
     152           0 :     if(bytesLength>0) {
     153             :         // Cannot add elements after building.
     154           0 :         errorCode=U_NO_WRITE_PERMISSION;
     155           0 :         return *this;
     156             :     }
     157           0 :     if(elementsLength==elementsCapacity) {
     158             :         int32_t newCapacity;
     159           0 :         if(elementsCapacity==0) {
     160           0 :             newCapacity=1024;
     161             :         } else {
     162           0 :             newCapacity=4*elementsCapacity;
     163             :         }
     164           0 :         BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
     165           0 :         if(newElements==NULL) {
     166           0 :             errorCode=U_MEMORY_ALLOCATION_ERROR;
     167           0 :             return *this; // error instead of dereferencing null
     168             :         }
     169           0 :         if(elementsLength>0) {
     170           0 :             uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
     171             :         }
     172           0 :         delete[] elements;
     173           0 :         elements=newElements;
     174           0 :         elementsCapacity=newCapacity;
     175             :     }
     176           0 :     elements[elementsLength++].setTo(s, value, *strings, errorCode);
     177           0 :     return *this;
     178             : }
     179             : 
     180             : U_CDECL_BEGIN
     181             : 
     182             : static int32_t U_CALLCONV
     183           0 : compareElementStrings(const void *context, const void *left, const void *right) {
     184           0 :     const CharString *strings=static_cast<const CharString *>(context);
     185           0 :     const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
     186           0 :     const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
     187           0 :     return leftElement->compareStringTo(*rightElement, *strings);
     188             : }
     189             : 
     190             : U_CDECL_END
     191             : 
     192             : BytesTrie *
     193           0 : BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
     194           0 :     buildBytes(buildOption, errorCode);
     195           0 :     BytesTrie *newTrie=NULL;
     196           0 :     if(U_SUCCESS(errorCode)) {
     197           0 :         newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
     198           0 :         if(newTrie==NULL) {
     199           0 :             errorCode=U_MEMORY_ALLOCATION_ERROR;
     200             :         } else {
     201           0 :             bytes=NULL;  // The new trie now owns the array.
     202           0 :             bytesCapacity=0;
     203             :         }
     204             :     }
     205           0 :     return newTrie;
     206             : }
     207             : 
     208             : StringPiece
     209           0 : BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
     210           0 :     buildBytes(buildOption, errorCode);
     211           0 :     StringPiece result;
     212           0 :     if(U_SUCCESS(errorCode)) {
     213           0 :         result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
     214             :     }
     215           0 :     return result;
     216             : }
     217             : 
     218             : void
     219           0 : BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
     220           0 :     if(U_FAILURE(errorCode)) {
     221           0 :         return;
     222             :     }
     223           0 :     if(bytes!=NULL && bytesLength>0) {
     224             :         // Already built.
     225           0 :         return;
     226             :     }
     227           0 :     if(bytesLength==0) {
     228           0 :         if(elementsLength==0) {
     229           0 :             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     230           0 :             return;
     231             :         }
     232           0 :         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
     233           0 :                       compareElementStrings, strings,
     234             :                       FALSE,  // need not be a stable sort
     235           0 :                       &errorCode);
     236           0 :         if(U_FAILURE(errorCode)) {
     237           0 :             return;
     238             :         }
     239             :         // Duplicate strings are not allowed.
     240           0 :         StringPiece prev=elements[0].getString(*strings);
     241           0 :         for(int32_t i=1; i<elementsLength; ++i) {
     242           0 :             StringPiece current=elements[i].getString(*strings);
     243           0 :             if(prev==current) {
     244           0 :                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     245           0 :                 return;
     246             :             }
     247           0 :             prev=current;
     248             :         }
     249             :     }
     250             :     // Create and byte-serialize the trie for the elements.
     251           0 :     bytesLength=0;
     252           0 :     int32_t capacity=strings->length();
     253           0 :     if(capacity<1024) {
     254           0 :         capacity=1024;
     255             :     }
     256           0 :     if(bytesCapacity<capacity) {
     257           0 :         uprv_free(bytes);
     258           0 :         bytes=static_cast<char *>(uprv_malloc(capacity));
     259           0 :         if(bytes==NULL) {
     260           0 :             errorCode=U_MEMORY_ALLOCATION_ERROR;
     261           0 :             bytesCapacity=0;
     262           0 :             return;
     263             :         }
     264           0 :         bytesCapacity=capacity;
     265             :     }
     266           0 :     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
     267           0 :     if(bytes==NULL) {
     268           0 :         errorCode=U_MEMORY_ALLOCATION_ERROR;
     269             :     }
     270             : }
     271             : 
     272             : BytesTrieBuilder &
     273           0 : BytesTrieBuilder::clear() {
     274           0 :     strings->clear();
     275           0 :     elementsLength=0;
     276           0 :     bytesLength=0;
     277           0 :     return *this;
     278             : }
     279             : 
     280             : int32_t
     281           0 : BytesTrieBuilder::getElementStringLength(int32_t i) const {
     282           0 :     return elements[i].getStringLength(*strings);
     283             : }
     284             : 
     285             : UChar
     286           0 : BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
     287           0 :     return (uint8_t)elements[i].charAt(byteIndex, *strings);
     288             : }
     289             : 
     290             : int32_t
     291           0 : BytesTrieBuilder::getElementValue(int32_t i) const {
     292           0 :     return elements[i].getValue();
     293             : }
     294             : 
     295             : int32_t
     296           0 : BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
     297           0 :     const BytesTrieElement &firstElement=elements[first];
     298           0 :     const BytesTrieElement &lastElement=elements[last];
     299           0 :     int32_t minStringLength=firstElement.getStringLength(*strings);
     300           0 :     while(++byteIndex<minStringLength &&
     301           0 :             firstElement.charAt(byteIndex, *strings)==
     302           0 :             lastElement.charAt(byteIndex, *strings)) {}
     303           0 :     return byteIndex;
     304             : }
     305             : 
     306             : int32_t
     307           0 : BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
     308           0 :     int32_t length=0;  // Number of different bytes at byteIndex.
     309           0 :     int32_t i=start;
     310           0 :     do {
     311           0 :         char byte=elements[i++].charAt(byteIndex, *strings);
     312           0 :         while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
     313           0 :             ++i;
     314             :         }
     315           0 :         ++length;
     316           0 :     } while(i<limit);
     317           0 :     return length;
     318             : }
     319             : 
     320             : int32_t
     321           0 : BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
     322           0 :     do {
     323           0 :         char byte=elements[i++].charAt(byteIndex, *strings);
     324           0 :         while(byte==elements[i].charAt(byteIndex, *strings)) {
     325           0 :             ++i;
     326             :         }
     327             :     } while(--count>0);
     328           0 :     return i;
     329             : }
     330             : 
     331             : int32_t
     332           0 : BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
     333           0 :     char b=(char)byte;
     334           0 :     while(b==elements[i].charAt(byteIndex, *strings)) {
     335           0 :         ++i;
     336             :     }
     337           0 :     return i;
     338             : }
     339             : 
     340           0 : BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
     341           0 :         : LinearMatchNode(len, nextNode), s(bytes) {
     342           0 :     hash=hash*37+ustr_hashCharsN(bytes, len);
     343           0 : }
     344             : 
     345             : UBool
     346           0 : BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
     347           0 :     if(this==&other) {
     348           0 :         return TRUE;
     349             :     }
     350           0 :     if(!LinearMatchNode::operator==(other)) {
     351           0 :         return FALSE;
     352             :     }
     353           0 :     const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
     354           0 :     return 0==uprv_memcmp(s, o.s, length);
     355             : }
     356             : 
     357             : void
     358           0 : BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
     359           0 :     BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
     360           0 :     next->write(builder);
     361           0 :     b.write(s, length);
     362           0 :     offset=b.write(b.getMinLinearMatch()+length-1);
     363           0 : }
     364             : 
     365             : StringTrieBuilder::Node *
     366           0 : BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
     367             :                                         Node *nextNode) const {
     368             :     return new BTLinearMatchNode(
     369           0 :             elements[i].getString(*strings).data()+byteIndex,
     370             :             length,
     371           0 :             nextNode);
     372             : }
     373             : 
     374             : UBool
     375           0 : BytesTrieBuilder::ensureCapacity(int32_t length) {
     376           0 :     if(bytes==NULL) {
     377           0 :         return FALSE;  // previous memory allocation had failed
     378             :     }
     379           0 :     if(length>bytesCapacity) {
     380           0 :         int32_t newCapacity=bytesCapacity;
     381           0 :         do {
     382           0 :             newCapacity*=2;
     383           0 :         } while(newCapacity<=length);
     384           0 :         char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
     385           0 :         if(newBytes==NULL) {
     386             :             // unable to allocate memory
     387           0 :             uprv_free(bytes);
     388           0 :             bytes=NULL;
     389           0 :             bytesCapacity=0;
     390           0 :             return FALSE;
     391             :         }
     392           0 :         uprv_memcpy(newBytes+(newCapacity-bytesLength),
     393           0 :                     bytes+(bytesCapacity-bytesLength), bytesLength);
     394           0 :         uprv_free(bytes);
     395           0 :         bytes=newBytes;
     396           0 :         bytesCapacity=newCapacity;
     397             :     }
     398           0 :     return TRUE;
     399             : }
     400             : 
     401             : int32_t
     402           0 : BytesTrieBuilder::write(int32_t byte) {
     403           0 :     int32_t newLength=bytesLength+1;
     404           0 :     if(ensureCapacity(newLength)) {
     405           0 :         bytesLength=newLength;
     406           0 :         bytes[bytesCapacity-bytesLength]=(char)byte;
     407             :     }
     408           0 :     return bytesLength;
     409             : }
     410             : 
     411             : int32_t
     412           0 : BytesTrieBuilder::write(const char *b, int32_t length) {
     413           0 :     int32_t newLength=bytesLength+length;
     414           0 :     if(ensureCapacity(newLength)) {
     415           0 :         bytesLength=newLength;
     416           0 :         uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
     417             :     }
     418           0 :     return bytesLength;
     419             : }
     420             : 
     421             : int32_t
     422           0 : BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
     423           0 :     return write(elements[i].getString(*strings).data()+byteIndex, length);
     424             : }
     425             : 
     426             : int32_t
     427           0 : BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
     428           0 :     if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
     429           0 :         return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
     430             :     }
     431             :     char intBytes[5];
     432           0 :     int32_t length=1;
     433           0 :     if(i<0 || i>0xffffff) {
     434           0 :         intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
     435           0 :         intBytes[1]=(char)((uint32_t)i>>24);
     436           0 :         intBytes[2]=(char)((uint32_t)i>>16);
     437           0 :         intBytes[3]=(char)((uint32_t)i>>8);
     438           0 :         intBytes[4]=(char)i;
     439           0 :         length=5;
     440             :     // } else if(i<=BytesTrie::kMaxOneByteValue) {
     441             :     //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
     442             :     } else {
     443           0 :         if(i<=BytesTrie::kMaxTwoByteValue) {
     444           0 :             intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
     445             :         } else {
     446           0 :             if(i<=BytesTrie::kMaxThreeByteValue) {
     447           0 :                 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
     448             :             } else {
     449           0 :                 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
     450           0 :                 intBytes[1]=(char)(i>>16);
     451           0 :                 length=2;
     452             :             }
     453           0 :             intBytes[length++]=(char)(i>>8);
     454             :         }
     455           0 :         intBytes[length++]=(char)i;
     456             :     }
     457           0 :     intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
     458           0 :     return write(intBytes, length);
     459             : }
     460             : 
     461             : int32_t
     462           0 : BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
     463           0 :     int32_t offset=write(node);
     464           0 :     if(hasValue) {
     465           0 :         offset=writeValueAndFinal(value, FALSE);
     466             :     }
     467           0 :     return offset;
     468             : }
     469             : 
     470             : int32_t
     471           0 : BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
     472           0 :     int32_t i=bytesLength-jumpTarget;
     473           0 :     U_ASSERT(i>=0);
     474           0 :     if(i<=BytesTrie::kMaxOneByteDelta) {
     475           0 :         return write(i);
     476             :     }
     477             :     char intBytes[5];
     478             :     int32_t length;
     479           0 :     if(i<=BytesTrie::kMaxTwoByteDelta) {
     480           0 :         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
     481           0 :         length=1;
     482             :     } else {
     483           0 :         if(i<=BytesTrie::kMaxThreeByteDelta) {
     484           0 :             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
     485           0 :             length=2;
     486             :         } else {
     487           0 :             if(i<=0xffffff) {
     488           0 :                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
     489           0 :                 length=3;
     490             :             } else {
     491           0 :                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
     492           0 :                 intBytes[1]=(char)(i>>24);
     493           0 :                 length=4;
     494             :             }
     495           0 :             intBytes[1]=(char)(i>>16);
     496             :         }
     497           0 :         intBytes[1]=(char)(i>>8);
     498             :     }
     499           0 :     intBytes[length++]=(char)i;
     500           0 :     return write(intBytes, length);
     501             : }
     502             : 
     503             : U_NAMESPACE_END

Generated by: LCOV version 1.13