Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2012, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * file name: bytestriebuilder.cpp
9 : * encoding: UTF-8
10 : * tab size: 8 (not used)
11 : * indentation:4
12 : *
13 : * created on: 2010sep25
14 : * created by: Markus W. Scherer
15 : */
16 :
17 : #include "unicode/utypes.h"
18 : #include "unicode/bytestrie.h"
19 : #include "unicode/bytestriebuilder.h"
20 : #include "unicode/stringpiece.h"
21 : #include "charstr.h"
22 : #include "cmemory.h"
23 : #include "uhash.h"
24 : #include "uarrsort.h"
25 : #include "uassert.h"
26 : #include "ustr_imp.h"
27 :
28 : U_NAMESPACE_BEGIN
29 :
30 : /*
31 : * Note: This builder implementation stores (bytes, value) pairs with full copies
32 : * of the byte sequences, until the BytesTrie is built.
33 : * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34 : */
35 :
36 : class BytesTrieElement : public UMemory {
37 : public:
38 : // Use compiler's default constructor, initializes nothing.
39 :
40 : void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
41 :
42 0 : StringPiece getString(const CharString &strings) const {
43 0 : int32_t offset=stringOffset;
44 : int32_t length;
45 0 : if(offset>=0) {
46 0 : length=(uint8_t)strings[offset++];
47 : } else {
48 0 : offset=~offset;
49 0 : length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
50 0 : offset+=2;
51 : }
52 0 : return StringPiece(strings.data()+offset, length);
53 : }
54 0 : int32_t getStringLength(const CharString &strings) const {
55 0 : int32_t offset=stringOffset;
56 0 : if(offset>=0) {
57 0 : return (uint8_t)strings[offset];
58 : } else {
59 0 : offset=~offset;
60 0 : return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
61 : }
62 : }
63 :
64 0 : char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
65 :
66 0 : int32_t getValue() const { return value; }
67 :
68 : int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
69 :
70 : private:
71 0 : const char *data(const CharString &strings) const {
72 0 : int32_t offset=stringOffset;
73 0 : if(offset>=0) {
74 0 : ++offset;
75 : } else {
76 0 : offset=~offset+2;
77 : }
78 0 : return strings.data()+offset;
79 : }
80 :
81 : // If the stringOffset is non-negative, then the first strings byte contains
82 : // the string length.
83 : // If the stringOffset is negative, then the first two strings bytes contain
84 : // the string length (big-endian), and the offset needs to be bit-inverted.
85 : // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
86 : int32_t stringOffset;
87 : int32_t value;
88 : };
89 :
90 : void
91 0 : BytesTrieElement::setTo(StringPiece s, int32_t val,
92 : CharString &strings, UErrorCode &errorCode) {
93 0 : if(U_FAILURE(errorCode)) {
94 0 : return;
95 : }
96 0 : int32_t length=s.length();
97 0 : if(length>0xffff) {
98 : // Too long: We store the length in 1 or 2 bytes.
99 0 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
100 0 : return;
101 : }
102 0 : int32_t offset=strings.length();
103 0 : if(length>0xff) {
104 0 : offset=~offset;
105 0 : strings.append((char)(length>>8), errorCode);
106 : }
107 0 : strings.append((char)length, errorCode);
108 0 : stringOffset=offset;
109 0 : value=val;
110 0 : strings.append(s, errorCode);
111 : }
112 :
113 : int32_t
114 0 : BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
115 : // TODO: add StringPiece::compare(), see ticket #8187
116 0 : StringPiece thisString=getString(strings);
117 0 : StringPiece otherString=other.getString(strings);
118 0 : int32_t lengthDiff=thisString.length()-otherString.length();
119 : int32_t commonLength;
120 0 : if(lengthDiff<=0) {
121 0 : commonLength=thisString.length();
122 : } else {
123 0 : commonLength=otherString.length();
124 : }
125 0 : int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
126 0 : return diff!=0 ? diff : lengthDiff;
127 : }
128 :
129 0 : BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
130 : : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
131 0 : bytes(NULL), bytesCapacity(0), bytesLength(0) {
132 0 : if(U_FAILURE(errorCode)) {
133 0 : return;
134 : }
135 0 : strings=new CharString();
136 0 : if(strings==NULL) {
137 0 : errorCode=U_MEMORY_ALLOCATION_ERROR;
138 : }
139 : }
140 :
141 0 : BytesTrieBuilder::~BytesTrieBuilder() {
142 0 : delete strings;
143 0 : delete[] elements;
144 0 : uprv_free(bytes);
145 0 : }
146 :
147 : BytesTrieBuilder &
148 0 : BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
149 0 : if(U_FAILURE(errorCode)) {
150 0 : return *this;
151 : }
152 0 : if(bytesLength>0) {
153 : // Cannot add elements after building.
154 0 : errorCode=U_NO_WRITE_PERMISSION;
155 0 : return *this;
156 : }
157 0 : if(elementsLength==elementsCapacity) {
158 : int32_t newCapacity;
159 0 : if(elementsCapacity==0) {
160 0 : newCapacity=1024;
161 : } else {
162 0 : newCapacity=4*elementsCapacity;
163 : }
164 0 : BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
165 0 : if(newElements==NULL) {
166 0 : errorCode=U_MEMORY_ALLOCATION_ERROR;
167 0 : return *this; // error instead of dereferencing null
168 : }
169 0 : if(elementsLength>0) {
170 0 : uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
171 : }
172 0 : delete[] elements;
173 0 : elements=newElements;
174 0 : elementsCapacity=newCapacity;
175 : }
176 0 : elements[elementsLength++].setTo(s, value, *strings, errorCode);
177 0 : return *this;
178 : }
179 :
180 : U_CDECL_BEGIN
181 :
182 : static int32_t U_CALLCONV
183 0 : compareElementStrings(const void *context, const void *left, const void *right) {
184 0 : const CharString *strings=static_cast<const CharString *>(context);
185 0 : const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
186 0 : const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
187 0 : return leftElement->compareStringTo(*rightElement, *strings);
188 : }
189 :
190 : U_CDECL_END
191 :
192 : BytesTrie *
193 0 : BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
194 0 : buildBytes(buildOption, errorCode);
195 0 : BytesTrie *newTrie=NULL;
196 0 : if(U_SUCCESS(errorCode)) {
197 0 : newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
198 0 : if(newTrie==NULL) {
199 0 : errorCode=U_MEMORY_ALLOCATION_ERROR;
200 : } else {
201 0 : bytes=NULL; // The new trie now owns the array.
202 0 : bytesCapacity=0;
203 : }
204 : }
205 0 : return newTrie;
206 : }
207 :
208 : StringPiece
209 0 : BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
210 0 : buildBytes(buildOption, errorCode);
211 0 : StringPiece result;
212 0 : if(U_SUCCESS(errorCode)) {
213 0 : result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
214 : }
215 0 : return result;
216 : }
217 :
218 : void
219 0 : BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
220 0 : if(U_FAILURE(errorCode)) {
221 0 : return;
222 : }
223 0 : if(bytes!=NULL && bytesLength>0) {
224 : // Already built.
225 0 : return;
226 : }
227 0 : if(bytesLength==0) {
228 0 : if(elementsLength==0) {
229 0 : errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
230 0 : return;
231 : }
232 0 : uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
233 0 : compareElementStrings, strings,
234 : FALSE, // need not be a stable sort
235 0 : &errorCode);
236 0 : if(U_FAILURE(errorCode)) {
237 0 : return;
238 : }
239 : // Duplicate strings are not allowed.
240 0 : StringPiece prev=elements[0].getString(*strings);
241 0 : for(int32_t i=1; i<elementsLength; ++i) {
242 0 : StringPiece current=elements[i].getString(*strings);
243 0 : if(prev==current) {
244 0 : errorCode=U_ILLEGAL_ARGUMENT_ERROR;
245 0 : return;
246 : }
247 0 : prev=current;
248 : }
249 : }
250 : // Create and byte-serialize the trie for the elements.
251 0 : bytesLength=0;
252 0 : int32_t capacity=strings->length();
253 0 : if(capacity<1024) {
254 0 : capacity=1024;
255 : }
256 0 : if(bytesCapacity<capacity) {
257 0 : uprv_free(bytes);
258 0 : bytes=static_cast<char *>(uprv_malloc(capacity));
259 0 : if(bytes==NULL) {
260 0 : errorCode=U_MEMORY_ALLOCATION_ERROR;
261 0 : bytesCapacity=0;
262 0 : return;
263 : }
264 0 : bytesCapacity=capacity;
265 : }
266 0 : StringTrieBuilder::build(buildOption, elementsLength, errorCode);
267 0 : if(bytes==NULL) {
268 0 : errorCode=U_MEMORY_ALLOCATION_ERROR;
269 : }
270 : }
271 :
272 : BytesTrieBuilder &
273 0 : BytesTrieBuilder::clear() {
274 0 : strings->clear();
275 0 : elementsLength=0;
276 0 : bytesLength=0;
277 0 : return *this;
278 : }
279 :
280 : int32_t
281 0 : BytesTrieBuilder::getElementStringLength(int32_t i) const {
282 0 : return elements[i].getStringLength(*strings);
283 : }
284 :
285 : UChar
286 0 : BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
287 0 : return (uint8_t)elements[i].charAt(byteIndex, *strings);
288 : }
289 :
290 : int32_t
291 0 : BytesTrieBuilder::getElementValue(int32_t i) const {
292 0 : return elements[i].getValue();
293 : }
294 :
295 : int32_t
296 0 : BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
297 0 : const BytesTrieElement &firstElement=elements[first];
298 0 : const BytesTrieElement &lastElement=elements[last];
299 0 : int32_t minStringLength=firstElement.getStringLength(*strings);
300 0 : while(++byteIndex<minStringLength &&
301 0 : firstElement.charAt(byteIndex, *strings)==
302 0 : lastElement.charAt(byteIndex, *strings)) {}
303 0 : return byteIndex;
304 : }
305 :
306 : int32_t
307 0 : BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
308 0 : int32_t length=0; // Number of different bytes at byteIndex.
309 0 : int32_t i=start;
310 0 : do {
311 0 : char byte=elements[i++].charAt(byteIndex, *strings);
312 0 : while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
313 0 : ++i;
314 : }
315 0 : ++length;
316 0 : } while(i<limit);
317 0 : return length;
318 : }
319 :
320 : int32_t
321 0 : BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
322 0 : do {
323 0 : char byte=elements[i++].charAt(byteIndex, *strings);
324 0 : while(byte==elements[i].charAt(byteIndex, *strings)) {
325 0 : ++i;
326 : }
327 : } while(--count>0);
328 0 : return i;
329 : }
330 :
331 : int32_t
332 0 : BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
333 0 : char b=(char)byte;
334 0 : while(b==elements[i].charAt(byteIndex, *strings)) {
335 0 : ++i;
336 : }
337 0 : return i;
338 : }
339 :
340 0 : BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
341 0 : : LinearMatchNode(len, nextNode), s(bytes) {
342 0 : hash=hash*37+ustr_hashCharsN(bytes, len);
343 0 : }
344 :
345 : UBool
346 0 : BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
347 0 : if(this==&other) {
348 0 : return TRUE;
349 : }
350 0 : if(!LinearMatchNode::operator==(other)) {
351 0 : return FALSE;
352 : }
353 0 : const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
354 0 : return 0==uprv_memcmp(s, o.s, length);
355 : }
356 :
357 : void
358 0 : BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
359 0 : BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
360 0 : next->write(builder);
361 0 : b.write(s, length);
362 0 : offset=b.write(b.getMinLinearMatch()+length-1);
363 0 : }
364 :
365 : StringTrieBuilder::Node *
366 0 : BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
367 : Node *nextNode) const {
368 : return new BTLinearMatchNode(
369 0 : elements[i].getString(*strings).data()+byteIndex,
370 : length,
371 0 : nextNode);
372 : }
373 :
374 : UBool
375 0 : BytesTrieBuilder::ensureCapacity(int32_t length) {
376 0 : if(bytes==NULL) {
377 0 : return FALSE; // previous memory allocation had failed
378 : }
379 0 : if(length>bytesCapacity) {
380 0 : int32_t newCapacity=bytesCapacity;
381 0 : do {
382 0 : newCapacity*=2;
383 0 : } while(newCapacity<=length);
384 0 : char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
385 0 : if(newBytes==NULL) {
386 : // unable to allocate memory
387 0 : uprv_free(bytes);
388 0 : bytes=NULL;
389 0 : bytesCapacity=0;
390 0 : return FALSE;
391 : }
392 0 : uprv_memcpy(newBytes+(newCapacity-bytesLength),
393 0 : bytes+(bytesCapacity-bytesLength), bytesLength);
394 0 : uprv_free(bytes);
395 0 : bytes=newBytes;
396 0 : bytesCapacity=newCapacity;
397 : }
398 0 : return TRUE;
399 : }
400 :
401 : int32_t
402 0 : BytesTrieBuilder::write(int32_t byte) {
403 0 : int32_t newLength=bytesLength+1;
404 0 : if(ensureCapacity(newLength)) {
405 0 : bytesLength=newLength;
406 0 : bytes[bytesCapacity-bytesLength]=(char)byte;
407 : }
408 0 : return bytesLength;
409 : }
410 :
411 : int32_t
412 0 : BytesTrieBuilder::write(const char *b, int32_t length) {
413 0 : int32_t newLength=bytesLength+length;
414 0 : if(ensureCapacity(newLength)) {
415 0 : bytesLength=newLength;
416 0 : uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
417 : }
418 0 : return bytesLength;
419 : }
420 :
421 : int32_t
422 0 : BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
423 0 : return write(elements[i].getString(*strings).data()+byteIndex, length);
424 : }
425 :
426 : int32_t
427 0 : BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
428 0 : if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
429 0 : return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
430 : }
431 : char intBytes[5];
432 0 : int32_t length=1;
433 0 : if(i<0 || i>0xffffff) {
434 0 : intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
435 0 : intBytes[1]=(char)((uint32_t)i>>24);
436 0 : intBytes[2]=(char)((uint32_t)i>>16);
437 0 : intBytes[3]=(char)((uint32_t)i>>8);
438 0 : intBytes[4]=(char)i;
439 0 : length=5;
440 : // } else if(i<=BytesTrie::kMaxOneByteValue) {
441 : // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
442 : } else {
443 0 : if(i<=BytesTrie::kMaxTwoByteValue) {
444 0 : intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
445 : } else {
446 0 : if(i<=BytesTrie::kMaxThreeByteValue) {
447 0 : intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
448 : } else {
449 0 : intBytes[0]=(char)BytesTrie::kFourByteValueLead;
450 0 : intBytes[1]=(char)(i>>16);
451 0 : length=2;
452 : }
453 0 : intBytes[length++]=(char)(i>>8);
454 : }
455 0 : intBytes[length++]=(char)i;
456 : }
457 0 : intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
458 0 : return write(intBytes, length);
459 : }
460 :
461 : int32_t
462 0 : BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
463 0 : int32_t offset=write(node);
464 0 : if(hasValue) {
465 0 : offset=writeValueAndFinal(value, FALSE);
466 : }
467 0 : return offset;
468 : }
469 :
470 : int32_t
471 0 : BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
472 0 : int32_t i=bytesLength-jumpTarget;
473 0 : U_ASSERT(i>=0);
474 0 : if(i<=BytesTrie::kMaxOneByteDelta) {
475 0 : return write(i);
476 : }
477 : char intBytes[5];
478 : int32_t length;
479 0 : if(i<=BytesTrie::kMaxTwoByteDelta) {
480 0 : intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
481 0 : length=1;
482 : } else {
483 0 : if(i<=BytesTrie::kMaxThreeByteDelta) {
484 0 : intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
485 0 : length=2;
486 : } else {
487 0 : if(i<=0xffffff) {
488 0 : intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
489 0 : length=3;
490 : } else {
491 0 : intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
492 0 : intBytes[1]=(char)(i>>24);
493 0 : length=4;
494 : }
495 0 : intBytes[1]=(char)(i>>16);
496 : }
497 0 : intBytes[1]=(char)(i>>8);
498 : }
499 0 : intBytes[length++]=(char)i;
500 0 : return write(intBytes, length);
501 : }
502 :
503 : U_NAMESPACE_END
|