Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2012,2014, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * file name: stringtriebuilder.h
9 : * encoding: UTF-8
10 : * tab size: 8 (not used)
11 : * indentation:4
12 : *
13 : * created on: 2010dec24
14 : * created by: Markus W. Scherer
15 : */
16 :
17 : #ifndef __STRINGTRIEBUILDER_H__
18 : #define __STRINGTRIEBUILDER_H__
19 :
20 : #include "unicode/utypes.h"
21 : #include "unicode/uobject.h"
22 :
23 : /**
24 : * \file
25 : * \brief C++ API: Builder API for trie builders
26 : */
27 :
28 : // Forward declaration.
29 : struct UHashtable;
30 : typedef struct UHashtable UHashtable;
31 :
32 : /**
33 : * Build options for BytesTrieBuilder and CharsTrieBuilder.
34 : * @stable ICU 4.8
35 : */
36 : enum UStringTrieBuildOption {
37 : /**
38 : * Builds a trie quickly.
39 : * @stable ICU 4.8
40 : */
41 : USTRINGTRIE_BUILD_FAST,
42 : /**
43 : * Builds a trie more slowly, attempting to generate
44 : * a shorter but equivalent serialization.
45 : * This build option also uses more memory.
46 : *
47 : * This option can be effective when many integer values are the same
48 : * and string/byte sequence suffixes can be shared.
49 : * Runtime speed is not expected to improve.
50 : * @stable ICU 4.8
51 : */
52 : USTRINGTRIE_BUILD_SMALL
53 : };
54 :
55 : U_NAMESPACE_BEGIN
56 :
57 : /**
58 : * Base class for string trie builder classes.
59 : *
60 : * This class is not intended for public subclassing.
61 : * @stable ICU 4.8
62 : */
63 : class U_COMMON_API StringTrieBuilder : public UObject {
64 : public:
65 : #ifndef U_HIDE_INTERNAL_API
66 : /** @internal */
67 : static UBool hashNode(const void *node);
68 : /** @internal */
69 : static UBool equalNodes(const void *left, const void *right);
70 : #endif /* U_HIDE_INTERNAL_API */
71 :
72 : protected:
73 : // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
74 : // or else the compiler will create a public default constructor.
75 : /** @internal */
76 : StringTrieBuilder();
77 : /** @internal */
78 : virtual ~StringTrieBuilder();
79 :
80 : #ifndef U_HIDE_INTERNAL_API
81 : /** @internal */
82 : void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
83 : /** @internal */
84 : void deleteCompactBuilder();
85 :
86 : /** @internal */
87 : void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
88 :
89 : /** @internal */
90 : int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
91 : /** @internal */
92 : int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
93 : #endif /* U_HIDE_INTERNAL_API */
94 :
95 : class Node;
96 :
97 : #ifndef U_HIDE_INTERNAL_API
98 : /** @internal */
99 : Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
100 : /** @internal */
101 : Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
102 : int32_t length, UErrorCode &errorCode);
103 : #endif /* U_HIDE_INTERNAL_API */
104 :
105 : /** @internal */
106 : virtual int32_t getElementStringLength(int32_t i) const = 0;
107 : /** @internal */
108 : virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
109 : /** @internal */
110 : virtual int32_t getElementValue(int32_t i) const = 0;
111 :
112 : // Finds the first unit index after this one where
113 : // the first and last element have different units again.
114 : /** @internal */
115 : virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
116 :
117 : // Number of different units at unitIndex.
118 : /** @internal */
119 : virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
120 : /** @internal */
121 : virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
122 : /** @internal */
123 : virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
124 :
125 : /** @internal */
126 : virtual UBool matchNodesCanHaveValues() const = 0;
127 :
128 : /** @internal */
129 : virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
130 : /** @internal */
131 : virtual int32_t getMinLinearMatch() const = 0;
132 : /** @internal */
133 : virtual int32_t getMaxLinearMatchLength() const = 0;
134 :
135 : #ifndef U_HIDE_INTERNAL_API
136 : // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
137 : /** @internal */
138 : static const int32_t kMaxBranchLinearSubNodeLength=5;
139 :
140 : // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
141 : // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
142 : /** @internal */
143 : static const int32_t kMaxSplitBranchLevels=14;
144 :
145 : /**
146 : * Makes sure that there is only one unique node registered that is
147 : * equivalent to newNode.
148 : * @param newNode Input node. The builder takes ownership.
149 : * @param errorCode ICU in/out UErrorCode.
150 : Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
151 : * @return newNode if it is the first of its kind, or
152 : * an equivalent node if newNode is a duplicate.
153 : * @internal
154 : */
155 : Node *registerNode(Node *newNode, UErrorCode &errorCode);
156 : /**
157 : * Makes sure that there is only one unique FinalValueNode registered
158 : * with this value.
159 : * Avoids creating a node if the value is a duplicate.
160 : * @param value A final value.
161 : * @param errorCode ICU in/out UErrorCode.
162 : Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
163 : * @return A FinalValueNode with the given value.
164 : * @internal
165 : */
166 : Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
167 : #endif /* U_HIDE_INTERNAL_API */
168 :
169 : /*
170 : * C++ note:
171 : * registerNode() and registerFinalValue() take ownership of their input nodes,
172 : * and only return owned nodes.
173 : * If they see a failure UErrorCode, they will delete the input node.
174 : * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
175 : * If there is a failure, they return NULL.
176 : *
177 : * NULL Node pointers can be safely passed into other Nodes because
178 : * they call the static Node::hashCode() which checks for a NULL pointer first.
179 : *
180 : * Therefore, as long as builder functions register a new node,
181 : * they need to check for failures only before explicitly dereferencing
182 : * a Node pointer, or before setting a new UErrorCode.
183 : */
184 :
185 : // Hash set of nodes, maps from nodes to integer 1.
186 : /** @internal */
187 : UHashtable *nodes;
188 :
189 : // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
190 : // it is needed for layout of other objects.
191 : /** @internal */
192 0 : class Node : public UObject {
193 : public:
194 0 : Node(int32_t initialHash) : hash(initialHash), offset(0) {}
195 0 : inline int32_t hashCode() const { return hash; }
196 : // Handles node==NULL.
197 0 : static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
198 : // Base class operator==() compares the actual class types.
199 : virtual UBool operator==(const Node &other) const;
200 : inline UBool operator!=(const Node &other) const { return !operator==(other); }
201 : /**
202 : * Traverses the Node graph and numbers branch edges, with rightmost edges first.
203 : * This is to avoid writing a duplicate node twice.
204 : *
205 : * Branch nodes in this trie data structure are not symmetric.
206 : * Most branch edges "jump" to other nodes but the rightmost branch edges
207 : * just continue without a jump.
208 : * Therefore, write() must write the rightmost branch edge last
209 : * (trie units are written backwards), and must write it at that point even if
210 : * it is a duplicate of a node previously written elsewhere.
211 : *
212 : * This function visits and marks right branch edges first.
213 : * Edges are numbered with increasingly negative values because we share the
214 : * offset field which gets positive values when nodes are written.
215 : * A branch edge also remembers the first number for any of its edges.
216 : *
217 : * When a further-left branch edge has a number in the range of the rightmost
218 : * edge's numbers, then it will be written as part of the required right edge
219 : * and we can avoid writing it first.
220 : *
221 : * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
222 : * edge numbers.
223 : *
224 : * @param edgeNumber The first edge number for this node and its sub-nodes.
225 : * @return An edge number that is at least the maximum-negative
226 : * of the input edge number and the numbers of this node and all of its sub-nodes.
227 : */
228 : virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
229 : // write() must set the offset to a positive value.
230 : virtual void write(StringTrieBuilder &builder) = 0;
231 : // See markRightEdgesFirst.
232 0 : inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
233 : StringTrieBuilder &builder) {
234 : // Note: Edge numbers are negative, lastRight<=firstRight.
235 : // If offset>0 then this node and its sub-nodes have been written already
236 : // and we need not write them again.
237 : // If this node is part of the unwritten right branch edge,
238 : // then we wait until that is written.
239 0 : if(offset<0 && (offset<lastRight || firstRight<offset)) {
240 0 : write(builder);
241 : }
242 0 : }
243 0 : inline int32_t getOffset() const { return offset; }
244 : protected:
245 : int32_t hash;
246 : int32_t offset;
247 : };
248 :
249 : #ifndef U_HIDE_INTERNAL_API
250 : // This class should not be overridden because
251 : // registerFinalValue() compares a stack-allocated FinalValueNode
252 : // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
253 : // with the input node, and the
254 : // !Node::operator==(other) used inside FinalValueNode::operator==(other)
255 : // will be false if the typeid's are different.
256 : /** @internal */
257 0 : class FinalValueNode : public Node {
258 : public:
259 0 : FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
260 : virtual UBool operator==(const Node &other) const;
261 : virtual void write(StringTrieBuilder &builder);
262 : protected:
263 : int32_t value;
264 : };
265 : #endif /* U_HIDE_INTERNAL_API */
266 :
267 : // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
268 : // it is needed for layout of other objects.
269 : /**
270 : * @internal
271 : */
272 0 : class ValueNode : public Node {
273 : public:
274 0 : ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
275 : virtual UBool operator==(const Node &other) const;
276 0 : void setValue(int32_t v) {
277 0 : hasValue=TRUE;
278 0 : value=v;
279 0 : hash=hash*37+v;
280 0 : }
281 : protected:
282 : UBool hasValue;
283 : int32_t value;
284 : };
285 :
286 : #ifndef U_HIDE_INTERNAL_API
287 : /**
288 : * @internal
289 : */
290 0 : class IntermediateValueNode : public ValueNode {
291 : public:
292 0 : IntermediateValueNode(int32_t v, Node *nextNode)
293 0 : : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
294 : virtual UBool operator==(const Node &other) const;
295 : virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
296 : virtual void write(StringTrieBuilder &builder);
297 : protected:
298 : Node *next;
299 : };
300 : #endif /* U_HIDE_INTERNAL_API */
301 :
302 : // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
303 : // it is needed for layout of other objects.
304 : /**
305 : * @internal
306 : */
307 0 : class LinearMatchNode : public ValueNode {
308 : public:
309 0 : LinearMatchNode(int32_t len, Node *nextNode)
310 0 : : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
311 0 : length(len), next(nextNode) {}
312 : virtual UBool operator==(const Node &other) const;
313 : virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
314 : protected:
315 : int32_t length;
316 : Node *next;
317 : };
318 :
319 : #ifndef U_HIDE_INTERNAL_API
320 : /**
321 : * @internal
322 : */
323 0 : class BranchNode : public Node {
324 : public:
325 0 : BranchNode(int32_t initialHash) : Node(initialHash) {}
326 : protected:
327 : int32_t firstEdgeNumber;
328 : };
329 :
330 : /**
331 : * @internal
332 : */
333 0 : class ListBranchNode : public BranchNode {
334 : public:
335 0 : ListBranchNode() : BranchNode(0x444444), length(0) {}
336 : virtual UBool operator==(const Node &other) const;
337 : virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
338 : virtual void write(StringTrieBuilder &builder);
339 : // Adds a unit with a final value.
340 0 : void add(int32_t c, int32_t value) {
341 0 : units[length]=(char16_t)c;
342 0 : equal[length]=NULL;
343 0 : values[length]=value;
344 0 : ++length;
345 0 : hash=(hash*37+c)*37+value;
346 0 : }
347 : // Adds a unit which leads to another match node.
348 0 : void add(int32_t c, Node *node) {
349 0 : units[length]=(char16_t)c;
350 0 : equal[length]=node;
351 0 : values[length]=0;
352 0 : ++length;
353 0 : hash=(hash*37+c)*37+hashCode(node);
354 0 : }
355 : protected:
356 : Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
357 : int32_t length;
358 : int32_t values[kMaxBranchLinearSubNodeLength];
359 : char16_t units[kMaxBranchLinearSubNodeLength];
360 : };
361 :
362 : /**
363 : * @internal
364 : */
365 0 : class SplitBranchNode : public BranchNode {
366 : public:
367 0 : SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
368 0 : : BranchNode(((0x555555*37+middleUnit)*37+
369 0 : hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
370 0 : unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
371 : virtual UBool operator==(const Node &other) const;
372 : virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
373 : virtual void write(StringTrieBuilder &builder);
374 : protected:
375 : char16_t unit;
376 : Node *lessThan;
377 : Node *greaterOrEqual;
378 : };
379 :
380 : // Branch head node, for writing the actual node lead unit.
381 : /** @internal */
382 0 : class BranchHeadNode : public ValueNode {
383 : public:
384 0 : BranchHeadNode(int32_t len, Node *subNode)
385 0 : : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
386 0 : length(len), next(subNode) {}
387 : virtual UBool operator==(const Node &other) const;
388 : virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
389 : virtual void write(StringTrieBuilder &builder);
390 : protected:
391 : int32_t length;
392 : Node *next; // A branch sub-node.
393 : };
394 : #endif /* U_HIDE_INTERNAL_API */
395 :
396 : /** @internal */
397 : virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
398 : Node *nextNode) const = 0;
399 :
400 : /** @internal */
401 : virtual int32_t write(int32_t unit) = 0;
402 : /** @internal */
403 : virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
404 : /** @internal */
405 : virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
406 : /** @internal */
407 : virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
408 : /** @internal */
409 : virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
410 : };
411 :
412 : U_NAMESPACE_END
413 :
414 : #endif // __STRINGTRIEBUILDER_H__
|