ICU 58.2  58.2
stringtriebuilder.h
Go to the documentation of this file.
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012,2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: stringtriebuilder.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010dec24
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __STRINGTRIEBUILDER_H__
18 #define __STRINGTRIEBUILDER_H__
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uobject.h"
22 
28 // Forward declaration.
29 struct UHashtable;
30 typedef struct UHashtable UHashtable;
31 
53 };
54 
56 
64 public:
65 #ifndef U_HIDE_INTERNAL_API
66 
67  static UBool hashNode(const void *node);
69  static UBool equalNodes(const void *left, const void *right);
70 #endif /* U_HIDE_INTERNAL_API */
71 
72 protected:
73  // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
74  // or else the compiler will create a public default constructor.
78  virtual ~StringTrieBuilder();
79 
80 #ifndef U_HIDE_INTERNAL_API
81 
82  void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
84  void deleteCompactBuilder();
85 
87  void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
88 
90  int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
92  int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
93 #endif /* U_HIDE_INTERNAL_API */
94 
95  class Node;
96 
97 #ifndef U_HIDE_INTERNAL_API
98 
99  Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
101  Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
102  int32_t length, UErrorCode &errorCode);
103 #endif /* U_HIDE_INTERNAL_API */
104 
106  virtual int32_t getElementStringLength(int32_t i) const = 0;
108  virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
110  virtual int32_t getElementValue(int32_t i) const = 0;
111 
112  // Finds the first unit index after this one where
113  // the first and last element have different units again.
115  virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
116 
117  // Number of different units at unitIndex.
119  virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
121  virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
123  virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
124 
126  virtual UBool matchNodesCanHaveValues() const = 0;
127 
129  virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
131  virtual int32_t getMinLinearMatch() const = 0;
133  virtual int32_t getMaxLinearMatchLength() const = 0;
134 
135 #ifndef U_HIDE_INTERNAL_API
136  // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
138  static const int32_t kMaxBranchLinearSubNodeLength=5;
139 
140  // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
141  // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
143  static const int32_t kMaxSplitBranchLevels=14;
144 
155  Node *registerNode(Node *newNode, UErrorCode &errorCode);
166  Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
167 #endif /* U_HIDE_INTERNAL_API */
168 
169  /*
170  * C++ note:
171  * registerNode() and registerFinalValue() take ownership of their input nodes,
172  * and only return owned nodes.
173  * If they see a failure UErrorCode, they will delete the input node.
174  * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
175  * If there is a failure, they return NULL.
176  *
177  * NULL Node pointers can be safely passed into other Nodes because
178  * they call the static Node::hashCode() which checks for a NULL pointer first.
179  *
180  * Therefore, as long as builder functions register a new node,
181  * they need to check for failures only before explicitly dereferencing
182  * a Node pointer, or before setting a new UErrorCode.
183  */
184 
185  // Hash set of nodes, maps from nodes to integer 1.
187  UHashtable *nodes;
188 
189  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
190  // it is needed for layout of other objects.
192  class Node : public UObject {
193  public:
194  Node(int32_t initialHash) : hash(initialHash), offset(0) {}
195  inline int32_t hashCode() const { return hash; }
196  // Handles node==NULL.
197  static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
198  // Base class operator==() compares the actual class types.
199  virtual UBool operator==(const Node &other) const;
200  inline UBool operator!=(const Node &other) const { return !operator==(other); }
228  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
229  // write() must set the offset to a positive value.
230  virtual void write(StringTrieBuilder &builder) = 0;
231  // See markRightEdgesFirst.
232  inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
233  StringTrieBuilder &builder) {
234  // Note: Edge numbers are negative, lastRight<=firstRight.
235  // If offset>0 then this node and its sub-nodes have been written already
236  // and we need not write them again.
237  // If this node is part of the unwritten right branch edge,
238  // then we wait until that is written.
239  if(offset<0 && (offset<lastRight || firstRight<offset)) {
240  write(builder);
241  }
242  }
243  inline int32_t getOffset() const { return offset; }
244  protected:
245  int32_t hash;
246  int32_t offset;
247  };
248 
249 #ifndef U_HIDE_INTERNAL_API
250  // This class should not be overridden because
251  // registerFinalValue() compares a stack-allocated FinalValueNode
252  // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
253  // with the input node, and the
254  // !Node::operator==(other) used inside FinalValueNode::operator==(other)
255  // will be false if the typeid's are different.
257  class FinalValueNode : public Node {
258  public:
259  FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
260  virtual UBool operator==(const Node &other) const;
261  virtual void write(StringTrieBuilder &builder);
262  protected:
263  int32_t value;
264  };
265 #endif /* U_HIDE_INTERNAL_API */
266 
267  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
268  // it is needed for layout of other objects.
272  class ValueNode : public Node {
273  public:
274  ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
275  virtual UBool operator==(const Node &other) const;
276  void setValue(int32_t v) {
277  hasValue=TRUE;
278  value=v;
279  hash=hash*37+v;
280  }
281  protected:
282  UBool hasValue;
283  int32_t value;
284  };
285 
286 #ifndef U_HIDE_INTERNAL_API
287 
291  public:
292  IntermediateValueNode(int32_t v, Node *nextNode)
293  : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
294  virtual UBool operator==(const Node &other) const;
295  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
296  virtual void write(StringTrieBuilder &builder);
297  protected:
298  Node *next;
299  };
300 #endif /* U_HIDE_INTERNAL_API */
301 
302  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
303  // it is needed for layout of other objects.
307  class LinearMatchNode : public ValueNode {
308  public:
309  LinearMatchNode(int32_t len, Node *nextNode)
310  : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
311  length(len), next(nextNode) {}
312  virtual UBool operator==(const Node &other) const;
313  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
314  protected:
315  int32_t length;
316  Node *next;
317  };
318 
319 #ifndef U_HIDE_INTERNAL_API
320 
323  class BranchNode : public Node {
324  public:
325  BranchNode(int32_t initialHash) : Node(initialHash) {}
326  protected:
327  int32_t firstEdgeNumber;
328  };
329 
333  class ListBranchNode : public BranchNode {
334  public:
335  ListBranchNode() : BranchNode(0x444444), length(0) {}
336  virtual UBool operator==(const Node &other) const;
337  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
338  virtual void write(StringTrieBuilder &builder);
339  // Adds a unit with a final value.
340  void add(int32_t c, int32_t value) {
341  units[length]=(UChar)c;
342  equal[length]=NULL;
343  values[length]=value;
344  ++length;
345  hash=(hash*37+c)*37+value;
346  }
347  // Adds a unit which leads to another match node.
348  void add(int32_t c, Node *node) {
349  units[length]=(UChar)c;
350  equal[length]=node;
351  values[length]=0;
352  ++length;
353  hash=(hash*37+c)*37+hashCode(node);
354  }
355  protected:
356  Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
357  int32_t length;
358  int32_t values[kMaxBranchLinearSubNodeLength];
359  UChar units[kMaxBranchLinearSubNodeLength];
360  };
361 
365  class SplitBranchNode : public BranchNode {
366  public:
367  SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
368  : BranchNode(((0x555555*37+middleUnit)*37+
369  hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
370  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
371  virtual UBool operator==(const Node &other) const;
372  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
373  virtual void write(StringTrieBuilder &builder);
374  protected:
375  UChar unit;
376  Node *lessThan;
377  Node *greaterOrEqual;
378  };
379 
380  // Branch head node, for writing the actual node lead unit.
382  class BranchHeadNode : public ValueNode {
383  public:
384  BranchHeadNode(int32_t len, Node *subNode)
385  : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
386  length(len), next(subNode) {}
387  virtual UBool operator==(const Node &other) const;
388  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
389  virtual void write(StringTrieBuilder &builder);
390  protected:
391  int32_t length;
392  Node *next; // A branch sub-node.
393  };
394 #endif /* U_HIDE_INTERNAL_API */
395 
397  virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
398  Node *nextNode) const = 0;
399 
401  virtual int32_t write(int32_t unit) = 0;
403  virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
405  virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
407  virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
409  virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
410 };
411 
413 
414 #endif // __STRINGTRIEBUILDER_H__
Builds a trie more slowly, attempting to generate a shorter but equivalent serialization.
Base class for string trie builder classes.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:131
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:220
Builds a trie quickly.
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:188
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:263
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:337
UStringTrieBuildOption
Build options for BytesTrieBuilder and CharsTrieBuilder.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:132
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:396
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:267
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:359
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
int8_t UBool
The ICU boolean type.
Definition: umachine.h:259