ICU 53.1  53.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ucharstrie.h
Go to the documentation of this file.
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucharstrie.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010nov14
12 * created by: Markus W. Scherer
13 */
14 
15 #ifndef __UCHARSTRIE_H__
16 #define __UCHARSTRIE_H__
17 
24 #include "unicode/utypes.h"
25 #include "unicode/unistr.h"
26 #include "unicode/uobject.h"
27 #include "unicode/ustringtrie.h"
28 
30 
31 class Appendable;
32 class UCharsTrieBuilder;
33 class UVector32;
34 
49 public:
64  UCharsTrie(const UChar *trieUChars)
65  : ownedArray_(NULL), uchars_(trieUChars),
66  pos_(uchars_), remainingMatchLength_(-1) {}
67 
72  ~UCharsTrie();
73 
80  UCharsTrie(const UCharsTrie &other)
81  : ownedArray_(NULL), uchars_(other.uchars_),
82  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
83 
90  pos_=uchars_;
91  remainingMatchLength_=-1;
92  return *this;
93  }
94 
100  class State : public UMemory {
101  public:
106  State() { uchars=NULL; }
107  private:
108  friend class UCharsTrie;
109 
110  const UChar *uchars;
111  const UChar *pos;
112  int32_t remainingMatchLength;
113  };
114 
122  const UCharsTrie &saveState(State &state) const {
123  state.uchars=uchars_;
124  state.pos=pos_;
125  state.remainingMatchLength=remainingMatchLength_;
126  return *this;
127  }
128 
139  UCharsTrie &resetToState(const State &state) {
140  if(uchars_==state.uchars && uchars_!=NULL) {
141  pos_=state.pos;
142  remainingMatchLength_=state.remainingMatchLength;
143  }
144  return *this;
145  }
146 
153  UStringTrieResult current() const;
154 
162  inline UStringTrieResult first(int32_t uchar) {
163  remainingMatchLength_=-1;
164  return nextImpl(uchars_, uchar);
165  }
166 
175  UStringTrieResult firstForCodePoint(UChar32 cp);
176 
183  UStringTrieResult next(int32_t uchar);
184 
192  UStringTrieResult nextForCodePoint(UChar32 cp);
193 
209  UStringTrieResult next(const UChar *s, int32_t length);
210 
220  inline int32_t getValue() const {
221  const UChar *pos=pos_;
222  int32_t leadUnit=*pos++;
223  // U_ASSERT(leadUnit>=kMinValueLead);
224  return leadUnit&kValueIsFinal ?
225  readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
226  }
227 
237  inline UBool hasUniqueValue(int32_t &uniqueValue) const {
238  const UChar *pos=pos_;
239  // Skip the rest of a pending linear-match node.
240  return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
241  }
242 
250  int32_t getNextUChars(Appendable &out) const;
251 
256  class U_COMMON_API Iterator : public UMemory {
257  public:
269  Iterator(const UChar *trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
270 
282  Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
283 
288  ~Iterator();
289 
295  Iterator &reset();
296 
301  UBool hasNext() const;
302 
317  UBool next(UErrorCode &errorCode);
318 
323  const UnicodeString &getString() const { return str_; }
328  int32_t getValue() const { return value_; }
329 
330  private:
331  UBool truncateAndStop() {
332  pos_=NULL;
333  value_=-1; // no real value for str
334  return TRUE;
335  }
336 
337  const UChar *branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode);
338 
339  const UChar *uchars_;
340  const UChar *pos_;
341  const UChar *initialPos_;
342  int32_t remainingMatchLength_;
343  int32_t initialRemainingMatchLength_;
344  UBool skipValue_; // Skip intermediate value which was already delivered.
345 
346  UnicodeString str_;
347  int32_t maxLength_;
348  int32_t value_;
349 
350  // The stack stores pairs of integers for backtracking to another
351  // outbound edge of a branch node.
352  // The first integer is an offset from uchars_.
353  // The second integer has the str_.length() from before the node in bits 15..0,
354  // and the remaining branch length in bits 31..16.
355  // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
356  // but the code looks more confusing that way.)
357  UVector32 *stack_;
358  };
359 
360 private:
361  friend class UCharsTrieBuilder;
362 
369  UCharsTrie(UChar *adoptUChars, const UChar *trieUChars)
370  : ownedArray_(adoptUChars), uchars_(trieUChars),
371  pos_(uchars_), remainingMatchLength_(-1) {}
372 
373  // No assignment operator.
374  UCharsTrie &operator=(const UCharsTrie &other);
375 
376  inline void stop() {
377  pos_=NULL;
378  }
379 
380  // Reads a compact 32-bit integer.
381  // pos is already after the leadUnit, and the lead unit has bit 15 reset.
382  static inline int32_t readValue(const UChar *pos, int32_t leadUnit) {
383  int32_t value;
384  if(leadUnit<kMinTwoUnitValueLead) {
385  value=leadUnit;
386  } else if(leadUnit<kThreeUnitValueLead) {
387  value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
388  } else {
389  value=(pos[0]<<16)|pos[1];
390  }
391  return value;
392  }
393  static inline const UChar *skipValue(const UChar *pos, int32_t leadUnit) {
394  if(leadUnit>=kMinTwoUnitValueLead) {
395  if(leadUnit<kThreeUnitValueLead) {
396  ++pos;
397  } else {
398  pos+=2;
399  }
400  }
401  return pos;
402  }
403  static inline const UChar *skipValue(const UChar *pos) {
404  int32_t leadUnit=*pos++;
405  return skipValue(pos, leadUnit&0x7fff);
406  }
407 
408  static inline int32_t readNodeValue(const UChar *pos, int32_t leadUnit) {
409  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
410  int32_t value;
411  if(leadUnit<kMinTwoUnitNodeValueLead) {
412  value=(leadUnit>>6)-1;
413  } else if(leadUnit<kThreeUnitNodeValueLead) {
414  value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
415  } else {
416  value=(pos[0]<<16)|pos[1];
417  }
418  return value;
419  }
420  static inline const UChar *skipNodeValue(const UChar *pos, int32_t leadUnit) {
421  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
422  if(leadUnit>=kMinTwoUnitNodeValueLead) {
423  if(leadUnit<kThreeUnitNodeValueLead) {
424  ++pos;
425  } else {
426  pos+=2;
427  }
428  }
429  return pos;
430  }
431 
432  static inline const UChar *jumpByDelta(const UChar *pos) {
433  int32_t delta=*pos++;
434  if(delta>=kMinTwoUnitDeltaLead) {
435  if(delta==kThreeUnitDeltaLead) {
436  delta=(pos[0]<<16)|pos[1];
437  pos+=2;
438  } else {
439  delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
440  }
441  }
442  return pos+delta;
443  }
444 
445  static const UChar *skipDelta(const UChar *pos) {
446  int32_t delta=*pos++;
447  if(delta>=kMinTwoUnitDeltaLead) {
448  if(delta==kThreeUnitDeltaLead) {
449  pos+=2;
450  } else {
451  ++pos;
452  }
453  }
454  return pos;
455  }
456 
457  static inline UStringTrieResult valueResult(int32_t node) {
459  }
460 
461  // Handles a branch node for both next(uchar) and next(string).
462  UStringTrieResult branchNext(const UChar *pos, int32_t length, int32_t uchar);
463 
464  // Requires remainingLength_<0.
465  UStringTrieResult nextImpl(const UChar *pos, int32_t uchar);
466 
467  // Helper functions for hasUniqueValue().
468  // Recursively finds a unique value (or whether there is not a unique one)
469  // from a branch.
470  static const UChar *findUniqueValueFromBranch(const UChar *pos, int32_t length,
471  UBool haveUniqueValue, int32_t &uniqueValue);
472  // Recursively finds a unique value (or whether there is not a unique one)
473  // starting from a position on a node lead unit.
474  static UBool findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue);
475 
476  // Helper functions for getNextUChars().
477  // getNextUChars() when pos is on a branch node.
478  static void getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out);
479 
480  // UCharsTrie data structure
481  //
482  // The trie consists of a series of UChar-serialized nodes for incremental
483  // Unicode string/UChar sequence matching. (UChar=16-bit unsigned integer)
484  // The root node is at the beginning of the trie data.
485  //
486  // Types of nodes are distinguished by their node lead unit ranges.
487  // After each node, except a final-value node, another node follows to
488  // encode match values or continue matching further units.
489  //
490  // Node types:
491  // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
492  // The value is for the string/UChar sequence so far.
493  // - Match node, optionally with an intermediate value in a different compact format.
494  // The value, if present, is for the string/UChar sequence so far.
495  //
496  // Aside from the value, which uses the node lead unit's high bits:
497  //
498  // - Linear-match node: Matches a number of units.
499  // - Branch node: Branches to other nodes according to the current input unit.
500  // The node unit is the length of the branch (number of units to select from)
501  // minus 1. It is followed by a sub-node:
502  // - If the length is at most kMaxBranchLinearSubNodeLength, then
503  // there are length-1 (key, value) pairs and then one more comparison unit.
504  // If one of the key units matches, then the value is either a final value for
505  // the string so far, or a "jump" delta to the next node.
506  // If the last unit matches, then matching continues with the next node.
507  // (Values have the same encoding as final-value nodes.)
508  // - If the length is greater than kMaxBranchLinearSubNodeLength, then
509  // there is one unit and one "jump" delta.
510  // If the input unit is less than the sub-node unit, then "jump" by delta to
511  // the next sub-node which will have a length of length/2.
512  // (The delta has its own compact encoding.)
513  // Otherwise, skip the "jump" delta to the next sub-node
514  // which will have a length of length-length/2.
515 
516  // Match-node lead unit values, after masking off intermediate-value bits:
517 
518  // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
519  // the length is one more than the next unit.
520 
521  // For a branch sub-node with at most this many entries, we drop down
522  // to a linear search.
523  static const int32_t kMaxBranchLinearSubNodeLength=5;
524 
525  // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
526  static const int32_t kMinLinearMatch=0x30;
527  static const int32_t kMaxLinearMatchLength=0x10;
528 
529  // Match-node lead unit bits 14..6 for the optional intermediate value.
530  // If these bits are 0, then there is no intermediate value.
531  // Otherwise, see the *NodeValue* constants below.
532  static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
533  static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
534 
535  // A final-value node has bit 15 set.
536  static const int32_t kValueIsFinal=0x8000;
537 
538  // Compact value: After testing and masking off bit 15, use the following thresholds.
539  static const int32_t kMaxOneUnitValue=0x3fff;
540 
541  static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
542  static const int32_t kThreeUnitValueLead=0x7fff;
543 
544  static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
545 
546  // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
547  static const int32_t kMaxOneUnitNodeValue=0xff;
548  static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
549  static const int32_t kThreeUnitNodeValueLead=0x7fc0;
550 
551  static const int32_t kMaxTwoUnitNodeValue=
552  ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
553 
554  // Compact delta integers.
555  static const int32_t kMaxOneUnitDelta=0xfbff;
556  static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
557  static const int32_t kThreeUnitDeltaLead=0xffff;
558 
559  static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
560 
561  UChar *ownedArray_;
562 
563  // Fixed value referencing the UCharsTrie words.
564  const UChar *uchars_;
565 
566  // Iterator variables.
567 
568  // Pointer to next trie unit to read. NULL if no more matches.
569  const UChar *pos_;
570  // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
571  int32_t remainingMatchLength_;
572 };
573 
575 
576 #endif // __UCHARSTRIE_H__
UCharsTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition: ucharstrie.h:139
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all strings reachable from the current state map to the same value.
Definition: ucharstrie.h:237
C++ API: Unicode String.
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition: ustringtrie.h:33
UCharsTrie(const UCharsTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the UChar array which wi...
Definition: ucharstrie.h:80
int32_t getValue() const
Returns a matching string's value if called immediately after current()/first()/next() returned USTRI...
Definition: ucharstrie.h:220
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
int32_t getValue() const
Definition: ucharstrie.h:328
UCharsTrie & reset()
Resets this trie to its initial state.
Definition: ucharstrie.h:89
Iterator for all of the (string, value) pairs in a UCharsTrie.
Definition: ucharstrie.h:256
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:298
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:186
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:204
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:278
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
State()
Constructs an empty State.
Definition: ucharstrie.h:106
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
UCharsTrie state object, for saving a trie's current state and resetting the trie back to this state ...
Definition: ucharstrie.h:100
C API: Helper definitions for dictionary trie APIs.
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:208
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:357
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:245
The input unit(s) continued a matching string and there is a value for the string so far...
Definition: ustringtrie.h:64
UStringTrieResult first(int32_t uchar)
Traverses the trie from the initial state for this input UChar.
Definition: ucharstrie.h:162
UCharsTrie(const UChar *trieUChars)
Constructs a UCharsTrie reader instance.
Definition: ucharstrie.h:64
const UnicodeString & getString() const
Definition: ucharstrie.h:323
UMemory is the common ICU base class.
Definition: uobject.h:110
Light-weight, non-const reader class for a UCharsTrie.
Definition: ucharstrie.h:48
const UCharsTrie & saveState(State &state) const
Saves the state of this trie.
Definition: ucharstrie.h:122
int8_t UBool
The ICU boolean type.
Definition: umachine.h:200
Base class for objects to which Unicode characters and strings can be appended.
Definition: appendable.h:49