ICU 49.1.1  49.1.1
stringtriebuilder.h
00001 /*
00002 *******************************************************************************
00003 *   Copyright (C) 2010-2012, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 *******************************************************************************
00006 *   file name:  stringtriebuilder.h
00007 *   encoding:   US-ASCII
00008 *   tab size:   8 (not used)
00009 *   indentation:4
00010 *
00011 *   created on: 2010dec24
00012 *   created by: Markus W. Scherer
00013 */
00014 
00015 #ifndef __STRINGTRIEBUILDER_H__
00016 #define __STRINGTRIEBUILDER_H__
00017 
00018 #include "unicode/utypes.h"
00019 #include "unicode/uobject.h"
00020 
00021 // Forward declaration.
00022 struct UHashtable;
00023 typedef struct UHashtable UHashtable;
00024 
00029 enum UStringTrieBuildOption {
00034     USTRINGTRIE_BUILD_FAST,
00045     USTRINGTRIE_BUILD_SMALL
00046 };
00047 
00048 U_NAMESPACE_BEGIN
00049 
00056 class U_COMMON_API StringTrieBuilder : public UObject {
00057 public:
00058 #ifndef U_HIDE_INTERNAL_API
00059 
00060     static UBool hashNode(const void *node);
00062     static UBool equalNodes(const void *left, const void *right);
00063 #endif  /* U_HIDE_INTERNAL_API */
00064 
00065 protected:
00066     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
00067     // or else the compiler will create a public default constructor.
00069     StringTrieBuilder();
00071     virtual ~StringTrieBuilder();
00072 
00073 #ifndef U_HIDE_INTERNAL_API
00074 
00075     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
00077     void deleteCompactBuilder();
00078 
00080     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
00081 
00083     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
00085     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
00086 #endif  /* U_HIDE_INTERNAL_API */
00087 
00088     class Node;
00089 
00090 #ifndef U_HIDE_INTERNAL_API
00091 
00092     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
00094     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
00095                             int32_t length, UErrorCode &errorCode);
00096 #endif  /* U_HIDE_INTERNAL_API */
00097 
00099     virtual int32_t getElementStringLength(int32_t i) const = 0;
00101     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
00103     virtual int32_t getElementValue(int32_t i) const = 0;
00104 
00105     // Finds the first unit index after this one where
00106     // the first and last element have different units again.
00108     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
00109 
00110     // Number of different units at unitIndex.
00112     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
00114     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
00116     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
00117 
00119     virtual UBool matchNodesCanHaveValues() const = 0;
00120 
00122     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
00124     virtual int32_t getMinLinearMatch() const = 0;
00126     virtual int32_t getMaxLinearMatchLength() const = 0;
00127 
00128 #ifndef U_HIDE_INTERNAL_API
00129     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
00131     static const int32_t kMaxBranchLinearSubNodeLength=5;
00132 
00133     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
00134     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
00136     static const int32_t kMaxSplitBranchLevels=14;
00137 
00148     Node *registerNode(Node *newNode, UErrorCode &errorCode);
00159     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
00160 
00161     /*
00162      * C++ note:
00163      * registerNode() and registerFinalValue() take ownership of their input nodes,
00164      * and only return owned nodes.
00165      * If they see a failure UErrorCode, they will delete the input node.
00166      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
00167      * If there is a failure, they return NULL.
00168      *
00169      * NULL Node pointers can be safely passed into other Nodes because
00170      * they call the static Node::hashCode() which checks for a NULL pointer first.
00171      *
00172      * Therefore, as long as builder functions register a new node,
00173      * they need to check for failures only before explicitly dereferencing
00174      * a Node pointer, or before setting a new UErrorCode.
00175      */
00176 
00177     // Hash set of nodes, maps from nodes to integer 1.
00179     UHashtable *nodes;
00180 
00182     class Node : public UObject {
00183     public:
00184         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
00185         inline int32_t hashCode() const { return hash; }
00186         // Handles node==NULL.
00187         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
00188         // Base class operator==() compares the actual class types.
00189         virtual UBool operator==(const Node &other) const;
00190         inline UBool operator!=(const Node &other) const { return !operator==(other); }
00218         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00219         // write() must set the offset to a positive value.
00220         virtual void write(StringTrieBuilder &builder) = 0;
00221         // See markRightEdgesFirst.
00222         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
00223                                                StringTrieBuilder &builder) {
00224             // Note: Edge numbers are negative, lastRight<=firstRight.
00225             // If offset>0 then this node and its sub-nodes have been written already
00226             // and we need not write them again.
00227             // If this node is part of the unwritten right branch edge,
00228             // then we wait until that is written.
00229             if(offset<0 && (offset<lastRight || firstRight<offset)) {
00230                 write(builder);
00231             }
00232         }
00233         inline int32_t getOffset() const { return offset; }
00234     protected:
00235         int32_t hash;
00236         int32_t offset;
00237     private:
00238         // No ICU "poor man's RTTI" for this class nor its subclasses.
00239         virtual UClassID getDynamicClassID() const;
00240     };
00241 
00242     // This class should not be overridden because
00243     // registerFinalValue() compares a stack-allocated FinalValueNode
00244     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
00245     // with the input node, and the
00246     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
00247     // will be false if the typeid's are different.
00249     class FinalValueNode : public Node {
00250     public:
00251         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
00252         virtual UBool operator==(const Node &other) const;
00253         virtual void write(StringTrieBuilder &builder);
00254     protected:
00255         int32_t value;
00256     };
00257 
00259     class ValueNode : public Node {
00260     public:
00261         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
00262         virtual UBool operator==(const Node &other) const;
00263         void setValue(int32_t v) {
00264             hasValue=TRUE;
00265             value=v;
00266             hash=hash*37+v;
00267         }
00268     protected:
00269         UBool hasValue;
00270         int32_t value;
00271     };
00272 
00274     class IntermediateValueNode : public ValueNode {
00275     public:
00276         IntermediateValueNode(int32_t v, Node *nextNode)
00277                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
00278         virtual UBool operator==(const Node &other) const;
00279         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00280         virtual void write(StringTrieBuilder &builder);
00281     protected:
00282         Node *next;
00283     };
00284 
00286     class LinearMatchNode : public ValueNode {
00287     public:
00288         LinearMatchNode(int32_t len, Node *nextNode)
00289                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
00290                   length(len), next(nextNode) {}
00291         virtual UBool operator==(const Node &other) const;
00292         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00293     protected:
00294         int32_t length;
00295         Node *next;
00296     };
00297 
00299     class BranchNode : public Node {
00300     public:
00301         BranchNode(int32_t initialHash) : Node(initialHash) {}
00302     protected:
00303         int32_t firstEdgeNumber;
00304     };
00305 
00307     class ListBranchNode : public BranchNode {
00308     public:
00309         ListBranchNode() : BranchNode(0x444444), length(0) {}
00310         virtual UBool operator==(const Node &other) const;
00311         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00312         virtual void write(StringTrieBuilder &builder);
00313         // Adds a unit with a final value.
00314         void add(int32_t c, int32_t value) {
00315             units[length]=(UChar)c;
00316             equal[length]=NULL;
00317             values[length]=value;
00318             ++length;
00319             hash=(hash*37+c)*37+value;
00320         }
00321         // Adds a unit which leads to another match node.
00322         void add(int32_t c, Node *node) {
00323             units[length]=(UChar)c;
00324             equal[length]=node;
00325             values[length]=0;
00326             ++length;
00327             hash=(hash*37+c)*37+hashCode(node);
00328         }
00329     protected:
00330         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
00331         int32_t length;
00332         int32_t values[kMaxBranchLinearSubNodeLength];
00333         UChar units[kMaxBranchLinearSubNodeLength];
00334     };
00335 
00337     class SplitBranchNode : public BranchNode {
00338     public:
00339         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
00340                 : BranchNode(((0x555555*37+middleUnit)*37+
00341                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
00342                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
00343         virtual UBool operator==(const Node &other) const;
00344         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00345         virtual void write(StringTrieBuilder &builder);
00346     protected:
00347         UChar unit;
00348         Node *lessThan;
00349         Node *greaterOrEqual;
00350     };
00351 
00352     // Branch head node, for writing the actual node lead unit.
00354     class BranchHeadNode : public ValueNode {
00355     public:
00356         BranchHeadNode(int32_t len, Node *subNode)
00357                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
00358                   length(len), next(subNode) {}
00359         virtual UBool operator==(const Node &other) const;
00360         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00361         virtual void write(StringTrieBuilder &builder);
00362     protected:
00363         int32_t length;
00364         Node *next;  // A branch sub-node.
00365     };
00366 #endif  /* U_HIDE_INTERNAL_API */
00367 
00369     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
00370                                         Node *nextNode) const = 0;
00371 
00373     virtual int32_t write(int32_t unit) = 0;
00375     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
00377     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
00379     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
00381     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
00382 
00383 private:
00384     // No ICU "poor man's RTTI" for this class nor its subclasses.
00385     virtual UClassID getDynamicClassID() const;
00386 };
00387 
00388 U_NAMESPACE_END
00389 
00390 #endif  // __STRINGTRIEBUILDER_H__