stringtriebuilder.h

Go to the documentation of this file.
00001 /*
00002 *******************************************************************************
00003 *   Copyright (C) 2010-2012, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 *******************************************************************************
00006 *   file name:  stringtriebuilder.h
00007 *   encoding:   US-ASCII
00008 *   tab size:   8 (not used)
00009 *   indentation:4
00010 *
00011 *   created on: 2010dec24
00012 *   created by: Markus W. Scherer
00013 */
00014 
00015 #ifndef __STRINGTRIEBUILDER_H__
00016 #define __STRINGTRIEBUILDER_H__
00017 
00018 #include "unicode/utypes.h"
00019 #include "unicode/uobject.h"
00020 
00026 // Forward declaration.
00027 struct UHashtable;
00028 typedef struct UHashtable UHashtable;
00029 
00034 enum UStringTrieBuildOption {
00039     USTRINGTRIE_BUILD_FAST,
00050     USTRINGTRIE_BUILD_SMALL
00051 };
00052 
00053 U_NAMESPACE_BEGIN
00054 
00061 class U_COMMON_API StringTrieBuilder : public UObject {
00062 public:
00063 #ifndef U_HIDE_INTERNAL_API
00064 
00065     static UBool hashNode(const void *node);
00067     static UBool equalNodes(const void *left, const void *right);
00068 #endif  /* U_HIDE_INTERNAL_API */
00069 
00070 protected:
00071     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
00072     // or else the compiler will create a public default constructor.
00074     StringTrieBuilder();
00076     virtual ~StringTrieBuilder();
00077 
00078 #ifndef U_HIDE_INTERNAL_API
00079 
00080     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
00082     void deleteCompactBuilder();
00083 
00085     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
00086 
00088     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
00090     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
00091 #endif  /* U_HIDE_INTERNAL_API */
00092 
00093     class Node;
00094 
00095 #ifndef U_HIDE_INTERNAL_API
00096 
00097     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
00099     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
00100                             int32_t length, UErrorCode &errorCode);
00101 #endif  /* U_HIDE_INTERNAL_API */
00102 
00104     virtual int32_t getElementStringLength(int32_t i) const = 0;
00106     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
00108     virtual int32_t getElementValue(int32_t i) const = 0;
00109 
00110     // Finds the first unit index after this one where
00111     // the first and last element have different units again.
00113     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
00114 
00115     // Number of different units at unitIndex.
00117     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
00119     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
00121     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
00122 
00124     virtual UBool matchNodesCanHaveValues() const = 0;
00125 
00127     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
00129     virtual int32_t getMinLinearMatch() const = 0;
00131     virtual int32_t getMaxLinearMatchLength() const = 0;
00132 
00133 #ifndef U_HIDE_INTERNAL_API
00134     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
00136     static const int32_t kMaxBranchLinearSubNodeLength=5;
00137 
00138     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
00139     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
00141     static const int32_t kMaxSplitBranchLevels=14;
00142 
00153     Node *registerNode(Node *newNode, UErrorCode &errorCode);
00164     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
00165 
00166     /*
00167      * C++ note:
00168      * registerNode() and registerFinalValue() take ownership of their input nodes,
00169      * and only return owned nodes.
00170      * If they see a failure UErrorCode, they will delete the input node.
00171      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
00172      * If there is a failure, they return NULL.
00173      *
00174      * NULL Node pointers can be safely passed into other Nodes because
00175      * they call the static Node::hashCode() which checks for a NULL pointer first.
00176      *
00177      * Therefore, as long as builder functions register a new node,
00178      * they need to check for failures only before explicitly dereferencing
00179      * a Node pointer, or before setting a new UErrorCode.
00180      */
00181 
00182     // Hash set of nodes, maps from nodes to integer 1.
00184     UHashtable *nodes;
00185 
00187     class Node : public UObject {
00188     public:
00189         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
00190         inline int32_t hashCode() const { return hash; }
00191         // Handles node==NULL.
00192         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
00193         // Base class operator==() compares the actual class types.
00194         virtual UBool operator==(const Node &other) const;
00195         inline UBool operator!=(const Node &other) const { return !operator==(other); }
00223         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00224         // write() must set the offset to a positive value.
00225         virtual void write(StringTrieBuilder &builder) = 0;
00226         // See markRightEdgesFirst.
00227         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
00228                                                StringTrieBuilder &builder) {
00229             // Note: Edge numbers are negative, lastRight<=firstRight.
00230             // If offset>0 then this node and its sub-nodes have been written already
00231             // and we need not write them again.
00232             // If this node is part of the unwritten right branch edge,
00233             // then we wait until that is written.
00234             if(offset<0 && (offset<lastRight || firstRight<offset)) {
00235                 write(builder);
00236             }
00237         }
00238         inline int32_t getOffset() const { return offset; }
00239     protected:
00240         int32_t hash;
00241         int32_t offset;
00242     private:
00243         // No ICU "poor man's RTTI" for this class nor its subclasses.
00244         virtual UClassID getDynamicClassID() const;
00245     };
00246 
00247     // This class should not be overridden because
00248     // registerFinalValue() compares a stack-allocated FinalValueNode
00249     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
00250     // with the input node, and the
00251     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
00252     // will be false if the typeid's are different.
00254     class FinalValueNode : public Node {
00255     public:
00256         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
00257         virtual UBool operator==(const Node &other) const;
00258         virtual void write(StringTrieBuilder &builder);
00259     protected:
00260         int32_t value;
00261     };
00262 
00266     class ValueNode : public Node {
00267     public:
00268         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
00269         virtual UBool operator==(const Node &other) const;
00270         void setValue(int32_t v) {
00271             hasValue=TRUE;
00272             value=v;
00273             hash=hash*37+v;
00274         }
00275     protected:
00276         UBool hasValue;
00277         int32_t value;
00278     };
00279 
00283     class IntermediateValueNode : public ValueNode {
00284     public:
00285         IntermediateValueNode(int32_t v, Node *nextNode)
00286                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
00287         virtual UBool operator==(const Node &other) const;
00288         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00289         virtual void write(StringTrieBuilder &builder);
00290     protected:
00291         Node *next;
00292     };
00293 
00297     class LinearMatchNode : public ValueNode {
00298     public:
00299         LinearMatchNode(int32_t len, Node *nextNode)
00300                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
00301                   length(len), next(nextNode) {}
00302         virtual UBool operator==(const Node &other) const;
00303         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00304     protected:
00305         int32_t length;
00306         Node *next;
00307     };
00308 
00312     class BranchNode : public Node {
00313     public:
00314         BranchNode(int32_t initialHash) : Node(initialHash) {}
00315     protected:
00316         int32_t firstEdgeNumber;
00317     };
00318 
00322     class ListBranchNode : public BranchNode {
00323     public:
00324         ListBranchNode() : BranchNode(0x444444), length(0) {}
00325         virtual UBool operator==(const Node &other) const;
00326         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00327         virtual void write(StringTrieBuilder &builder);
00328         // Adds a unit with a final value.
00329         void add(int32_t c, int32_t value) {
00330             units[length]=(UChar)c;
00331             equal[length]=NULL;
00332             values[length]=value;
00333             ++length;
00334             hash=(hash*37+c)*37+value;
00335         }
00336         // Adds a unit which leads to another match node.
00337         void add(int32_t c, Node *node) {
00338             units[length]=(UChar)c;
00339             equal[length]=node;
00340             values[length]=0;
00341             ++length;
00342             hash=(hash*37+c)*37+hashCode(node);
00343         }
00344     protected:
00345         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
00346         int32_t length;
00347         int32_t values[kMaxBranchLinearSubNodeLength];
00348         UChar units[kMaxBranchLinearSubNodeLength];
00349     };
00350 
00354     class SplitBranchNode : public BranchNode {
00355     public:
00356         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
00357                 : BranchNode(((0x555555*37+middleUnit)*37+
00358                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
00359                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
00360         virtual UBool operator==(const Node &other) const;
00361         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00362         virtual void write(StringTrieBuilder &builder);
00363     protected:
00364         UChar unit;
00365         Node *lessThan;
00366         Node *greaterOrEqual;
00367     };
00368 
00369     // Branch head node, for writing the actual node lead unit.
00371     class BranchHeadNode : public ValueNode {
00372     public:
00373         BranchHeadNode(int32_t len, Node *subNode)
00374                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
00375                   length(len), next(subNode) {}
00376         virtual UBool operator==(const Node &other) const;
00377         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
00378         virtual void write(StringTrieBuilder &builder);
00379     protected:
00380         int32_t length;
00381         Node *next;  // A branch sub-node.
00382     };
00383 #endif  /* U_HIDE_INTERNAL_API */
00384 
00386     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
00387                                         Node *nextNode) const = 0;
00388 
00390     virtual int32_t write(int32_t unit) = 0;
00392     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
00394     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
00396     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
00398     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
00399 
00400 private:
00401     // No ICU "poor man's RTTI" for this class nor its subclasses.
00402     virtual UClassID getDynamicClassID() const;
00403 };
00404 
00405 U_NAMESPACE_END
00406 
00407 #endif  // __STRINGTRIEBUILDER_H__

Generated on 25 Nov 2014 for ICU 50.1.2 by  doxygen 1.4.7