00001 //File: $Id$ 00002 // Author: John Wu <John.Wu at ACM.org> 00003 // Copyright 2006-2011 the Regents of the University of California 00004 #ifndef IBIS_KEYWORDS_H 00005 #define IBIS_KEYWORDS_H 00006 00007 00008 00009 00010 00011 #include "index.h" // base index class 00012 #include "category.h" // definitions of string-valued columns 00013 00078 class ibis::keywords : public ibis::index { 00079 public: 00080 virtual ~keywords() {clear();} 00081 explicit keywords(const ibis::column* c, const char* f=0); 00082 keywords(const ibis::column* c, ibis::text::tokenizer& tkn, 00083 const char* f=0); 00084 keywords(const ibis::column* c, ibis::fileManager::storage* st); 00085 00086 virtual INDEX_TYPE type() const {return KEYWORDS;} 00087 virtual const char* name() const {return "keywords";} 00088 virtual void binBoundaries(std::vector<double>& b) const {b.clear();} 00089 virtual void binWeights(std::vector<uint32_t>& b) const; 00090 virtual double getMin() const {return DBL_MAX;} 00091 virtual double getMax() const {return -DBL_MAX;} 00092 virtual double getSum() const {return -DBL_MAX;} 00094 long search(const char* kw, ibis::bitvector& hits) const; 00096 long search(const char* kw) const; 00097 00098 virtual void print(std::ostream& out) const; 00099 virtual int write(const char* dt) const; 00100 virtual int read(const char* idxfile); 00101 virtual int read(ibis::fileManager::storage* st); 00102 virtual long append(const char* dt, const char* df, uint32_t nnew); 00103 00104 using ibis::index::evaluate; 00105 using ibis::index::estimate; 00106 using ibis::index::undecidable; 00107 virtual long evaluate(const ibis::qContinuousRange& expr, 00108 ibis::bitvector& hits) const; 00109 virtual void estimate(const ibis::qContinuousRange& expr, 00110 ibis::bitvector& lower, 00111 ibis::bitvector& upper) const; 00112 virtual uint32_t estimate(const ibis::qContinuousRange& expr) const; 00115 virtual float undecidable(const ibis::qContinuousRange&, 00116 ibis::bitvector& iffy) const { 00117 iffy.clear(); 00118 return 0.0; 00119 } 00120 virtual double estimateCost(const ibis::qContinuousRange& expr) const; 00121 virtual double estimateCost(const ibis::qDiscreteRange& expr) const; 00122 00123 class tokenizer; 00124 00125 protected: 00126 virtual size_t getSerialSize() const throw(); 00127 int readTermDocFile(const ibis::column* idcol, const char* f); 00128 inline char readTerm(const char*& buf, std::string &key) const; 00129 inline uint32_t readUInt(const char*& buf) const; 00130 int readTDLine(std::istream& in, std::string& key, 00131 std::vector<uint32_t>& idlist, 00132 char* buf, uint32_t nbuf) const; 00133 void setBits(std::vector<uint32_t>& pos, ibis::bitvector& bvec) const; 00134 int parseTextFile(ibis::text::tokenizer &tkn, const char *f); 00135 00137 void clear(); 00138 00139 private: 00140 ibis::dictionary terms; //< A dictionary for the terms. 00141 }; // class ibis::keywords 00142 00148 inline char ibis::keywords::readTerm(const char*& buf, 00149 std::string &keyword) const { 00150 while (isspace(*buf)) // skip leading space 00151 ++ buf; 00152 while (isprint(*buf)) { // loop through all printable till the delimiter 00153 if (*buf == ':') { 00154 return *buf; 00155 } 00156 else if (isspace(*buf)) { 00157 for (++ buf; isspace(*buf); ++ buf); 00158 if (*buf == ':') { 00159 return *buf; 00160 } 00161 else { 00162 keyword += ' '; 00163 keyword += *buf; 00164 ++ buf; 00165 } 00166 } 00167 else { 00168 keyword += *buf; 00169 ++ buf; 00170 } 00171 } 00172 return *buf; 00173 } // ibis::keywords::readTerm 00174 00176 inline uint32_t ibis::keywords::readUInt(const char*& buf) const { 00177 uint32_t res = 0; 00178 while (*buf && ! isdigit(*buf)) // skip leading non-digit 00179 ++ buf; 00180 00181 while (isdigit(*buf)) { 00182 res = res * 10 + (*buf - '0'); 00183 ++ buf; 00184 } 00185 return res; 00186 } // ibis::keywords::readUInt 00187 00189 class ibis::keywords::tokenizer : public ibis::text::tokenizer { 00190 public: 00196 tokenizer(const char *d=ibis::util::delimiters) : delim_(d) {} 00198 virtual ~tokenizer() {} 00199 00200 virtual int operator()(std::vector<const char*>& tkns, char *buf); 00201 00202 private: 00203 std::string delim_; 00204 }; // class ibis::keywords::tokenizer 00205 #endif
![]() |