00001 //File: $Id$ 00002 // Author: John Wu <John.Wu at ACM.org> 00003 // Copyright 2006-2012 the Regents of the University of California 00004 #ifndef IBIS_KEYWORDS_H 00005 #define IBIS_KEYWORDS_H 00006 00007 00008 00009 00010 00011 #include "index.h" // base index class 00012 #include "category.h" // definitions of string-valued columns 00013 00078 class ibis::keywords : public ibis::index { 00079 public: 00080 virtual ~keywords() {clear();} 00081 explicit keywords(const ibis::column* c, const char* f=0); 00082 keywords(const ibis::column* c, ibis::text::tokenizer& tkn, 00083 const char* f=0); 00084 keywords(const ibis::column* c, ibis::fileManager::storage* st); 00085 00086 virtual INDEX_TYPE type() const {return KEYWORDS;} 00087 virtual const char* name() const {return "keywords";} 00088 virtual void binBoundaries(std::vector<double>& b) const {b.clear();} 00089 virtual void binWeights(std::vector<uint32_t>& b) const; 00090 virtual double getMin() const {return DBL_MAX;} 00091 virtual double getMax() const {return -DBL_MAX;} 00092 virtual double getSum() const {return -DBL_MAX;} 00094 long search(const char* kw, ibis::bitvector& hits) const; 00096 long search(const char* kw) const; 00097 00098 virtual void print(std::ostream& out) const; 00099 virtual int write(const char* dt) const; 00100 virtual int read(const char* idxfile); 00101 virtual int read(ibis::fileManager::storage* st); 00102 virtual long append(const char* dt, const char* df, uint32_t nnew); 00103 00104 using ibis::index::evaluate; 00105 using ibis::index::estimate; 00106 using ibis::index::undecidable; 00107 virtual long evaluate(const ibis::qContinuousRange& expr, 00108 ibis::bitvector& hits) const; 00109 virtual void estimate(const ibis::qContinuousRange& expr, 00110 ibis::bitvector& lower, 00111 ibis::bitvector& upper) const; 00112 virtual uint32_t estimate(const ibis::qContinuousRange& expr) const; 00115 virtual float undecidable(const ibis::qContinuousRange&, 00116 ibis::bitvector& iffy) const { 00117 iffy.clear(); 00118 return 0.0; 00119 } 00120 virtual double estimateCost(const ibis::qContinuousRange& expr) const; 00121 virtual double estimateCost(const ibis::qDiscreteRange& expr) const; 00122 00123 virtual long select(const ibis::qContinuousRange&, void*) const { 00124 return -1;} 00125 virtual long select(const ibis::qContinuousRange&, void*, 00126 ibis::bitvector&) const { 00127 return -1;} 00128 00129 class tokenizer; 00130 00131 protected: 00132 virtual size_t getSerialSize() const throw(); 00133 int readTermDocFile(const ibis::column* idcol, const char* f); 00134 inline char readTerm(const char*& buf, std::string &key) const; 00135 inline uint32_t readUInt(const char*& buf) const; 00136 int readTDLine(std::istream& in, std::string& key, 00137 std::vector<uint32_t>& idlist, 00138 char* buf, uint32_t nbuf) const; 00139 void setBits(std::vector<uint32_t>& pos, ibis::bitvector& bvec) const; 00140 int parseTextFile(ibis::text::tokenizer &tkn, const char *f); 00141 00143 void clear(); 00144 00145 private: 00146 ibis::dictionary terms; //< A dictionary for the terms. 00147 }; // class ibis::keywords 00148 00154 inline char ibis::keywords::readTerm(const char*& buf, 00155 std::string &keyword) const { 00156 while (isspace(*buf)) // skip leading space 00157 ++ buf; 00158 while (isprint(*buf)) { // loop through all printable till the delimiter 00159 if (*buf == ':') { 00160 return *buf; 00161 } 00162 else if (isspace(*buf)) { 00163 for (++ buf; isspace(*buf); ++ buf); 00164 if (*buf == ':') { 00165 return *buf; 00166 } 00167 else { 00168 keyword += ' '; 00169 keyword += *buf; 00170 ++ buf; 00171 } 00172 } 00173 else { 00174 keyword += *buf; 00175 ++ buf; 00176 } 00177 } 00178 return *buf; 00179 } // ibis::keywords::readTerm 00180 00182 inline uint32_t ibis::keywords::readUInt(const char*& buf) const { 00183 uint32_t res = 0; 00184 while (*buf && ! isdigit(*buf)) // skip leading non-digit 00185 ++ buf; 00186 00187 while (isdigit(*buf)) { 00188 res = res * 10 + (*buf - '0'); 00189 ++ buf; 00190 } 00191 return res; 00192 } // ibis::keywords::readUInt 00193 00195 class ibis::keywords::tokenizer : public ibis::text::tokenizer { 00196 public: 00202 tokenizer(const char *d=ibis::util::delimiters) : delim_(d) {} 00204 virtual ~tokenizer() {} 00205 00206 virtual int operator()(std::vector<const char*>& tkns, char *buf); 00207 00208 private: 00209 std::string delim_; 00210 }; // class ibis::keywords::tokenizer 00211 #endif
![]() |