ikeywords.h
Go to the documentation of this file.
00001 //File: $Id$
00002 // Author: John Wu <John.Wu at ACM.org>
00003 // Copyright 2006-2011 the Regents of the University of California
00004 #ifndef IBIS_KEYWORDS_H
00005 #define IBIS_KEYWORDS_H
00006 
00007 
00008 
00009 
00010 
00011 #include "index.h"      // base index class
00012 #include "category.h"   // definitions of string-valued columns
00013 
00078 class ibis::keywords : public ibis::index {
00079 public:
00080     virtual ~keywords() {clear();}
00081     explicit keywords(const ibis::column* c, const char* f=0);
00082     keywords(const ibis::column* c, ibis::text::tokenizer& tkn,
00083              const char* f=0);
00084     keywords(const ibis::column* c, ibis::fileManager::storage* st);
00085 
00086     virtual INDEX_TYPE type() const {return KEYWORDS;}
00087     virtual const char* name() const {return "keywords";}
00088     virtual void binBoundaries(std::vector<double>& b) const {b.clear();}
00089     virtual void binWeights(std::vector<uint32_t>& b) const;
00090     virtual double getMin() const {return DBL_MAX;}
00091     virtual double getMax() const {return -DBL_MAX;}
00092     virtual double getSum() const {return -DBL_MAX;}
00094     long search(const char* kw, ibis::bitvector& hits) const;
00096     long search(const char* kw) const;
00097 
00098     virtual void print(std::ostream& out) const;
00099     virtual int write(const char* dt) const;
00100     virtual int read(const char* idxfile);
00101     virtual int read(ibis::fileManager::storage* st);
00102     virtual long append(const char* dt, const char* df, uint32_t nnew);
00103 
00104     using ibis::index::evaluate;
00105     using ibis::index::estimate;
00106     using ibis::index::undecidable;
00107     virtual long evaluate(const ibis::qContinuousRange& expr,
00108                           ibis::bitvector& hits) const;
00109     virtual void estimate(const ibis::qContinuousRange& expr,
00110                           ibis::bitvector& lower,
00111                           ibis::bitvector& upper) const;
00112     virtual uint32_t estimate(const ibis::qContinuousRange& expr) const;
00115     virtual float undecidable(const ibis::qContinuousRange&,
00116                               ibis::bitvector& iffy) const {
00117         iffy.clear();
00118         return 0.0;
00119     }
00120     virtual double estimateCost(const ibis::qContinuousRange& expr) const;
00121     virtual double estimateCost(const ibis::qDiscreteRange& expr) const;
00122 
00123     class tokenizer;
00124 
00125 protected:
00126     virtual size_t getSerialSize() const throw();
00127     int readTermDocFile(const ibis::column* idcol, const char* f);
00128     inline char readTerm(const char*& buf, std::string &key) const;
00129     inline uint32_t readUInt(const char*& buf) const;
00130     int readTDLine(std::istream& in, std::string& key,
00131                    std::vector<uint32_t>& idlist,
00132                    char* buf, uint32_t nbuf) const;
00133     void setBits(std::vector<uint32_t>& pos, ibis::bitvector& bvec) const;
00134     int parseTextFile(ibis::text::tokenizer &tkn, const char *f);
00135 
00137     void clear();
00138 
00139 private:
00140     ibis::dictionary terms;     //< A dictionary for the terms.
00141 }; // class ibis::keywords
00142 
00148 inline char ibis::keywords::readTerm(const char*& buf,
00149                                      std::string &keyword) const {
00150     while (isspace(*buf)) // skip leading space
00151         ++ buf;
00152     while (isprint(*buf)) { // loop through all printable till the delimiter
00153         if (*buf == ':') {
00154             return *buf;
00155         }
00156         else if (isspace(*buf)) {
00157             for (++ buf; isspace(*buf); ++ buf);
00158             if (*buf == ':') {
00159                 return *buf;
00160             }
00161             else {
00162                 keyword += ' ';
00163                 keyword += *buf;
00164                 ++ buf;
00165             }
00166         }
00167         else {
00168             keyword += *buf;
00169             ++ buf;
00170         }
00171     }
00172     return *buf;
00173 } // ibis::keywords::readTerm
00174 
00176 inline uint32_t ibis::keywords::readUInt(const char*& buf) const {
00177     uint32_t res = 0;
00178     while (*buf && ! isdigit(*buf)) // skip leading non-digit
00179         ++ buf;
00180 
00181     while (isdigit(*buf)) {
00182         res = res * 10 + (*buf - '0');
00183         ++ buf;
00184     }
00185     return res;
00186 } // ibis::keywords::readUInt
00187 
00189 class ibis::keywords::tokenizer : public ibis::text::tokenizer {
00190 public:
00196     tokenizer(const char *d=ibis::util::delimiters) : delim_(d) {}
00198     virtual ~tokenizer() {}
00199 
00200     virtual int operator()(std::vector<const char*>& tkns, char *buf);
00201 
00202 private:
00203     std::string delim_; 
00204 }; // class ibis::keywords::tokenizer
00205 #endif

Make It A Bit Faster
Contact us
Disclaimers
FastBit source code
FastBit mailing list archive