ikeywords.h
Go to the documentation of this file.
00001 //File: $Id$
00002 // Author: John Wu <John.Wu at ACM.org>
00003 // Copyright 2006-2012 the Regents of the University of California
00004 #ifndef IBIS_KEYWORDS_H
00005 #define IBIS_KEYWORDS_H
00006 
00007 
00008 
00009 
00010 
00011 #include "index.h"      // base index class
00012 #include "category.h"   // definitions of string-valued columns
00013 
00078 class ibis::keywords : public ibis::index {
00079 public:
00080     virtual ~keywords() {clear();}
00081     explicit keywords(const ibis::column* c, const char* f=0);
00082     keywords(const ibis::column* c, ibis::text::tokenizer& tkn,
00083              const char* f=0);
00084     keywords(const ibis::column* c, ibis::fileManager::storage* st);
00085 
00086     virtual INDEX_TYPE type() const {return KEYWORDS;}
00087     virtual const char* name() const {return "keywords";}
00088     virtual void binBoundaries(std::vector<double>& b) const {b.clear();}
00089     virtual void binWeights(std::vector<uint32_t>& b) const;
00090     virtual double getMin() const {return DBL_MAX;}
00091     virtual double getMax() const {return -DBL_MAX;}
00092     virtual double getSum() const {return -DBL_MAX;}
00094     long search(const char* kw, ibis::bitvector& hits) const;
00096     long search(const char* kw) const;
00097 
00098     virtual void print(std::ostream& out) const;
00099     virtual int write(const char* dt) const;
00100     virtual int read(const char* idxfile);
00101     virtual int read(ibis::fileManager::storage* st);
00102     virtual long append(const char* dt, const char* df, uint32_t nnew);
00103 
00104     using ibis::index::evaluate;
00105     using ibis::index::estimate;
00106     using ibis::index::undecidable;
00107     virtual long evaluate(const ibis::qContinuousRange& expr,
00108                           ibis::bitvector& hits) const;
00109     virtual void estimate(const ibis::qContinuousRange& expr,
00110                           ibis::bitvector& lower,
00111                           ibis::bitvector& upper) const;
00112     virtual uint32_t estimate(const ibis::qContinuousRange& expr) const;
00115     virtual float undecidable(const ibis::qContinuousRange&,
00116                               ibis::bitvector& iffy) const {
00117         iffy.clear();
00118         return 0.0;
00119     }
00120     virtual double estimateCost(const ibis::qContinuousRange& expr) const;
00121     virtual double estimateCost(const ibis::qDiscreteRange& expr) const;
00122 
00123     virtual long select(const ibis::qContinuousRange&, void*) const {
00124         return -1;}
00125     virtual long select(const ibis::qContinuousRange&, void*,
00126                         ibis::bitvector&) const {
00127         return -1;}
00128 
00129     class tokenizer;
00130 
00131 protected:
00132     virtual size_t getSerialSize() const throw();
00133     int readTermDocFile(const ibis::column* idcol, const char* f);
00134     inline char readTerm(const char*& buf, std::string &key) const;
00135     inline uint32_t readUInt(const char*& buf) const;
00136     int readTDLine(std::istream& in, std::string& key,
00137                    std::vector<uint32_t>& idlist,
00138                    char* buf, uint32_t nbuf) const;
00139     void setBits(std::vector<uint32_t>& pos, ibis::bitvector& bvec) const;
00140     int parseTextFile(ibis::text::tokenizer &tkn, const char *f);
00141 
00143     void clear();
00144 
00145 private:
00146     ibis::dictionary terms;     //< A dictionary for the terms.
00147 }; // class ibis::keywords
00148 
00154 inline char ibis::keywords::readTerm(const char*& buf,
00155                                      std::string &keyword) const {
00156     while (isspace(*buf)) // skip leading space
00157         ++ buf;
00158     while (isprint(*buf)) { // loop through all printable till the delimiter
00159         if (*buf == ':') {
00160             return *buf;
00161         }
00162         else if (isspace(*buf)) {
00163             for (++ buf; isspace(*buf); ++ buf);
00164             if (*buf == ':') {
00165                 return *buf;
00166             }
00167             else {
00168                 keyword += ' ';
00169                 keyword += *buf;
00170                 ++ buf;
00171             }
00172         }
00173         else {
00174             keyword += *buf;
00175             ++ buf;
00176         }
00177     }
00178     return *buf;
00179 } // ibis::keywords::readTerm
00180 
00182 inline uint32_t ibis::keywords::readUInt(const char*& buf) const {
00183     uint32_t res = 0;
00184     while (*buf && ! isdigit(*buf)) // skip leading non-digit
00185         ++ buf;
00186 
00187     while (isdigit(*buf)) {
00188         res = res * 10 + (*buf - '0');
00189         ++ buf;
00190     }
00191     return res;
00192 } // ibis::keywords::readUInt
00193 
00195 class ibis::keywords::tokenizer : public ibis::text::tokenizer {
00196 public:
00202     tokenizer(const char *d=ibis::util::delimiters) : delim_(d) {}
00204     virtual ~tokenizer() {}
00205 
00206     virtual int operator()(std::vector<const char*>& tkns, char *buf);
00207 
00208 private:
00209     std::string delim_; 
00210 }; // class ibis::keywords::tokenizer
00211 #endif

Make It A Bit Faster
Contact us
Disclaimers
FastBit source code
FastBit mailing list archive