index.h
Go to the documentation of this file.
00001 //File: $Id$
00002 // Author: John Wu <John.Wu at ACM.org>
00003 //         Lawrence Berkeley National Laboratory
00004 // Copyright 2000-2012 the Regents of the University of California
00005 #ifndef IBIS_INDEX_H
00006 #define IBIS_INDEX_H
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 #if defined(_WIN32) && defined(_MSC_VER)
00025 #pragma warning(disable:4786)   // some identifier longer than 256 characters
00026 #endif
00027 #include "qExpr.h"
00028 #include "bitvector.h"
00029 
00030 #include <string>
00031 #include <limits>       // std::numeric_limits
00032 
00033 namespace ibis { // the concrete classes of index hierarchy
00034     class bin;   // equal size bins (equality encoded bitmap index)
00035     class range; // one-sided range encoded (cumulative) index
00036     class mesa;  // interval encoded index (two-sided ranges)
00037     class ambit; // two-level, cumulative index (cumulate on all levels)
00038     class pale;  // two-level, cumulative on the fine level only
00039     class pack;  // two-level, cumulative on the coarse level only
00040     class zone;  // two-level, do not cumulate on either levels
00041     class relic; // the basic bitmap index (one bitmap per distinct value)
00042     class slice; // the bit slice index (binary encoding of ibis::relic)
00043     class fade;  // a more controlled slice (multicomponent range code)
00044     class sbiad; // Italian translation of fade (multicomponent interval code)
00045     class sapid; // closest word to sbiad in English (multicomponent equality
00046                  // code)
00047     class egale; // French word for "equal" (multicomponent equality code on
00048                  // bins)
00049     class moins; // French word for "less" (multicomponent range code on bins)
00050     class entre; // French word for "in between" (multicomponent interval code
00051                  // on bins)
00052     class bak;   // Dutch word for "bin" (simple equality encoding for
00053                  // values mapped to reduced precision floating-point
00054                  // values)
00055     class bak2;  // a variation of bak that splits each bak bin in two
00056 //     class waaier;// Dutch word for "range" (simple range encoding for values
00057 //               // mapped to reduced precision floating-point values)
00058 //     class tussen;// Dutch word for "in between" (simple interval encoding
00059 //               // for values mapped to reduce precision floating-point
00060 //               // values)
00061 //     class uguale;// Italian word for "equal" (multicomponent version of bak)
00062 //     class meno;  // Italian word for "less" (multicomponent version of waaier)
00063 //     class fra;   // Italian word for "in between" (multicomponent version of
00064 //               // tussen)
00065     class keywords;// A boolean version of term-document matrix.
00066     class mesh;  // Composite index on 2-D regular mesh
00067     class band;  // Composite index on 2-D bands.
00068     class direkte;// Directly use the integer values as bin numbers.
00069     class bylt;  // Unbinned version of pack.
00070     class zona;  // Unbinned version of zone.
00071     class fuzz;  // Unbinned version of interval-equality encoding.
00072     class fuge;  // Binned version of interval-equality encoding.
00073 } // namespace ibis
00074 
00081 class ibis::index {
00082 public:
00086     enum INDEX_TYPE {
00087         BINNING=0,      
00088         RANGE,  
00089         MESA,   
00090         AMBIT,  
00091         PALE,   
00092         PACK,   
00093         ZONE,   
00094         RELIC,  
00095         ROSTER, 
00096         SLICE,  
00097         FADE,   
00098         SBIAD,  
00099         SAPID,  
00100         EGALE,  
00101         MOINS,  
00102         ENTRE,  
00103         BAK,    
00104         BAK2,   
00105 
00106 
00107         KEYWORDS,       
00108         MESH,   
00109         BAND,   
00110         DIREKTE,
00111         GENERIC,
00112         BYLT,   
00113         FUZZ,   
00114         ZONA,   
00115         FUGE,   
00116         EXTERN  
00117     };
00118 
00119     static index* create(const column* c, const char* name=0,
00120                          const char* spec=0, int inEntirety=0);
00121     static bool isIndex(const char* f, INDEX_TYPE t);
00123     virtual ~index () {clear();};
00124 
00126     virtual INDEX_TYPE type() const = 0;
00129     virtual const char* name() const = 0;
00130 
00133     virtual long evaluate(const ibis::qContinuousRange& expr,
00134                           ibis::bitvector& hits) const = 0;
00136     virtual long select(const ibis::qContinuousRange&, void*) const =0;
00138     virtual long select(const ibis::qContinuousRange&, void*,
00139                         ibis::bitvector&) const =0;
00142     virtual long evaluate(const ibis::qDiscreteRange&,
00143                           ibis::bitvector&) const {
00144         return -1;}
00145 
00156     virtual void estimate(const ibis::qContinuousRange&,
00157                           ibis::bitvector& lower,
00158                           ibis::bitvector& upper) const {
00159         lower.set(0, nrows); upper.set(1, nrows);}
00161     virtual uint32_t estimate(const ibis::qContinuousRange&) const {
00162         return nrows;}
00170     virtual float undecidable(const ibis::qContinuousRange&,
00171                               ibis::bitvector&) const {return 0.5;}
00172 
00175     virtual void estimate(const ibis::qDiscreteRange& expr,
00176                           ibis::bitvector& lower,
00177                           ibis::bitvector& upper) const;
00178     virtual uint32_t estimate(const ibis::qDiscreteRange& expr) const;
00179     virtual float undecidable(const ibis::qDiscreteRange& expr,
00180                               ibis::bitvector& iffy) const;
00181 
00183     virtual void estimate(const ibis::index& idx2,
00184                           const ibis::deprecatedJoin& expr,
00185                           ibis::bitvector64& lower,
00186                           ibis::bitvector64& upper) const;
00189     virtual void estimate(const ibis::index& idx2,
00190                           const ibis::deprecatedJoin& expr,
00191                           const ibis::bitvector& mask,
00192                           ibis::bitvector64& lower,
00193                           ibis::bitvector64& upper) const;
00194     virtual void estimate(const ibis::index& idx2,
00195                           const ibis::deprecatedJoin& expr,
00196                           const ibis::bitvector& mask,
00197                           const ibis::qRange* const range1,
00198                           const ibis::qRange* const range2,
00199                           ibis::bitvector64& lower,
00200                           ibis::bitvector64& upper) const;
00202     virtual int64_t estimate(const ibis::index& idx2,
00203                              const ibis::deprecatedJoin& expr) const;
00206     virtual int64_t estimate(const ibis::index& idx2,
00207                              const ibis::deprecatedJoin& expr,
00208                              const ibis::bitvector& mask) const;
00209     virtual int64_t estimate(const ibis::index& idx2,
00210                              const ibis::deprecatedJoin& expr,
00211                              const ibis::bitvector& mask,
00212                              const ibis::qRange* const range1,
00213                              const ibis::qRange* const range2) const;
00214 
00216     virtual void estimate(const ibis::deprecatedJoin& expr,
00217                           const ibis::bitvector& mask,
00218                           const ibis::qRange* const range1,
00219                           const ibis::qRange* const range2,
00220                           ibis::bitvector64& lower,
00221                           ibis::bitvector64& upper) const;
00222     virtual int64_t estimate(const ibis::deprecatedJoin& expr,
00223                              const ibis::bitvector& mask,
00224                              const ibis::qRange* const range1,
00225                              const ibis::qRange* const range2) const;
00226 
00228     virtual double estimateCost(const ibis::qContinuousRange&) const {
00229         return (offset32.empty() ? (nrows<<3) : offset32.back());}
00231     virtual double estimateCost(const ibis::qDiscreteRange&) const {
00232         return (offset32.empty() ? (nrows<<3) : offset32.back());}
00233 
00236     virtual void print(std::ostream& out) const = 0;
00240     virtual int write(const char* name) const = 0;
00244     virtual int read(const char* name) = 0;
00247     virtual int read(ibis::fileManager::storage* st) = 0;
00248 
00250     virtual long append(const char*, const char*, uint32_t) {return -1;}
00251 
00252     inline float sizeInBytes() const;
00254     virtual void speedTest(std::ostream&) const {};
00256     virtual uint32_t numBitvectors() const {return bits.size();}
00258     virtual const ibis::bitvector* getBitvector(uint32_t i) const {
00259         if (i < bits.size()) {
00260             if (bits[i] == 0)
00261                 activate(i);
00262             return bits[i];
00263         }
00264         else {
00265             return 0;
00266         }
00267     }
00268 
00271     virtual void binBoundaries(std::vector<double>&) const {return;}
00272     virtual void binWeights(std::vector<uint32_t>&) const {return;}
00273 
00275     virtual long getCumulativeDistribution
00276     (std::vector<double>& bds, std::vector<uint32_t>& cts) const;
00278     virtual long getDistribution
00279     (std::vector<double>& bbs, std::vector<uint32_t>& cts) const;
00281     virtual double getMin() const {
00282         return std::numeric_limits<double>::quiet_NaN();}
00284     virtual double getMax() const {
00285         return std::numeric_limits<double>::quiet_NaN();}
00289     virtual double getSum() const {
00290         return std::numeric_limits<double>::quiet_NaN();}
00292     uint32_t getNRows() const {return nrows;}
00293 
00294     void addBins(uint32_t ib, uint32_t ie, ibis::bitvector& res) const;
00295     void addBins(uint32_t ib, uint32_t ie, ibis::bitvector& res,
00296                  const ibis::bitvector& tot) const;
00297     void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector& res) const;
00298     void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector& res,
00299                  uint32_t ib0, uint32_t ie0) const;
00300     void sumBins(const ibis::array_t<uint32_t> &, ibis::bitvector&) const;
00301 
00307     virtual int expandRange(ibis::qContinuousRange&) const {return 0;}
00308     virtual int contractRange(ibis::qContinuousRange&) const {return 0;}
00309 
00310     typedef std::map< double, ibis::bitvector* > VMap;
00311     typedef std::map< double, uint32_t > histogram;
00312     template <typename E>
00313     static void mapValues(const array_t<E>& val, VMap& bmap);
00314     template <typename E>
00315     static void mapValues(const array_t<E>& val, histogram& hist,
00316                           uint32_t count=0);
00317     template <typename E>
00318     static void mapValues(const array_t<E>& val, array_t<E>& bounds,
00319                           std::vector<uint32_t>& cnts);
00320     template <typename E1, typename E2>
00321     static void mapValues(const array_t<E1>& val1, const array_t<E2>& val2,
00322                           array_t<E1>& bnd1, array_t<E2>& bnd2,
00323                           std::vector<uint32_t>& cnts);
00326     static void divideCounts(array_t<uint32_t>& bounds,
00327                              const array_t<uint32_t>& cnt);
00328 
00329     // three static functions to perform the task of summing up bit sequences
00330     static void addBits(const array_t<bitvector*>& bits,
00331                         uint32_t ib, uint32_t ie, ibis::bitvector& res);
00332     static void sumBits(const array_t<bitvector*>& bits,
00333                         uint32_t ib, uint32_t ie, ibis::bitvector& res);
00334     static void sumBits(const array_t<bitvector*>& bits,
00335                         const ibis::bitvector& tot, uint32_t ib, uint32_t ie,
00336                         ibis::bitvector& res);
00337     // a static function to assign bases for multicomponent schemes
00338     static void setBases(array_t<uint32_t>& bases, uint32_t card,
00339                          uint32_t nbase = 2);
00340 
00341 protected:
00342     // shared members for all indexes
00344     const ibis::column* col;
00347     mutable ibis::fileManager::storage* str;
00349     mutable const char* fname;
00351     mutable array_t<int32_t> offset32;
00355     mutable array_t<int64_t> offset64;
00357     mutable array_t<ibis::bitvector*> bits;
00361     uint32_t nrows;
00362 
00366     index(const ibis::column* c=0) : col(c), str(0), fname(0), nrows(0) {}
00367     index(const ibis::column* c, ibis::fileManager::storage* s);
00368 
00369     void dataFileName(std::string& name, const char* f=0) const;
00370     void indexFileName(std::string& name, const char* f=0) const;
00371     static void indexFileName(std::string& name, const ibis::column* col1,
00372                               const ibis::column* col2, const char* f=0);
00373 
00375     virtual void activate() const;
00377     virtual void activate(uint32_t i) const;
00380     virtual void activate(uint32_t i, uint32_t j) const;
00382     virtual void clear();
00383 
00385     // both VMap and histogram assume that all supported data types can be
00386     // safely stored in double
00389     void mapValues(const char* f, VMap& bmap) const;
00391     void mapValues(const char* f, histogram& hist, uint32_t count=0) const;
00392 
00393     void computeMinMax(const char* f, double& min, double& max) const;
00395     void optionalUnpack(array_t<ibis::bitvector*>& bits,
00396                         const char* opt);
00397 
00398     int initOffsets(int fdes, const char offsize, size_t start,
00399                     uint32_t nobs);
00400     int initOffsets(ibis::fileManager::storage* st, size_t start,
00401                     uint32_t nobs);
00402     void initBitmaps(int fdes);
00403     void initBitmaps(ibis::fileManager::storage* st);
00404 
00405     class barrel;
00406 
00407 private:
00408     index(const index&); // no copy constructor
00409     index& operator=(const index&); // no assignment operator
00410 }; // ibis::index
00411 
00412 
00415 class ibis::index::barrel : public ibis::math::barrel {
00416 public:
00417     barrel(const ibis::math::term* t) : ibis::math::barrel(t) {}
00418 
00419     void setValue(uint32_t i, double v) {varvalues[i] = v;}
00420 }; // ibis::index::barrel
00421 
00422 
00425 inline float ibis::index::sizeInBytes() const {
00426     if (offset64.size() > bits.size()) {
00427         return (float)offset64[bits.size()];
00428     }
00429     else if (offset32.size() > bits.size()) {
00430         return (float)offset32[bits.size()];
00431     }
00432     else if (str != 0) {
00433         return (float)str->size();
00434     }
00435     else if (*fname != 0) {
00436         return (float)ibis::util::getFileSize(fname);
00437     }
00438     else {
00439         return FLT_MAX;
00440     }
00441 } // ibis::index::sizeInBytes
00442 
00443 #endif // IBIS_INDEX_H

Make It A Bit Faster
Contact us
Disclaimers
FastBit source code
FastBit mailing list archive