00001 //File: $Id$ 00002 // Author: John Wu <John.Wu at ACM.org> 00003 // Lawrence Berkeley National Laboratory 00004 // Copyright 2000-2012 the Regents of the University of California 00005 #ifndef IBIS_INDEX_H 00006 #define IBIS_INDEX_H 00007 00008 00009 00010 00011 00012 00013 00014 00015 00016 00017 00018 00019 00020 00021 00022 00023 00024 #if defined(_WIN32) && defined(_MSC_VER) 00025 #pragma warning(disable:4786) // some identifier longer than 256 characters 00026 #endif 00027 #include "qExpr.h" 00028 #include "bitvector.h" 00029 00030 #include <string> 00031 #include <limits> // std::numeric_limits 00032 00033 namespace ibis { // the concrete classes of index hierarchy 00034 class bin; // equal size bins (equality encoded bitmap index) 00035 class range; // one-sided range encoded (cumulative) index 00036 class mesa; // interval encoded index (two-sided ranges) 00037 class ambit; // two-level, cumulative index (cumulate on all levels) 00038 class pale; // two-level, cumulative on the fine level only 00039 class pack; // two-level, cumulative on the coarse level only 00040 class zone; // two-level, do not cumulate on either levels 00041 class relic; // the basic bitmap index (one bitmap per distinct value) 00042 class slice; // the bit slice index (binary encoding of ibis::relic) 00043 class fade; // a more controlled slice (multicomponent range code) 00044 class sbiad; // Italian translation of fade (multicomponent interval code) 00045 class sapid; // closest word to sbiad in English (multicomponent equality 00046 // code) 00047 class egale; // French word for "equal" (multicomponent equality code on 00048 // bins) 00049 class moins; // French word for "less" (multicomponent range code on bins) 00050 class entre; // French word for "in between" (multicomponent interval code 00051 // on bins) 00052 class bak; // Dutch word for "bin" (simple equality encoding for 00053 // values mapped to reduced precision floating-point 00054 // values) 00055 class bak2; // a variation of bak that splits each bak bin in two 00056 // class waaier;// Dutch word for "range" (simple range encoding for values 00057 // // mapped to reduced precision floating-point values) 00058 // class tussen;// Dutch word for "in between" (simple interval encoding 00059 // // for values mapped to reduce precision floating-point 00060 // // values) 00061 // class uguale;// Italian word for "equal" (multicomponent version of bak) 00062 // class meno; // Italian word for "less" (multicomponent version of waaier) 00063 // class fra; // Italian word for "in between" (multicomponent version of 00064 // // tussen) 00065 class keywords;// A boolean version of term-document matrix. 00066 class mesh; // Composite index on 2-D regular mesh 00067 class band; // Composite index on 2-D bands. 00068 class direkte;// Directly use the integer values as bin numbers. 00069 class bylt; // Unbinned version of pack. 00070 class zona; // Unbinned version of zone. 00071 class fuzz; // Unbinned version of interval-equality encoding. 00072 class fuge; // Binned version of interval-equality encoding. 00073 } // namespace ibis 00074 00081 class ibis::index { 00082 public: 00086 enum INDEX_TYPE { 00087 BINNING=0, 00088 RANGE, 00089 MESA, 00090 AMBIT, 00091 PALE, 00092 PACK, 00093 ZONE, 00094 RELIC, 00095 ROSTER, 00096 SLICE, 00097 FADE, 00098 SBIAD, 00099 SAPID, 00100 EGALE, 00101 MOINS, 00102 ENTRE, 00103 BAK, 00104 BAK2, 00105 00106 00107 KEYWORDS, 00108 MESH, 00109 BAND, 00110 DIREKTE, 00111 GENERIC, 00112 BYLT, 00113 FUZZ, 00114 ZONA, 00115 FUGE, 00116 EXTERN 00117 }; 00118 00119 static index* create(const column* c, const char* name=0, 00120 const char* spec=0, int inEntirety=0); 00121 static bool isIndex(const char* f, INDEX_TYPE t); 00123 virtual ~index () {clear();}; 00124 00126 virtual INDEX_TYPE type() const = 0; 00129 virtual const char* name() const = 0; 00130 00133 virtual long evaluate(const ibis::qContinuousRange& expr, 00134 ibis::bitvector& hits) const = 0; 00136 virtual long select(const ibis::qContinuousRange&, void*) const =0; 00138 virtual long select(const ibis::qContinuousRange&, void*, 00139 ibis::bitvector&) const =0; 00142 virtual long evaluate(const ibis::qDiscreteRange&, 00143 ibis::bitvector&) const { 00144 return -1;} 00145 00156 virtual void estimate(const ibis::qContinuousRange&, 00157 ibis::bitvector& lower, 00158 ibis::bitvector& upper) const { 00159 lower.set(0, nrows); upper.set(1, nrows);} 00161 virtual uint32_t estimate(const ibis::qContinuousRange&) const { 00162 return nrows;} 00170 virtual float undecidable(const ibis::qContinuousRange&, 00171 ibis::bitvector&) const {return 0.5;} 00172 00175 virtual void estimate(const ibis::qDiscreteRange& expr, 00176 ibis::bitvector& lower, 00177 ibis::bitvector& upper) const; 00178 virtual uint32_t estimate(const ibis::qDiscreteRange& expr) const; 00179 virtual float undecidable(const ibis::qDiscreteRange& expr, 00180 ibis::bitvector& iffy) const; 00181 00183 virtual void estimate(const ibis::index& idx2, 00184 const ibis::deprecatedJoin& expr, 00185 ibis::bitvector64& lower, 00186 ibis::bitvector64& upper) const; 00189 virtual void estimate(const ibis::index& idx2, 00190 const ibis::deprecatedJoin& expr, 00191 const ibis::bitvector& mask, 00192 ibis::bitvector64& lower, 00193 ibis::bitvector64& upper) const; 00194 virtual void estimate(const ibis::index& idx2, 00195 const ibis::deprecatedJoin& expr, 00196 const ibis::bitvector& mask, 00197 const ibis::qRange* const range1, 00198 const ibis::qRange* const range2, 00199 ibis::bitvector64& lower, 00200 ibis::bitvector64& upper) const; 00202 virtual int64_t estimate(const ibis::index& idx2, 00203 const ibis::deprecatedJoin& expr) const; 00206 virtual int64_t estimate(const ibis::index& idx2, 00207 const ibis::deprecatedJoin& expr, 00208 const ibis::bitvector& mask) const; 00209 virtual int64_t estimate(const ibis::index& idx2, 00210 const ibis::deprecatedJoin& expr, 00211 const ibis::bitvector& mask, 00212 const ibis::qRange* const range1, 00213 const ibis::qRange* const range2) const; 00214 00216 virtual void estimate(const ibis::deprecatedJoin& expr, 00217 const ibis::bitvector& mask, 00218 const ibis::qRange* const range1, 00219 const ibis::qRange* const range2, 00220 ibis::bitvector64& lower, 00221 ibis::bitvector64& upper) const; 00222 virtual int64_t estimate(const ibis::deprecatedJoin& expr, 00223 const ibis::bitvector& mask, 00224 const ibis::qRange* const range1, 00225 const ibis::qRange* const range2) const; 00226 00228 virtual double estimateCost(const ibis::qContinuousRange&) const { 00229 return (offset32.empty() ? (nrows<<3) : offset32.back());} 00231 virtual double estimateCost(const ibis::qDiscreteRange&) const { 00232 return (offset32.empty() ? (nrows<<3) : offset32.back());} 00233 00236 virtual void print(std::ostream& out) const = 0; 00240 virtual int write(const char* name) const = 0; 00244 virtual int read(const char* name) = 0; 00247 virtual int read(ibis::fileManager::storage* st) = 0; 00248 00250 virtual long append(const char*, const char*, uint32_t) {return -1;} 00251 00252 inline float sizeInBytes() const; 00254 virtual void speedTest(std::ostream&) const {}; 00256 virtual uint32_t numBitvectors() const {return bits.size();} 00258 virtual const ibis::bitvector* getBitvector(uint32_t i) const { 00259 if (i < bits.size()) { 00260 if (bits[i] == 0) 00261 activate(i); 00262 return bits[i]; 00263 } 00264 else { 00265 return 0; 00266 } 00267 } 00268 00271 virtual void binBoundaries(std::vector<double>&) const {return;} 00272 virtual void binWeights(std::vector<uint32_t>&) const {return;} 00273 00275 virtual long getCumulativeDistribution 00276 (std::vector<double>& bds, std::vector<uint32_t>& cts) const; 00278 virtual long getDistribution 00279 (std::vector<double>& bbs, std::vector<uint32_t>& cts) const; 00281 virtual double getMin() const { 00282 return std::numeric_limits<double>::quiet_NaN();} 00284 virtual double getMax() const { 00285 return std::numeric_limits<double>::quiet_NaN();} 00289 virtual double getSum() const { 00290 return std::numeric_limits<double>::quiet_NaN();} 00292 uint32_t getNRows() const {return nrows;} 00293 00294 void addBins(uint32_t ib, uint32_t ie, ibis::bitvector& res) const; 00295 void addBins(uint32_t ib, uint32_t ie, ibis::bitvector& res, 00296 const ibis::bitvector& tot) const; 00297 void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector& res) const; 00298 void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector& res, 00299 uint32_t ib0, uint32_t ie0) const; 00300 void sumBins(const ibis::array_t<uint32_t> &, ibis::bitvector&) const; 00301 00307 virtual int expandRange(ibis::qContinuousRange&) const {return 0;} 00308 virtual int contractRange(ibis::qContinuousRange&) const {return 0;} 00309 00310 typedef std::map< double, ibis::bitvector* > VMap; 00311 typedef std::map< double, uint32_t > histogram; 00312 template <typename E> 00313 static void mapValues(const array_t<E>& val, VMap& bmap); 00314 template <typename E> 00315 static void mapValues(const array_t<E>& val, histogram& hist, 00316 uint32_t count=0); 00317 template <typename E> 00318 static void mapValues(const array_t<E>& val, array_t<E>& bounds, 00319 std::vector<uint32_t>& cnts); 00320 template <typename E1, typename E2> 00321 static void mapValues(const array_t<E1>& val1, const array_t<E2>& val2, 00322 array_t<E1>& bnd1, array_t<E2>& bnd2, 00323 std::vector<uint32_t>& cnts); 00326 static void divideCounts(array_t<uint32_t>& bounds, 00327 const array_t<uint32_t>& cnt); 00328 00329 // three static functions to perform the task of summing up bit sequences 00330 static void addBits(const array_t<bitvector*>& bits, 00331 uint32_t ib, uint32_t ie, ibis::bitvector& res); 00332 static void sumBits(const array_t<bitvector*>& bits, 00333 uint32_t ib, uint32_t ie, ibis::bitvector& res); 00334 static void sumBits(const array_t<bitvector*>& bits, 00335 const ibis::bitvector& tot, uint32_t ib, uint32_t ie, 00336 ibis::bitvector& res); 00337 // a static function to assign bases for multicomponent schemes 00338 static void setBases(array_t<uint32_t>& bases, uint32_t card, 00339 uint32_t nbase = 2); 00340 00341 protected: 00342 // shared members for all indexes 00344 const ibis::column* col; 00347 mutable ibis::fileManager::storage* str; 00349 mutable const char* fname; 00351 mutable array_t<int32_t> offset32; 00355 mutable array_t<int64_t> offset64; 00357 mutable array_t<ibis::bitvector*> bits; 00361 uint32_t nrows; 00362 00366 index(const ibis::column* c=0) : col(c), str(0), fname(0), nrows(0) {} 00367 index(const ibis::column* c, ibis::fileManager::storage* s); 00368 00369 void dataFileName(std::string& name, const char* f=0) const; 00370 void indexFileName(std::string& name, const char* f=0) const; 00371 static void indexFileName(std::string& name, const ibis::column* col1, 00372 const ibis::column* col2, const char* f=0); 00373 00375 virtual void activate() const; 00377 virtual void activate(uint32_t i) const; 00380 virtual void activate(uint32_t i, uint32_t j) const; 00382 virtual void clear(); 00383 00385 // both VMap and histogram assume that all supported data types can be 00386 // safely stored in double 00389 void mapValues(const char* f, VMap& bmap) const; 00391 void mapValues(const char* f, histogram& hist, uint32_t count=0) const; 00392 00393 void computeMinMax(const char* f, double& min, double& max) const; 00395 void optionalUnpack(array_t<ibis::bitvector*>& bits, 00396 const char* opt); 00397 00398 int initOffsets(int fdes, const char offsize, size_t start, 00399 uint32_t nobs); 00400 int initOffsets(ibis::fileManager::storage* st, size_t start, 00401 uint32_t nobs); 00402 void initBitmaps(int fdes); 00403 void initBitmaps(ibis::fileManager::storage* st); 00404 00405 class barrel; 00406 00407 private: 00408 index(const index&); // no copy constructor 00409 index& operator=(const index&); // no assignment operator 00410 }; // ibis::index 00411 00412 00415 class ibis::index::barrel : public ibis::math::barrel { 00416 public: 00417 barrel(const ibis::math::term* t) : ibis::math::barrel(t) {} 00418 00419 void setValue(uint32_t i, double v) {varvalues[i] = v;} 00420 }; // ibis::index::barrel 00421 00422 00425 inline float ibis::index::sizeInBytes() const { 00426 if (offset64.size() > bits.size()) { 00427 return (float)offset64[bits.size()]; 00428 } 00429 else if (offset32.size() > bits.size()) { 00430 return (float)offset32[bits.size()]; 00431 } 00432 else if (str != 0) { 00433 return (float)str->size(); 00434 } 00435 else if (*fname != 0) { 00436 return (float)ibis::util::getFileSize(fname); 00437 } 00438 else { 00439 return FLT_MAX; 00440 } 00441 } // ibis::index::sizeInBytes 00442 00443 #endif // IBIS_INDEX_H
![]() |