Go to the documentation of this file.
1 //File: $Id$
2 // Author: John Wu <John.Wu at>
3 // Lawrence Berkeley National Laboratory
4 // Copyright (c) 2000-2016 the Regents of the University of California
5 #ifndef IBIS_INDEX_H
6 #define IBIS_INDEX_H
7 #if defined(_WIN32) && defined(_MSC_VER)
25 #pragma warning(disable:4786) // some identifier longer than 256 characters
26 #endif
27 #include "qExpr.h"
28 #include "bitvector.h"
30 #include <string>
31 #include <limits> // std::numeric_limits
33 namespace ibis { // the concrete classes of index hierarchy
34  class bin; // equal size bins (equality encoded bitmap index)
35  class range; // one-sided range encoded (cumulative) index
36  class mesa; // interval encoded index (two-sided ranges)
37  class ambit; // two-level, cumulative index (cumulate on all levels)
38  class pale; // two-level, cumulative on the fine level only
39  class pack; // two-level, cumulative on the coarse level only
40  class zone; // two-level, do not cumulate on either levels
41  class relic; // the basic bitmap index (one bitmap per distinct value)
42  class skive; // a binary encoded index with repacking of keyvalues
43  class slice; // the bit slice index (binary encoding of ibis::relic)
44  class fade; // a more controlled slice (multicomponent range code)
45  class sbiad; // Italian translation of fade (multicomponent interval code)
46  class sapid; // closest word to sbiad in English (multicomponent equality
47  // code)
48  class egale; // French word for "equal" (multicomponent equality code on
49  // bins)
50  class moins; // French word for "less" (multicomponent range code on bins)
51  class entre; // French word for "in between" (multicomponent interval code
52  // on bins)
53  class bak; // Dutch word for "bin" (simple equality encoding for
54  // values mapped to reduced precision floating-point
55  // values)
56  class bak2; // a variation of bak that splits each bak bin in two
57 // class waaier;// Dutch word for "range" (simple range encoding for values
58 // // mapped to reduced precision floating-point values)
59 // class tussen;// Dutch word for "in between" (simple interval encoding
60 // // for values mapped to reduce precision floating-point
61 // // values)
62 // class uguale;// Italian word for "equal" (multicomponent version of bak)
63 // class meno; // Italian word for "less" (multicomponent version of waaier)
64 // class fra; // Italian word for "in between" (multicomponent version of
65 // // tussen)
66  class keywords;// A boolean version of term-document matrix.
67  class mesh; // Composite index on 2-D regular mesh
68  class band; // Composite index on 2-D bands.
69  class direkte;// Directly use the integer values as bin numbers.
70  class bylt; // Unbinned version of pack.
71  class zona; // Unbinned version of zone.
72  class fuzz; // Unbinned version of interval-equality encoding.
73  class fuge; // Binned version of interval-equality encoding.
74 } // namespace ibis
82 class ibis::index {
83 public:
86  enum INDEX_TYPE {
151  };
153  static index* create(const column* c, const char* name=0,
154  const char* spec=0, int inEntirety=0);
155  static bool isIndex(const char* f, INDEX_TYPE t);
158  virtual ~index () {clear();};
161  virtual INDEX_TYPE type() const = 0;
164  virtual const char* name() const = 0;
168  virtual long evaluate(const ibis::qContinuousRange& expr,
169  ibis::bitvector& hits) const = 0;
171  virtual long select(const ibis::qContinuousRange&, void*) const =0;
173  virtual long select(const ibis::qContinuousRange&, void*,
174  ibis::bitvector&) const =0;
177  virtual long evaluate(const ibis::qDiscreteRange&,
178  ibis::bitvector&) const {
179  return -1;}
191  virtual void estimate(const ibis::qContinuousRange&,
192  ibis::bitvector& lower,
193  ibis::bitvector& upper) const {
194  lower.set(0, nrows); upper.set(1, nrows);}
196  virtual uint32_t estimate(const ibis::qContinuousRange&) const {
197  return nrows;}
205  virtual float undecidable(const ibis::qContinuousRange &expr,
206  ibis::bitvector &iffy) const {return 0.5;}
210  virtual void estimate(const ibis::qDiscreteRange& expr,
211  ibis::bitvector& lower,
212  ibis::bitvector& upper) const;
213  virtual uint32_t estimate(const ibis::qDiscreteRange& expr) const;
214  virtual float undecidable(const ibis::qDiscreteRange& expr,
215  ibis::bitvector& iffy) const;
218  virtual void estimate(const ibis::index& idx2,
219  const ibis::deprecatedJoin& expr,
220  ibis::bitvector64& lower,
221  ibis::bitvector64& upper) const;
224  virtual void estimate(const ibis::index& idx2,
225  const ibis::deprecatedJoin& expr,
226  const ibis::bitvector& mask,
227  ibis::bitvector64& lower,
228  ibis::bitvector64& upper) const;
229  virtual void estimate(const ibis::index& idx2,
230  const ibis::deprecatedJoin& expr,
231  const ibis::bitvector& mask,
232  const ibis::qRange* const range1,
233  const ibis::qRange* const range2,
234  ibis::bitvector64& lower,
235  ibis::bitvector64& upper) const;
237  virtual int64_t estimate(const ibis::index& idx2,
238  const ibis::deprecatedJoin& expr) const;
241  virtual int64_t estimate(const ibis::index& idx2,
242  const ibis::deprecatedJoin& expr,
243  const ibis::bitvector& mask) const;
244  virtual int64_t estimate(const ibis::index& idx2,
245  const ibis::deprecatedJoin& expr,
246  const ibis::bitvector& mask,
247  const ibis::qRange* const range1,
248  const ibis::qRange* const range2) const;
251  virtual void estimate(const ibis::deprecatedJoin& expr,
252  const ibis::bitvector& mask,
253  const ibis::qRange* const range1,
254  const ibis::qRange* const range2,
255  ibis::bitvector64& lower,
256  ibis::bitvector64& upper) const;
257  virtual int64_t estimate(const ibis::deprecatedJoin& expr,
258  const ibis::bitvector& mask,
259  const ibis::qRange* const range1,
260  const ibis::qRange* const range2) const;
263  virtual double estimateCost(const ibis::qContinuousRange&) const {
264  return (offset32.empty() ? (nrows<<3) : offset32.back());}
266  virtual double estimateCost(const ibis::qDiscreteRange&) const {
267  return (offset32.empty() ? (nrows<<3) : offset32.back());}
271  virtual void print(std::ostream& out) const = 0;
275  virtual int write(const char* name) const = 0;
277  virtual int write(ibis::array_t<double> &,
279  ibis::array_t<uint32_t> &) const =0;
282  virtual void serialSizes(uint64_t&, uint64_t&, uint64_t&) const =0;
286  virtual int read(const char* name) = 0;
289  virtual int read(ibis::fileManager::storage* st) = 0;
292  virtual long append(const char*, const char*, uint32_t) {return -1;}
294  virtual index* dup() const = 0;
296  float sizeInBytes() const;
298  virtual void speedTest(std::ostream&) const {};
300  virtual uint32_t numBitvectors() const {return bits.size();}
302  virtual const ibis::bitvector* getBitvector(uint32_t i) const {
303  if (i < bits.size()) {
304  if (bits[i] == 0)
305  activate(i);
306  return bits[i];
307  }
308  else {
309  return 0;
310  }
311  }
315  virtual void binBoundaries(std::vector<double>&) const {return;}
316  virtual void binWeights(std::vector<uint32_t>&) const {return;}
319  virtual long getCumulativeDistribution
320  (std::vector<double>& bds, std::vector<uint32_t>& cts) const;
322  virtual long getDistribution
323  (std::vector<double>& bbs, std::vector<uint32_t>& cts) const;
325  virtual double getMin() const {
326  return std::numeric_limits<double>::quiet_NaN();}
328  virtual double getMax() const {
329  return std::numeric_limits<double>::quiet_NaN();}
333  virtual double getSum() const {
334  return std::numeric_limits<double>::quiet_NaN();}
336  uint32_t getNRows() const {return nrows;}
340  bool empty() const {return (bits.empty() || nrows == 0);}
342  void addBins(uint32_t ib, uint32_t ie, ibis::bitvector& res) const;
343  void addBins(uint32_t ib, uint32_t ie, ibis::bitvector& res,
344  const ibis::bitvector& tot) const;
345  void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector& res) const;
346  void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector& res,
347  uint32_t ib0, uint32_t ie0) const;
348  void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector& res,
349  uint32_t *buf) const;
350  void sumBins(const ibis::array_t<uint32_t> &, ibis::bitvector&) const;
357  virtual int expandRange(ibis::qContinuousRange&) const {return 0;}
358  virtual int contractRange(ibis::qContinuousRange&) const {return 0;}
360  typedef std::map< double, ibis::bitvector* > VMap;
361  typedef std::map< double, uint32_t > histogram;
362  template <typename E>
363  static void mapValues(const array_t<E>& val, VMap& bmap);
364  template <typename E>
365  static void mapValues(const array_t<E>& val, histogram& hist,
366  uint32_t count=0);
367  template <typename E>
368  static void mapValues(const array_t<E>& val, array_t<E>& bounds,
369  std::vector<uint32_t>& cnts);
370  template <typename E1, typename E2>
371  static void mapValues(const array_t<E1>& val1, const array_t<E2>& val2,
372  array_t<E1>& bnd1, array_t<E2>& bnd2,
373  std::vector<uint32_t>& cnts);
376  static void divideCounts(array_t<uint32_t>& bounds,
377  const array_t<uint32_t>& cnt);
379  // three static functions to perform the task of summing up bit sequences
380  static void addBits(const array_t<bitvector*>& bits,
381  uint32_t ib, uint32_t ie, ibis::bitvector& res);
382  static void sumBits(const array_t<bitvector*>& bits,
383  uint32_t ib, uint32_t ie, ibis::bitvector& res);
384  static void sumBits(const array_t<bitvector*>& bits,
385  const ibis::bitvector& tot, uint32_t ib, uint32_t ie,
386  ibis::bitvector& res);
388  static void setBases(array_t<uint32_t>& bases, uint32_t card,
389  uint32_t nbase = 2);
390  static void printHeader(std::ostream&, const char*);
392 protected:
393  // forward declarations.
394  class barrel;
397  // shared members for all indexes
399  const ibis::column* col;
404  mutable const char* fname;
418  uint32_t nrows;
423  index(const ibis::column* c=0)
424  : col(c), str(0), fname(0), breader(0), nrows(0) {}
426  index(const index&);
427  index& operator=(const index&);
429  void dataFileName(std::string& name, const char* f=0) const;
430  void indexFileName(std::string& name, const char* f=0) const;
431  static void indexFileName(std::string& name, const ibis::column* col1,
432  const ibis::column* col2, const char* f=0);
435  virtual void activate() const;
437  virtual void activate(uint32_t i) const;
440  virtual void activate(uint32_t i, uint32_t j) const;
442  virtual void clear();
443  virtual size_t getSerialSize() const throw();
446  // both VMap and histogram assume that all supported data types can be
447  // safely stored in double
450  void mapValues(const char* f, VMap& bmap) const;
452  void mapValues(const char* f, histogram& hist, uint32_t count=0) const;
454  void computeMinMax(const char* f, double& min, double& max) const;
456  void optionalUnpack(array_t<ibis::bitvector*>& bits,
457  const char* opt);
459  int initOffsets(int64_t *, size_t);
460  int initOffsets(int fdes, const char offsize, size_t start,
461  uint32_t nobs);
462  int initOffsets(ibis::fileManager::storage *st, size_t start,
463  uint32_t nobs);
464  void initBitmaps(int fdes);
465  void initBitmaps(ibis::fileManager::storage *st);
466  void initBitmaps(uint32_t *st);
467  void initBitmaps(void *ctx, FastBitReadBitmaps rd);
469 private:
471  static index* readOld(const column*, const char*,
472  fileManager::storage*, INDEX_TYPE);
473  static index* buildNew(const column*, const char*, const char*);
474 }; // ibis::index
479 class ibis::index::barrel : public ibis::math::barrel {
480 public:
481  barrel(const ibis::math::term* t) : ibis::math::barrel(t) {}
483  void setValue(uint32_t i, double v) {varvalues[i] = v;}
484 }; // ibis::index::barrel
489 public:
492  : _context(ctx), _reader(rd) {}
500  int read(uint64_t b, uint64_t c, ibis::fileManager::buffer<uint32_t> &buf) {
501  if (c == 0) return 0; // nothing to read
502  if (buf.size() < c) {
503  buf.resize(c);
504  if (buf.size() < c) {
505  LOGGER(ibis::gVerbose > 1)
506  << "Warning -- bitmapReader(" << _context << ", "
507  << reinterpret_cast<void*>(_reader) << ") failed to "
508  "allocate enough space to read " << c << " elements "
509  "from the given context";
510  return -1;
511  }
512  }
514  return _reader(_context, b, c, buf.address());
515  }
523  int read(uint64_t b, uint64_t c, ibis::array_t<uint32_t> &buf) {
524  if (c == 0) return 0; // nothing to read
525  if (buf.size() < c) {
526  buf.resize(c);
527  if (buf.size() < c) {
528  LOGGER(ibis::gVerbose > 1)
529  << "Warning -- bitmapReader(" << _context << ", "
530  << reinterpret_cast<void*>(_reader) << ") failed to "
531  "allocate enough space to read " << c << " elements "
532  "from the given context";
533  return -1;
534  }
535  }
537  return _reader(_context, b, c, buf.begin());
538  }
540 private:
541  void *_context;
542  FastBitReadBitmaps _reader;
544  // Default constructor. Declared, but not defined.
545  bitmapReader();
546 }; // ibis::index::bitmapReader
547 #endif // IBIS_INDEX_H
virtual double getMin() const
The minimum value recorded in the index.
Definition: index.h:325
static void setBases(array_t< uint32_t > &bases, uint32_t card, uint32_t nbase=2)
Fill the array bases with the values that cover the range [0, card).
Definition: index.cpp:8650
ibis::bylt, unbinned range-equality encoding.
Definition: index.h:140
A class to represent simple range conditions.
Definition: qExpr.h:207
size_t resize(size_t sz=0)
Increase the size of the buffer.
Definition: fileManager.cpp:1794
Definition: index.h:93
ibis::fuge, binned interval-equality encoding.
Definition: index.h:146
const char * fname
The name of the file containing the index.
Definition: index.h:404
void sumBins(uint32_t ib, uint32_t ie, ibis::bitvector &res) const
Sum up bits[ib:ie-1] and place the result in res.
Definition: index.cpp:6450
size_t size() const
Definition: array_t.h:69
virtual long append(const char *, const char *, uint32_t)
Extend the index.
Definition: index.h:292
virtual long evaluate(const ibis::qDiscreteRange &, ibis::bitvector &) const
To evaluate the exact hits.
Definition: index.h:177
float sizeInBytes() const
Estiamte the size of this index object measured in bytes.
Definition: index.cpp:1352
ibis::zone, equality-equality encoding on bins.
Definition: index.h:103
static void sumBits(const array_t< bitvector * > &bits, uint32_t ib, uint32_t ie, ibis::bitvector &res)
Sum up pile[ib:ie-1] and place the result in res.
Definition: index.cpp:7349
bitmapReader * breader
The functor to read serialized bitmaps from a more complex source.
Definition: index.h:406
A barrel to hold a list of variables.
Definition: qExpr.h:760
uint32_t nrows
The number of rows represented by the index.
Definition: index.h:418
Simple range condition.
Definition: qExpr.h:252
virtual void speedTest(std::ostream &) const
Time some logical operations and print out their speed.
Definition: index.h:298
The storage class treats all memory as char*.
Definition: fileManager.h:237
virtual const char * name() const =0
Returns the name of the index, similar to the function type, but returns a string instead...
static bool isIndex(const char *f, INDEX_TYPE t)
Is the named file an index file? Read the header of the named file to determine if it contains an ind...
Definition: index.cpp:1426
array_t< ibis::bitvector * > bits
A list of bitvectors.
Definition: index.h:414
ibis::fileManager::storage * str
The underlying storage.
Definition: index.h:402
bitmapReader(void *ctx, FastBitReadBitmaps rd)
Definition: index.h:491
ibis::bak, reduced precision mapping, equality code.
Definition: index.h:123
index & operator=(const index &)
Assignment operator.
Definition: index.cpp:1299
Definition: index.h:91
The current implementation of FastBit is code named IBIS; most data structures and functions are in t...
Definition: bord.h:16
ibis::bak2, splits each BAK bin in three, one less than the mapped value, one greater than the mapped...
Definition: index.h:128
virtual void estimate(const ibis::qContinuousRange &, ibis::bitvector &lower, ibis::bitvector &upper) const
Computes an approximation of hits as a pair of lower and upper bounds.
Definition: index.h:191
The class to represent a column of a data partition.
Definition: column.h:65
ibis::pale, equality-range encoding on bins.
Definition: index.h:99
virtual long evaluate(const ibis::qContinuousRange &expr, ibis::bitvector &hits) const =0
To evaluate the exact hits.
A data structure to represent a sequence of bits.
Definition: bitvector64.h:54
not used.
Definition: index.h:132
ibis::sapid, multicomponent equality encoding (unbinned).
Definition: index.h:115
size_t size() const
The number of elements in the buffer. NOT the number of bytes.
Definition: fileManager.h:142
int read(uint64_t b, uint64_t c, ibis::fileManager::buffer< uint32_t > &buf)
The main function to read the serialized bitmaps.
Definition: index.h:500
ibis::slice, bit-sliced index.
Definition: index.h:148
virtual float undecidable(const ibis::qContinuousRange &expr, ibis::bitvector &iffy) const
Mark the position of the rows that can not be decided with this index.
Definition: index.h:205
virtual index * dup() const =0
Duplicate the content of an index object.
void dataFileName(std::string &name, const char *f=0) const
Generate data file name from "f".
Definition: index.cpp:1464
ibis::zona, unbinned equality-equality encoding.
Definition: index.h:144
The base index class.
Definition: index.h:82
virtual uint32_t estimate(const ibis::qContinuousRange &) const
Returns an upper bound on the number of hits.
Definition: index.h:196
static index * create(const column *c, const char *name=0, const char *spec=0, int inEntirety=0)
Index factory.
Definition: index.cpp:168
virtual void clear()
Clear the existing content.
Definition: index.cpp:1313
Define the query expression.
virtual long getCumulativeDistribution(std::vector< double > &bds, std::vector< uint32_t > &cts) const
Cumulative distribution of the data.
Definition: index.cpp:2634
A buffer is intended to be a temporary workspace in memory.
Definition: fileManager.h:128
The integer values of this enum type are used in the index files to differentiate the indexes...
Definition: index.h:86
static void divideCounts(array_t< uint32_t > &bounds, const array_t< uint32_t > &cnt)
Determine how to split the array cnt, so that each group has roughly the same total value...
Definition: index.cpp:3648
uint32_t getNRows() const
Return the number of rows represented by this object.
Definition: index.h:336
T * address() const
Address of the buffer allocated.
Definition: fileManager.h:140
virtual uint32_t numBitvectors() const
Returns the number of bit vectors used by the index.
Definition: index.h:300
virtual double estimateCost(const ibis::qContinuousRange &) const
Estimate the cost of evaluating a range condition.
Definition: index.h:263
This fileManager is intended to allow different objects to share the same open file.
Definition: fileManager.h:23
ibis::entre, multicomponent interval encoding on bins.
Definition: index.h:121
virtual void binBoundaries(std::vector< double > &) const
The function binBoundaries and binWeights return bin boundaries and counts of each bin respectively...
Definition: index.h:315
A specialization that adds function setValue.
Definition: index.h:479
virtual ~index()
The destructor.
Definition: index.h:158
virtual void print(std::ostream &out) const =0
Prints human readable information.
ibis::pack, range-equality encoding on bins.
Definition: index.h:101
not used.
Definition: index.h:138
virtual double estimateCost(const ibis::qDiscreteRange &) const
Estimate the cost of evaluating a range condition.
Definition: index.h:266
virtual void serialSizes(uint64_t &, uint64_t &, uint64_t &) const =0
Compute the size of arrays that would be generated by the serializatioin function (write)...
array_t< int64_t > offset64
Starting positions of the bitvectors.
Definition: index.h:412
void optionalUnpack(array_t< ibis::bitvector * > &bits, const char *opt)
A function to decide whether to uncompress the bitvectors.
Definition: index.cpp:8730
int initOffsets(int64_t *, size_t)
Initialize the offsets from the given data array.
Definition: index.cpp:4213
void resize(size_t n)
Change the size of the array to have elements.
Definition: array_t.cpp:1469
int read(uint64_t b, uint64_t c, ibis::array_t< uint32_t > &buf)
The main function to read the serialized bitmaps.
Definition: index.h:523
virtual long getDistribution(std::vector< double > &bbs, std::vector< uint32_t > &cts) const
Binned distribution of the data.
Definition: index.cpp:2615
externally defined index.
Definition: index.h:150
ibis::skive, binary encoding with recoding of key values.
Definition: index.h:109
virtual double getSum() const
Compute the approximate sum of all the values indexed.
Definition: index.h:333
bool empty() const
The index object is considered empty if there is no bitmap or getNRows returns 0. ...
Definition: index.h:340
ibis::moins, multicomponent range encoding on bins.
Definition: index.h:119
The abstract base class for arithmetic terms.
Definition: qExpr.h:728
array_t< int32_t > offset32
Starting positions of the bitvectors.
Definition: index.h:408
ibis::roster, RID list.
Definition: index.h:107
void addBins(uint32_t ib, uint32_t ie, ibis::bitvector &res) const
Add the sum of bits[ib] through bits[ie-1] to res.
Definition: index.cpp:5537
ibis::sbiad, multicomponent interval encoding (unbinned).
Definition: index.h:113
ibis::fade, multicomponent range encoding (unbinned).
Definition: index.h:111
ibis::relic, the basic bitmap index.
Definition: index.h:105
void initBitmaps(int fdes)
Prepare the bitmaps using the given file descriptor.
Definition: index.cpp:4298
A data structure to represent a sequence of bits.
Definition: bitvector.h:62
virtual int read(const char *name)=0
Reconstructs an index from the named file.
Definition of Word-Aligned Hybrid code.
Definition: index.h:95
A simple container to hold the function pointer given by user for reading the serialized bitmaps...
Definition: index.h:488
virtual void activate() const
Regenerate all bitvectors from the underlying storage.
Definition: index.cpp:4638
void indexFileName(std::string &name, const char *f=0) const
Generates index file name from "f".
Definition: index.cpp:1473
virtual int expandRange(ibis::qContinuousRange &) const
The functions expandRange and contractRange expands or contracts the boundaries of a range condition ...
Definition: index.h:357
const ibis::column * col
Pointer to the column this index is for.
Definition: index.h:395
virtual size_t getSerialSize() const
Compute the size of the serialized version of the index.
Definition: index.cpp:1341
index(const ibis::column *c=0)
Default constructor.
Definition: index.h:423
ibis::ambit, range-range two level encoding on bins.
Definition: index.h:97
virtual INDEX_TYPE type() const =0
Returns an index type identifier.
not used.
Definition: index.h:134
virtual double getMax() const
The maximum value recorded in the index.
Definition: index.h:328
int(* FastBitReadBitmaps)(void *context, uint64_t start, uint64_t count, uint32_t *data)
A function prototype for delayed index reconstruction.
Definition: const.h:341
A discrete range expression.
Definition: qExpr.h:337
ibis::fuzz, unbinned interval-equality encoding.
Definition: index.h:142
ibis::direkte, hash value to bitmaps.
Definition: index.h:136
virtual int write(const char *name) const =0
Save index to a file.
ibis::egale, multicomponent equality encoding on bins.
Definition: index.h:117
ibis::keywords, boolean term-document matrix.
Definition: index.h:130
static void addBits(const array_t< bitvector * > &bits, uint32_t ib, uint32_t ie, ibis::bitvector &res)
Add the pile[ib:ie-1] to res.
Definition: index.cpp:7186
A join is defined by two names and a numerical expression.
Definition: qExpr.h:1240
virtual const ibis::bitvector * getBitvector(uint32_t i) const
Return a pointer to the ith bitvector used in the index (may be 0).
Definition: index.h:302
virtual long select(const ibis::qContinuousRange &, void *) const =0
Evaluate the range condition and select values.
void set(int val, word_t n)
Create a vector with n bits of value val (cf.
Definition: bitvector.cpp:256

Make It A Bit Faster
Contact us
FastBit source code
FastBit mailing list archive