ikeywords.h
Go to the documentation of this file.
1 //File: $Id$
2 // Author: John Wu <John.Wu at ACM.org>
3 // Copyright (c) 2006-2016 the Regents of the University of California
4 #ifndef IBIS_KEYWORDS_H
5 #define IBIS_KEYWORDS_H
6 
11 #include "index.h" // base index class
12 #include "category.h" // definitions of string-valued columns
13 
96 class ibis::keywords : public ibis::index {
97 public:
98  virtual ~keywords() {clear();}
99  explicit keywords(const ibis::column* c, const char* f=0);
101  const char* f=0);
103 
104  virtual INDEX_TYPE type() const {return KEYWORDS;}
105  virtual const char* name() const {return "keywords";}
106  virtual void binBoundaries(std::vector<double>& b) const {b.clear();}
107  virtual void binWeights(std::vector<uint32_t>& b) const;
108  virtual double getMin() const {return DBL_MAX;}
109  virtual double getMax() const {return -DBL_MAX;}
110  virtual double getSum() const {return -DBL_MAX;}
111  long search(const char*, ibis::bitvector&) const;
112  long search(const char*) const;
113  long search(const std::vector<std::string>&, ibis::bitvector&) const;
114  long search(const std::vector<std::string>&) const;
115 
116  virtual index* dup() const;
117  virtual void print(std::ostream& out) const;
118  virtual void serialSizes(uint64_t&, uint64_t&, uint64_t&) const;
119  virtual int write(ibis::array_t<double> &,
121  ibis::array_t<uint32_t> &) const;
122  virtual int write(const char* dt) const;
123  virtual int read(const char* idxfile);
124  virtual int read(ibis::fileManager::storage* st);
125  virtual long append(const char* dt, const char* df, uint32_t nnew);
126 
127  using ibis::index::evaluate;
128  using ibis::index::estimate;
130  virtual long evaluate(const ibis::qContinuousRange& expr,
131  ibis::bitvector& hits) const;
132  virtual void estimate(const ibis::qContinuousRange& expr,
133  ibis::bitvector& lower,
134  ibis::bitvector& upper) const;
135  virtual uint32_t estimate(const ibis::qContinuousRange& expr) const;
138  virtual float undecidable(const ibis::qContinuousRange &,
139  ibis::bitvector &iffy) const {
140  iffy.clear();
141  return 0.0;
142  }
143  virtual double estimateCost(const ibis::qContinuousRange& expr) const;
144  virtual double estimateCost(const ibis::qDiscreteRange& expr) const;
145 
146  virtual long select(const ibis::qContinuousRange&, void*) const {
147  return -1;}
148  virtual long select(const ibis::qContinuousRange&, void*,
149  ibis::bitvector&) const {
150  return -1;}
151 
152  class tokenizer;
153 
154 protected:
157 
158  virtual size_t getSerialSize() const throw();
159  int readTermDocFile(const ibis::column* idcol, const char* f);
160  inline char readTerm(const char*& buf, std::string &key) const;
161  inline uint32_t readUInt(const char*& buf) const;
162  int readTDLine(std::istream& in, std::string& key,
163  std::vector<uint32_t>& idlist,
164  char* buf, uint32_t nbuf) const;
165  void setBits(std::vector<uint32_t>& pos, ibis::bitvector& bvec) const;
166  int parseTextFile(ibis::text::tokenizer &tkn, const char *f);
167 
168  void clear();
169  void reorderTerms();
170 }; // class ibis::keywords
171 
177 inline char ibis::keywords::readTerm(const char*& buf,
178  std::string &keyword) const {
179  while (isspace(*buf)) // skip leading space
180  ++ buf;
181  while (isprint(*buf)) { // loop through all printable till the delimiter
182  if (*buf == ':') {
183  return *buf;
184  }
185  else if (isspace(*buf)) {
186  for (++ buf; isspace(*buf); ++ buf);
187  if (*buf == ':') {
188  return *buf;
189  }
190  else {
191  keyword += ' ';
192  keyword += *buf;
193  ++ buf;
194  }
195  }
196  else {
197  keyword += *buf;
198  ++ buf;
199  }
200  }
201  return *buf;
202 } // ibis::keywords::readTerm
203 
205 inline uint32_t ibis::keywords::readUInt(const char*& buf) const {
206  uint32_t res = 0;
207  while (*buf && ! isdigit(*buf)) // skip leading non-digit
208  ++ buf;
209 
210  while (isdigit(*buf)) {
211  res = res * 10 + (*buf - '0');
212  ++ buf;
213  }
214  return res;
215 } // ibis::keywords::readUInt
216 
221 public:
222  tokenizer(const char *d=ibis::util::delimiters);
224  virtual ~tokenizer() {}
225 
226  virtual int operator()(std::vector<const char*>& tkns, char *buf);
227 
228 protected:
230  std::string delim_;
231 }; // class ibis::keywords::tokenizer
232 #endif
virtual double getMin() const
The minimum value recorded in the index.
Definition: ikeywords.h:108
A simple tokenizer used to extract keywords.
Definition: ikeywords.h:220
void clear()
Clear the current content.
Definition: ikeywords.cpp:844
Class ibis::keywords defines a boolean term-document matrix.
Definition: ikeywords.h:96
Definition of the common functions of an index.
uint32_t readUInt(const char *&buf) const
Extract the next integer in an inputline.
Definition: ikeywords.h:205
virtual size_t getSerialSize() const
Estimate the size of the .idx file.
Definition: ikeywords.cpp:1030
Define three specialization of the column class.
virtual int read(const char *idxfile)
Reconstructs an index from the named file.
Definition: ikeywords.cpp:714
virtual ~tokenizer()
Destructor.
Definition: ikeywords.h:224
std::string delim_
The list of delimiters. May be empty.
Definition: ikeywords.h:230
Simple range condition.
Definition: qExpr.h:252
virtual double getMax() const
The maximum value recorded in the index.
Definition: ikeywords.h:109
The storage class treats all memory as char*.
Definition: fileManager.h:237
virtual void print(std::ostream &out) const
Prints human readable information.
Definition: ikeywords.cpp:501
virtual long append(const char *dt, const char *df, uint32_t nnew)
Extend the index.
Definition: ikeywords.cpp:860
long search(const char *, ibis::bitvector &) const
Match a particular keyword.
Definition: ikeywords.cpp:886
virtual void estimate(const ibis::qContinuousRange &expr, ibis::bitvector &lower, ibis::bitvector &upper) const
Computes an approximation of hits as a pair of lower and upper bounds.
Definition: ikeywords.cpp:874
STL namespace.
The current implementation of FastBit is code named IBIS; most data structures and functions are in t...
Definition: bord.h:16
virtual void estimate(const ibis::qContinuousRange &, ibis::bitvector &lower, ibis::bitvector &upper) const
Computes an approximation of hits as a pair of lower and upper bounds.
Definition: index.h:191
The class to represent a column of a data partition.
Definition: column.h:65
virtual long evaluate(const ibis::qContinuousRange &expr, ibis::bitvector &hits) const =0
To evaluate the exact hits.
virtual void serialSizes(uint64_t &, uint64_t &, uint64_t &) const
Compute the size of arrays that would be generated by the serializatioin function (write)...
Definition: ikeywords.cpp:695
ibis::dictionary terms
A dictionary for the terms.
Definition: ikeywords.h:152
void reorderTerms()
Reorder the terms in the dictionary.
Definition: ikeywords.cpp:851
tokenizer(const char *d=ibis::util::delimiters)
Constructor.
Definition: ikeywords.cpp:1043
virtual float undecidable(const ibis::qContinuousRange &expr, ibis::bitvector &iffy) const
Mark the position of the rows that can not be decided with this index.
Definition: index.h:205
The base index class.
Definition: index.h:82
virtual void binBoundaries(std::vector< double > &b) const
The function binBoundaries and binWeights return bin boundaries and counts of each bin respectively...
Definition: ikeywords.h:106
virtual int operator()(std::vector< const char * > &tkns, char *buf)
Tokenizer.
Definition: ikeywords.cpp:1058
virtual index * dup() const
Duplicate the content of an index object.
Definition: ikeywords.cpp:166
INDEX_TYPE
The integer values of this enum type are used in the index files to differentiate the indexes...
Definition: index.h:86
virtual long select(const ibis::qContinuousRange &, void *, ibis::bitvector &) const
Evaluate the range condition, select values, and record the positions.
Definition: ikeywords.h:148
A data structure for storing null-terminated text.
Definition: category.h:27
virtual long select(const ibis::qContinuousRange &, void *) const
Evaluate the range condition and select values.
Definition: ikeywords.h:146
virtual double getSum() const
Compute the approximate sum of all the values indexed.
Definition: ikeywords.h:110
int readTDLine(std::istream &in, std::string &key, std::vector< uint32_t > &idlist, char *buf, uint32_t nbuf) const
Read one line from the term-docuement file.
Definition: ikeywords.cpp:289
A tokenizer class to turn a string buffer into tokens.
Definition: category.h:81
virtual int write(ibis::array_t< double > &, ibis::array_t< int64_t > &, ibis::array_t< uint32_t > &) const
Save index to three arrays. Serialize the index in memory.
Definition: ikeywords.cpp:668
A data structure to represent a sequence of bits.
Definition: bitvector.h:62
int parseTextFile(ibis::text::tokenizer &tkn, const char *f)
Parse the text file to build a keyword index.
Definition: ikeywords.cpp:366
virtual float undecidable(const ibis::qContinuousRange &, ibis::bitvector &iffy) const
This class and its derived classes should produce exact answers, therefore no undecidable rows...
Definition: ikeywords.h:138
const char * delimiters
Delimiters used to separate a string of names.
Definition: util.cpp:71
virtual const char * name() const
Returns the name of the index, similar to the function type, but returns a string instead...
Definition: ikeywords.h:105
keywords(const ibis::column *c, const char *f=0)
Constructor.
Definition: ikeywords.cpp:18
virtual INDEX_TYPE type() const
Returns an index type identifier.
Definition: ikeywords.h:104
void setBits(std::vector< uint32_t > &pos, ibis::bitvector &bvec) const
Turn on the specified positions in a bitvector.
Definition: ikeywords.cpp:354
char readTerm(const char *&buf, std::string &key) const
Extract the term from a line of input term-document file.
Definition: ikeywords.h:177
virtual double estimateCost(const ibis::qContinuousRange &expr) const
Estimate the cost of evaluating a range condition.
Definition: ikeywords.cpp:977
void clear()
Remove the existing content of a bitvector.
Definition: bitvector.cpp:243
A discrete range expression.
Definition: qExpr.h:337
Provide a dual-directional mapping between strings and integers.
Definition: dict-0.h:19
int readTermDocFile(const ibis::column *idcol, const char *f)
Reads a term-document list from an external file.
Definition: ikeywords.cpp:173
ibis::keywords, boolean term-document matrix.
Definition: index.h:130
virtual long evaluate(const ibis::qContinuousRange &expr, ibis::bitvector &hits) const
To evaluate the exact hits.
Definition: ikeywords.cpp:866

Make It A Bit Faster
Contact us
Disclaimers
FastBit source code
FastBit mailing list archive