Class ibis::keywords defines a boolean term-document matrix. More...
#include <ikeywords.h>
Classes | |
class | tokenizer |
A simple tokenizer used to extract keywords. More... | |
Public Member Functions | |
virtual long | append (const char *dt, const char *df, uint32_t nnew) |
Extend the index. | |
virtual void | binBoundaries (std::vector< double > &b) const |
The function binBoundaries and binWeights return bin boundaries and counts of each bin respectively. More... | |
virtual void | binWeights (std::vector< uint32_t > &b) const |
virtual index * | dup () const |
Duplicate the content of an index object. | |
virtual void | estimate (const ibis::qContinuousRange &expr, ibis::bitvector &lower, ibis::bitvector &upper) const |
Computes an approximation of hits as a pair of lower and upper bounds. More... | |
virtual uint32_t | estimate (const ibis::qContinuousRange &expr) const |
Returns an upper bound on the number of hits. | |
virtual double | estimateCost (const ibis::qContinuousRange &expr) const |
Estimate the cost of evaluating a range condition. | |
virtual double | estimateCost (const ibis::qDiscreteRange &expr) const |
Estimate the cost of evaluating a range condition. | |
virtual long | evaluate (const ibis::qContinuousRange &expr, ibis::bitvector &hits) const |
To evaluate the exact hits. More... | |
virtual double | getMax () const |
The maximum value recorded in the index. | |
virtual double | getMin () const |
The minimum value recorded in the index. | |
virtual double | getSum () const |
Compute the approximate sum of all the values indexed. More... | |
keywords (const ibis::column *c, const char *f=0) | |
Constructor. More... | |
keywords (const ibis::column *c, ibis::text::tokenizer &tkn, const char *f=0) | |
Constructor. More... | |
keywords (const ibis::column *c, ibis::fileManager::storage *st) | |
Constructor. Reconstruct a keyword index from an existing file. | |
virtual const char * | name () const |
Returns the name of the index, similar to the function type , but returns a string instead. More... | |
virtual void | print (std::ostream &out) const |
Prints human readable information. More... | |
virtual int | read (const char *idxfile) |
Reconstructs an index from the named file. More... | |
virtual int | read (ibis::fileManager::storage *st) |
Reconstructs an index from an array of bytes. More... | |
long | search (const char *, ibis::bitvector &) const |
Match a particular keyword. | |
long | search (const char *) const |
Estimate the number of matches. | |
long | search (const std::vector< std::string > &, ibis::bitvector &) const |
Match all given keywords. | |
long | search (const std::vector< std::string > &) const |
Estimate the number of matches. An upper bound. | |
virtual long | select (const ibis::qContinuousRange &, void *) const |
Evaluate the range condition and select values. | |
virtual long | select (const ibis::qContinuousRange &, void *, ibis::bitvector &) const |
Evaluate the range condition, select values, and record the positions. | |
virtual void | serialSizes (uint64_t &, uint64_t &, uint64_t &) const |
Compute the size of arrays that would be generated by the serializatioin function (write). More... | |
virtual INDEX_TYPE | type () const |
Returns an index type identifier. | |
virtual float | undecidable (const ibis::qContinuousRange &, ibis::bitvector &iffy) const |
This class and its derived classes should produce exact answers, therefore no undecidable rows. More... | |
virtual int | write (ibis::array_t< double > &, ibis::array_t< int64_t > &, ibis::array_t< uint32_t > &) const |
Save index to three arrays. Serialize the index in memory. | |
virtual int | write (const char *dt) const |
Write the boolean term-document matrix as two files, xx.terms for the terms and xx.idx for the bitmaps that marks the positions. More... | |
Public Member Functions inherited from ibis::index | |
void | addBins (uint32_t ib, uint32_t ie, ibis::bitvector &res) const |
Add the sum of bits [ib] through bits [ie-1] to res . More... | |
void | addBins (uint32_t ib, uint32_t ie, ibis::bitvector &res, const ibis::bitvector &tot) const |
Compute the sum of bit vectors [ib , ie ). More... | |
virtual int | contractRange (ibis::qContinuousRange &) const |
bool | empty () const |
The index object is considered empty if there is no bitmap or getNRows returns 0. More... | |
virtual void | estimate (const ibis::qDiscreteRange &expr, ibis::bitvector &lower, ibis::bitvector &upper) const |
Estimate the hits for discrete ranges, i.e., those translated from 'a IN (x, y, ..)'. More... | |
virtual uint32_t | estimate (const ibis::qDiscreteRange &expr) const |
virtual void | estimate (const ibis::index &idx2, const ibis::deprecatedJoin &expr, ibis::bitvector64 &lower, ibis::bitvector64 &upper) const |
Estimate the pairs for the range join operator. | |
virtual void | estimate (const ibis::index &idx2, const ibis::deprecatedJoin &expr, const ibis::bitvector &mask, ibis::bitvector64 &lower, ibis::bitvector64 &upper) const |
Estimate the pairs for the range join operator. More... | |
virtual void | estimate (const ibis::index &idx2, const ibis::deprecatedJoin &expr, const ibis::bitvector &mask, const ibis::qRange *const range1, const ibis::qRange *const range2, ibis::bitvector64 &lower, ibis::bitvector64 &upper) const |
virtual int64_t | estimate (const ibis::index &idx2, const ibis::deprecatedJoin &expr) const |
Estimate an upper bound for the number of pairs. | |
virtual int64_t | estimate (const ibis::index &idx2, const ibis::deprecatedJoin &expr, const ibis::bitvector &mask) const |
Estimate an upper bound for the number of pairs produced from marked records. More... | |
virtual int64_t | estimate (const ibis::index &idx2, const ibis::deprecatedJoin &expr, const ibis::bitvector &mask, const ibis::qRange *const range1, const ibis::qRange *const range2) const |
virtual void | estimate (const ibis::deprecatedJoin &expr, const ibis::bitvector &mask, const ibis::qRange *const range1, const ibis::qRange *const range2, ibis::bitvector64 &lower, ibis::bitvector64 &upper) const |
Evaluating a join condition with one (likely composite) index. | |
virtual int64_t | estimate (const ibis::deprecatedJoin &expr, const ibis::bitvector &mask, const ibis::qRange *const range1, const ibis::qRange *const range2) const |
virtual long | evaluate (const ibis::qDiscreteRange &, ibis::bitvector &) const |
To evaluate the exact hits. More... | |
virtual int | expandRange (ibis::qContinuousRange &) const |
The functions expandRange and contractRange expands or contracts the boundaries of a range condition so that the new range will have exact answers using the function estimate. More... | |
virtual const ibis::bitvector * | getBitvector (uint32_t i) const |
Return a pointer to the ith bitvector used in the index (may be 0). | |
virtual long | getCumulativeDistribution (std::vector< double > &bds, std::vector< uint32_t > &cts) const |
Cumulative distribution of the data. More... | |
virtual long | getDistribution (std::vector< double > &bbs, std::vector< uint32_t > &cts) const |
Binned distribution of the data. More... | |
uint32_t | getNRows () const |
Return the number of rows represented by this object. | |
virtual uint32_t | numBitvectors () const |
Returns the number of bit vectors used by the index. | |
float | sizeInBytes () const |
Estiamte the size of this index object measured in bytes. More... | |
virtual void | speedTest (std::ostream &) const |
Time some logical operations and print out their speed. | |
void | sumBins (uint32_t ib, uint32_t ie, ibis::bitvector &res) const |
Sum up bits[ib:ie-1] and place the result in res. More... | |
void | sumBins (uint32_t ib, uint32_t ie, ibis::bitvector &res, uint32_t ib0, uint32_t ie0) const |
Compute a new sum for bit vectors [ib, ie) by taking advantage of the old sum for bitvectors [ib0, ie0). More... | |
void | sumBins (uint32_t ib, uint32_t ie, ibis::bitvector &res, uint32_t *buf) const |
Sum up bits[ib:ie-1] and place the result in res. More... | |
void | sumBins (const ibis::array_t< uint32_t > &, ibis::bitvector &) const |
Sum up the bits in in the specified bins. | |
virtual float | undecidable (const ibis::qDiscreteRange &expr, ibis::bitvector &iffy) const |
virtual | ~index () |
The destructor. | |
Protected Member Functions | |
void | clear () |
Clear the current content. | |
virtual size_t | getSerialSize () const throw () |
Estimate the size of the .idx file. More... | |
int | parseTextFile (ibis::text::tokenizer &tkn, const char *f) |
Parse the text file to build a keyword index. More... | |
int | readTDLine (std::istream &in, std::string &key, std::vector< uint32_t > &idlist, char *buf, uint32_t nbuf) const |
Read one line from the term-docuement file. More... | |
char | readTerm (const char *&buf, std::string &key) const |
Extract the term from a line of input term-document file. More... | |
int | readTermDocFile (const ibis::column *idcol, const char *f) |
Reads a term-document list from an external file. More... | |
uint32_t | readUInt (const char *&buf) const |
Extract the next integer in an inputline. | |
void | reorderTerms () |
Reorder the terms in the dictionary. More... | |
void | setBits (std::vector< uint32_t > &pos, ibis::bitvector &bvec) const |
Turn on the specified positions in a bitvector. | |
Protected Member Functions inherited from ibis::index | |
virtual void | activate () const |
Regenerate all bitvectors from the underlying storage. More... | |
virtual void | activate (uint32_t i) const |
Regenerate the ith bitvector from the underlying storage. | |
virtual void | activate (uint32_t i, uint32_t j) const |
Regenerate bitvectors i (inclusive) through j (exclusive) from the underlying storage. More... | |
void | computeMinMax (const char *f, double &min, double &max) const |
void | dataFileName (std::string &name, const char *f=0) const |
Generate data file name from "f". More... | |
index (const ibis::column *c=0) | |
Default constructor. More... | |
index (const ibis::column *c, ibis::fileManager::storage *s) | |
Constructor with a storage object. More... | |
index (const index &) | |
Copy constructor. | |
void | indexFileName (std::string &name, const char *f=0) const |
Generates index file name from "f". More... | |
void | initBitmaps (int fdes) |
Prepare the bitmaps using the given file descriptor. More... | |
void | initBitmaps (ibis::fileManager::storage *st) |
Prepare bitmaps from the given storage object. More... | |
void | initBitmaps (uint32_t *st) |
Prepare bitmaps from the given raw pointer. More... | |
void | initBitmaps (void *ctx, FastBitReadBitmaps rd) |
Prepare bitmaps from the user provided function pointer and context. More... | |
int | initOffsets (int64_t *, size_t) |
Initialize the offsets from the given data array. More... | |
int | initOffsets (int fdes, const char offsize, size_t start, uint32_t nobs) |
Read in the offset array. More... | |
int | initOffsets (ibis::fileManager::storage *st, size_t start, uint32_t nobs) |
Regenerate the offsets array from the given storage object. More... | |
void | mapValues (const char *f, VMap &bmap) const |
Map the positions of each individual value. More... | |
void | mapValues (const char *f, histogram &hist, uint32_t count=0) const |
Generate a histogram. More... | |
index & | operator= (const index &) |
Assignment operator. | |
void | optionalUnpack (array_t< ibis::bitvector * > &bits, const char *opt) |
A function to decide whether to uncompress the bitvectors. More... | |
Protected Attributes | |
ibis::dictionary | terms |
A dictionary for the terms. | |
Protected Attributes inherited from ibis::index | |
array_t< ibis::bitvector * > | bits |
A list of bitvectors. | |
bitmapReader * | breader |
The functor to read serialized bitmaps from a more complex source. | |
const ibis::column * | col |
Pointer to the column this index is for. | |
const char * | fname |
The name of the file containing the index. | |
uint32_t | nrows |
The number of rows represented by the index. More... | |
array_t< int32_t > | offset32 |
Starting positions of the bitvectors. | |
array_t< int64_t > | offset64 |
Starting positions of the bitvectors. More... | |
ibis::fileManager::storage * | str |
The underlying storage. More... | |
Additional Inherited Members | |
Public Types inherited from ibis::index | |
typedef std::map< double, uint32_t > | histogram |
enum | INDEX_TYPE { BINNING =0, RANGE, MESA, AMBIT, PALE, PACK, ZONE, RELIC, ROSTER, SKIVE, FADE, SBIAD, SAPID, EGALE, MOINS, ENTRE, BAK, BAK2, KEYWORDS, MESH, BAND, DIREKTE, GENERIC, BYLT, FUZZ, ZONA, FUGE, SLICE, EXTERN } |
The integer values of this enum type are used in the index files to differentiate the indexes. More... | |
typedef std::map< double, ibis::bitvector * > | VMap |
Static Public Member Functions inherited from ibis::index | |
static void | addBits (const array_t< bitvector * > &bits, uint32_t ib, uint32_t ie, ibis::bitvector &res) |
Add the pile [ib:ie-1] to res . More... | |
static index * | create (const column *c, const char *name=0, const char *spec=0, int inEntirety=0) |
Index factory. More... | |
static void | divideCounts (array_t< uint32_t > &bounds, const array_t< uint32_t > &cnt) |
Determine how to split the array cnt , so that each group has roughly the same total value. More... | |
static bool | isIndex (const char *f, INDEX_TYPE t) |
Is the named file an index file? Read the header of the named file to determine if it contains an index of the specified type. More... | |
template<typename E > | |
static void | mapValues (const array_t< E > &val, VMap &bmap) |
template<typename E > | |
static void | mapValues (const array_t< E > &val, histogram &hist, uint32_t count=0) |
template<typename E > | |
static void | mapValues (const array_t< E > &val, array_t< E > &bounds, std::vector< uint32_t > &cnts) |
template<typename E1 , typename E2 > | |
static void | mapValues (const array_t< E1 > &val1, const array_t< E2 > &val2, array_t< E1 > &bnd1, array_t< E2 > &bnd2, std::vector< uint32_t > &cnts) |
Compute a two-dimensional histogram. More... | |
static void | printHeader (std::ostream &, const char *) |
static void | setBases (array_t< uint32_t > &bases, uint32_t card, uint32_t nbase=2) |
Fill the array bases with the values that cover the range [0, card). More... | |
static void | sumBits (const array_t< bitvector * > &bits, uint32_t ib, uint32_t ie, ibis::bitvector &res) |
Sum up pile [ib:ie-1] and place the result in res . More... | |
static void | sumBits (const array_t< bitvector * > &bits, const ibis::bitvector &tot, uint32_t ib, uint32_t ie, ibis::bitvector &res) |
Sum up pile [ib:ie-1] and add the result to res . More... | |
Static Protected Member Functions inherited from ibis::index | |
static void | indexFileName (std::string &name, const ibis::column *col1, const ibis::column *col2, const char *f=0) |
Generate the index file name for the composite index fromed on two columns. More... | |
Class ibis::keywords defines a boolean term-document matrix.
The terms are stored in an ibis::dictionary and the columns of the matrix are stored in a series of bitvectors. The name term-document matrix is borrowed from literature about indexing documents. In this context, a document is a row of the text column and each document ID is either stored in another column of unsigned integers or simply the ordinal number of the row.
The current implementation can either read a term-document list or parse the binary string values with a list of delimiters to extract the key words. It first checks for the presence of a term-document list which can be explicitly or implicitly specified. Here are the options.
Note that the filename given above can be either a fully qualified name or a name in the same directory as the data file.
If a term-document list is provided, the document id used in the list may be specified explicitly through docIdName either in the index specification or in a configuration file. An example of index specification is as follows
In a configuration file, the syntax for specifying a docIdName is as follows.
For example,
If an ID column is not specified, the integer IDs in the .tdlist file is assumed to the row numbers.
If the term-document list is not specified, one may specify a list of delimiters for the tokenizer to parse the text values. The list of delimiters can be specified in either the index option or through a configuration file. Here is an example indexing option
The following is an example line in a configuration file (say, ibis.rc)
This particular choice is suitable for indexing set-valued columns, where the values are stored as coma-separated ASCII text strings. For example, in a data set about automobiles from different manufactures, the column about color choices might have the following values:
/// "model A", "red, white, tan", ... /// "model B', "black, maroon, silver", ... /// "model C', "red, light blue, sky blue, white, pink", ... ///
In this case, the second column about color choices could be read in as TEXT and indexed with keywords index using coma as delimiter for the built-in tokenizer.
There are two different ways of building a keyword index and they can each be specified explicitly or implicitly. The precedence is as follows: an explicitly specified option takes precedence over an implicitly option, a term-document list has the precedence over the built-in parser.
|
explicit |
Constructor.
It first tries to read the terms (.terms) and the tdmat (
.idx) files if they both exist. If that fails, it will attempt to build an index using the externally provided term-document list or parsing the text with a specified list of delimiters.
References ibis::index::bits, ibis::CATEGORY, clear(), ibis::index::clear(), ibis::index::col, ibis::index::dataFileName(), ibis::util::getFileSize(), ibis::INT, ibis::index::optionalUnpack(), parseTextFile(), print(), read(), readTermDocFile(), reorderTerms(), ibis::dictionary::size(), terms, ibis::TEXT, ibis::column::type(), and ibis::UINT.
ibis::keywords::keywords | ( | const ibis::column * | c, |
ibis::text::tokenizer & | tkn, | ||
const char * | f = 0 |
||
) |
Constructor.
Construct a new keyword index using the user-provided tokenizer. The tokenizer must be derived from ibis::text::tokenizer.
References ibis::index::bits, ibis::index::col, ibis::index::optionalUnpack(), parseTextFile(), print(), and reorderTerms().
|
inlinevirtual |
The function binBoundaries and binWeights return bin boundaries and counts of each bin respectively.
Reimplemented from ibis::index.
|
virtual |
Computes an approximation of hits as a pair of lower and upper bounds.
expr | the query expression to be evaluated. |
lower | a bitvector marking a subset of the hits. All rows marked with one (1) are definitely hits. |
upper | a bitvector marking a superset of the hits. All hits are marked with one, but some of the rows marked one may not be hits. If the variable upper is empty, the variable lower is assumed to contain the exact answer. |
Reimplemented from ibis::index.
References ibis::bitvector::set().
|
virtual |
To evaluate the exact hits.
On success, return the number of hits, otherwise a negative value is returned.
Implements ibis::index.
|
protectedvirtual |
Estimate the size of the .idx file.
The .idx file contains only the bitmaps without the actual terms. The bitmap offsets are assumed to be 8-byte long.
Reimplemented from ibis::index.
|
inlinevirtual |
Compute the approximate sum of all the values indexed.
If it decides that computing the sum directly from the vertical partition is more efficient, it will return NaN immediately.
Reimplemented from ibis::index.
|
inlinevirtual |
Returns the name of the index, similar to the function type
, but returns a string instead.
Implements ibis::index.
|
protected |
Parse the text file to build a keyword index.
This function is called by the constructor of the class to build a new keyword index.
References ibis::fileManager::buffer< T >::address(), ibis::util::clear(), ibis::fileManager::buffer< T >::resize(), ibis::bitvector::setBit(), ibis::fileManager::buffer< T >::size(), and UnixOpen.
Referenced by keywords().
|
virtual |
Prints human readable information.
Outputs information about the index as text to the specified output stream.
Implements ibis::index.
References ibis::util::compactValue().
Referenced by keywords().
|
virtual |
Reconstructs an index from the named file.
The name can be the directory containing an index file. In this case, the name of the index file must be the name of the column followed by ".idx" suffix.
Implements ibis::index.
References ibis::index::clear(), ibis::fileManager::instance(), ibis::index::KEYWORDS, ibis::fileManager::recordPages(), ibis::util::strnewdup(), and UnixOpen.
Referenced by keywords().
|
virtual |
Reconstructs an index from an array of bytes.
Intended for internal use only!
Implements ibis::index.
References ibis::fileManager::storage::begin(), ibis::util::clear(), and ibis::index::KEYWORDS.
|
protected |
Read one line from the term-docuement file.
The caller has opened the file already, read one line from the input stream. Extract the keyword and the list of ids.
References ibis::util::readUInt().
|
inlineprotected |
Extract the term from a line of input term-document file.
A keyword is any number of printable characters. Returns the first non-space character following the keyword, which should be the delimiter ':'. Consecutive spaces in the keyword are replaced with a single plain space character.
|
protected |
Reads a term-document list from an external file.
Returns the number of terms found if successful, otherwise returns a negative number to indicate error.
References ibis::fileManager::buffer< T >::address(), ibis::bitvector::adjustSize(), ibis::bitvector::cnt(), ibis::roster::locate(), ibis::column::name(), ibis::fileManager::buffer< T >::size(), and ibis::util::strnewdup().
Referenced by keywords().
|
protected |
Reorder the terms in the dictionary.
Afterwards, the terms will be ordered alphabetically in the dictionary.
References ibis::array_t< T >::size(), ibis::array_t< T >::sort(), and ibis::array_t< T >::swap().
Referenced by keywords().
|
virtual |
Compute the size of arrays that would be generated by the serializatioin function (write).
Implements ibis::index.
|
inlinevirtual |
This class and its derived classes should produce exact answers, therefore no undecidable rows.
Reimplemented from ibis::index.
References ibis::bitvector::clear().
|
virtual |
Write the boolean term-document matrix as two files, xx.terms for the terms and xx.idx for the bitmaps that marks the positions.
Implements ibis::index.
References ibis::fileManager::flushFile(), ibis::fileManager::instance(), ibis::util::flock::isLocked(), ibis::index::KEYWORDS, and UnixOpen.