ibis::column Class Reference

The class to represent a column of a data partition. More...

#include <column.h>

Inheritance diagram for ibis::column:

ibis::bord::column ibis::text ibis::category

List of all members.

Public Member Functions

virtual long append (const char *dt, const char *df, const uint32_t nold, const uint32_t nnew, const uint32_t nbuf, char *buf)
 Append new data in directory df to the end of existing data in dt.
void binWeights (std::vector< uint32_t > &) const
 Retrive the number of rows in each bin.
 column (const column &rhs)
 copy constructor
 column (const part *tbl, ibis::TYPE_T t, const char *name, const char *desc="", double low=DBL_MAX, double high=-DBL_MAX)
 Construct a new column of specified type.
 column (const part *tbl, FILE *file)
 Reconstitute a column from the content of a file.
virtual void computeMinMax (const char *dir, double &min, double &max) const
 Compute the actual min/max of the data in directory dir.
virtual void computeMinMax (const char *dir)
virtual void computeMinMax ()
 Compute the actual min/max values by actually going through all the values.
int contractRange (ibis::qContinuousRange &rng) const
const char * dataFileName (std::string &fname, const char *dir=0) const
 Name of the data file in the given data directory.
void description (const char *d)
const char * description () const
int elementSize () const
virtual double estimateCost (const ibis::qMultiString &cmp) const
virtual double estimateCost (const ibis::qString &cmp) const
virtual double estimateCost (const ibis::qDiscreteRange &cmp) const
virtual double estimateCost (const ibis::qContinuousRange &cmp) const
 Estimate the cost of evaluate the query expression.
virtual long estimateRange (const ibis::qDiscreteRange &cmp) const
virtual long estimateRange (const ibis::qContinuousRange &cmp) const
 Use an index to compute an upper bound on the number of hits.
virtual long estimateRange (const ibis::qDiscreteRange &cmp, ibis::bitvector &low, ibis::bitvector &high) const
virtual long estimateRange (const ibis::qContinuousRange &cmp, ibis::bitvector &low, ibis::bitvector &high) const
 Compute a lower bound and an upper bound on the number of hits using the bitmap index.
virtual long evaluateRange (const ibis::qDiscreteRange &cmp, const ibis::bitvector &mask, ibis::bitvector &res) const
virtual long evaluateRange (const ibis::qContinuousRange &cmp, const ibis::bitvector &mask, ibis::bitvector &res) const
 Attempt to compute the exact answer.
int expandRange (ibis::qContinuousRange &rng) const
 expand/contract range condition so that the new ranges fall exactly on the bin boundaries
virtual const char * findString (const char *str) const
 Determine if the input string is one of the records.
virtual double getActualMax () const
 Compute the actual maximum value by reading the data or examining the index.
virtual double getActualMin () const
 Compute the actual minimum value by reading the data or examining the index.
long getCumulativeDistribution (std::vector< double > &bounds, std::vector< uint32_t > &counts) const
 Compute the actual data distribution.
long getDistribution (std::vector< double > &bbs, std::vector< uint32_t > &counts) const
 Count the number of records in each bin.
array_t< double > * getDoubleArray () const
array_t< float > * getFloatArray () const
array_t< int32_t > * getIntArray () const
 Return all rows of the column as an array_t object.
void getNullMask (bitvector &mask) const
template<typename T>
int getRawData (array_t< T > &vals) const
ibis::fileManager::storagegetRawData () const
virtual void getString (uint32_t i, std::string &val) const
 Return the string value for the ith row.
virtual double getSum () const
 Compute the sum of all values by reading the data.
virtual float getUndecidable (const ibis::qDiscreteRange &cmp, ibis::bitvector &iffy) const
virtual float getUndecidable (const ibis::qContinuousRange &cmp, ibis::bitvector &iffy) const
 Compute the locations of the rows can not be decided by the index.
virtual long indexSize () const
 Compute the index size (in bytes).
void indexSpec (const char *spec)
 Set the index specification.
const char * indexSpec () const
 Retrieve the index specification.
void indexSpeedTest () const
 Perform a set of built-in tests to determine the speed of common operations.
bool isFloat () const
bool isInteger () const
bool isNumeric () const
virtual void loadIndex (const char *opt=0) const throw ()
 Load the index associated with the column.
void logMessage (const char *event, const char *fmt,...) const
void logWarning (const char *event, const char *fmt,...) const
void lowerBound (double d)
const double & lowerBound () const
const char * name () const
const char * nullMaskName (std::string &fname) const
 Name of the NULL mask file.
uint32_t numBins () const
 Retrieve the number of bins used.
const partpartition () const
void preferredBounds (std::vector< double > &) const
 Retrive the bin boundaries if the index currently in use.
virtual void print (std::ostream &out) const
void purgeIndexFile (const char *dir=0) const
 Purge the index files assocated with the current column.
virtual long saveSelected (const ibis::bitvector &sel, const char *dest, char *buf, uint32_t nbuf)
 Save only the rows marked 1. Replace the data file in dest.
virtual array_t< char > * selectBytes (const bitvector &mask) const
 Return selected rows of the column as an array_t object.
virtual array_t< double > * selectDoubles (const bitvector &mask) const
 Put the selected values into an array as doubles.
virtual array_t< float > * selectFloats (const bitvector &mask) const
 Put selected values of a float column into an array.
virtual array_t< int32_t > * selectInts (const bitvector &mask) const
virtual array_t< int64_t > * selectLongs (const bitvector &mask) const
 Can be called on all integral types.
virtual array_t< int16_t > * selectShorts (const bitvector &mask) const
 Can convert all integers 2-byte or less in length.
virtual std::vector
< std::string > * 
selectStrings (const bitvector &mask) const
virtual array_t< unsigned char > * selectUBytes (const bitvector &mask) const
virtual array_t< uint32_t > * selectUInts (const bitvector &mask) const
 Can be called on columns of unsigned integral types, UINT, CATEGORY, USHORT, and UBYTE.
virtual array_t< uint64_t > * selectULongs (const bitvector &mask) const
 Can be called on all unsigned integral types.
virtual array_t< uint16_t > * selectUShorts (const bitvector &mask) const
template<typename T>
long selectValues (const bitvector &mask, array_t< T > &vals, array_t< uint32_t > &inds) const
 Select the values marked in the bitvector mask.
template<typename T>
long selectValues (const bitvector &mask, array_t< T > &vals) const
 Select the values marked in the bitvector mask.
long truncateData (const char *dir, uint32_t nent, ibis::bitvector &mask) const
 truncate the number of data entries in the named dir to nent.
ibis::TYPE_T type () const
void unloadIndex () const
 Unload the index associated with the column.
void upperBound (double d)
const double & upperBound () const
virtual void write (FILE *file) const
 Write the current content to the TDC file.
virtual long writeData (const char *dir, uint32_t nold, uint32_t nnew, ibis::bitvector &mask, const void *va1, const void *va2=0)
 Record the content in array va1 to directory dir. Extend the mask.

Protected Member Functions

template<typename T>
void actualMinMax (const array_t< T > &vals, const ibis::bitvector &mask, double &min, double &max) const
void actualMinMax (const char *fname, const ibis::bitvector &mask, double &min, double &max) const
 Given the name of the data file, compute the actual minimum and the maximum value.
template<typename T>
computeMax (const array_t< T > &vals, const ibis::bitvector &mask) const
double computeMax () const
 Read the base data to compute the maximum value.
template<typename T>
computeMin (const array_t< T > &vals, const ibis::bitvector &mask) const
double computeMin () const
 Read the data values and compute the minimum value.
template<typename T>
double computeSum (const array_t< T > &vals, const ibis::bitvector &mask) const
double computeSum () const
 Read the base data to compute the total sum.
void logError (const char *event, const char *fmt,...) const
 Print messages started with "Error" and throw a string exception.
long string2int (int fptr, dictionary &dic, uint32_t nbuf, char *buf, array_t< uint32_t > &out) const
 Convert strings in the opened file to a list of integers with the aid of a dictionary.

Protected Attributes

ibis::indexidx
 The index for this column. It is not consider as a must-have member.
double lower
std::string m_bins
std::string m_desc
std::string m_name
ibis::TYPE_T m_type
ibis::bitvector mask_
const partthePart
double upper

Friends

class indexLock
class mutexLock
class readLock
class softWriteLock
class writeLock

Classes

class  indexLock
 A class for controlling access of the index object of a column. More...
class  info
 Some basic information about a column. More...
class  mutexLock
 Provide a mutual exclusion lock on an ibis::column. More...
class  readLock
 Provide a write lock on a ibis::column object. More...
class  softWriteLock
 Provide a write lock on a ibis::column object. More...
class  writeLock
 Provide a write lock on a ibis::column object. More...


Detailed Description

The class to represent a column of a data partition.

FastBit represents user data as tables (each table may be divided into multiple partitions) where each table consists of a number of columns. Internally, the data values for each column is stored separated from others. In relational algebra terms, this is equivalent to projecting out each attribute of a relation separately. It increases the efficiency of searching on relatively small number of attributes compared to the horizontal data organization used in typical relational database systems.


Constructor & Destructor Documentation

ibis::column::column ( const part tbl,
FILE *  file 
)

Reconstitute a column from the content of a file.

Read the basic information about a column from file.

Note:
Assume the calling program has read "Begin Property/Column" already.

A well-formed column must have a valid name, i.e., ! m_name.empty().

References ibis::BYTE, ibis::CATEGORY, ibis::DOUBLE, ibis::FLOAT, ibis::gVerbose, ibis::INT, ibis::LONG, ibis::SHORT, ibis::TEXT, ibis::UBYTE, ibis::UINT, ibis::ULONG, and ibis::USHORT.

ibis::column::column ( const part tbl,
ibis::TYPE_T  t,
const char *  name,
const char *  desc = "",
double  low = DBL_MAX,
double  high = -DBL_MAX 
)

Construct a new column of specified type.

Construct a new column object based on type and name.

ibis::column::column ( const column rhs  ) 

copy constructor

The copy constructor.

Note:
The rwlock can not be copied.

The index is not copied either because reference counting difficulties.


Member Function Documentation

void ibis::column::actualMinMax ( const char *  name,
const ibis::bitvector mask,
double &  min,
double &  max 
) const [protected]

Given the name of the data file, compute the actual minimum and the maximum value.

Compute the actual minimum and maximum values.

Given a data file name, read its content to compute the actual minimum and the maximum of the data values. Only deal with four types of values, unsigned int, signed int, float and double.

References ibis::BYTE, ibis::DOUBLE, ibis::FLOAT, ibis::fileManager::getFile(), ibis::gVerbose, ibis::fileManager::instance(), ibis::INT, ibis::LONG, ibis::part::nRows(), ibis::SHORT, ibis::TYPESTRING, ibis::UBYTE, ibis::UINT, ibis::ULONG, and ibis::USHORT.

Referenced by computeMinMax().

long ibis::column::append ( const char *  dt,
const char *  df,
const uint32_t  nold,
const uint32_t  nnew,
const uint32_t  nbuf,
char *  buf 
) [virtual]

Append new data in directory df to the end of existing data in dt.

Append the content of file in df to end of file in dt.

Note:
Since this function does not compute the mininimum and the maximum of the new values, it is important the minimum and the maximum is present in the corresponding table.tdc file. For new data without minimum and maximum, some test functions may fail.

Reimplemented in ibis::text, and ibis::category.

References ibis::bitvector::adjustSize(), ibis::index::append(), ibis::util::logger::buffer(), ibis::bitvector::cnt(), ibis::index::create(), ibis::part::currentDataDir(), ibis::fileManager::flushFile(), ibis::index::getNRows(), ibis::part::getState(), ibis::gVerbose, ibis::fileManager::instance(), ibis::part::nRows(), ibis::OID, ibis::index::print(), ibis::bitvector::read(), ibis::bitvector::size(), ibis::part::timestamp(), ibis::index::write(), and ibis::bitvector::write().

Referenced by ibis::part::appendToBackup().

void ibis::column::computeMinMax ( const char *  dir,
double &  min,
double &  max 
) const [virtual]

Compute the actual min/max of the data in directory dir.

Go through the values in data directory dir and compute the actual min and max.

Report the actual min/max found back through output arguments min and max.

Reimplemented in ibis::bord::column.

References actualMinMax(), ibis::part::currentDataDir(), and dataFileName().

void ibis::column::computeMinMax (  )  [virtual]

Compute the actual min/max values by actually going through all the values.

This function reads the data in the active data directory and modifies the member variables to record the actual min/max.

Reimplemented in ibis::bord::column.

References actualMinMax(), ibis::part::currentDataDir(), and dataFileName().

Referenced by ibis::index::create().

const char * ibis::column::dataFileName ( std::string &  fname,
const char *  dir = 0 
) const

Name of the data file in the given data directory.

In normal case, the pointer returned is fname.c_str(), there is no need for the caller to free this pointer.

If the directory name is not given, the directory is assumed to be the current data directory of the data partition.

In case of error, it returns a nil pointer.

References ibis::part::currentDataDir().

Referenced by computeMax(), computeMin(), computeMinMax(), computeSum(), getDoubleArray(), getFloatArray(), getIntArray(), indexSize(), selectBytes(), selectDoubles(), selectFloats(), selectLongs(), selectShorts(), selectUInts(), selectULongs(), and selectValues().

long ibis::column::estimateRange ( const ibis::qContinuousRange cmp  )  const [virtual]

Use an index to compute an upper bound on the number of hits.

If no index can be computed, it will return the number of rows as the upper bound.

References ibis::index::estimate(), idx, ibis::part::nRows(), and unloadIndex().

long ibis::column::estimateRange ( const ibis::qContinuousRange cmp,
ibis::bitvector low,
ibis::bitvector high 
) const [virtual]

Compute a lower bound and an upper bound on the number of hits using the bitmap index.

If no index is available a new one will be built. If no index can be built, the lower bound will contain nothing and the the upper bound will contain everything. The two bounds are returned as bitmaps which marked the qualified rows as one, where the lower bound is stored in 'low' and the upper bound is stored in 'high'. If the bitvector 'high' has less bits than 'low', the bitvector 'low' is assumed to have an exact solution. This function always returns zero (0).

References ibis::bitvector::adjustSize(), ibis::bitvector::cnt(), ibis::bitvector::copy(), ibis::index::estimate(), ibis::gVerbose, idx, ibis::part::name(), ibis::part::nRows(), ibis::bitvector::set(), ibis::bitvector::size(), and unloadIndex().

long ibis::column::evaluateRange ( const ibis::qContinuousRange cmp,
const ibis::bitvector mask,
ibis::bitvector res 
) const [virtual]

Attempt to compute the exact answer.

If successful, return the number of hits, otherwise return a negative value.

Reimplemented in ibis::bord::column.

References ibis::bitvector::adjustSize(), ibis::bitvector::clear(), ibis::bitvector::cnt(), ibis::bitvector::copy(), ibis::part::doScan(), ibis::index::estimate(), ibis::gVerbose, idx, ibis::part::name(), ibis::bitvector::size(), and unloadIndex().

virtual const char* ibis::column::findString ( const char *  str  )  const [inline, virtual]

Determine if the input string is one of the records.

If yes, return the pointer to the incoming string, otherwise return nil.

Reimplemented in ibis::text.

double ibis::column::getActualMax (  )  const [virtual]

Compute the actual maximum value by reading the data or examining the index.

It returns -DBL_MAX in case of error.

References computeMax().

Referenced by ibis::query::addJoinConstraints(), ibis::part::coarsenBins(), ibis::part::get2DDistributionD(), and ibis::part::get2DDistributionI().

double ibis::column::getActualMin (  )  const [virtual]

Compute the actual minimum value by reading the data or examining the index.

It returns DBL_MAX in case of error.

References computeMin().

Referenced by ibis::query::addJoinConstraints(), ibis::part::get2DDistributionD(), and ibis::part::get2DDistributionI().

long ibis::column::getCumulativeDistribution ( std::vector< double > &  bounds,
std::vector< uint32_t > &  counts 
) const

Compute the actual data distribution.

It will generate an index for the column if one is not already available. The value in cts[i] is the number of values less than bds[i]. If there is no NULL values in the column, the array cts will start with 0 and and end the number of rows in the data. The array bds will end with a value that is greater than the actual maximum value.

long ibis::column::getDistribution ( std::vector< double > &  bbs,
std::vector< uint32_t > &  counts 
) const

Count the number of records in each bin.

The array bins contains bin boundaries that defines the following bins:

    (..., bins[0]) [bins[0], bins[1]) ... [bins.back(), ...).
Because of the two open bins at the end, N bin boundaries defines N+1 bins. The array counts has one more element than bins. This function returns the number of bins. If this function was executed successfully, the return value should be the same as the size of array counts, and one larger than the size of array bbs.

array_t< double > * ibis::column::getDoubleArray (  )  const

array_t< float > * ibis::column::getFloatArray (  )  const

array_t< int32_t > * ibis::column::getIntArray (  )  const

Return all rows of the column as an array_t object.

Caller is responsible for deleting the returned object.

References dataFileName(), ibis::fileManager::getFile(), ibis::fileManager::instance(), ibis::INT, and ibis::UINT.

virtual void ibis::column::getString ( uint32_t  i,
std::string &  val 
) const [inline, virtual]

Return the string value for the ith row.

Only valid for ibis::text and ibis::category. ibis::text

Reimplemented in ibis::text, and ibis::category.

Referenced by ibis::mensa::getColumnAsStrings().

float ibis::column::getUndecidable ( const ibis::qContinuousRange cmp,
ibis::bitvector iffy 
) const [virtual]

Compute the locations of the rows can not be decided by the index.

Returns the fraction of rows might satisfy the specified range condition.

References idx, ibis::index::undecidable(), and unloadIndex().

long ibis::column::indexSize (  )  const [virtual]

Compute the index size (in bytes).

Return a negative value if the index file does not exist.

References dataFileName().

Referenced by ibis::part::get2DDistribution().

void ibis::column::indexSpeedTest (  )  const

Perform a set of built-in tests to determine the speed of common operations.

References ibis::util::logger::buffer(), idx, and ibis::index::speedTest().

void ibis::column::loadIndex ( const char *  opt = 0  )  const throw () [virtual]

Load the index associated with the column.

Note:
Only the meta data about the index is loaded into memory. Bitmaps associated with the index are read into memory as needed.

References ibis::util::logger::buffer(), ibis::index::create(), ibis::part::currentDataDir(), ibis::index::getNRows(), ibis::gParameters(), ibis::gVerbose, idx, ibis::part::name(), ibis::part::nRows(), ibis::index::print(), purgeIndexFile(), and ibis::part::updateTDC().

Referenced by fastbit_build_index().

const char * ibis::column::nullMaskName ( std::string &  fname  )  const

Name of the NULL mask file.

On successful completion of this function, the return value is the result of fname.c_str(), otherwise a nil pointer is returned to indicate error.

References ibis::part::currentDataDir().

uint32_t ibis::column::numBins (  )  const

Retrieve the number of bins used.

References ibis::part::indexSpec().

array_t< char > * ibis::column::selectBytes ( const bitvector mask  )  const [virtual]

array_t< double > * ibis::column::selectDoubles ( const bitvector mask  )  const [virtual]

array_t< int64_t > * ibis::column::selectLongs ( const bitvector mask  )  const [virtual]

array_t< int16_t > * ibis::column::selectShorts ( const bitvector mask  )  const [virtual]

array_t< uint32_t > * ibis::column::selectUInts ( const bitvector mask  )  const [virtual]

template<typename T>
long ibis::column::selectValues ( const bitvector mask,
array_t< T > &  vals,
array_t< uint32_t > &  inds 
) const [inline]

Select the values marked in the bitvector mask.

Select all values marked 1 in the mask and pack them into the output array vals and fill the array inds with the positions of the values selected. On a successful executation, it returns the number of values selected. If it returns zero (0), the contents of vals and inds are not modified. If it returns a negative number, the contents of arrays vals and inds are not guaranteed to be in particular state.

References ibis::part::accessHint(), array_t< T >::begin(), ibis::bitvector::bytes(), ibis::bitvector::cnt(), dataFileName(), ibis::bitvector::firstIndexSet(), ibis::fileManager::getFile(), ibis::gVerbose, ibis::fileManager::instance(), ibis::part::name(), ibis::part::nRows(), ibis::fileManager::pageSize(), array_t< T >::push_back(), ibis::fileManager::recordPages(), array_t< T >::reserve(), array_t< T >::resize(), array_t< T >::size(), ibis::bitvector::size(), and ibis::fileManager::tryGetFile().

template<typename T>
long ibis::column::selectValues ( const bitvector mask,
array_t< T > &  vals 
) const [inline]

Select the values marked in the bitvector mask.

Select all values marked 1 in the mask and pack them into the output array vals