A data structure for storing null-terminated text. More...
#include <category.h>
Classes | |
struct | tokenizer |
A tokenizer class to turn a string buffer into tokens. More... | |
Public Member Functions | |
virtual long | append (const char *dt, const char *df, const uint32_t nold, const uint32_t nnew, uint32_t nbuf, char *buf) |
Append the data file stored in directory df to the corresponding data file in directory dt . | |
virtual long | append (const void *, const ibis::bitvector &) |
Append the records in vals to the current working dataset. | |
void | delimitersForKeywordIndex (std::string &) const |
virtual double | estimateCost (const ibis::qString &cmp) const |
Estimate the cost of evaluating a string lookup. | |
virtual double | estimateCost (const ibis::qMultiString &cmp) const |
Estimate the cost of looking up a group of strings. | |
virtual const char * | findString (const char *str) const |
If the input string is found in the data file, it is returned, else this function returns 0. | |
virtual void | getString (uint32_t i, std::string &val) const |
Return the string value for the i th row. | |
const column * | IDColumnForKeywordIndex () const |
Locate the ID column for processing term-document list provided by the user. | |
virtual long | keywordSearch (const char *str, ibis::bitvector &hits) const |
virtual long | keywordSearch (const char *str) const |
virtual void | loadIndex (const char *iopt=0, int ropt=0) const throw () |
Load the index associated with the column. | |
virtual long | patternSearch (const char *, ibis::bitvector &) const |
virtual long | patternSearch (const char *) const |
virtual void | print (std::ostream &out) const |
Print header info. | |
virtual long | saveSelected (const ibis::bitvector &sel, const char *dest, char *buf, uint32_t nbuf) |
Write the selected values to the specified directory. | |
virtual array_t< int64_t > * | selectLongs (const bitvector &mask) const |
Return the starting positions of strings marked 1 in the mask. | |
virtual std::vector < std::string > * | selectStrings (const bitvector &mask) const |
Retrieve the string values from the rows marked 1 in mask. | |
virtual array_t< uint32_t > * | selectUInts (const bitvector &mask) const |
Return the positions of records marked 1 in the mask. | |
virtual long | stringSearch (const char *str, ibis::bitvector &hits) const |
Given a string literal, return a bitvector that marks the strings that matche it. | |
virtual long | stringSearch (const std::vector< std::string > &strs, ibis::bitvector &hits) const |
Given a group of string literals, return a bitvector that matches anyone of the input strings. | |
virtual long | stringSearch (const char *str) const |
virtual long | stringSearch (const std::vector< std::string > &strs) const |
void | TDListForKeywordIndex (std::string &) const |
text (const part *tbl, FILE *file) | |
text (const part *tbl, const char *name, ibis::TYPE_T t=ibis::TEXT) | |
Construct a text object for a data partition with the given name. | |
text (const ibis::column &col) | |
Copy constructor. Copy from a column with TEXT type. | |
virtual void | write (FILE *file) const |
Write the metadata entry. | |
Protected Member Functions | |
void | readString (uint32_t i, std::string &val) const |
Read the string value of i th row. | |
int | readString (std::string &, int, long, long, char *, uint32_t, uint32_t &, off_t &) const |
Read one string from an open file. | |
void | startPositions (const char *dir, char *buf, uint32_t nbuf) const |
Locate the starting position of each string. | |
int | writeStrings (const char *to, const char *from, const char *spto, const char *spfrom, ibis::bitvector &msk, const ibis::bitvector &sel, char *buf, uint32_t nbuf) const |
Write the selected strings. |
A data structure for storing null-terminated text.
The only type of search supported on this type of data is keyword search. The keyword search operation is implemented through a boolean term-document matrix (ibis::keywords) that has to be generated externally.
long ibis::text::append | ( | const char * | dt, |
const char * | df, | ||
const uint32_t | nold, | ||
const uint32_t | nnew, | ||
uint32_t | nbuf, | ||
char * | buf | ||
) | [virtual] |
Append the data file stored in directory df
to the corresponding data file in directory dt
.
Use the buffer buf
to copy data in large chuncks.
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References FASTBIT_DIRSEP, ibis::gVerbose, and UnixOpen.
virtual long ibis::text::append | ( | const void * | vals, |
const ibis::bitvector & | msk | ||
) | [inline, virtual] |
Append the records in vals to the current working dataset.
The 'void*' in this function follows the convention of the function getValuesArray (not writeData), i.e., for the ten fixed-size elementary data types, it is array_t<type>* and for string-valued columns it is std::vector<std::string>*.
Return the number of entries actually written to disk or a negative number to indicate error conditions.
Reimplemented from ibis::column.
Reimplemented in ibis::category.
const char * ibis::text::findString | ( | const char * | str | ) | const [virtual] |
If the input string is found in the data file, it is returned, else this function returns 0.
It needs to keep both the data file and the starting position file open at the same time.
Reimplemented from ibis::column.
References ibis::fileManager::buffer< T >::address(), FASTBIT_DIRSEP, ibis::gVerbose, ibis::fileManager::instance(), ibis::fileManager::recordPages(), and ibis::fileManager::buffer< T >::size().
virtual void ibis::text::getString | ( | uint32_t | , |
std::string & | |||
) | const [inline, virtual] |
Return the string value for the i
th row.
Only implemented for ibis::text and ibis::category.
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References readString().
Referenced by ibis::mensa::cursor::dumpIJ().
const ibis::column * ibis::text::IDColumnForKeywordIndex | ( | ) | const |
Locate the ID column for processing term-document list provided by the user.
This function checks indexSpec first for docIDName=xx for the name of the ID column, then checks the global parameter <table-name>.<column-name>.docIDName.
References ibis::util::getString(), ibis::gParameters(), and ibis::column::name().
void ibis::text::loadIndex | ( | const char * | iopt = 0 , |
int | ropt = 0 |
||
) | const throw () [virtual] |
Load the index associated with the column.
iopt | This option is passed to ibis::index::create to be used if a new index is to be created. |
ropt | This option is passed to ibis::index::create to control the reading operations for reconstitute the index object from an index file. |
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References ibis::column::loadIndex().
void ibis::text::readString | ( | uint32_t | i, |
std::string & | ret | ||
) | const [protected] |
Read the string value of i
th row.
It goes through a two-stage process by reading from two files, first from the .sp file to read the position of the string in the second file and the second file contains the actual string values (with nil terminators).
This can be quite slow!
References FASTBIT_DIRSEP, ibis::gVerbose, ibis::fileManager::instance(), ibis::fileManager::recordPages(), and UnixOpen.
Referenced by getString().
int ibis::text::readString | ( | std::string & | res, |
int | fdes, | ||
long | be, | ||
long | en, | ||
char * | buf, | ||
uint32_t | nbuf, | ||
uint32_t & | inbuf, | ||
off_t & | boffset | ||
) | const [protected] |
Read one string from an open file.
The string starts at position be
and ends at en
.
The content may be in the array buf
.
Returns 0 if successful, otherwise return a negative number to indicate error.
References ibis::gVerbose.
long ibis::text::saveSelected | ( | const ibis::bitvector & | sel, |
const char * | dest, | ||
char * | buf, | ||
uint32_t | nbuf | ||
) | [virtual] |
Write the selected values to the specified directory.
If the destination directory is the current data directory, the file containing existing string values will be renamed to be column-name.old, otherwise, the file in the destination directory is simply overwritten. In case of error, a negative number is returned, otherwise, the number of rows saved to the new file is returned.
Reimplemented from ibis::column.
References FASTBIT_DIRSEP, ibis::fileManager::flushFile(), and ibis::fileManager::instance().
ibis::array_t< int64_t > * ibis::text::selectLongs | ( | const bitvector & | mask | ) | const [virtual] |
Return the starting positions of strings marked 1 in the mask.
The starting positions of the selected string values are stored in the returned array.
Reimplemented from ibis::column.
References FASTBIT_DIRSEP, ibis::fileManager::getFile(), ibis::util::getFileSize(), ibis::fileManager::instance(), ibis::bitvector::indexSet::nIndices(), ibis::array_t< T >::push_back(), ibis::array_t< T >::size(), and ibis::bitvector::size().
std::vector< std::string > * ibis::text::selectStrings | ( | const bitvector & | mask | ) | const [virtual] |
Retrieve the string values from the rows marked 1 in mask.
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References ibis::fileManager::buffer< T >::address(), ibis::bitvector::cnt(), FASTBIT_DIRSEP, ibis::util::getFileSize(), ibis::gVerbose, ibis::bitvector::indexSet::nIndices(), ibis::util::readString(), ibis::array_t< T >::size(), ibis::fileManager::buffer< T >::size(), ibis::bitvector::size(), and UnixOpen.
Referenced by ibis::category::selectStrings().
ibis::array_t< uint32_t > * ibis::text::selectUInts | ( | const bitvector & | mask | ) | const [virtual] |
Return the positions of records marked 1 in the mask.
This indicates to ibis::bundle that every string value is distinct.
It also forces the sorting procedure to produce an order following the order of the entries in the table. This makes the print out of an ibis::text field quite less useful than others!
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References ibis::bitvector::indexSet::nIndices(), and ibis::array_t< T >::push_back().
void ibis::text::startPositions | ( | const char * | dir, |
char * | buf, | ||
uint32_t | nbuf | ||
) | const [protected] |
Locate the starting position of each string.
Using the data file located in the named directory dir
.
If dir
is a nil pointer, the directory defaults to the current working directory of the data partition.
It writes the starting positions as int64_t integers to a file with .sp as extension.
Argument buf
(with nbuf
bytes) is used as temporary work space. If nbuf
= 0, this function allocates its own working space.
References ibis::fileManager::buffer< T >::address(), FASTBIT_DIRSEP, ibis::gVerbose, and ibis::fileManager::buffer< T >::size().
Referenced by text().
long ibis::text::stringSearch | ( | const char * | str, |
ibis::bitvector & | hits | ||
) | const [virtual] |
Given a string literal, return a bitvector that marks the strings that matche it.
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References ibis::fileManager::buffer< T >::address(), ibis::bitvector::adjustSize(), ibis::bitvector::clear(), ibis::bitvector::cnt(), FASTBIT_DIRSEP, ibis::gVerbose, ibis::fileManager::instance(), ibis::fileManager::recordPages(), ibis::bitvector::setBit(), ibis::fileManager::buffer< T >::size(), and ibis::bitvector::size().
Referenced by ibis::category::stringSearch().
long ibis::text::stringSearch | ( | const std::vector< std::string > & | strs, |
ibis::bitvector & | hits | ||
) | const [virtual] |
Given a group of string literals, return a bitvector that matches anyone of the input strings.
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References ibis::fileManager::buffer< T >::address(), ibis::bitvector::adjustSize(), ibis::bitvector::clear(), ibis::bitvector::cnt(), FASTBIT_DIRSEP, ibis::gVerbose, ibis::fileManager::instance(), ibis::fileManager::recordPages(), ibis::bitvector::set(), ibis::bitvector::setBit(), ibis::fileManager::buffer< T >::size(), and ibis::bitvector::size().
void ibis::text::write | ( | FILE * | file | ) | const [virtual] |
Write the metadata entry.
Write the current metadata to -part.txt of the data partition.
Reimplemented from ibis::column.
Reimplemented in ibis::category.
References ibis::TYPESTRING.
int ibis::text::writeStrings | ( | const char * | to, |
const char * | from, | ||
const char * | spto, | ||
const char * | spfrom, | ||
ibis::bitvector & | msk, | ||
const ibis::bitvector & | sel, | ||
char * | buf, | ||
uint32_t | nbuf | ||
) | const [protected] |
Write the selected strings.
The caller manages the necessary locks for accessing this function.
References ibis::fileManager::buffer< T >::address(), ibis::bitvector::adjustSize(), ibis::bitvector::cnt(), ibis::gVerbose, ibis::bitvector::indexSet::nIndices(), ibis::fileManager::buffer< T >::size(), ibis::bitvector::subset(), and UnixOpen.
![]() |