// construct a bitmap index from current data ibis::bak::bak(const ibis::column* c, const char* f) : ibis::bin() { if (c == 0) return; // nothing can be done col = c; try { if (f) { // f is not null read(f); } if (nobs == 0) { bakMap bmap; mapValues(f, bmap); construct(bmap); optionalUnpack(bits, col->indexSpec()); if (ibis::gVerbose > 4) { ibis::util::logger lg; print(lg()); } } } catch (...) { clear(); throw; } } // constructor
// this function simply recreates the index using the current data in dt // directory long ibis::bak::append(const char* dt, const char* df, uint32_t nnew) { if (nnew == 0) return 0; clear(); // clear the current content and rebuild index in dt bakMap bmap; mapValues(dt, bmap); construct(bmap); optionalUnpack(bits, col->indexSpec()); //write(dt); // record the new index if (ibis::gVerbose > 2) { ibis::util::logger lg; print(lg()); } return nnew; } // ibis::bak::append
/// Generate a new sbiad index by passing through the data twice. /// - (1) scan the data to generate a list of distinct values and their count. /// - (2) scan the data a second time to produce the bit vectors. void ibis::sbiad::construct2(const char* f, const uint32_t nbase) { { // a block to limit the scope of hst histogram hst; mapValues(f, hst); // scan the data to produce the histogram if (hst.empty()) // no data, of course no index return; // convert histogram into two arrays const uint32_t nhst = hst.size(); vals.resize(nhst); cnts.resize(nhst); histogram::const_iterator it = hst.begin(); for (uint32_t i = 0; i < nhst; ++i) { vals[i] = (*it).first; cnts[i] = (*it).second; ++ it; } } // determine the base sizes setBases(bases, vals.size(), nbase); const uint32_t nb = bases.size(); int ierr; // allocate the correct number of bitvectors uint32_t nobs = 0; for (uint32_t ii = 0; ii < nb; ++ii) nobs += bases[ii]; bits.resize(nobs); for (uint32_t ii = 0; ii < nobs; ++ii) bits[ii] = new ibis::bitvector; std::string fnm; // name of the data file dataFileName(fnm, f); nrows = col->partition()->nRows(); ibis::bitvector mask; { // name of mask file associated with the data file array_t<ibis::bitvector::word_t> arr; std::string mname(fnm); mname += ".msk"; if (ibis::fileManager::instance().getFile(mname.c_str(), arr) == 0) mask.copy(ibis::bitvector(arr)); // convert arr to a bitvector else mask.set(1, nrows); // default mask } // need to do different things for different columns switch (col->type()) { case ibis::TEXT: case ibis::UINT: {// unsigned int array_t<uint32_t> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::INT: {// signed int array_t<int32_t> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::ULONG: {// unsigned long int array_t<uint64_t> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::LONG: {// signed long int array_t<int64_t> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::USHORT: {// unsigned short int array_t<uint16_t> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::SHORT: {// signed short int array_t<int16_t> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::UBYTE: {// unsigned char array_t<unsigned char> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::BYTE: {// signed char array_t<signed char> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::FLOAT: {// (4-byte) floating-point values array_t<float> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::DOUBLE: {// (8-byte) floating-point values array_t<double> val; if (! fnm.empty()) ierr = ibis::fileManager::instance().getFile(fnm.c_str(), val); else ierr = col->getValuesArray(&val); if (ierr < 0 || val.empty()) { LOGGER(ibis::gVerbose > 0) << "Warning -- sbiad::construct2 failed to retrieve any value"; break; } if (val.size() > mask.size()) { col->logWarning("sbiad::construct", "the data file \"%s\" " "contains more elements (%lu) then expected " "(%lu)", fnm.c_str(), static_cast<long unsigned>(val.size()), static_cast<long unsigned>(mask.size())); mask.adjustSize(nrows, nrows); } ibis::bitvector::indexSet iset = mask.firstIndexSet(); uint32_t nind = iset.nIndices(); const ibis::bitvector::word_t *iix = iset.indices(); while (nind) { if (iset.isRange()) { // a range uint32_t k = (iix[1] < nrows ? iix[1] : nrows); for (uint32_t i = *iix; i < k; ++i) setBit(i, val[i]); } else if (*iix+ibis::bitvector::bitsPerLiteral() < nrows) { // a list of indices for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; setBit(k, val[k]); } } else { for (uint32_t i = 0; i < nind; ++i) { uint32_t k = iix[i]; if (k < nrows) setBit(k, val[k]); } } ++iset; nind = iset.nIndices(); if (*iix >= nrows) nind = 0; } // while (nind) break;} case ibis::CATEGORY: // no need for a separate index col->logWarning("sbiad::ctor", "no need for another index"); return; default: col->logWarning("sbiad::ctor", "unable to create bit sbiad index " "for column type %s", ibis::TYPESTRING[(int)col->type()]); return; } // make sure all bit vectors are the same size for (uint32_t i = 0; i < nobs; ++i) { bits[i]->adjustSize(0, nrows); } // sum up the bitvectors according to interval-encoding array_t<bitvector*> beq; beq.swap(bits); try { uint32_t ke = 0; bits.clear(); for (uint32_t i = 0; i < nb; ++i) { if (bases[i] > 2) { nobs = (bases[i] - 1) / 2; bits.push_back(new ibis::bitvector); bits.back()->copy(*(beq[ke])); if (nobs > 64) bits.back()->decompress(); for (uint32_t j = ke+1; j <= ke+nobs; ++j) *(bits.back()) |= *(beq[j]); bits.back()->compress(); for (uint32_t j = 1; j < bases[i]-nobs; ++j) { bits.push_back(*(bits.back()) - *(beq[ke+j-1])); *(bits.back()) |= *(beq[ke+j+nobs]); bits.back()->compress(); } for (uint32_t j = ke; j < ke+bases[i]; ++j) { delete beq[j]; beq[j] = 0; } } else { bits.push_back(beq[ke]); if (bases[i] > 1) { delete beq[ke+1]; beq[ke+1] = 0; } } ke += bases[i]; } } catch (...) { LOGGER(ibis::gVerbose > 1) << "Warning -- column::[" << col->name() << "]::construct2 encountered an exception while converting " "to inverval encoding, cleaning up ..."; for (uint32_t i = 0; i < beq.size(); ++ i) delete beq[i]; throw; } beq.clear(); optionalUnpack(bits, col->indexSpec()); // write out the current content if (ibis::gVerbose > 8) { ibis::util::logger lg; print(lg()); } } // ibis::sbiad::construct2
/// This version of the constructor take one pass throught the data. It /// constructs a ibis::index::VMap first, then construct the sbiad from the /// VMap. It uses more computer memory than the two-pass version, but will /// probably run a little faster. void ibis::sbiad::construct1(const char* f, const uint32_t nbase) { VMap bmap; // a map between values and their position try { mapValues(f, bmap); } catch (...) { // need to clean up bmap LOGGER(ibis::gVerbose >= 0) << "sbiad::construct reclaiming storage " "allocated to bitvectors (" << bmap.size() << ")"; for (VMap::iterator it = bmap.begin(); it != bmap.end(); ++ it) delete (*it).second; bmap.clear(); ibis::fileManager::instance().signalMemoryAvailable(); throw; } if (bmap.empty()) return; nrows = (*(bmap.begin())).second->size(); if (nrows != col->partition()->nRows()) { for (VMap::iterator it = bmap.begin(); it != bmap.end(); ++ it) delete (*it).second; bmap.clear(); ibis::fileManager::instance().signalMemoryAvailable(); LOGGER(ibis::gVerbose >= 0) << "Warning -- sbiad::construct1 the bitvectors " "do not have the expected size(" << col->partition()->nRows() << "). stopping.."; throw ibis::bad_alloc("incorrect bitvector sizes"); } // convert bmap into the current data structure // fill the arrays vals and cnts const uint32_t card = bmap.size(); vals.reserve(card); cnts.reserve(card); for (VMap::const_iterator it = bmap.begin(); it != bmap.end(); ++it) { vals.push_back((*it).first); cnts.push_back((*it).second->cnt()); } // fill the array bases setBases(bases, card, nbase); // count the number of bitvectors to genreate const uint32_t nb = bases.size(); uint32_t nobs = 0; uint32_t i; for (i = 0; i < nb; ++i) nobs += bases[i]; // allocate enough bitvectors in bits bits.resize(nobs); for (i = 0; i < nobs; ++i) bits[i] = 0; if (ibis::gVerbose > 5) { col->logMessage("sbiad::construct", "initialized the array of " "bitvectors, start converting %lu bitmaps into %lu-" "component range code (with %lu bitvectors)", static_cast<long unsigned>(vals.size()), static_cast<long unsigned>(nb), static_cast<long unsigned>(nobs)); } // converting to multi-level equality encoding first i = 0; for (VMap::const_iterator it = bmap.begin(); it != bmap.end(); ++it, ++i) { uint32_t offset = 0; uint32_t ii = i; for (uint32_t j = 0; j < nb; ++j) { uint32_t k = ii % bases[j]; if (bits[offset+k]) { *(bits[offset+k]) |= *((*it).second); } else { bits[offset+k] = new ibis::bitvector(); bits[offset+k]->copy(*((*it).second)); // expected to be operated on more than 64 times if (vals.size() > 64*bases[j]) bits[offset+k]->decompress(); } ii /= bases[j]; offset += bases[j]; } delete (*it).second; // no longer need the bitmap } for (i = 0; i < nobs; ++i) { if (bits[i] == 0) { bits[i] = new ibis::bitvector(); bits[i]->set(0, nrows); } } #if DEBUG+0 > 0 || _DEBUG+0 > 0 if (ibis::gVerbose > 11) { LOGGER(ibis::gVerbose >= 0) << "DEBUG -- sbiad::construct1 converted" << bmap.size() << " bitmaps for each distinct value into " << bits.size() << bases.size() << "-component equality encoded bitmaps"; } #endif // sum up the bitvectors according to the interval-encoding array_t<bitvector*> beq; beq.swap(bits); try { // use a try block to ensure the bitvectors in beq are freed uint32_t ke = 0; bits.clear(); for (i = 0; i < nb; ++i) { if (bases[i] > 2) { nobs = (bases[i] - 1) / 2; bits.push_back(new ibis::bitvector); bits.back()->copy(*(beq[ke])); if (nobs > 64) bits.back()->decompress(); for (uint32_t j = ke+1; j <= ke+nobs; ++j) *(bits.back()) |= *(beq[j]); bits.back()->compress(); for (uint32_t j = 1; j < bases[i]-nobs; ++j) { bits.push_back(*(bits.back()) - *(beq[ke+j-1])); *(bits.back()) |= *(beq[ke+j+nobs]); bits.back()->compress(); } for (uint32_t j = ke; j < ke+bases[i]; ++j) { delete beq[j]; beq[j] = 0; } } else { bits.push_back(beq[ke]); if (bases[i] > 1) { delete beq[ke+1]; beq[ke+1] = 0; } } ke += bases[i]; } } catch (...) { LOGGER(ibis::gVerbose > 1) << "Warning -- column::[" << col->name() << "]::construct1 encountered an exception while converting " "to inverval encoding, cleaning up ..."; for (uint32_t i = 0; i < beq.size(); ++ i) delete beq[i]; throw; } beq.clear(); #if DEBUG+0 > 0 || _DEBUG+0 > 0 if (ibis::gVerbose > 11) { LOGGER(ibis::gVerbose >= 0) << "DEBUG -- sbiad::construct1 completed " << "converting equality encoding to interval encoding"; } #endif optionalUnpack(bits, col->indexSpec()); // write out the current content if (ibis::gVerbose > 8) { ibis::util::logger lg; print(lg()); } } // ibis::sbiad::construct1