void MockReadonlyStore::build(const Schema& schema, SortableStrVec& data) { size_t fixlen = schema.getFixedRowLen(); if (0 == fixlen) { if (data.str_size() >= UINT32_MAX) { THROW_STD(length_error, "keys.str_size=%lld is too large", llong(data.str_size())); } // reuse memory of keys.m_index auto offsets = (uint32_t*)data.m_index.data(); size_t rows = data.m_index.size(); for (size_t i = 0; i < rows; ++i) { uint32_t offset = uint32_t(data.m_index[i].offset); offsets[i] = offset; } offsets[rows] = data.str_size(); BOOST_STATIC_ASSERT(sizeof(SortableStrVec::SEntry) == 4*3); m_rows.offsets.risk_set_data(offsets); m_rows.offsets.risk_set_size(rows + 1); m_rows.offsets.risk_set_capacity(3 * rows); m_rows.offsets.shrink_to_fit(); data.m_index.risk_release_ownership(); #if !defined(NDEBUG) assert(data.m_strpool.size() == m_rows.offsets.back()); for (size_t i = 0; i < rows; ++i) { assert(m_rows.offsets[i] < m_rows.offsets[i+1]); } #endif } m_rows.strpool.swap((valvec<char>&)data.m_strpool); m_fixedLen = fixlen; }
static void patchStrVec(SortableStrVec& strVec, size_t fixlen) { const size_t rows = strVec.str_size() / fixlen; assert(strVec.str_size() % rows == 0); strVec.m_index.resize_no_init(rows); for (size_t i = 0; i < rows; ++i) { strVec.m_index[i].seq_id = i; strVec.m_index[i].length = fixlen; strVec.m_index[i].offset = fixlen * i; } }
void MockReadonlyIndex::build(SortableStrVec& keys) { const Schema* schema = m_schema; const byte* base = keys.m_strpool.data(); size_t fixlen = schema->getFixedRowLen(); if (fixlen) { assert(keys.m_index.size() == 0); assert(keys.str_size() % fixlen == 0); m_ids.resize_no_init(keys.str_size() / fixlen); for (size_t i = 0; i < m_ids.size(); ++i) m_ids[i] = i; std::sort(m_ids.begin(), m_ids.end(), [=](size_t x, size_t y) { fstring xs(base + fixlen * x, fixlen); fstring ys(base + fixlen * y, fixlen); int r = schema->compareData(xs, ys); if (r) return r < 0; else return x < y; }); } else { if (keys.str_size() >= UINT32_MAX) { THROW_STD(length_error, "keys.str_size=%lld is too large", llong(keys.str_size())); } // reuse memory of keys.m_index auto offsets = (uint32_t*)keys.m_index.data(); size_t rows = keys.m_index.size(); m_ids.resize_no_init(rows); for (size_t i = 0; i < rows; ++i) m_ids[i] = i; for (size_t i = 0; i < rows; ++i) { uint32_t offset = uint32_t(keys.m_index[i].offset); offsets[i] = offset; } offsets[rows] = keys.str_size(); std::sort(m_ids.begin(), m_ids.end(), [=](size_t x, size_t y) { size_t xoff0 = offsets[x], xoff1 = offsets[x+1]; size_t yoff0 = offsets[y], yoff1 = offsets[y+1]; fstring xs(base + xoff0, xoff1 - xoff0); fstring ys(base + yoff0, yoff1 - yoff0); int r = schema->compareData(xs, ys); if (r) return r < 0; else return x < y; }); BOOST_STATIC_ASSERT(sizeof(SortableStrVec::SEntry) == 4*3); m_keys.offsets.risk_set_data(offsets); m_keys.offsets.risk_set_size(rows + 1); m_keys.offsets.risk_set_capacity(3 * rows); m_keys.offsets.shrink_to_fit(); keys.m_index.risk_release_ownership(); } m_keys.strpool.swap((valvec<char>&)keys.m_strpool); m_fixedLen = fixlen; }
void DfaDbReadonlySegment::compressSingleColgroup(ReadableSegment* input, DbContext* ctx) { llong prevId = -1, id = -1; llong logicRowNum = input->m_isDel.size(), newRowNum = 0; assert(logicRowNum > 0); auto tmpDir = m_segDir + ".tmp"; valvec<byte> val; StoreIteratorPtr iter(input->createStoreIterForward(ctx)); SortableStrVec valueVec; const Schema& valueSchema = m_schema->getColgroupSchema(0); std::unique_ptr<DictZipBlobStore::ZipBuilder> builder; FixedLenStorePtr store; if (valueSchema.should_use_FixedLenStore()) { store = new FixedLenStore(tmpDir, valueSchema); store->unneedsLock(); } else if (valueSchema.m_dictZipSampleRatio >= 0.0) { double sRatio = valueSchema.m_dictZipSampleRatio; double avgLen = double(input->dataInflateSize()) / logicRowNum; if ((sRatio > FLT_EPSILON) || (sRatio >= 0 && avgLen > 100)) { builder = createDictZipBlobStoreBuilder(valueSchema); } } std::mt19937_64 random; // (random.max() - random.min()) + 1 may overflow // do not +1 to avoid overflow uint64_t sampleUpperBound = random.min() + (random.max() - random.min()) * valueSchema.m_dictZipSampleRatio; size_t sampleLenSum = 0; while (iter->increment(&id, &val) && id < logicRowNum) { assert(id >= 0); assert(id < logicRowNum); assert(prevId < id); if (!m_isDel[id]) { if (builder) { if (random() < sampleUpperBound) { builder->addSample(val); sampleLenSum += val.size(); } } else { if (store) store->append(val, NULL); else valueVec.push_back(val); } newRowNum++; m_isDel.beg_end_set1(prevId+1, id); prevId = id; } } if (prevId != id) { assert(prevId < id); assert(m_isDel[id]); m_isDel.beg_end_set1(prevId+1, id); } llong inputRowNum = id + 1; assert(inputRowNum <= logicRowNum); if (inputRowNum < logicRowNum) { fprintf(stderr , "WARN: DfaDbReadonlySegment::compressSingleKeyValue(): realrows=%lld, m_isDel=%lld, some data have lost\n" , inputRowNum, logicRowNum); input->m_isDel.beg_end_set1(inputRowNum, logicRowNum); this->m_isDel.beg_end_set1(inputRowNum, logicRowNum); } m_delcnt = m_isDel.popcnt(); // recompute delcnt assert(newRowNum <= inputRowNum); assert(size_t(logicRowNum - newRowNum) == m_delcnt); if (builder) { assert(valueVec.m_index.size() == 0); assert(valueVec.m_strpool.size() == 0); iter->reset(); // free resources and seek to begin std::lock_guard<std::mutex> lock(DictZip_reduceMemMutex()); auto fpath = tmpDir / ("colgroup-" + valueSchema.m_name + ".nlt"); emptyCheckProtect(sampleLenSum, val, *builder); builder->prepare(newRowNum, fpath.string()); prevId = -1; while (iter->increment(&id, &val) && id < inputRowNum) { for (llong j = prevId+1; j < id; ++j) { if (!m_isDel[j]) { // j was deleted during compressing builder->addRecord(fstring()); // add an empty record } } prevId = id; if (!m_isDel[id]) builder->addRecord(val); } iter = nullptr; m_colgroups[0] = new NestLoudsTrieStore(valueSchema, builder->finish( DictZipBlobStore::ZipBuilder::FinishFreeDict | DictZipBlobStore::ZipBuilder::FinishWriteDictFile )); } else if (store) { m_colgroups[0] = std::move(store); } else { iter = nullptr; m_colgroups[0] = this->buildStore(valueSchema, valueVec); } }