void DfaDbReadonlySegment::compressSingleColgroup(ReadableSegment* input, DbContext* ctx) { llong prevId = -1, id = -1; llong logicRowNum = input->m_isDel.size(), newRowNum = 0; assert(logicRowNum > 0); auto tmpDir = m_segDir + ".tmp"; valvec<byte> val; StoreIteratorPtr iter(input->createStoreIterForward(ctx)); SortableStrVec valueVec; const Schema& valueSchema = m_schema->getColgroupSchema(0); std::unique_ptr<DictZipBlobStore::ZipBuilder> builder; FixedLenStorePtr store; if (valueSchema.should_use_FixedLenStore()) { store = new FixedLenStore(tmpDir, valueSchema); store->unneedsLock(); } else if (valueSchema.m_dictZipSampleRatio >= 0.0) { double sRatio = valueSchema.m_dictZipSampleRatio; double avgLen = double(input->dataInflateSize()) / logicRowNum; if ((sRatio > FLT_EPSILON) || (sRatio >= 0 && avgLen > 100)) { builder = createDictZipBlobStoreBuilder(valueSchema); } } std::mt19937_64 random; // (random.max() - random.min()) + 1 may overflow // do not +1 to avoid overflow uint64_t sampleUpperBound = random.min() + (random.max() - random.min()) * valueSchema.m_dictZipSampleRatio; size_t sampleLenSum = 0; while (iter->increment(&id, &val) && id < logicRowNum) { assert(id >= 0); assert(id < logicRowNum); assert(prevId < id); if (!m_isDel[id]) { if (builder) { if (random() < sampleUpperBound) { builder->addSample(val); sampleLenSum += val.size(); } } else { if (store) store->append(val, NULL); else valueVec.push_back(val); } newRowNum++; m_isDel.beg_end_set1(prevId+1, id); prevId = id; } } if (prevId != id) { assert(prevId < id); assert(m_isDel[id]); m_isDel.beg_end_set1(prevId+1, id); } llong inputRowNum = id + 1; assert(inputRowNum <= logicRowNum); if (inputRowNum < logicRowNum) { fprintf(stderr , "WARN: DfaDbReadonlySegment::compressSingleKeyValue(): realrows=%lld, m_isDel=%lld, some data have lost\n" , inputRowNum, logicRowNum); input->m_isDel.beg_end_set1(inputRowNum, logicRowNum); this->m_isDel.beg_end_set1(inputRowNum, logicRowNum); } m_delcnt = m_isDel.popcnt(); // recompute delcnt assert(newRowNum <= inputRowNum); assert(size_t(logicRowNum - newRowNum) == m_delcnt); if (builder) { assert(valueVec.m_index.size() == 0); assert(valueVec.m_strpool.size() == 0); iter->reset(); // free resources and seek to begin std::lock_guard<std::mutex> lock(DictZip_reduceMemMutex()); auto fpath = tmpDir / ("colgroup-" + valueSchema.m_name + ".nlt"); emptyCheckProtect(sampleLenSum, val, *builder); builder->prepare(newRowNum, fpath.string()); prevId = -1; while (iter->increment(&id, &val) && id < inputRowNum) { for (llong j = prevId+1; j < id; ++j) { if (!m_isDel[j]) { // j was deleted during compressing builder->addRecord(fstring()); // add an empty record } } prevId = id; if (!m_isDel[id]) builder->addRecord(val); } iter = nullptr; m_colgroups[0] = new NestLoudsTrieStore(valueSchema, builder->finish( DictZipBlobStore::ZipBuilder::FinishFreeDict | DictZipBlobStore::ZipBuilder::FinishWriteDictFile )); } else if (store) { m_colgroups[0] = std::move(store); } else { iter = nullptr; m_colgroups[0] = this->buildStore(valueSchema, valueVec); } }