Exemplo n.º 1
0
void
DfaDbReadonlySegment::compressSingleColgroup(ReadableSegment* input, DbContext* ctx) {
	llong  prevId = -1, id = -1;
	llong  logicRowNum = input->m_isDel.size(), newRowNum = 0;
	assert(logicRowNum > 0);
	auto tmpDir = m_segDir + ".tmp";
	valvec<byte> val;
	StoreIteratorPtr iter(input->createStoreIterForward(ctx));
	SortableStrVec valueVec;
	const Schema& valueSchema = m_schema->getColgroupSchema(0);
	std::unique_ptr<DictZipBlobStore::ZipBuilder> builder;
	FixedLenStorePtr store;
	if (valueSchema.should_use_FixedLenStore()) {
		store = new FixedLenStore(tmpDir, valueSchema);
		store->unneedsLock();
	}
	else if (valueSchema.m_dictZipSampleRatio >= 0.0) {
		double sRatio = valueSchema.m_dictZipSampleRatio;
		double avgLen = double(input->dataInflateSize()) / logicRowNum;
		if ((sRatio > FLT_EPSILON) || (sRatio >= 0 && avgLen > 100)) {
			builder = createDictZipBlobStoreBuilder(valueSchema);
		}
	}
	std::mt19937_64 random;
	// (random.max() - random.min()) + 1 may overflow
	// do not +1 to avoid overflow
	uint64_t sampleUpperBound = random.min() +
		(random.max() - random.min()) * valueSchema.m_dictZipSampleRatio;
	size_t sampleLenSum = 0;
	while (iter->increment(&id, &val) && id < logicRowNum) {
		assert(id >= 0);
		assert(id < logicRowNum);
		assert(prevId < id);
		if (!m_isDel[id]) {
			if (builder) {
				if (random() < sampleUpperBound) {
					builder->addSample(val);
					sampleLenSum += val.size();
				}
			}
			else {
				if (store)
					store->append(val, NULL);
				else
					valueVec.push_back(val);
			}
			newRowNum++;
			m_isDel.beg_end_set1(prevId+1, id);
			prevId = id;
		}
	}
	if (prevId != id) {
		assert(prevId < id);
		assert(m_isDel[id]);
		m_isDel.beg_end_set1(prevId+1, id);
	}
	llong  inputRowNum = id + 1;
	assert(inputRowNum <= logicRowNum);
	if (inputRowNum < logicRowNum) {
		fprintf(stderr
			, "WARN: DfaDbReadonlySegment::compressSingleKeyValue(): realrows=%lld, m_isDel=%lld, some data have lost\n"
			, inputRowNum, logicRowNum);
		input->m_isDel.beg_end_set1(inputRowNum, logicRowNum);
		this->m_isDel.beg_end_set1(inputRowNum, logicRowNum);
	}
	m_delcnt = m_isDel.popcnt(); // recompute delcnt
	assert(newRowNum <= inputRowNum);
	assert(size_t(logicRowNum - newRowNum) == m_delcnt);
	if (builder) {
		assert(valueVec.m_index.size() == 0);
		assert(valueVec.m_strpool.size() == 0);
		iter->reset(); // free resources and seek to begin
		std::lock_guard<std::mutex> lock(DictZip_reduceMemMutex());
		auto fpath = tmpDir / ("colgroup-" + valueSchema.m_name + ".nlt");
		emptyCheckProtect(sampleLenSum, val, *builder);
		builder->prepare(newRowNum, fpath.string());
		prevId = -1;
		while (iter->increment(&id, &val) && id < inputRowNum) {
			for (llong j = prevId+1; j < id; ++j) {
				if (!m_isDel[j]) {
					// j was deleted during compressing
					builder->addRecord(fstring()); // add an empty record
				}
			}
			prevId = id;
			if (!m_isDel[id])
				builder->addRecord(val);
		}
		iter = nullptr;
		m_colgroups[0] = new NestLoudsTrieStore(valueSchema, builder->finish(
            DictZipBlobStore::ZipBuilder::FinishFreeDict | DictZipBlobStore::ZipBuilder::FinishWriteDictFile
        ));
	}
	else if (store) {
		m_colgroups[0] = std::move(store);
	}
	else {
		iter = nullptr;
		m_colgroups[0] = this->buildStore(valueSchema, valueVec);
	}
}