Пример #1
0
void MockReadonlyStore::build(const Schema& schema, SortableStrVec& data) {
	size_t fixlen = schema.getFixedRowLen();
	if (0 == fixlen) {
		if (data.str_size() >= UINT32_MAX) {
			THROW_STD(length_error,
				"keys.str_size=%lld is too large", llong(data.str_size()));
		}
		// reuse memory of keys.m_index
		auto offsets = (uint32_t*)data.m_index.data();
		size_t rows = data.m_index.size();
		for (size_t i = 0; i < rows; ++i) {
			uint32_t offset = uint32_t(data.m_index[i].offset);
			offsets[i] = offset;
		}
		offsets[rows] = data.str_size();
		BOOST_STATIC_ASSERT(sizeof(SortableStrVec::SEntry) == 4*3);
		m_rows.offsets.risk_set_data(offsets);
		m_rows.offsets.risk_set_size(rows + 1);
		m_rows.offsets.risk_set_capacity(3 * rows);
		m_rows.offsets.shrink_to_fit();
		data.m_index.risk_release_ownership();
	#if !defined(NDEBUG)
		assert(data.m_strpool.size() == m_rows.offsets.back());
		for (size_t i = 0; i < rows; ++i) {
			assert(m_rows.offsets[i] < m_rows.offsets[i+1]);
		}
	#endif
	}
	m_rows.strpool.swap((valvec<char>&)data.m_strpool);
	m_fixedLen = fixlen;
}
Пример #2
0
static void patchStrVec(SortableStrVec& strVec, size_t fixlen) {
	const size_t rows = strVec.str_size() / fixlen;
	assert(strVec.str_size() % rows == 0);
	strVec.m_index.resize_no_init(rows);
	for (size_t i = 0; i < rows; ++i) {
		strVec.m_index[i].seq_id = i;
		strVec.m_index[i].length = fixlen;
		strVec.m_index[i].offset = fixlen * i;
	}
}
Пример #3
0
void
MockReadonlyIndex::build(SortableStrVec& keys) {
	const Schema* schema = m_schema;
	const byte* base = keys.m_strpool.data();
	size_t fixlen = schema->getFixedRowLen();
	if (fixlen) {
		assert(keys.m_index.size() == 0);
		assert(keys.str_size() % fixlen == 0);
		m_ids.resize_no_init(keys.str_size() / fixlen);
		for (size_t i = 0; i < m_ids.size(); ++i) m_ids[i] = i;
		std::sort(m_ids.begin(), m_ids.end(), [=](size_t x, size_t y) {
			fstring xs(base + fixlen * x, fixlen);
			fstring ys(base + fixlen * y, fixlen);
			int r = schema->compareData(xs, ys);
			if (r) return r < 0;
			else   return x < y;
		});
	}
	else {
		if (keys.str_size() >= UINT32_MAX) {
			THROW_STD(length_error,
				"keys.str_size=%lld is too large", llong(keys.str_size()));
		}
		// reuse memory of keys.m_index
		auto offsets = (uint32_t*)keys.m_index.data();
		size_t rows = keys.m_index.size();
		m_ids.resize_no_init(rows);
		for (size_t i = 0; i < rows; ++i) m_ids[i] = i;
		for (size_t i = 0; i < rows; ++i) {
			uint32_t offset = uint32_t(keys.m_index[i].offset);
			offsets[i] = offset;
		}
		offsets[rows] = keys.str_size();
		std::sort(m_ids.begin(), m_ids.end(), [=](size_t x, size_t y) {
			size_t xoff0 = offsets[x], xoff1 = offsets[x+1];
			size_t yoff0 = offsets[y], yoff1 = offsets[y+1];
			fstring xs(base + xoff0, xoff1 - xoff0);
			fstring ys(base + yoff0, yoff1 - yoff0);
			int r = schema->compareData(xs, ys);
			if (r) return r < 0;
			else   return x < y;
		});
		BOOST_STATIC_ASSERT(sizeof(SortableStrVec::SEntry) == 4*3);
		m_keys.offsets.risk_set_data(offsets);
		m_keys.offsets.risk_set_size(rows + 1);
		m_keys.offsets.risk_set_capacity(3 * rows);
		m_keys.offsets.shrink_to_fit();
		keys.m_index.risk_release_ownership();
	}
	m_keys.strpool.swap((valvec<char>&)keys.m_strpool);
	m_fixedLen = fixlen;
}
Пример #4
0
void
DfaDbReadonlySegment::compressSingleColgroup(ReadableSegment* input, DbContext* ctx) {
	llong  prevId = -1, id = -1;
	llong  logicRowNum = input->m_isDel.size(), newRowNum = 0;
	assert(logicRowNum > 0);
	auto tmpDir = m_segDir + ".tmp";
	valvec<byte> val;
	StoreIteratorPtr iter(input->createStoreIterForward(ctx));
	SortableStrVec valueVec;
	const Schema& valueSchema = m_schema->getColgroupSchema(0);
	std::unique_ptr<DictZipBlobStore::ZipBuilder> builder;
	FixedLenStorePtr store;
	if (valueSchema.should_use_FixedLenStore()) {
		store = new FixedLenStore(tmpDir, valueSchema);
		store->unneedsLock();
	}
	else if (valueSchema.m_dictZipSampleRatio >= 0.0) {
		double sRatio = valueSchema.m_dictZipSampleRatio;
		double avgLen = double(input->dataInflateSize()) / logicRowNum;
		if ((sRatio > FLT_EPSILON) || (sRatio >= 0 && avgLen > 100)) {
			builder = createDictZipBlobStoreBuilder(valueSchema);
		}
	}
	std::mt19937_64 random;
	// (random.max() - random.min()) + 1 may overflow
	// do not +1 to avoid overflow
	uint64_t sampleUpperBound = random.min() +
		(random.max() - random.min()) * valueSchema.m_dictZipSampleRatio;
	size_t sampleLenSum = 0;
	while (iter->increment(&id, &val) && id < logicRowNum) {
		assert(id >= 0);
		assert(id < logicRowNum);
		assert(prevId < id);
		if (!m_isDel[id]) {
			if (builder) {
				if (random() < sampleUpperBound) {
					builder->addSample(val);
					sampleLenSum += val.size();
				}
			}
			else {
				if (store)
					store->append(val, NULL);
				else
					valueVec.push_back(val);
			}
			newRowNum++;
			m_isDel.beg_end_set1(prevId+1, id);
			prevId = id;
		}
	}
	if (prevId != id) {
		assert(prevId < id);
		assert(m_isDel[id]);
		m_isDel.beg_end_set1(prevId+1, id);
	}
	llong  inputRowNum = id + 1;
	assert(inputRowNum <= logicRowNum);
	if (inputRowNum < logicRowNum) {
		fprintf(stderr
			, "WARN: DfaDbReadonlySegment::compressSingleKeyValue(): realrows=%lld, m_isDel=%lld, some data have lost\n"
			, inputRowNum, logicRowNum);
		input->m_isDel.beg_end_set1(inputRowNum, logicRowNum);
		this->m_isDel.beg_end_set1(inputRowNum, logicRowNum);
	}
	m_delcnt = m_isDel.popcnt(); // recompute delcnt
	assert(newRowNum <= inputRowNum);
	assert(size_t(logicRowNum - newRowNum) == m_delcnt);
	if (builder) {
		assert(valueVec.m_index.size() == 0);
		assert(valueVec.m_strpool.size() == 0);
		iter->reset(); // free resources and seek to begin
		std::lock_guard<std::mutex> lock(DictZip_reduceMemMutex());
		auto fpath = tmpDir / ("colgroup-" + valueSchema.m_name + ".nlt");
		emptyCheckProtect(sampleLenSum, val, *builder);
		builder->prepare(newRowNum, fpath.string());
		prevId = -1;
		while (iter->increment(&id, &val) && id < inputRowNum) {
			for (llong j = prevId+1; j < id; ++j) {
				if (!m_isDel[j]) {
					// j was deleted during compressing
					builder->addRecord(fstring()); // add an empty record
				}
			}
			prevId = id;
			if (!m_isDel[id])
				builder->addRecord(val);
		}
		iter = nullptr;
		m_colgroups[0] = new NestLoudsTrieStore(valueSchema, builder->finish(
            DictZipBlobStore::ZipBuilder::FinishFreeDict | DictZipBlobStore::ZipBuilder::FinishWriteDictFile
        ));
	}
	else if (store) {
		m_colgroups[0] = std::move(store);
	}
	else {
		iter = nullptr;
		m_colgroups[0] = this->buildStore(valueSchema, valueVec);
	}
}