static PyObject *create_hc_stream(PyObject *self, PyObject *args, PyObject *keywds) { (void)self; int block_size = 0; int compression_level = 9; static char *kwlist[] = {"block_size", "compression_level", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywds, "i|i", kwlist, &block_size, &compression_level)) { return NULL; } if (block_size > LZ4_MAX_INPUT_SIZE) { PyErr_Format(PyExc_ValueError, "block size is %d bytes, which is larger than the maximum supported size of %d bytes", block_size, LZ4_MAX_INPUT_SIZE); return NULL; } if (block_size <= 0) { PyErr_Format(PyExc_ValueError, "block size is %d bytes, which is invalid", block_size); return NULL; } struct compression_stream *stream = (struct compression_stream *)PyMem_Malloc(sizeof(struct compression_stream)); if (!stream) { return PyErr_NoMemory(); } stream->block_size = block_size; stream->input_buffer_index = 0; stream->input_buffer[0] = (char *)PyMem_Malloc(block_size); stream->input_buffer[1] = (char *)PyMem_Malloc(block_size); stream->compressed_buffer_max_size = LZ4_COMPRESSBOUND(block_size); stream->compressed_buffer = (char *)PyMem_Malloc(stream->compressed_buffer_max_size); stream->stream = LZ4_createStreamHC(); if (!stream->input_buffer[0] || !stream->input_buffer[1] || !stream->compressed_buffer || !stream->stream) { PyMem_Free(stream->input_buffer[0]); PyMem_Free(stream->input_buffer[1]); PyMem_Free(stream->compressed_buffer); if (stream->stream) { LZ4_freeStreamHC(stream->stream); } PyMem_Free(stream); return PyErr_NoMemory(); } LZ4_resetStreamHC(stream->stream, compression_level); return PyCapsule_New(stream, NULL, NULL); }
void _compress(pmr_vector<T>& values, pmr_vector<pmr_vector<char>>& lz4_blocks, const pmr_vector<char>& dictionary) { /** * Here begins the LZ4 compression. The library provides a function to create a stream. The stream is used with * every new block that is to be compressed, but the stream returns a raw pointer to an internal structure. * The stream memory is freed with another call to a library function after compression is done. */ auto lz4_stream = LZ4_createStreamHC(); // We use the maximum high compression level available in LZ4 for best compression ratios. LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX); const auto input_size = values.size() * sizeof(T); auto num_blocks = input_size / _block_size; // Only add the last not-full block if the data doesn't perfectly fit into the block size. if (input_size % _block_size != 0) { num_blocks++; } lz4_blocks.reserve(num_blocks); for (auto block_index = size_t{0u}; block_index < num_blocks; ++block_index) { auto decompressed_block_size = _block_size; // The last block's uncompressed size varies. if (block_index + 1 == num_blocks) { decompressed_block_size = input_size - (block_index * _block_size); } // LZ4_compressBound returns an upper bound for the size of the compressed data const auto block_bound = static_cast<size_t>(LZ4_compressBound(static_cast<int>(decompressed_block_size))); auto compressed_block = pmr_vector<char>{values.get_allocator()}; compressed_block.resize(block_bound); /** * If we previously learned a dictionary, we use it to initialize LZ4. Otherwise LZ4 uses the previously * compressed block instead, which would cause the blocks to depend on one another. * If we have no dictionary present and compress at least a second block (i.e., block_index > 0), then we reset * the LZ4 stream to maintain the independence of the blocks. This only happens when the column does not contain * enough data to produce a zstd dictionary (i.e., a column of single character strings). */ if (!dictionary.empty()) { LZ4_loadDictHC(lz4_stream, dictionary.data(), static_cast<int>(dictionary.size())); } else if (block_index) { LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX); } // The offset in the source data where the current block starts. const auto value_offset = block_index * _block_size; // move pointer to start position and pass to the actual compression method const int compression_result = LZ4_compress_HC_continue( lz4_stream, reinterpret_cast<char*>(values.data()) + value_offset, compressed_block.data(), static_cast<int>(decompressed_block_size), static_cast<int>(block_bound)); Assert(compression_result > 0, "LZ4 stream compression failed"); // shrink the block vector to the actual size of the compressed result compressed_block.resize(static_cast<size_t>(compression_result)); compressed_block.shrink_to_fit(); lz4_blocks.emplace_back(std::move(compressed_block)); } // Finally, release the LZ4 stream memory. LZ4_freeStreamHC(lz4_stream); }