static PyObject *create_hc_stream(PyObject *self, PyObject *args, PyObject *keywds)
{
    (void)self;

    int block_size = 0;
    int compression_level = 9;

    static char *kwlist[] = {"block_size", "compression_level", NULL};

    if (!PyArg_ParseTupleAndKeywords(args, keywds, "i|i", kwlist,
        &block_size,
        &compression_level)) {
        return NULL;
    }

    if (block_size > LZ4_MAX_INPUT_SIZE) {
        PyErr_Format(PyExc_ValueError, "block size is %d bytes, which is larger than the maximum supported size of %d bytes", block_size, LZ4_MAX_INPUT_SIZE);
        return NULL;
    }
    if (block_size <= 0) {
        PyErr_Format(PyExc_ValueError, "block size is %d bytes, which is invalid", block_size);
        return NULL;
    }

    struct compression_stream *stream = (struct compression_stream *)PyMem_Malloc(sizeof(struct compression_stream));
    if (!stream) {
        return PyErr_NoMemory();
    }

    stream->block_size = block_size;
    stream->input_buffer_index = 0;
    stream->input_buffer[0] = (char *)PyMem_Malloc(block_size);
    stream->input_buffer[1] = (char *)PyMem_Malloc(block_size);
    stream->compressed_buffer_max_size = LZ4_COMPRESSBOUND(block_size);
    stream->compressed_buffer = (char *)PyMem_Malloc(stream->compressed_buffer_max_size);
    stream->stream = LZ4_createStreamHC();

    if (!stream->input_buffer[0] ||
            !stream->input_buffer[1] ||
            !stream->compressed_buffer ||
            !stream->stream) {
        PyMem_Free(stream->input_buffer[0]);
        PyMem_Free(stream->input_buffer[1]);
        PyMem_Free(stream->compressed_buffer);
        if (stream->stream) {
            LZ4_freeStreamHC(stream->stream);
        }
        PyMem_Free(stream);
        return PyErr_NoMemory();
    }

    LZ4_resetStreamHC(stream->stream, compression_level);

    return PyCapsule_New(stream, NULL, NULL);
}
Exemple #2
0
  void _compress(pmr_vector<T>& values, pmr_vector<pmr_vector<char>>& lz4_blocks, const pmr_vector<char>& dictionary) {
    /**
     * Here begins the LZ4 compression. The library provides a function to create a stream. The stream is used with
     * every new block that is to be compressed, but the stream returns a raw pointer to an internal structure.
     * The stream memory is freed with another call to a library function after compression is done.
     */
    auto lz4_stream = LZ4_createStreamHC();
    // We use the maximum high compression level available in LZ4 for best compression ratios.
    LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX);

    const auto input_size = values.size() * sizeof(T);
    auto num_blocks = input_size / _block_size;
    // Only add the last not-full block if the data doesn't perfectly fit into the block size.
    if (input_size % _block_size != 0) {
      num_blocks++;
    }
    lz4_blocks.reserve(num_blocks);

    for (auto block_index = size_t{0u}; block_index < num_blocks; ++block_index) {
      auto decompressed_block_size = _block_size;
      // The last block's uncompressed size varies.
      if (block_index + 1 == num_blocks) {
        decompressed_block_size = input_size - (block_index * _block_size);
      }
      // LZ4_compressBound returns an upper bound for the size of the compressed data
      const auto block_bound = static_cast<size_t>(LZ4_compressBound(static_cast<int>(decompressed_block_size)));
      auto compressed_block = pmr_vector<char>{values.get_allocator()};
      compressed_block.resize(block_bound);

      /**
       * If we previously learned a dictionary, we use it to initialize LZ4. Otherwise LZ4 uses the previously
       * compressed block instead, which would cause the blocks to depend on one another.
       * If we have no dictionary present and compress at least a second block (i.e., block_index > 0), then we reset
       * the LZ4 stream to maintain the independence of the blocks. This only happens when the column does not contain
       * enough data to produce a zstd dictionary (i.e., a column of single character strings).
       */
      if (!dictionary.empty()) {
        LZ4_loadDictHC(lz4_stream, dictionary.data(), static_cast<int>(dictionary.size()));
      } else if (block_index) {
        LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX);
      }

      // The offset in the source data where the current block starts.
      const auto value_offset = block_index * _block_size;
      // move pointer to start position and pass to the actual compression method
      const int compression_result = LZ4_compress_HC_continue(
          lz4_stream, reinterpret_cast<char*>(values.data()) + value_offset, compressed_block.data(),
          static_cast<int>(decompressed_block_size), static_cast<int>(block_bound));

      Assert(compression_result > 0, "LZ4 stream compression failed");

      // shrink the block vector to the actual size of the compressed result
      compressed_block.resize(static_cast<size_t>(compression_result));
      compressed_block.shrink_to_fit();

      lz4_blocks.emplace_back(std::move(compressed_block));
    }

    // Finally, release the LZ4 stream memory.
    LZ4_freeStreamHC(lz4_stream);
  }