Пример #1
0
static PyObject *compress_hc_continue(PyObject *self, PyObject *args, PyObject *keywds)
{
    (void)self;

    PyObject *py_stream = NULL;
    const char *source = NULL;
    int source_size = 0;

    static char *kwlist[] = {"stream", "source", NULL};

    if (!PyArg_ParseTupleAndKeywords(args, keywds, "Os#|i", kwlist,
            &py_stream,
            &source, &source_size))
        return NULL;

    struct compression_stream *stream = (struct compression_stream *)PyCapsule_GetPointer(py_stream, NULL);
    if (!stream) {
        PyErr_Format(PyExc_ValueError, "No stream supplied");
        return NULL;
    }

    if (source_size > stream->block_size) {
        PyErr_Format(PyExc_ValueError, "Source data is %d bytes. It must be less than or equal to the stream's block size, which is %d bytes", source_size, stream->block_size);
        return NULL;
    }

    char *input_pointer = stream->input_buffer[stream->input_buffer_index];
    int bytes_to_compress = source_size;

    memcpy(input_pointer, source, bytes_to_compress);

    int compressed_size = LZ4_compress_HC_continue(
            stream->stream,
            input_pointer,
            stream->compressed_buffer,
            source_size,
            stream->compressed_buffer_max_size);
    if (compressed_size <= 0) {
        PyErr_Format(PyExc_RuntimeError, "LZ4_compress_fast_continue failed with code: %d", compressed_size);
        return NULL;
    }

    stream->input_buffer_index = (stream->input_buffer_index + 1) % 2;

    return PyBytes_FromStringAndSize(stream->compressed_buffer, compressed_size);
}
Пример #2
0
  void _compress(pmr_vector<T>& values, pmr_vector<pmr_vector<char>>& lz4_blocks, const pmr_vector<char>& dictionary) {
    /**
     * Here begins the LZ4 compression. The library provides a function to create a stream. The stream is used with
     * every new block that is to be compressed, but the stream returns a raw pointer to an internal structure.
     * The stream memory is freed with another call to a library function after compression is done.
     */
    auto lz4_stream = LZ4_createStreamHC();
    // We use the maximum high compression level available in LZ4 for best compression ratios.
    LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX);

    const auto input_size = values.size() * sizeof(T);
    auto num_blocks = input_size / _block_size;
    // Only add the last not-full block if the data doesn't perfectly fit into the block size.
    if (input_size % _block_size != 0) {
      num_blocks++;
    }
    lz4_blocks.reserve(num_blocks);

    for (auto block_index = size_t{0u}; block_index < num_blocks; ++block_index) {
      auto decompressed_block_size = _block_size;
      // The last block's uncompressed size varies.
      if (block_index + 1 == num_blocks) {
        decompressed_block_size = input_size - (block_index * _block_size);
      }
      // LZ4_compressBound returns an upper bound for the size of the compressed data
      const auto block_bound = static_cast<size_t>(LZ4_compressBound(static_cast<int>(decompressed_block_size)));
      auto compressed_block = pmr_vector<char>{values.get_allocator()};
      compressed_block.resize(block_bound);

      /**
       * If we previously learned a dictionary, we use it to initialize LZ4. Otherwise LZ4 uses the previously
       * compressed block instead, which would cause the blocks to depend on one another.
       * If we have no dictionary present and compress at least a second block (i.e., block_index > 0), then we reset
       * the LZ4 stream to maintain the independence of the blocks. This only happens when the column does not contain
       * enough data to produce a zstd dictionary (i.e., a column of single character strings).
       */
      if (!dictionary.empty()) {
        LZ4_loadDictHC(lz4_stream, dictionary.data(), static_cast<int>(dictionary.size()));
      } else if (block_index) {
        LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX);
      }

      // The offset in the source data where the current block starts.
      const auto value_offset = block_index * _block_size;
      // move pointer to start position and pass to the actual compression method
      const int compression_result = LZ4_compress_HC_continue(
          lz4_stream, reinterpret_cast<char*>(values.data()) + value_offset, compressed_block.data(),
          static_cast<int>(decompressed_block_size), static_cast<int>(block_bound));

      Assert(compression_result > 0, "LZ4 stream compression failed");

      // shrink the block vector to the actual size of the compressed result
      compressed_block.resize(static_cast<size_t>(compression_result));
      compressed_block.shrink_to_fit();

      lz4_blocks.emplace_back(std::move(compressed_block));
    }

    // Finally, release the LZ4 stream memory.
    LZ4_freeStreamHC(lz4_stream);
  }