Exemplo n.º 1
0
  std::unique_ptr<const BaseCompressedVector> encode(const pmr_vector<uint32_t>& vector) {
    const auto compression_type = GetParam();

    auto encoded_vector = compress_vector(vector, compression_type, {}, {max()});
    EXPECT_EQ(encoded_vector->size(), vector.size());

    return encoded_vector;
  }
Exemplo n.º 2
0
int main(int argc, char *argv[])
{
    int i, j;
    unsigned char *z;
    unsigned char r[100*1000];

    double drand48();

    unsigned char **threaded_topics, **threaded_docs;

    float t0, t1;
    float tmp[280];

    init_compand();

    threaded_docs = calloc(100, sizeof(threaded_docs[0]));
    for (i=0;i<100;i++) {
	threaded_docs[i] = calloc(280, sizeof(threaded_docs[0][0]));
	for (j=0;j<280;j++) {
	    tmp[j] = gauss_dev(0, 1);
	}
	scv_normalize(tmp);
	compress_vector(threaded_docs[i], tmp, 280);
	j = drand48() * (i+1);
	z = threaded_docs[i];
	threaded_docs[i] = threaded_docs[j];
	threaded_docs[j] = z;
    }

    threaded_topics = calloc(1000, sizeof(threaded_topics[0]));
    for (i=0;i<1000;i++) {
	threaded_topics[i] = calloc(280, sizeof(threaded_topics[0][0]));
	for (j=0;j<280;j++) {
	    tmp[j] = gauss_dev(0, 1);
	}
	scv_normalize(tmp);
	compress_vector(threaded_topics[i], tmp, 280);
	j = drand48() * (i+1);
	z = threaded_topics[i];
	threaded_topics[i] = threaded_topics[j];
	threaded_topics[j] = z;
    }

    /* convectis style threaded matrix 
	by vector
	by matrix
	*/
    t0 = millitime();
    for (i=0;i<100;i++) {
	for (j=0;j<1000;j++) {
	    r[i*1000+j] = dot8(threaded_topics[j], threaded_docs[i], 280);
	}
    }
    t1 = millitime();
    printf("%.3f seconds for %d convectis style categorizations\n",
	   t1-t0, 100);

    t0 = millitime();
    for (j=0;j<1000;j++) {
	for (i=0;i<100;i++) {
	    r[i*1000+j] = dot8(threaded_topics[j], threaded_docs[i], 280);
	}
    }
    t1 = millitime();
    printf("%.3f seconds for %d convectis matrix categorizations\n",
	   t1-t0, 100);

    return 0;
}
Exemplo n.º 3
0
  std::shared_ptr<BaseEncodedSegment> _on_encode(const AnySegmentIterable<pmr_string> segment_iterable,
                                                 const PolymorphicAllocator<pmr_string>& allocator) {
    /**
     * First iterate over the values for two reasons.
     * 1) If all the strings are empty LZ4 will try to compress an empty vector which will cause a segmentation fault.
     *    In this case we can and need to do an early exit.
     * 2) Sum the length of the strings to improve the performance when copying the data to the char vector.
     */
    auto num_chars = size_t{0u};
    segment_iterable.with_iterators([&](auto it, auto end) {
      for (; it != end; ++it) {
        if (!it->is_null()) {
          num_chars += it->value().size();
        }
      }
    });

    // copy values and null flags from value segment
    auto values = pmr_vector<char>{allocator};
    values.reserve(num_chars);
    auto null_values = pmr_vector<bool>{allocator};

    /**
     * If the null value vector only contains the value false, then the value segment does not have any row value that
     * is null. In that case, we don't store the null value vector to reduce the LZ4 segment's memory footprint.
     */
    auto segment_contains_null = false;

    /**
     * These offsets mark the beginning of strings (and therefore end of the previous string) in the data vector.
     * These offsets are character offsets. The string at position 0 starts at the offset stored at position 0, which
     * will always be 0.
     * Its exclusive end is the offset stored at position 1 (i.e., offsets[1] - 1 is the last character of the string
     * at position 0).
     * In case of the last string its end is determined by the end of the data vector.
     *
     * The offsets are stored as 32 bit unsigned integer as opposed to 64 bit (size_t) so that they can later be
     * compressed via vector compression.
     */
    auto offsets = pmr_vector<uint32_t>{allocator};

    /**
     * These are the lengths of each string. They are needed to train the zstd dictionary.
     */
    auto string_samples_lengths = pmr_vector<size_t>{allocator};

    segment_iterable.with_iterators([&](auto it, auto end) {
      const auto segment_size = std::distance(it, end);

      null_values.resize(segment_size);
      offsets.resize(segment_size);
      string_samples_lengths.resize(segment_size);

      auto offset = uint32_t{0u};
      // iterate over the iterator to access the values and increment the row index to write to the values and null
      // values vectors
      auto row_index = size_t{0};
      for (; it != end; ++it) {
        const auto segment_element = *it;
        const auto contains_null = segment_element.is_null();
        null_values[row_index] = contains_null;
        segment_contains_null = segment_contains_null || contains_null;
        offsets[row_index] = offset;
        auto sample_size = size_t{0u};
        if (!contains_null) {
          const auto value = segment_element.value();
          const auto string_length = value.size();
          values.insert(values.cend(), value.begin(), value.end());
          Assert(string_length <= std::numeric_limits<uint32_t>::max(),
                 "The size of string row value exceeds the maximum of uint32 in LZ4 encoding.");
          offset += static_cast<uint32_t>(string_length);
          sample_size = string_length;
        }

        string_samples_lengths[row_index] = sample_size;
        ++row_index;
      }
    });

    auto optional_null_values = segment_contains_null ? std::optional<pmr_vector<bool>>{null_values} : std::nullopt;

    /**
     * If the input only contained null values and/or empty strings we don't need to compress anything (and LZ4 will
     * cause an error). We can also throw away the offsets, since they won't be used for decompression.
     * We can do an early exit and return the (not encoded) segment.
     */
    if (num_chars == 0) {
      auto empty_blocks = pmr_vector<pmr_vector<char>>{allocator};
      auto empty_dictionary = pmr_vector<char>{};
      return std::allocate_shared<LZ4Segment<pmr_string>>(allocator, std::move(empty_blocks),
                                                          std::move(optional_null_values), std::move(empty_dictionary),
                                                          nullptr, _block_size, 0u, 0u, null_values.size());
    }

    // Compress the offsets with a vector compression method to reduce the memory footprint of the LZ4 segment.
    auto compressed_offsets = compress_vector(offsets, vector_compression_type(), allocator, {offsets.back()});

    /**
     * Pre-compute a zstd dictionary if the input data is split among multiple blocks. This dictionary allows
     * independent compression of the blocks, while maintaining a good compression ratio.
     * If the input data fits into a single block, training of a dictionary is skipped.
     */
    const auto input_size = values.size();
    auto dictionary = pmr_vector<char>{allocator};
    if (input_size > _block_size) {
      dictionary = _train_dictionary(values, string_samples_lengths);
    }

    /**
     * Compress the data and calculate the last block size (which may vary from the block size of the previous blocks)
     * and the total compressed size. The size of the last block is needed for decompression. The total compressed size
     * is pre-calculated instead of iterating over all blocks when the memory consumption of the LZ4 segment is
     * estimated.
     */
    auto lz4_blocks = pmr_vector<pmr_vector<char>>{allocator};
    _compress(values, lz4_blocks, dictionary);

    auto last_block_size = input_size % _block_size != 0 ? input_size % _block_size : _block_size;

    auto total_compressed_size = size_t{0u};
    for (const auto& compressed_block : lz4_blocks) {
      total_compressed_size += compressed_block.size();
    }

    return std::allocate_shared<LZ4Segment<pmr_string>>(
        allocator, std::move(lz4_blocks), std::move(optional_null_values), std::move(dictionary),
        std::move(compressed_offsets), _block_size, last_block_size, total_compressed_size, null_values.size());
  }