void IDataType::updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint) { /// Update the average value size hint if amount of read rows isn't too small size_t column_size = column.size(); if (column_size > 10) { double current_avg_value_size = static_cast<double>(column.byteSize()) / column_size; /// Heuristic is chosen so that avg_value_size_hint increases rapidly but decreases slowly. if (current_avg_value_size > avg_value_size_hint) avg_value_size_hint = std::min(1024., current_avg_value_size); /// avoid overestimation else if (current_avg_value_size * 2 < avg_value_size_hint) avg_value_size_hint = (current_avg_value_size + avg_value_size_hint * 3) / 4; } }
void NativeBlockInputStream::readData(const IDataType & type, IColumn & column, ReadBuffer & istr, size_t rows) { /** Для массивов требуется сначала десериализовать смещения, а потом значения. */ if (const DataTypeArray * type_arr = typeid_cast<const DataTypeArray *>(&type)) { IColumn & offsets_column = *typeid_cast<ColumnArray &>(column).getOffsetsColumn(); type_arr->getOffsetsType()->deserializeBinary(offsets_column, istr, rows, 0); if (offsets_column.size() != rows) throw Exception("Cannot read all data in NativeBlockInputStream.", ErrorCodes::CANNOT_READ_ALL_DATA); if (rows) readData( *type_arr->getNestedType(), typeid_cast<ColumnArray &>(column).getData(), istr, typeid_cast<const ColumnArray &>(column).getOffsets()[rows - 1]); } else type.deserializeBinary(column, istr, rows, 0); /// TODO Использовать avg_value_size_hint. if (column.size() != rows) throw Exception("Cannot read all data in NativeBlockInputStream.", ErrorCodes::CANNOT_READ_ALL_DATA); }
void IMergedBlockOutputStream::writeData( const String & name, const IDataType & type, const IColumn & column, OffsetColumns & offset_columns, bool skip_offsets) { size_t size = column.size(); size_t prev_mark = 0; while (prev_mark < size) { size_t limit = 0; /// If there is `index_offset`, then the first mark goes not immediately, but after this number of rows. if (prev_mark == 0 && index_offset != 0) limit = index_offset; else { limit = storage.index_granularity; /// Write marks. type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; if (is_offsets && skip_offsets) return; String stream_name = IDataType::getFileNameForStream(name, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.count(stream_name)) return; ColumnStream & stream = *column_streams[stream_name]; /// There could already be enough data to compress into the new block. if (stream.compressed.offset() >= min_compress_block_size) stream.compressed.next(); writeIntBinary(stream.plain_hashing.count(), stream.marks); writeIntBinary(stream.compressed.offset(), stream.marks); }, {}); } IDataType::OutputStreamGetter stream_getter = [&] (const IDataType::SubstreamPath & substream_path) -> WriteBuffer * { bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; if (is_offsets && skip_offsets) return nullptr; String stream_name = IDataType::getFileNameForStream(name, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.count(stream_name)) return nullptr; return &column_streams[stream_name]->compressed; }; type.serializeBinaryBulkWithMultipleStreams(column, stream_getter, prev_mark, limit, true, {}); /// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one. type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; if (is_offsets && skip_offsets) return; String stream_name = IDataType::getFileNameForStream(name, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.count(stream_name)) return; column_streams[stream_name]->compressed.nextIfAtEnd(); }, {}); prev_mark += limit; } /// Memoize offsets for Nested types, that are already written. They will not be written again for next columns of Nested structure. type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes; if (is_offsets) { String stream_name = IDataType::getFileNameForStream(name, substream_path); offset_columns.insert(stream_name); } }, {}); }
void IDataType::insertDefaultInto(IColumn & column) const { column.insertDefault(); }