MergeTreeSequentialBlockInputStream::MergeTreeSequentialBlockInputStream(
    const MergeTreeData & storage_,
    const MergeTreeData::DataPartPtr & data_part_,
    Names columns_to_read_,
    bool read_with_direct_io_,
    bool take_column_types_from_storage,
    bool quiet)
    : storage(storage_)
    , data_part(data_part_)
    , part_columns_lock(data_part->columns_lock)
    , columns_to_read(columns_to_read_)
    , read_with_direct_io(read_with_direct_io_)
    , mark_cache(storage.global_context.getMarkCache())
{
    if (!quiet)
    {
        std::stringstream message;
        message << "Reading " << data_part->marks_count << " marks from part " << data_part->name
            << ", total " << data_part->rows_count
            << " rows starting from the beginning of the part, columns: ";
        for (size_t i = 0, size = columns_to_read.size(); i < size; ++i)
            message << (i == 0 ? "" : ", ") << columns_to_read[i];

        LOG_TRACE(log, message.rdbuf());
    }

    addTotalRowsApprox(data_part->rows_count);

    header = storage.getSampleBlockForColumns(columns_to_read);
    fixHeader(header);

    /// Add columns because we don't want to read empty blocks
    injectRequiredColumns(storage, data_part, columns_to_read);
    NamesAndTypesList columns_for_reader;
    if (take_column_types_from_storage)
    {
        const NamesAndTypesList & physical_columns = storage.getColumns().getAllPhysical();
        columns_for_reader = physical_columns.addTypes(columns_to_read);
    }
    else
    {
        /// take columns from data_part
        columns_for_reader = data_part->columns.addTypes(columns_to_read);
    }

    reader = std::make_unique<MergeTreeReader>(
        data_part->getFullPath(), data_part, columns_for_reader, /* uncompressed_cache = */ nullptr,
        mark_cache.get(), /* save_marks_in_cache = */ false, storage,
        MarkRanges{MarkRange(0, data_part->marks_count)},
        /* bytes to use AIO (this is hack) */
        read_with_direct_io ? 1UL : std::numeric_limits<size_t>::max(),
        DBMS_DEFAULT_BUFFER_SIZE);
}
Пример #2
0
std::vector<std::size_t> MergeTreeReadPool::fillPerPartInfo(
	RangesInDataParts & parts, const ExpressionActionsPtr & prewhere_actions, const String & prewhere_column_name,
	const bool check_columns)
{
	std::vector<std::size_t> per_part_sum_marks;

	for (const auto i : ext::range(0, parts.size()))
	{
		auto & part = parts[i];

		/// Read marks for every data part.
		size_t sum_marks = 0;
		/// Ranges are in right-to-left order, due to 'reverse' in MergeTreeDataSelectExecutor.
		for (const auto & range : part.ranges)
			sum_marks += range.end - range.begin;

		per_part_sum_marks.push_back(sum_marks);

		per_part_columns_lock.push_back(std::make_unique<Poco::ScopedReadRWLock>(
			part.data_part->columns_lock));

		/// inject column names required for DEFAULT evaluation in current part
		auto required_column_names = column_names;

		const auto injected_columns = injectRequiredColumns(part.data_part, required_column_names);
		auto should_reoder = !injected_columns.empty();

		Names required_pre_column_names;

		if (prewhere_actions)
		{
			/// collect columns required for PREWHERE evaluation
			required_pre_column_names = prewhere_actions->getRequiredColumns();

			/// there must be at least one column required for PREWHERE
			if (required_pre_column_names.empty())
				required_pre_column_names.push_back(required_column_names[0]);

			/// PREWHERE columns may require some additional columns for DEFAULT evaluation
			const auto injected_pre_columns = injectRequiredColumns(part.data_part, required_pre_column_names);
			if (!injected_pre_columns.empty())
				should_reoder = true;

			/// will be used to distinguish between PREWHERE and WHERE columns when applying filter
			const NameSet pre_name_set{
				std::begin(required_pre_column_names), std::end(required_pre_column_names)
			};
			/** If expression in PREWHERE is not table column, then no need to return column with it to caller
				*	(because storage is expected only to read table columns).
				*/
			per_part_remove_prewhere_column.push_back(0 == pre_name_set.count(prewhere_column_name));

			Names post_column_names;
			for (const auto & name : required_column_names)
				if (!pre_name_set.count(name))
					post_column_names.push_back(name);

			required_column_names = post_column_names;
		}
		else
			per_part_remove_prewhere_column.push_back(false);

		per_part_column_name_set.emplace_back(std::begin(required_column_names), std::end(required_column_names));

		if (check_columns)
		{
			/** Under part->columns_lock check that all requested columns in part are of same type that in table.
				*	This could be violated during ALTER MODIFY.
				*/
			if (!required_pre_column_names.empty())
				data.check(part.data_part->columns, required_pre_column_names);
			if (!required_column_names.empty())
				data.check(part.data_part->columns, required_column_names);

			per_part_pre_columns.push_back(data.getColumnsList().addTypes(required_pre_column_names));
			per_part_columns.push_back(data.getColumnsList().addTypes(required_column_names));
		}
		else
		{
			per_part_pre_columns.push_back(part.data_part->columns.addTypes(required_pre_column_names));
			per_part_columns.push_back(part.data_part->columns.addTypes(required_column_names));
		}

		per_part_should_reorder.push_back(should_reoder);

		this->parts.push_back({ part.data_part, part.part_index_in_query });
	}

	return per_part_sum_marks;
}
bool MergeTreeBlockInputStream::getNewTask()
try
{
    /// Produce only one task
    if (!is_first_task)
    {
        finish();
        return false;
    }
    is_first_task = false;

    Names pre_column_names, column_names = ordered_names;
    bool remove_prewhere_column = false;

    /// inject columns required for defaults evaluation
    bool should_reorder = !injectRequiredColumns(storage, data_part, column_names).empty();

    if (prewhere_actions)
    {
        pre_column_names = prewhere_actions->getRequiredColumns();

        if (pre_column_names.empty())
            pre_column_names.push_back(column_names[0]);

        const auto injected_pre_columns = injectRequiredColumns(storage, data_part, pre_column_names);
        if (!injected_pre_columns.empty())
            should_reorder = true;

        const NameSet pre_name_set(pre_column_names.begin(), pre_column_names.end());
        /// If the expression in PREWHERE is not a column of the table, you do not need to output a column with it
        ///  (from storage expect to receive only the columns of the table).
        remove_prewhere_column = !pre_name_set.count(prewhere_column);

        Names post_column_names;
        for (const auto & name : column_names)
            if (!pre_name_set.count(name))
                post_column_names.push_back(name);

        column_names = post_column_names;
    }

    /// will be used to distinguish between PREWHERE and WHERE columns when applying filter
    column_name_set = NameSet{column_names.begin(), column_names.end()};

    if (check_columns)
    {
        /// Under owned_data_part->columns_lock we check that all requested columns are of the same type as in the table.
        /// This may be not true in case of ALTER MODIFY.
        if (!pre_column_names.empty())
            storage.check(data_part->columns, pre_column_names);
        if (!column_names.empty())
            storage.check(data_part->columns, column_names);

        pre_columns = storage.getColumnsList().addTypes(pre_column_names);
        columns = storage.getColumnsList().addTypes(column_names);
    }
    else
    {
        pre_columns = data_part->columns.addTypes(pre_column_names);
        columns = data_part->columns.addTypes(column_names);
    }

    /** @note you could simply swap `reverse` in if and else branches of MergeTreeDataSelectExecutor,
     * and remove this reverse. */
    MarkRanges remaining_mark_ranges = all_mark_ranges;
    std::reverse(remaining_mark_ranges.begin(), remaining_mark_ranges.end());

    auto size_predictor = (preferred_block_size_bytes == 0) ? nullptr
                          : std::make_unique<MergeTreeBlockSizePredictor>(data_part, ordered_names, data_part->storage.getSampleBlock());

    task = std::make_unique<MergeTreeReadTask>(data_part, remaining_mark_ranges, part_index_in_query, ordered_names,
                                               column_name_set, columns, pre_columns, remove_prewhere_column, should_reorder,
                                               std::move(size_predictor));

    if (!reader)
    {
        if (use_uncompressed_cache)
            owned_uncompressed_cache = storage.context.getUncompressedCache();

        owned_mark_cache = storage.context.getMarkCache();

        reader = std::make_unique<MergeTreeReader>(
            path, data_part, columns, owned_uncompressed_cache.get(),
            owned_mark_cache.get(), save_marks_in_cache, storage,
            all_mark_ranges, min_bytes_to_use_direct_io, max_read_buffer_size);

        if (prewhere_actions)
            pre_reader = std::make_unique<MergeTreeReader>(
                path, data_part, pre_columns, owned_uncompressed_cache.get(),
                owned_mark_cache.get(), save_marks_in_cache, storage,
                all_mark_ranges, min_bytes_to_use_direct_io, max_read_buffer_size);
    }

    return true;
}
catch (...)
{
    /// Suspicion of the broken part. A part is added to the queue for verification.
    if (getCurrentExceptionCode() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
        storage.reportBrokenPart(data_part->name);
    throw;
}