Example #1
0
Paving Paving::projection(const Names &names) {

  Paving result;
  NamedBox varbox;   // varbox for result, varbox_ for this paving
  Tuple varlist;   // list of variables to be retained
  varlist.resize(names.size());
  result.set_type(type_);

  // push the relevant names and intervals
  for (nat i = 0; i < names.size(); ++i) {
    nat v = varbox_.var(names[i]);
    if (v < varbox_.size()) {
      varbox.push(names[i], varbox_.val(v));
      varlist[i] = v;
    } else {
      std::ostringstream os;
      os << "Kodiak (projection): name \"" << names[i] << "\" doesn't exist in the paving.";
      throw Growl(os.str());
    }
  }
  result.set_varbox(varbox);

  // add boxes
  Box x;
  x.resize(names.size());
  if (boxes_.size() == 0) return result;
  for (nat i = 0; i < boxes_.size(); ++i) { // iterate over types
    if (boxes_[i].size() == 0) continue;
    // add the first box
    for (nat k = 0; k < names.size(); ++k) x[k] = boxes_[i][0][varlist[k]];
    result.push_box(i, x);
    if (boxes_[i].size() == 1) continue;
    // add remaining boxes where necessary
    for (nat j = 1; j < boxes_[i].size(); ++j) {
      for (nat k = 0; k < names.size(); ++k) x[k] = boxes_[i][j][varlist[k]];
      nat subset = 0;   // is the jth box a subset of any box already
      // pushed into the result paving?
      for (nat jj = 0; jj < result.boxes(i).size(); ++jj) {
        if (box_subset(result.boxes(i)[jj], x)) {
          subset = 1;
          break;
        }
      }
      if (subset == 0) result.push_box(i, x);
    }
  }

  for (nat i = 0; i < boxes_.size(); ++i) { // iterate over types
    encluster(result.boxes(i));
  }

  result.set_type(type_);
  return result;

}
Example #2
0
void Path::set( size_t begin, size_t end, const Names &names )
{
	if( begin > m_names.size() )
	{
		throw IECore::Exception( "Index out of range" );
	}

	if( end > m_names.size() )
	{
		throw IECore::Exception( "Index out of range" );
	}

	Names::difference_type sizeDifference = names.size() - (end - begin);

	if( sizeDifference == 0 )
	{
		if( equal( m_names.begin() + begin, m_names.begin() + end, names.begin() ) )
		{
			return;
		}
	}
	else if( sizeDifference > 0 )
	{
		m_names.resize( m_names.size() + sizeDifference );
		std::copy_backward( m_names.begin() + end, m_names.begin() + end + sizeDifference, m_names.end() );
	}
	else
	{
		std::copy( m_names.begin() + end, m_names.end(), m_names.begin() + end + sizeDifference );
		m_names.resize( m_names.size() + sizeDifference );
	}

	std::copy( names.begin(), names.end(), m_names.begin() + begin );
	emitPathChanged();
}
Example #3
0
void Paving::save(const std::string filename, const Names &titles, const Names &names) const {
  if (empty()) return;
  Tuple vs;
  for (nat v = 0; v < names.size(); ++v) {
    nat n = varbox_.var(names[v]);
    if (n < nvars())
      vs.push_back(n);
  }
  std::ostringstream os;
  os << filename;
  for (nat i = 0; i < vs.size(); ++i)
    os << "_" << varbox_.name(vs[i]);
  os << ".dat";
  if (vs.empty()) {
    vs.resize(varbox_.size());
    for (nat v = 0; v < varbox_.size(); ++v)
      vs[v] = v;
  }
  std::ofstream f;
  f.open(os.str().c_str(), std::ofstream::out);
  f << "## File: " << os.str() << std::endl;
  f << "## Type: " << type_ << std::endl;
  f << "## Vars:" << std::endl;
  nat width = 2 * Kodiak::precision();
  for (nat i = 0; i < vs.size(); ++i)
    f << std::setw(width) << varbox_.name(vs[i]);
  f << std::endl;
  for (nat i = 0; i < vs.size(); ++i)
    f << std::setw(width) << varbox_.box()[vs[i]].inf();
  f << std::endl;
  for (nat i = 0; i < vs.size(); ++i)
    f << std::setw(width) << varbox_.box()[vs[i]].sup();
  f << std::endl;
  f << std::endl;
  for (nat i = 0; i < titles.size(); ++i) {
    f << "## " << titles[i] << ": " << size(i) << " boxes " << std::endl;
    if (i < boxes_.size() && boxes_[i].size() > 0)
      save_boxes(f, boxes_[i], vs, width);
    else
      f << std::endl;
  }
  f.close();
  std::cout << "Kodiak (save): Boxes were saved in file " << os.str() << std::endl;
}
NameSet MergeTreeReadPool::injectRequiredColumns(const MergeTreeData::DataPartPtr & part, Names & columns) const
{
	NameSet required_columns{std::begin(columns), std::end(columns)};
	NameSet injected_columns;

	auto all_column_files_missing = true;

	for (size_t i = 0; i < columns.size(); ++i)
	{
		const auto & column_name = columns[i];

		/// column has files and hence does not require evaluation
		if (part->hasColumnFiles(column_name))
		{
			all_column_files_missing = false;
			continue;
		}

		const auto default_it = data.column_defaults.find(column_name);
		/// columns has no explicit default expression
		if (default_it == std::end(data.column_defaults))
			continue;

		/// collect identifiers required for evaluation
		IdentifierNameSet identifiers;
		default_it->second.expression->collectIdentifierNames(identifiers);

		for (const auto & identifier : identifiers)
		{
			if (data.hasColumn(identifier))
			{
				/// ensure each column is added only once
				if (required_columns.count(identifier) == 0)
				{
					columns.emplace_back(identifier);
					required_columns.emplace(identifier);
					injected_columns.emplace(identifier);
				}
			}
		}
	}

	/** Добавить столбец минимального размера.
		* Используется в случае, когда ни один столбец не нужен или файлы отсутствуют, но нужно хотя бы знать количество строк.
		* Добавляет в columns.
		*/
	if (all_column_files_missing)
	{
		const auto minimum_size_column_name = part->getColumnNameWithMinumumCompressedSize();
		columns.push_back(minimum_size_column_name);
		/// correctly report added column
		injected_columns.insert(columns.back());
	}

	return injected_columns;
}
Example #5
0
    NonJoinedBlockInputStream(const Join & parent_, const Block & left_sample_block, const Names & key_names_left, size_t max_block_size_)
        : parent(parent_), max_block_size(max_block_size_)
    {
        /** left_sample_block contains keys and "left" columns.
          * result_sample_block - keys, "left" columns, and "right" columns.
          */

        size_t num_keys = key_names_left.size();
        size_t num_columns_left = left_sample_block.columns() - num_keys;
        size_t num_columns_right = parent.sample_block_with_columns_to_add.columns();

        result_sample_block = materializeBlock(left_sample_block);

        /// Add columns from the right-side table to the block.
        for (size_t i = 0; i < num_columns_right; ++i)
        {
            const ColumnWithTypeAndName & src_column = parent.sample_block_with_columns_to_add.getByPosition(i);
            result_sample_block.insert(src_column.cloneEmpty());
        }

        column_indices_left.reserve(num_columns_left);
        column_indices_keys_and_right.reserve(num_keys + num_columns_right);
        std::vector<bool> is_key_column_in_left_block(num_keys + num_columns_left, false);

        for (const std::string & key : key_names_left)
        {
            size_t key_pos = left_sample_block.getPositionByName(key);
            is_key_column_in_left_block[key_pos] = true;
            /// Here we establish the mapping between key columns of the left- and right-side tables.
            /// key_pos index is inserted in the position corresponding to key column in parent.blocks
            /// (saved blocks of the right-side table) and points to the same key column
            /// in the left_sample_block and thus in the result_sample_block.
            column_indices_keys_and_right.push_back(key_pos);
        }

        for (size_t i = 0; i < num_keys + num_columns_left; ++i)
        {
            if (!is_key_column_in_left_block[i])
                column_indices_left.push_back(i);
        }

        for (size_t i = 0; i < num_columns_right; ++i)
            column_indices_keys_and_right.push_back(num_keys + num_columns_left + i);

        /// If use_nulls, convert left columns to Nullable.
        if (parent.use_nulls)
        {
            for (size_t i = 0; i < num_columns_left; ++i)
            {
                convertColumnToNullable(result_sample_block.getByPosition(column_indices_left[i]));
            }
        }

        columns_left.resize(num_columns_left);
        columns_keys_and_right.resize(num_keys + num_columns_right);
    }
Example #6
0
Names Macros::expand(const Names & source_names, size_t level) const
{
    Names result_names;
    result_names.reserve(source_names.size());

    for (const String & name : source_names)
        result_names.push_back(expand(name, level));

    return result_names;
}
Example #7
0
void Join::checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right) const
{
    size_t keys_size = key_names_left.size();

    for (size_t i = 0; i < keys_size; ++i)
    {
        /// Compare up to Nullability.

        DataTypePtr left_type = removeNullable(recursiveRemoveLowCardinality(block_left.getByName(key_names_left[i]).type));
        DataTypePtr right_type = removeNullable(recursiveRemoveLowCardinality(block_right.getByName(key_names_right[i]).type));

        if (!left_type->equals(*right_type))
            throw Exception("Type mismatch of columns to JOIN by: "
                + key_names_left[i] + " " + left_type->getName() + " at left, "
                + key_names_right[i] + " " + right_type->getName() + " at right",
                ErrorCodes::TYPE_MISMATCH);
    }
}
InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(
    const ASTPtr & query_ptr_,
    const Context & context_,
    const Names & required_result_column_names,
    QueryProcessingStage::Enum to_stage_,
    size_t subquery_depth_,
    bool only_analyze,
    bool modify_inplace)
    : query_ptr(query_ptr_),
    context(context_),
    to_stage(to_stage_),
    subquery_depth(subquery_depth_)
{
    const ASTSelectWithUnionQuery & ast = typeid_cast<const ASTSelectWithUnionQuery &>(*query_ptr);

    size_t num_selects = ast.list_of_selects->children.size();

    if (!num_selects)
        throw Exception("Logical error: no children in ASTSelectWithUnionQuery", ErrorCodes::LOGICAL_ERROR);

    /// Initialize interpreters for each SELECT query.
    /// Note that we pass 'required_result_column_names' to first SELECT.
    /// And for the rest, we pass names at the corresponding positions of 'required_result_column_names' in the result of first SELECT,
    ///  because names could be different.

    nested_interpreters.reserve(num_selects);

    std::vector<Names> required_result_column_names_for_other_selects(num_selects);
    if (!required_result_column_names.empty() && num_selects > 1)
    {
        /// Result header if there are no filtering by 'required_result_column_names'.
        /// We use it to determine positions of 'required_result_column_names' in SELECT clause.

        Block full_result_header = InterpreterSelectQuery(
            ast.list_of_selects->children.at(0), context, Names(), to_stage, subquery_depth, true).getSampleBlock();

        std::vector<size_t> positions_of_required_result_columns(required_result_column_names.size());
        for (size_t required_result_num = 0, size = required_result_column_names.size(); required_result_num < size; ++required_result_num)
            positions_of_required_result_columns[required_result_num] = full_result_header.getPositionByName(required_result_column_names[required_result_num]);

        for (size_t query_num = 1; query_num < num_selects; ++query_num)
        {
            Block full_result_header_for_current_select = InterpreterSelectQuery(
                ast.list_of_selects->children.at(query_num), context, Names(), to_stage, subquery_depth, true).getSampleBlock();

            if (full_result_header_for_current_select.columns() != full_result_header.columns())
                throw Exception("Different number of columns in UNION ALL elements:\n"
                    + full_result_header.dumpNames()
                    + "\nand\n"
                    + full_result_header_for_current_select.dumpNames() + "\n",
                    ErrorCodes::UNION_ALL_RESULT_STRUCTURES_MISMATCH);

            required_result_column_names_for_other_selects[query_num].reserve(required_result_column_names.size());
            for (const auto & pos : positions_of_required_result_columns)
                required_result_column_names_for_other_selects[query_num].push_back(full_result_header_for_current_select.getByPosition(pos).name);
        }
    }

    for (size_t query_num = 0; query_num < num_selects; ++query_num)
    {
        const Names & current_required_result_column_names
            = query_num == 0 ? required_result_column_names : required_result_column_names_for_other_selects[query_num];

        nested_interpreters.emplace_back(std::make_unique<InterpreterSelectQuery>(
            ast.list_of_selects->children.at(query_num),
            context,
            current_required_result_column_names,
            to_stage,
            subquery_depth,
            only_analyze,
            modify_inplace));
    }

    /// Determine structure of the result.

    if (num_selects == 1)
    {
        result_header = nested_interpreters.front()->getSampleBlock();
    }
    else
    {
        Blocks headers(num_selects);
        for (size_t query_num = 0; query_num < num_selects; ++query_num)
            headers[query_num] = nested_interpreters[query_num]->getSampleBlock();

        result_header = headers.front();
        size_t num_columns = result_header.columns();

        for (size_t query_num = 1; query_num < num_selects; ++query_num)
            if (headers[query_num].columns() != num_columns)
                throw Exception("Different number of columns in UNION ALL elements:\n"
                    + result_header.dumpNames()
                    + "\nand\n"
                    + headers[query_num].dumpNames() + "\n",
                    ErrorCodes::UNION_ALL_RESULT_STRUCTURES_MISMATCH);

        for (size_t column_num = 0; column_num < num_columns; ++column_num)
        {
            ColumnWithTypeAndName & result_elem = result_header.getByPosition(column_num);

            /// Determine common type.

            DataTypes types(num_selects);
            for (size_t query_num = 0; query_num < num_selects; ++query_num)
                types[query_num] = headers[query_num].getByPosition(column_num).type;

            result_elem.type = getLeastSupertype(types);

            /// If there are different constness or different values of constants, the result must be non-constant.

            if (result_elem.column->isColumnConst())
            {
                bool need_materialize = false;
                for (size_t query_num = 1; query_num < num_selects; ++query_num)
                {
                    const ColumnWithTypeAndName & source_elem = headers[query_num].getByPosition(column_num);

                    if (!source_elem.column->isColumnConst()
                        || (static_cast<const ColumnConst &>(*result_elem.column).getField()
                            != static_cast<const ColumnConst &>(*source_elem.column).getField()))
                    {
                        need_materialize = true;
                        break;
                    }
                }

                if (need_materialize)
                    result_elem.column = result_elem.type->createColumn();
            }

            /// BTW, result column names are from first SELECT.
        }
    }
}
int main(int argc, char ** argv)
{
    using namespace DB;

    try
    {

    if (argc < 2)
    {
        std::cerr << "at least 1 argument expected" << std::endl;
        return 1;
    }

    Context context;

    NamesAndTypesList columns;
    for (int i = 2; i + 1 < argc; i += 2)
    {
        NameAndTypePair col;
        col.name = argv[i];
        col.type = DataTypeFactory::instance().get(argv[i + 1]);
        columns.push_back(col);
    }

    ASTPtr root;
    ParserPtr parsers[] = {std::make_unique<ParserSelectQuery>(), std::make_unique<ParserExpressionList>(false)};
    for (size_t i = 0; i < sizeof(parsers)/sizeof(parsers[0]); ++i)
    {
        IParser & parser = *parsers[i];
        const char * pos = argv[1];
        const char * end = argv[1] + strlen(argv[1]);
        const char * max_parsed_pos = pos;
        Expected expected = "";
        if (parser.parse(pos, end, root, max_parsed_pos, expected))
            break;
        else
            root = nullptr;
    }
    if (!root)
    {
        std::cerr << "invalid expression (should be select query or expression list)" << std::endl;
        return 2;
    }

    formatAST(*root, std::cout);
    std::cout << std::endl;

    ExpressionAnalyzer analyzer(root, context, {}, columns);

    Names required = analyzer.getRequiredColumns();
    std::cout << "required columns:\n";
    for (size_t i = 0; i < required.size(); ++i)
    {
        std::cout << required[i] << "\n";
    }
    std::cout << "\n";

    std::cout << "only consts:\n\n" << analyzer.getConstActions()->dumpActions() << "\n";

    if (analyzer.hasAggregation())
    {
        Names key_names;
        AggregateDescriptions aggregates;
        analyzer.getAggregateInfo(key_names, aggregates);

        std::cout << "keys:\n";
        for (size_t i = 0; i < key_names.size(); ++i)
            std::cout << key_names[i] << "\n";
        std::cout << "\n";

        std::cout << "aggregates:\n";
        for (size_t i = 0; i < aggregates.size(); ++i)
        {
            AggregateDescription desc = aggregates[i];

            std::cout << desc.column_name << " = " << desc.function->getName() << " ( ";
            for (size_t j = 0; j < desc.argument_names.size(); ++j)
                std::cout << desc.argument_names[j] << " ";
            std::cout << ")\n";
        }
        std::cout << "\n";

        ExpressionActionsChain before;
        if (analyzer.appendWhere(before, false))
            before.addStep();
        analyzer.appendAggregateFunctionsArguments(before, false);
        analyzer.appendGroupBy(before, false);
        before.finalize();

        ExpressionActionsChain after;
        if (analyzer.appendHaving(after, false))
            after.addStep();
        analyzer.appendSelect(after, false);
        analyzer.appendOrderBy(after, false);
        after.addStep();
        analyzer.appendProjectResult(after, false);
        after.finalize();

        std::cout << "before aggregation:\n\n";
        for (size_t i = 0; i < before.steps.size(); ++i)
        {
            std::cout << before.steps[i].actions->dumpActions();
            std::cout << std::endl;
        }

        std::cout << "\nafter aggregation:\n\n";
        for (size_t i = 0; i < after.steps.size(); ++i)
        {
            std::cout << after.steps[i].actions->dumpActions();
            std::cout << std::endl;
        }
    }
    else
    {
        if (typeid_cast<ASTSelectQuery *>(&*root))
        {
            ExpressionActionsChain chain;
            if (analyzer.appendWhere(chain, false))
                chain.addStep();
            analyzer.appendSelect(chain, false);
            analyzer.appendOrderBy(chain, false);
            chain.addStep();
            analyzer.appendProjectResult(chain, false);
            chain.finalize();

            for (size_t i = 0; i < chain.steps.size(); ++i)
            {
                std::cout << chain.steps[i].actions->dumpActions();
                std::cout << std::endl;
            }
        }
        else
        {
            std::cout << "unprojected actions:\n\n" << analyzer.getActions(false)->dumpActions() << "\n";
            std::cout << "projected actions:\n\n" << analyzer.getActions(true)->dumpActions() << "\n";
        }
    }

    }
    catch (Exception & e)
    {
        std::cerr << "Exception " << e.what() << ": " << e.displayText() << "\n" << e.getStackTrace().toString();
        return 3;
    }

    return 0;
}
Example #10
0
BlockInputStreams StorageMerge::createSourceStreams(const SelectQueryInfo & query_info, const QueryProcessingStage::Enum & processed_stage,
                                                    const size_t max_block_size, const Block & header, const StoragePtr & storage,
                                                    const TableStructureReadLockPtr & struct_lock, Names & real_column_names,
                                                    Context & modified_context, size_t streams_num, bool has_table_virtual_column,
                                                    bool concat_streams)
{
    SelectQueryInfo modified_query_info = query_info;
    modified_query_info.query = query_info.query->clone();

    VirtualColumnUtils::rewriteEntityInAst(modified_query_info.query, "_table", storage ? storage->getTableName() : "");

    if (!storage)
        return BlockInputStreams{
            InterpreterSelectQuery(modified_query_info.query, modified_context, std::make_shared<OneBlockInputStream>(header),
                                   processed_stage, true).execute().in};

    BlockInputStreams source_streams;

    if (processed_stage <= storage->getQueryProcessingStage(modified_context))
    {
        /// If there are only virtual columns in query, you must request at least one other column.
        if (real_column_names.size() ==0)
            real_column_names.push_back(ExpressionActions::getSmallestColumn(storage->getColumns().getAllPhysical()));

        source_streams = storage->read(real_column_names, modified_query_info, modified_context, processed_stage, max_block_size,
                                       UInt32(streams_num));
    }
    else if (processed_stage > storage->getQueryProcessingStage(modified_context))
    {
        typeid_cast<ASTSelectQuery *>(modified_query_info.query.get())->replaceDatabaseAndTable(source_database, storage->getTableName());

        /// Maximum permissible parallelism is streams_num
        modified_context.getSettingsRef().max_threads = UInt64(streams_num);
        modified_context.getSettingsRef().max_streams_to_max_threads_ratio = 1;

        InterpreterSelectQuery interpreter{modified_query_info.query, modified_context, Names{}, processed_stage};
        BlockInputStreamPtr interpreter_stream = interpreter.execute().in;

        /** Materialization is needed, since from distributed storage the constants come materialized.
          * If you do not do this, different types (Const and non-Const) columns will be produced in different threads,
          * And this is not allowed, since all code is based on the assumption that in the block stream all types are the same.
          */
        source_streams.emplace_back(std::make_shared<MaterializingBlockInputStream>(interpreter_stream));
    }

    if (!source_streams.empty())
    {
        if (concat_streams)
        {
            BlockInputStreamPtr stream =
                source_streams.size() > 1 ? std::make_shared<ConcatBlockInputStream>(source_streams) : source_streams[0];

            source_streams.resize(1);
            source_streams[0] = stream;
        }

        for (BlockInputStreamPtr & source_stream : source_streams)
        {
            if (has_table_virtual_column)
                source_stream = std::make_shared<AddingConstColumnBlockInputStream<String>>(
                    source_stream, std::make_shared<DataTypeString>(), storage->getTableName(), "_table");

            /// Subordinary tables could have different but convertible types, like numeric types of different width.
            /// We must return streams with structure equals to structure of Merge table.
            convertingSourceStream(header, modified_context, modified_query_info.query, source_stream, processed_stage);

            source_stream->addTableLock(struct_lock);
        }
    }

    return source_streams;
}
Example #11
0
BlockInputStreams StorageMerge::read(
    const Names & column_names,
    const SelectQueryInfo & query_info,
    const Context & context,
    QueryProcessingStage::Enum processed_stage,
    const size_t max_block_size,
    const unsigned num_streams)
{
    BlockInputStreams res;

    bool has_table_virtual_column = false;
    Names real_column_names;
    real_column_names.reserve(column_names.size());

    for (const auto & column_name : column_names)
    {
        if (column_name == "_table")
            has_table_virtual_column = true;
        else
            real_column_names.push_back(column_name);
    }

    /** Just in case, turn off optimization "transfer to PREWHERE",
      * since there is no certainty that it works when one of table is MergeTree and other is not.
      */
    Context modified_context = context;
    modified_context.getSettingsRef().optimize_move_to_prewhere = false;

    /// What will be result structure depending on query processed stage in source tables?
    Block header = getQueryHeader(column_names, query_info, context, processed_stage);

    /** First we make list of selected tables to find out its size.
      * This is necessary to correctly pass the recommended number of threads to each table.
      */
    StorageListWithLocks selected_tables = getSelectedTables(query_info.query, has_table_virtual_column, true);

    if (selected_tables.empty())
        return createSourceStreams(
            query_info, processed_stage, max_block_size, header, {}, {}, real_column_names, modified_context, 0, has_table_virtual_column);

    size_t remaining_streams = num_streams;
    size_t tables_count = selected_tables.size();

    for (auto it = selected_tables.begin(); it != selected_tables.end(); ++it)
    {
        size_t current_need_streams = tables_count >= num_streams ? 1 : (num_streams / tables_count);
        size_t current_streams = std::min(current_need_streams, remaining_streams);
        remaining_streams -= current_streams;
        current_streams = std::max(size_t(1), current_streams);

        StoragePtr storage = it->first;
        TableStructureReadLockPtr struct_lock = it->second;

        BlockInputStreams source_streams;

        if (current_streams)
        {
            source_streams = createSourceStreams(
                query_info, processed_stage, max_block_size, header, storage,
                struct_lock, real_column_names, modified_context, current_streams, has_table_virtual_column);
        }
        else
        {
            source_streams.emplace_back(std::make_shared<LazyBlockInputStream>(
                header, [=]() mutable -> BlockInputStreamPtr
                {
                    BlockInputStreams streams = createSourceStreams(query_info, processed_stage, max_block_size,
                                                                    header, storage, struct_lock, real_column_names,
                                                                    modified_context, current_streams, has_table_virtual_column, true);

                    if (!streams.empty() && streams.size() != 1)
                        throw Exception("LogicalError: the lazy stream size must to be one or empty.", ErrorCodes::LOGICAL_ERROR);

                    return streams.empty() ? std::make_shared<NullBlockInputStream>(header) : streams[0];
                }));
        }

        res.insert(res.end(), source_streams.begin(), source_streams.end());
    }

    if (res.empty())
        return res;

    res = narrowBlockInputStreams(res, num_streams);
    return res;
}
Example #12
0
void Join::joinBlockImpl(
    Block & block,
    const Names & key_names_left,
    const NameSet & needed_key_names_right,
    const Block & block_with_columns_to_add,
    const Maps & maps_) const
{
    size_t keys_size = key_names_left.size();
    ColumnRawPtrs key_columns(keys_size);

    /// Rare case, when keys are constant. To avoid code bloat, simply materialize them.
    Columns materialized_columns;
    materialized_columns.reserve(keys_size);

    /// Memoize key columns to work with.
    for (size_t i = 0; i < keys_size; ++i)
    {
        materialized_columns.emplace_back(recursiveRemoveLowCardinality(block.getByName(key_names_left[i]).column->convertToFullColumnIfConst()));
        key_columns[i] = materialized_columns.back().get();
    }

    /// Keys with NULL value in any column won't join to anything.
    ColumnPtr null_map_holder;
    ConstNullMapPtr null_map{};
    extractNestedColumnsAndNullMap(key_columns, null_map_holder, null_map);

    size_t existing_columns = block.columns();

    /** If you use FULL or RIGHT JOIN, then the columns from the "left" table must be materialized.
      * Because if they are constants, then in the "not joined" rows, they may have different values
      *  - default values, which can differ from the values of these constants.
      */
    if (getFullness(kind))
    {
        for (size_t i = 0; i < existing_columns; ++i)
        {
            block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst();

            /// If use_nulls, convert left columns (except keys) to Nullable.
            if (use_nulls)
            {
                if (std::end(key_names_left) == std::find(key_names_left.begin(), key_names_left.end(), block.getByPosition(i).name))
                    convertColumnToNullable(block.getByPosition(i));
            }
        }
    }

    /** For LEFT/INNER JOIN, the saved blocks do not contain keys.
      * For FULL/RIGHT JOIN, the saved blocks contain keys;
      *  but they will not be used at this stage of joining (and will be in `AdderNonJoined`), and they need to be skipped.
      */
    size_t num_columns_to_skip = 0;
    if (getFullness(kind))
        num_columns_to_skip = keys_size;

    /// Add new columns to the block.
    size_t num_columns_to_add = sample_block_with_columns_to_add.columns();
    MutableColumns added_columns;
    added_columns.reserve(num_columns_to_add);

    std::vector<std::pair<decltype(ColumnWithTypeAndName::type), decltype(ColumnWithTypeAndName::name)>> added_type_name;
    added_type_name.reserve(num_columns_to_add);

    std::vector<size_t> right_indexes;
    right_indexes.reserve(num_columns_to_add);

    for (size_t i = 0; i < num_columns_to_add; ++i)
    {
        const ColumnWithTypeAndName & src_column = sample_block_with_columns_to_add.safeGetByPosition(i);

        /// Don't insert column if it's in left block or not explicitly required.
        if (!block.has(src_column.name) && block_with_columns_to_add.has(src_column.name))
        {
            added_columns.push_back(src_column.column->cloneEmpty());
            added_columns.back()->reserve(src_column.column->size());
            added_type_name.emplace_back(src_column.type, src_column.name);
            right_indexes.push_back(num_columns_to_skip + i);
        }
    }

    size_t rows = block.rows();

    std::unique_ptr<IColumn::Filter> filter;

    bool filter_left_keys = (kind == ASTTableJoin::Kind::Inner || kind == ASTTableJoin::Kind::Right) && strictness == ASTTableJoin::Strictness::Any;
    filter = std::make_unique<IColumn::Filter>(rows);

    /// Used with ALL ... JOIN
    IColumn::Offset current_offset = 0;
    std::unique_ptr<IColumn::Offsets> offsets_to_replicate;

    if (strictness == ASTTableJoin::Strictness::All)
        offsets_to_replicate = std::make_unique<IColumn::Offsets>(rows);

    switch (type)
    {
    #define M(TYPE) \
        case Join::Type::TYPE: \
            joinBlockImplType<KIND, STRICTNESS, typename KeyGetterForType<Join::Type::TYPE>::Type>(\
                *maps_.TYPE, rows, key_columns, key_sizes, added_columns, null_map, \
                filter, current_offset, offsets_to_replicate, right_indexes); \
            break;
        APPLY_FOR_JOIN_VARIANTS(M)
    #undef M

        default:
            throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
    }

    const auto added_columns_size = added_columns.size();
    for (size_t i = 0; i < added_columns_size; ++i)
        block.insert(ColumnWithTypeAndName(std::move(added_columns[i]), added_type_name[i].first, added_type_name[i].second));

    /// If ANY INNER | RIGHT JOIN - filter all the columns except the new ones.
    if (filter_left_keys)
        for (size_t i = 0; i < existing_columns; ++i)
            block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->filter(*filter, -1);

    ColumnUInt64::Ptr mapping;

    /// Add join key columns from right block if they has different name.
    for (size_t i = 0; i < key_names_right.size(); ++i)
    {
        auto & right_name = key_names_right[i];
        auto & left_name = key_names_left[i];

        if (needed_key_names_right.count(right_name) && !block.has(right_name))
        {
            const auto & col = block.getByName(left_name);
            auto column = col.column;
            if (!filter_left_keys)
            {
                if (!mapping)
                {
                    auto mut_mapping = ColumnUInt64::create(column->size());
                    auto & data = mut_mapping->getData();
                    size_t size = column->size();
                    for (size_t j = 0; j < size; ++j)
                        data[j] = (*filter)[j] ? j : size;

                    mapping = std::move(mut_mapping);
                }

                auto mut_column = (*std::move(column)).mutate();
                mut_column->insertDefault();
                column = mut_column->index(*mapping, 0);
            }
            block.insert({column, col.type, right_name});
        }
    }

    /// If ALL ... JOIN - we replicate all the columns except the new ones.
    if (offsets_to_replicate)
    {
        for (size_t i = 0; i < existing_columns; ++i)
            block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->replicate(*offsets_to_replicate);
    }
}