void Join::setSampleBlock(const Block & block) { std::unique_lock lock(rwlock); if (!empty()) return; size_t keys_size = key_names_right.size(); ColumnRawPtrs key_columns(keys_size); Columns materialized_columns(keys_size); for (size_t i = 0; i < keys_size; ++i) { materialized_columns[i] = recursiveRemoveLowCardinality(block.getByName(key_names_right[i]).column); key_columns[i] = materialized_columns[i].get(); /// We will join only keys, where all components are not NULL. if (key_columns[i]->isColumnNullable()) key_columns[i] = &static_cast<const ColumnNullable &>(*key_columns[i]).getNestedColumn(); } /// Choose data structure to use for JOIN. init(chooseMethod(key_columns, key_sizes)); sample_block_with_columns_to_add = materializeBlock(block); /// Move from `sample_block_with_columns_to_add` key columns to `sample_block_with_keys`, keeping the order. size_t pos = 0; while (pos < sample_block_with_columns_to_add.columns()) { const auto & name = sample_block_with_columns_to_add.getByPosition(pos).name; if (key_names_right.end() != std::find(key_names_right.begin(), key_names_right.end(), name)) { auto & col = sample_block_with_columns_to_add.getByPosition(pos); col.column = recursiveRemoveLowCardinality(col.column); col.type = recursiveRemoveLowCardinality(col.type); sample_block_with_keys.insert(col); sample_block_with_columns_to_add.erase(pos); } else ++pos; } size_t num_columns_to_add = sample_block_with_columns_to_add.columns(); for (size_t i = 0; i < num_columns_to_add; ++i) { auto & column = sample_block_with_columns_to_add.getByPosition(i); if (!column.column) column.column = column.type->createColumn(); } /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && (kind == ASTTableJoin::Kind::Left || kind == ASTTableJoin::Kind::Full)) for (size_t i = 0; i < num_columns_to_add; ++i) convertColumnToNullable(sample_block_with_columns_to_add.getByPosition(i)); }
void Join::joinBlockImpl(Block & block, const Maps & maps) const { size_t keys_size = key_names_left.size(); ColumnRawPtrs key_columns(keys_size); /// Rare case, when keys are constant. To avoid code bloat, simply materialize them. Columns materialized_columns; /// Memoize key columns to work with. for (size_t i = 0; i < keys_size; ++i) { key_columns[i] = block.getByName(key_names_left[i]).column.get(); if (ColumnPtr converted = key_columns[i]->convertToFullColumnIfConst()) { materialized_columns.emplace_back(converted); key_columns[i] = materialized_columns.back().get(); } } /// Keys with NULL value in any column won't join to anything. ColumnPtr null_map_holder; ConstNullMapPtr null_map{}; extractNestedColumnsAndNullMap(key_columns, null_map_holder, null_map); size_t existing_columns = block.columns(); /** If you use FULL or RIGHT JOIN, then the columns from the "left" table must be materialized. * Because if they are constants, then in the "not joined" rows, they may have different values * - default values, which can differ from the values of these constants. */ if (getFullness(kind)) { for (size_t i = 0; i < existing_columns; ++i) { auto & col = block.getByPosition(i).column; if (ColumnPtr converted = col->convertToFullColumnIfConst()) col = converted; /// If use_nulls, convert left columns (except keys) to Nullable. if (use_nulls) { if (std::end(key_names_left) == std::find(key_names_left.begin(), key_names_left.end(), block.getByPosition(i).name)) convertColumnToNullable(block.getByPosition(i)); } } } /** For LEFT/INNER JOIN, the saved blocks do not contain keys. * For FULL/RIGHT JOIN, the saved blocks contain keys; * but they will not be used at this stage of joining (and will be in `AdderNonJoined`), and they need to be skipped. */ size_t num_columns_to_skip = 0; if (getFullness(kind)) num_columns_to_skip = keys_size; /// Add new columns to the block. size_t num_columns_to_add = sample_block_with_columns_to_add.columns(); MutableColumns added_columns; added_columns.reserve(num_columns_to_add); std::vector<std::pair<decltype(ColumnWithTypeAndName::type), decltype(ColumnWithTypeAndName::name)>> added_type_name; added_type_name.reserve(num_columns_to_add); std::vector<size_t> right_indexes; right_indexes.reserve(num_columns_to_add); for (size_t i = 0; i < num_columns_to_add; ++i) { const ColumnWithTypeAndName & src_column = sample_block_with_columns_to_add.safeGetByPosition(i); /// Don't insert column if it's in left block. if (!block.has(src_column.name)) { added_columns.push_back(src_column.column->cloneEmpty()); added_columns.back()->reserve(src_column.column->size()); added_type_name.emplace_back(src_column.type, src_column.name); right_indexes.push_back(num_columns_to_skip + i); } } size_t rows = block.rows(); /// Used with ANY INNER JOIN std::unique_ptr<IColumn::Filter> filter; bool filter_left_keys = (kind == ASTTableJoin::Kind::Inner || kind == ASTTableJoin::Kind::Right) && strictness == ASTTableJoin::Strictness::Any; filter = std::make_unique<IColumn::Filter>(rows); /// Used with ALL ... JOIN IColumn::Offset current_offset = 0; std::unique_ptr<IColumn::Offsets> offsets_to_replicate; if (strictness == ASTTableJoin::Strictness::All) offsets_to_replicate = std::make_unique<IColumn::Offsets>(rows); switch (type) { #define M(TYPE) \ case Join::Type::TYPE: \ joinBlockImplType<KIND, STRICTNESS, typename KeyGetterForType<Join::Type::TYPE>::Type>(\ *maps.TYPE, rows, key_columns, key_sizes, added_columns, null_map, \ filter, current_offset, offsets_to_replicate, right_indexes); \ break; APPLY_FOR_JOIN_VARIANTS(M) #undef M default: throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT); } const auto added_columns_size = added_columns.size(); for (size_t i = 0; i < added_columns_size; ++i) block.insert(ColumnWithTypeAndName(std::move(added_columns[i]), added_type_name[i].first, added_type_name[i].second)); /// If ANY INNER | RIGHT JOIN - filter all the columns except the new ones. if (filter_left_keys) for (size_t i = 0; i < existing_columns; ++i) block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->filter(*filter, -1); ColumnUInt64::Ptr mapping; /// Add join key columns from right block if they has different name. for (size_t i = 0; i < key_names_right.size(); ++i) { auto & right_name = key_names_right[i]; auto & left_name = key_names_left[i]; if (needed_key_names_right.count(right_name) && !block.has(right_name)) { const auto & col = block.getByName(left_name); auto column = col.column; if (!filter_left_keys) { if (!mapping) { auto mut_mapping = ColumnUInt64::create(column->size()); auto & data = mut_mapping->getData(); size_t size = column->size(); for (size_t j = 0; j < size; ++j) data[j] = (*filter)[j] ? j : size; mapping = std::move(mut_mapping); } auto mut_column = (*std::move(column)).mutate(); mut_column->insertDefault(); column = mut_column->index(*mapping, 0); } block.insert({column, col.type, right_name}); } } /// If ALL ... JOIN - we replicate all the columns except the new ones. if (offsets_to_replicate) { for (size_t i = 0; i < existing_columns; ++i) block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->replicate(*offsets_to_replicate); } }
bool Join::insertFromBlock(const Block & block) { std::unique_lock lock(rwlock); if (empty()) throw Exception("Logical error: Join was not initialized", ErrorCodes::LOGICAL_ERROR); size_t keys_size = key_names_right.size(); ColumnRawPtrs key_columns(keys_size); /// Rare case, when keys are constant. To avoid code bloat, simply materialize them. Columns materialized_columns; /// Memoize key columns to work. for (size_t i = 0; i < keys_size; ++i) { key_columns[i] = block.getByName(key_names_right[i]).column.get(); if (ColumnPtr converted = key_columns[i]->convertToFullColumnIfConst()) { materialized_columns.emplace_back(converted); key_columns[i] = materialized_columns.back().get(); } } /// We will insert to the map only keys, where all components are not NULL. ColumnPtr null_map_holder; ConstNullMapPtr null_map{}; extractNestedColumnsAndNullMap(key_columns, null_map_holder, null_map); size_t rows = block.rows(); blocks.push_back(block); Block * stored_block = &blocks.back(); if (getFullness(kind)) { /** Move the key columns to the beginning of the block. * This is where NonJoinedBlockInputStream will expect. */ size_t key_num = 0; for (const auto & name : key_names_right) { size_t pos = stored_block->getPositionByName(name); ColumnWithTypeAndName col = stored_block->safeGetByPosition(pos); stored_block->erase(pos); stored_block->insert(key_num, std::move(col)); ++key_num; } } else { /// Remove the key columns from stored_block, as they are not needed. for (const auto & name : key_names_right) stored_block->erase(stored_block->getPositionByName(name)); } size_t size = stored_block->columns(); /// Rare case, when joined columns are constant. To avoid code bloat, simply materialize them. for (size_t i = 0; i < size; ++i) { ColumnPtr col = stored_block->safeGetByPosition(i).column; if (ColumnPtr converted = col->convertToFullColumnIfConst()) stored_block->safeGetByPosition(i).column = converted; } /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && (kind == ASTTableJoin::Kind::Left || kind == ASTTableJoin::Kind::Full)) { for (size_t i = getFullness(kind) ? keys_size : 0; i < size; ++i) { convertColumnToNullable(stored_block->getByPosition(i)); } } if (kind != ASTTableJoin::Kind::Cross) { /// Fill the hash table. if (!getFullness(kind)) { if (strictness == ASTTableJoin::Strictness::Any) insertFromBlockImpl<ASTTableJoin::Strictness::Any>(type, maps_any, rows, key_columns, keys_size, key_sizes, stored_block, null_map, pool); else insertFromBlockImpl<ASTTableJoin::Strictness::All>(type, maps_all, rows, key_columns, keys_size, key_sizes, stored_block, null_map, pool); } else { if (strictness == ASTTableJoin::Strictness::Any) insertFromBlockImpl<ASTTableJoin::Strictness::Any>(type, maps_any_full, rows, key_columns, keys_size, key_sizes, stored_block, null_map, pool); else insertFromBlockImpl<ASTTableJoin::Strictness::All>(type, maps_all_full, rows, key_columns, keys_size, key_sizes, stored_block, null_map, pool); } } return limits.check(getTotalRowCount(), getTotalByteCount(), "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); }