DataTypePtr recursiveRemoveLowCardinality(const DataTypePtr & type) { if (!type) return type; if (const auto * array_type = typeid_cast<const DataTypeArray *>(type.get())) return std::make_shared<DataTypeArray>(recursiveRemoveLowCardinality(array_type->getNestedType())); if (const auto * tuple_type = typeid_cast<const DataTypeTuple *>(type.get())) { DataTypes elements = tuple_type->getElements(); for (auto & element : elements) element = recursiveRemoveLowCardinality(element); if (tuple_type->haveExplicitNames()) return std::make_shared<DataTypeTuple>(elements, tuple_type->getElementNames()); else return std::make_shared<DataTypeTuple>(elements); } if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(type.get())) return low_cardinality_type->getDictionaryType(); return type; }
void Join::setSampleBlock(const Block & block) { std::unique_lock lock(rwlock); if (!empty()) return; size_t keys_size = key_names_right.size(); ColumnRawPtrs key_columns(keys_size); Columns materialized_columns(keys_size); for (size_t i = 0; i < keys_size; ++i) { materialized_columns[i] = recursiveRemoveLowCardinality(block.getByName(key_names_right[i]).column); key_columns[i] = materialized_columns[i].get(); /// We will join only keys, where all components are not NULL. if (key_columns[i]->isColumnNullable()) key_columns[i] = &static_cast<const ColumnNullable &>(*key_columns[i]).getNestedColumn(); } /// Choose data structure to use for JOIN. init(chooseMethod(key_columns, key_sizes)); sample_block_with_columns_to_add = materializeBlock(block); /// Move from `sample_block_with_columns_to_add` key columns to `sample_block_with_keys`, keeping the order. size_t pos = 0; while (pos < sample_block_with_columns_to_add.columns()) { const auto & name = sample_block_with_columns_to_add.getByPosition(pos).name; if (key_names_right.end() != std::find(key_names_right.begin(), key_names_right.end(), name)) { auto & col = sample_block_with_columns_to_add.getByPosition(pos); col.column = recursiveRemoveLowCardinality(col.column); col.type = recursiveRemoveLowCardinality(col.type); sample_block_with_keys.insert(col); sample_block_with_columns_to_add.erase(pos); } else ++pos; } size_t num_columns_to_add = sample_block_with_columns_to_add.columns(); for (size_t i = 0; i < num_columns_to_add; ++i) { auto & column = sample_block_with_columns_to_add.getByPosition(i); if (!column.column) column.column = column.type->createColumn(); } /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && (kind == ASTTableJoin::Kind::Left || kind == ASTTableJoin::Kind::Full)) for (size_t i = 0; i < num_columns_to_add; ++i) convertColumnToNullable(sample_block_with_columns_to_add.getByPosition(i)); }
void Join::checkTypesOfKeys(const Block & block_left, const Names & key_names_left, const Block & block_right) const { size_t keys_size = key_names_left.size(); for (size_t i = 0; i < keys_size; ++i) { /// Compare up to Nullability. DataTypePtr left_type = removeNullable(recursiveRemoveLowCardinality(block_left.getByName(key_names_left[i]).type)); DataTypePtr right_type = removeNullable(recursiveRemoveLowCardinality(block_right.getByName(key_names_right[i]).type)); if (!left_type->equals(*right_type)) throw Exception("Type mismatch of columns to JOIN by: " + key_names_left[i] + " " + left_type->getName() + " at left, " + key_names_right[i] + " " + right_type->getName() + " at right", ErrorCodes::TYPE_MISMATCH); } }
ColumnPtr recursiveRemoveLowCardinality(const ColumnPtr & column) { if (!column) return column; if (const auto * column_array = typeid_cast<const ColumnArray *>(column.get())) return ColumnArray::create(recursiveRemoveLowCardinality(column_array->getDataPtr()), column_array->getOffsetsPtr()); if (const auto * column_const = typeid_cast<const ColumnConst *>(column.get())) return ColumnConst::create(recursiveRemoveLowCardinality(column_const->getDataColumnPtr()), column_const->size()); if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get())) { Columns columns = column_tuple->getColumns(); for (auto & element : columns) element = recursiveRemoveLowCardinality(element); return ColumnTuple::create(columns); } if (const auto * column_low_cardinality = typeid_cast<const ColumnLowCardinality *>(column.get())) return column_low_cardinality->convertToFullColumn(); return column; }
void NativeBlockOutputStream::write(const Block & block) { /// Additional information about the block. if (client_revision > 0) block.info.write(ostr); block.checkNumberOfRows(); /// Dimensions size_t columns = block.columns(); size_t rows = block.rows(); writeVarUInt(columns, ostr); writeVarUInt(rows, ostr); /** The index has the same structure as the data stream. * But instead of column values, it contains a mark that points to the location in the data file where this part of the column is located. */ if (index_ostr) { writeVarUInt(columns, *index_ostr); writeVarUInt(rows, *index_ostr); } for (size_t i = 0; i < columns; ++i) { /// For the index. MarkInCompressedFile mark; if (index_ostr) { ostr_concrete->next(); /// Finish compressed block. mark.offset_in_compressed_file = initial_size_of_file + ostr_concrete->getCompressedBytes(); mark.offset_in_decompressed_block = ostr_concrete->getRemainingBytes(); } ColumnWithTypeAndName column = block.safeGetByPosition(i); /// Send data to old clients without low cardinality type. if (remove_low_cardinality || (client_revision && client_revision < DBMS_MIN_REVISION_WITH_LOW_CARDINALITY_TYPE)) { column.column = recursiveRemoveLowCardinality(column.column); column.type = recursiveRemoveLowCardinality(column.type); } /// Name writeStringBinary(column.name, ostr); /// Type String type_name = column.type->getName(); /// For compatibility, we will not send explicit timezone parameter in DateTime data type /// to older clients, that cannot understand it. if (client_revision < DBMS_MIN_REVISION_WITH_TIME_ZONE_PARAMETER_IN_DATETIME_DATA_TYPE && startsWith(type_name, "DateTime(")) type_name = "DateTime"; writeStringBinary(type_name, ostr); /// Data if (rows) /// Zero items of data is always represented as zero number of bytes. writeData(*column.type, column.column, ostr, 0, 0); if (index_ostr) { writeStringBinary(column.name, *index_ostr); writeStringBinary(column.type->getName(), *index_ostr); writeBinary(mark.offset_in_compressed_file, *index_ostr); writeBinary(mark.offset_in_decompressed_block, *index_ostr); } } }
void Join::joinBlockImpl( Block & block, const Names & key_names_left, const NameSet & needed_key_names_right, const Block & block_with_columns_to_add, const Maps & maps_) const { size_t keys_size = key_names_left.size(); ColumnRawPtrs key_columns(keys_size); /// Rare case, when keys are constant. To avoid code bloat, simply materialize them. Columns materialized_columns; materialized_columns.reserve(keys_size); /// Memoize key columns to work with. for (size_t i = 0; i < keys_size; ++i) { materialized_columns.emplace_back(recursiveRemoveLowCardinality(block.getByName(key_names_left[i]).column->convertToFullColumnIfConst())); key_columns[i] = materialized_columns.back().get(); } /// Keys with NULL value in any column won't join to anything. ColumnPtr null_map_holder; ConstNullMapPtr null_map{}; extractNestedColumnsAndNullMap(key_columns, null_map_holder, null_map); size_t existing_columns = block.columns(); /** If you use FULL or RIGHT JOIN, then the columns from the "left" table must be materialized. * Because if they are constants, then in the "not joined" rows, they may have different values * - default values, which can differ from the values of these constants. */ if (getFullness(kind)) { for (size_t i = 0; i < existing_columns; ++i) { block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst(); /// If use_nulls, convert left columns (except keys) to Nullable. if (use_nulls) { if (std::end(key_names_left) == std::find(key_names_left.begin(), key_names_left.end(), block.getByPosition(i).name)) convertColumnToNullable(block.getByPosition(i)); } } } /** For LEFT/INNER JOIN, the saved blocks do not contain keys. * For FULL/RIGHT JOIN, the saved blocks contain keys; * but they will not be used at this stage of joining (and will be in `AdderNonJoined`), and they need to be skipped. */ size_t num_columns_to_skip = 0; if (getFullness(kind)) num_columns_to_skip = keys_size; /// Add new columns to the block. size_t num_columns_to_add = sample_block_with_columns_to_add.columns(); MutableColumns added_columns; added_columns.reserve(num_columns_to_add); std::vector<std::pair<decltype(ColumnWithTypeAndName::type), decltype(ColumnWithTypeAndName::name)>> added_type_name; added_type_name.reserve(num_columns_to_add); std::vector<size_t> right_indexes; right_indexes.reserve(num_columns_to_add); for (size_t i = 0; i < num_columns_to_add; ++i) { const ColumnWithTypeAndName & src_column = sample_block_with_columns_to_add.safeGetByPosition(i); /// Don't insert column if it's in left block or not explicitly required. if (!block.has(src_column.name) && block_with_columns_to_add.has(src_column.name)) { added_columns.push_back(src_column.column->cloneEmpty()); added_columns.back()->reserve(src_column.column->size()); added_type_name.emplace_back(src_column.type, src_column.name); right_indexes.push_back(num_columns_to_skip + i); } } size_t rows = block.rows(); std::unique_ptr<IColumn::Filter> filter; bool filter_left_keys = (kind == ASTTableJoin::Kind::Inner || kind == ASTTableJoin::Kind::Right) && strictness == ASTTableJoin::Strictness::Any; filter = std::make_unique<IColumn::Filter>(rows); /// Used with ALL ... JOIN IColumn::Offset current_offset = 0; std::unique_ptr<IColumn::Offsets> offsets_to_replicate; if (strictness == ASTTableJoin::Strictness::All) offsets_to_replicate = std::make_unique<IColumn::Offsets>(rows); switch (type) { #define M(TYPE) \ case Join::Type::TYPE: \ joinBlockImplType<KIND, STRICTNESS, typename KeyGetterForType<Join::Type::TYPE>::Type>(\ *maps_.TYPE, rows, key_columns, key_sizes, added_columns, null_map, \ filter, current_offset, offsets_to_replicate, right_indexes); \ break; APPLY_FOR_JOIN_VARIANTS(M) #undef M default: throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT); } const auto added_columns_size = added_columns.size(); for (size_t i = 0; i < added_columns_size; ++i) block.insert(ColumnWithTypeAndName(std::move(added_columns[i]), added_type_name[i].first, added_type_name[i].second)); /// If ANY INNER | RIGHT JOIN - filter all the columns except the new ones. if (filter_left_keys) for (size_t i = 0; i < existing_columns; ++i) block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->filter(*filter, -1); ColumnUInt64::Ptr mapping; /// Add join key columns from right block if they has different name. for (size_t i = 0; i < key_names_right.size(); ++i) { auto & right_name = key_names_right[i]; auto & left_name = key_names_left[i]; if (needed_key_names_right.count(right_name) && !block.has(right_name)) { const auto & col = block.getByName(left_name); auto column = col.column; if (!filter_left_keys) { if (!mapping) { auto mut_mapping = ColumnUInt64::create(column->size()); auto & data = mut_mapping->getData(); size_t size = column->size(); for (size_t j = 0; j < size; ++j) data[j] = (*filter)[j] ? j : size; mapping = std::move(mut_mapping); } auto mut_column = (*std::move(column)).mutate(); mut_column->insertDefault(); column = mut_column->index(*mapping, 0); } block.insert({column, col.type, right_name}); } } /// If ALL ... JOIN - we replicate all the columns except the new ones. if (offsets_to_replicate) { for (size_t i = 0; i < existing_columns; ++i) block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->replicate(*offsets_to_replicate); } }
bool Join::insertFromBlock(const Block & block) { std::unique_lock lock(rwlock); if (empty()) throw Exception("Logical error: Join was not initialized", ErrorCodes::LOGICAL_ERROR); size_t keys_size = key_names_right.size(); ColumnRawPtrs key_columns(keys_size); /// Rare case, when keys are constant. To avoid code bloat, simply materialize them. Columns materialized_columns; materialized_columns.reserve(keys_size); /// Memoize key columns to work. for (size_t i = 0; i < keys_size; ++i) { materialized_columns.emplace_back(recursiveRemoveLowCardinality(block.getByName(key_names_right[i]).column->convertToFullColumnIfConst())); key_columns[i] = materialized_columns.back().get(); } /// We will insert to the map only keys, where all components are not NULL. ColumnPtr null_map_holder; ConstNullMapPtr null_map{}; extractNestedColumnsAndNullMap(key_columns, null_map_holder, null_map); size_t rows = block.rows(); blocks.push_back(block); Block * stored_block = &blocks.back(); if (getFullness(kind)) { /** Move the key columns to the beginning of the block. * This is where NonJoinedBlockInputStream will expect. */ size_t key_num = 0; for (const auto & name : key_names_right) { size_t pos = stored_block->getPositionByName(name); ColumnWithTypeAndName col = stored_block->safeGetByPosition(pos); stored_block->erase(pos); stored_block->insert(key_num, std::move(col)); ++key_num; } } else { /// Remove the key columns from stored_block, as they are not needed. for (const auto & name : key_names_right) stored_block->erase(stored_block->getPositionByName(name)); } size_t size = stored_block->columns(); /// Rare case, when joined columns are constant. To avoid code bloat, simply materialize them. for (size_t i = 0; i < size; ++i) stored_block->safeGetByPosition(i).column = stored_block->safeGetByPosition(i).column->convertToFullColumnIfConst(); /// In case of LEFT and FULL joins, if use_nulls, convert joined columns to Nullable. if (use_nulls && (kind == ASTTableJoin::Kind::Left || kind == ASTTableJoin::Kind::Full)) { for (size_t i = getFullness(kind) ? keys_size : 0; i < size; ++i) { convertColumnToNullable(stored_block->getByPosition(i)); } } if (kind != ASTTableJoin::Kind::Cross) { dispatch([&](auto, auto strictness_, auto & map) { insertFromBlockImpl<strictness_>(type, map, rows, key_columns, keys_size, key_sizes, stored_block, null_map, pool); }); } return limits.check(getTotalRowCount(), getTotalByteCount(), "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED); }