void DistributedBlockOutputStream::writeToShard(const Block & block, const std::vector<std::string> & dir_names) { /** tmp directory is used to ensure atomicity of transactions * and keep monitor thread out from reading incomplete data */ std::string first_file_tmp_path{}; auto first = true; const auto & query_string = queryToString(query_ast); /// write first file, hardlink the others for (const auto & dir_name : dir_names) { const auto & path = storage.getPath() + dir_name + '/'; /// ensure shard subdirectory creation and notify storage if (Poco::File(path).createDirectory()) storage.requireDirectoryMonitor(dir_name); const auto & file_name = toString(storage.file_names_increment.get()) + ".bin"; const auto & block_file_path = path + file_name; /** on first iteration write block to a temporary directory for subsequent hardlinking to ensure * the inode is not freed until we're done */ if (first) { first = false; const auto & tmp_path = path + "tmp/"; Poco::File(tmp_path).createDirectory(); const auto & block_file_tmp_path = tmp_path + file_name; first_file_tmp_path = block_file_tmp_path; WriteBufferFromFile out{block_file_tmp_path}; CompressedWriteBuffer compress{out}; NativeBlockOutputStream stream{compress, ClickHouseRevision::get()}; writeStringBinary(query_string, out); stream.writePrefix(); stream.write(block); stream.writeSuffix(); } if (link(first_file_tmp_path.data(), block_file_path.data())) throwFromErrno("Could not link " + block_file_path + " to " + first_file_tmp_path); } /** remove the temporary file, enabling the OS to reclaim inode after all threads * have removed their corresponding files */ Poco::File(first_file_tmp_path).remove(); }
Block InterpreterSelectWithUnionQuery::getSampleBlock( const ASTPtr & query_ptr, const Context & context) { auto & cache = context.getSampleBlockCache(); /// Using query string because query_ptr changes for every internal SELECT auto key = queryToString(query_ptr); if (cache.find(key) != cache.end()) { return cache[key]; } return cache[key] = InterpreterSelectWithUnionQuery(query_ptr, context, {}, QueryProcessingStage::Complete, 0, true).getSampleBlock(); }
std::string ReshardingJob::toString() const { std::string serialized_job; WriteBufferFromString buf{serialized_job}; writeBinary(database_name, buf); writeBinary(table_name, buf); writeBinary(partition, buf); writeBinary(queryToString(sharding_key_expr), buf); writeBinary(coordinator_id, buf); writeVarUInt(block_number, buf); writeBinary(do_copy, buf); writeVarUInt(paths.size(), buf); for (const auto & path : paths) { writeBinary(path.first, buf); writeVarUInt(path.second, buf); } buf.next(); return serialized_job; }
BlockInputStreams StorageSystemColumns::read( const Names & column_names, ASTPtr query, const Context & context, const Settings & settings, QueryProcessingStage::Enum & processed_stage, const size_t max_block_size, const unsigned threads) { check(column_names); processed_stage = QueryProcessingStage::FetchColumns; Block block; std::map<std::pair<std::string, std::string>, StoragePtr> storages; { Databases databases = context.getDatabases(); /// Добавляем столбец database. ColumnPtr database_column = std::make_shared<ColumnString>(); for (const auto & database : databases) database_column->insert(database.first); block.insert(ColumnWithTypeAndName(database_column, std::make_shared<DataTypeString>(), "database")); /// Отфильтруем блок со столбцом database. VirtualColumnUtils::filterBlockWithQuery(query, block, context); if (!block.rows()) return BlockInputStreams(); database_column = block.getByName("database").column; size_t rows = database_column->size(); /// Добавляем столбец table. ColumnPtr table_column = std::make_shared<ColumnString>(); IColumn::Offsets_t offsets(rows); for (size_t i = 0; i < rows; ++i) { const std::string database_name = (*database_column)[i].get<std::string>(); const DatabasePtr database = databases.at(database_name); offsets[i] = i ? offsets[i - 1] : 0; for (auto iterator = database->getIterator(); iterator->isValid(); iterator->next()) { const String & table_name = iterator->name(); storages.emplace(std::piecewise_construct, std::forward_as_tuple(database_name, table_name), std::forward_as_tuple(iterator->table())); table_column->insert(table_name); offsets[i] += 1; } } for (size_t i = 0; i < block.columns(); ++i) { ColumnPtr & column = block.getByPosition(i).column; column = column->replicate(offsets); } block.insert(ColumnWithTypeAndName(table_column, std::make_shared<DataTypeString>(), "table")); } /// Отфильтруем блок со столбцами database и table. VirtualColumnUtils::filterBlockWithQuery(query, block, context); if (!block.rows()) return BlockInputStreams(); ColumnPtr filtered_database_column = block.getByName("database").column; ColumnPtr filtered_table_column = block.getByName("table").column; /// Составляем результат. ColumnPtr database_column = std::make_shared<ColumnString>(); ColumnPtr table_column = std::make_shared<ColumnString>(); ColumnPtr name_column = std::make_shared<ColumnString>(); ColumnPtr type_column = std::make_shared<ColumnString>(); ColumnPtr default_type_column = std::make_shared<ColumnString>(); ColumnPtr default_expression_column = std::make_shared<ColumnString>(); ColumnPtr bytes_column = std::make_shared<ColumnUInt64>(); size_t rows = filtered_database_column->size(); for (size_t i = 0; i < rows; ++i) { const std::string database_name = (*filtered_database_column)[i].get<std::string>(); const std::string table_name = (*filtered_table_column)[i].get<std::string>(); NamesAndTypesList columns; ColumnDefaults column_defaults; std::unordered_map<String, size_t> column_sizes; { StoragePtr storage = storages.at(std::make_pair(database_name, table_name)); IStorage::TableStructureReadLockPtr table_lock; try { table_lock = storage->lockStructure(false); } catch (const Exception & e) { /** There are case when IStorage::drop was called, * but we still own the object. * Then table will throw exception at attempt to lock it. * Just skip the table. */ if (e.code() == ErrorCodes::TABLE_IS_DROPPED) continue; else throw; } columns = storage->getColumnsList(); columns.insert(std::end(columns), std::begin(storage->alias_columns), std::end(storage->alias_columns)); column_defaults = storage->column_defaults; /** Данные о размерах столбцов для таблиц семейства MergeTree. * NOTE: В дальнейшем можно сделать интерфейс, позволяющий получить размеры столбцов у IStorage. */ if (auto storage_concrete = dynamic_cast<StorageMergeTree *>(storage.get())) { column_sizes = storage_concrete->getData().getColumnSizes(); } else if (auto storage_concrete = dynamic_cast<StorageReplicatedMergeTree *>(storage.get())) { column_sizes = storage_concrete->getData().getColumnSizes(); auto unreplicated_data = storage_concrete->getUnreplicatedData(); if (unreplicated_data) { auto unreplicated_column_sizes = unreplicated_data->getColumnSizes(); for (const auto & name_size : unreplicated_column_sizes) column_sizes[name_size.first] += name_size.second; } } } for (const auto & column : columns) { database_column->insert(database_name); table_column->insert(table_name); name_column->insert(column.name); type_column->insert(column.type->getName()); { const auto it = column_defaults.find(column.name); if (it == std::end(column_defaults)) { default_type_column->insertDefault(); default_expression_column->insertDefault(); } else { default_type_column->insert(toString(it->second.type)); default_expression_column->insert(queryToString(it->second.expression)); } } { const auto it = column_sizes.find(column.name); if (it == std::end(column_sizes)) bytes_column->insertDefault(); else bytes_column->insert(it->second); } } } block.clear(); block.insert(ColumnWithTypeAndName(database_column, std::make_shared<DataTypeString>(), "database")); block.insert(ColumnWithTypeAndName(table_column, std::make_shared<DataTypeString>(), "table")); block.insert(ColumnWithTypeAndName(name_column, std::make_shared<DataTypeString>(), "name")); block.insert(ColumnWithTypeAndName(type_column, std::make_shared<DataTypeString>(), "type")); block.insert(ColumnWithTypeAndName(default_type_column, std::make_shared<DataTypeString>(), "default_type")); block.insert(ColumnWithTypeAndName(default_expression_column, std::make_shared<DataTypeString>(), "default_expression")); block.insert(ColumnWithTypeAndName(bytes_column, std::make_shared<DataTypeUInt64>(), "bytes")); return BlockInputStreams{ 1, std::make_shared<OneBlockInputStream>(block) }; }
void StorageDistributed::reshardPartitions(ASTPtr query, const String & database_name, const Field & first_partition, const Field & last_partition, const WeightedZooKeeperPaths & weighted_zookeeper_paths, const ASTPtr & sharding_key_expr, bool do_copy, const Field & coordinator, const Settings & settings) { auto & resharding_worker = context.getReshardingWorker(); if (!resharding_worker.isStarted()) throw Exception{"Resharding background thread is not running", ErrorCodes::RESHARDING_NO_WORKER}; if (!coordinator.isNull()) throw Exception{"Use of COORDINATE WITH is forbidden in ALTER TABLE ... RESHARD" " queries for distributed tables", ErrorCodes::RESHARDING_INVALID_PARAMETERS}; std::string coordinator_id = resharding_worker.createCoordinator(cluster); std::atomic<bool> has_notified_error{false}; std::string dumped_coordinator_state; auto handle_exception = [&](const std::string & msg = "") { try { if (!has_notified_error) resharding_worker.setStatus(coordinator_id, ReshardingWorker::STATUS_ERROR, msg); dumped_coordinator_state = resharding_worker.dumpCoordinatorState(coordinator_id); resharding_worker.deleteCoordinator(coordinator_id); } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); } }; try { /// Создать запрос ALTER TABLE ... RESHARD [COPY] PARTITION ... COORDINATE WITH ... ASTPtr alter_query_ptr = std::make_shared<ASTAlterQuery>(); auto & alter_query = static_cast<ASTAlterQuery &>(*alter_query_ptr); alter_query.database = remote_database; alter_query.table = remote_table; alter_query.parameters.emplace_back(); ASTAlterQuery::Parameters & parameters = alter_query.parameters.back(); parameters.type = ASTAlterQuery::RESHARD_PARTITION; if (!first_partition.isNull()) parameters.partition = std::make_shared<ASTLiteral>(StringRange(), first_partition); if (!last_partition.isNull()) parameters.last_partition = std::make_shared<ASTLiteral>(StringRange(), last_partition); ASTPtr expr_list = std::make_shared<ASTExpressionList>(); for (const auto & entry : weighted_zookeeper_paths) { ASTPtr weighted_path_ptr = std::make_shared<ASTWeightedZooKeeperPath>(); auto & weighted_path = static_cast<ASTWeightedZooKeeperPath &>(*weighted_path_ptr); weighted_path.path = entry.first; weighted_path.weight = entry.second; expr_list->children.push_back(weighted_path_ptr); } parameters.weighted_zookeeper_paths = expr_list; parameters.sharding_key_expr = sharding_key_expr; parameters.do_copy = do_copy; parameters.coordinator = std::make_shared<ASTLiteral>(StringRange(), Field(coordinator_id)); resharding_worker.registerQuery(coordinator_id, queryToString(alter_query_ptr)); /** Функциональность shard_multiplexing не доделана - выключаем её. * (Потому что установка соединений с разными шардами в рамках одного потока выполняется не параллельно.) * Подробнее смотрите в https://███████████.yandex-team.ru/METR-18300 */ bool enable_shard_multiplexing = false; ClusterProxy::AlterQueryConstructor alter_query_constructor; BlockInputStreams streams = ClusterProxy::Query{alter_query_constructor, cluster, alter_query_ptr, context, settings, enable_shard_multiplexing}.execute(); /// This callback is called if an exception has occurred while attempting to read /// a block from a shard. This is to avoid a potential deadlock if other shards are /// waiting inside a barrier. Actually, even without this solution, we would avoid /// such a deadlock because we would eventually time out while trying to get remote /// blocks. Nevertheless this is not the ideal way of sorting out this issue since /// we would then not get to know the actual cause of the failure. auto exception_callback = [&resharding_worker, coordinator_id, &has_notified_error]() { try { resharding_worker.setStatus(coordinator_id, ReshardingWorker::STATUS_ERROR); has_notified_error = true; } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); } }; streams[0] = std::make_shared<UnionBlockInputStream<>>( streams, nullptr, settings.max_distributed_connections, exception_callback); streams.resize(1); auto stream_ptr = dynamic_cast<IProfilingBlockInputStream *>(&*streams[0]); if (stream_ptr == nullptr) throw Exception{"StorageDistributed: Internal error", ErrorCodes::LOGICAL_ERROR}; auto & stream = *stream_ptr; stream.readPrefix(); while (!stream.isCancelled() && stream.read()) ; if (!stream.isCancelled()) stream.readSuffix(); } catch (const Exception & ex) { handle_exception(ex.message()); LOG_ERROR(log, dumped_coordinator_state); throw; } catch (const std::exception & ex) { handle_exception(ex.what()); LOG_ERROR(log, dumped_coordinator_state); throw; } catch (...) { handle_exception(); LOG_ERROR(log, dumped_coordinator_state); throw; } }
Block readImpl() override { if (done) return {}; Block res = header; MutableColumns res_columns = header.cloneEmptyColumns(); size_t rows_count = 0; while (rows_count < max_block_size) { if (tables_it && !tables_it->isValid()) ++database_idx; while (database_idx < databases->size() && (!tables_it || !tables_it->isValid())) { database_name = databases->getDataAt(database_idx).toString(); database = context.tryGetDatabase(database_name); if (!database || !context.hasDatabaseAccessRights(database_name)) { /// Database was deleted just now or the user has no access. ++database_idx; continue; } break; } /// This is for temporary tables. They are output in single block regardless to max_block_size. if (database_idx >= databases->size()) { if (context.hasSessionContext()) { Tables external_tables = context.getSessionContext().getExternalTables(); for (auto table : external_tables) { size_t src_index = 0; size_t res_index = 0; if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insert(table.first); if (columns_mask[src_index++]) res_columns[res_index++]->insert(table.second->getName()); if (columns_mask[src_index++]) res_columns[res_index++]->insert(1u); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insert(table.second->getName()); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); } } res.setColumns(std::move(res_columns)); done = true; return res; } if (!tables_it || !tables_it->isValid()) tables_it = database->getIterator(context); for (; rows_count < max_block_size && tables_it->isValid(); tables_it->next()) { ++rows_count; auto table_name = tables_it->name(); size_t src_index = 0; size_t res_index = 0; if (columns_mask[src_index++]) res_columns[res_index++]->insert(database_name); if (columns_mask[src_index++]) res_columns[res_index++]->insert(table_name); if (columns_mask[src_index++]) res_columns[res_index++]->insert(tables_it->table()->getName()); if (columns_mask[src_index++]) res_columns[res_index++]->insert(0u); // is_temporary if (columns_mask[src_index++]) res_columns[res_index++]->insert(tables_it->table()->getDataPath()); if (columns_mask[src_index++]) res_columns[res_index++]->insert(database->getTableMetadataPath(table_name)); if (columns_mask[src_index++]) res_columns[res_index++]->insert(static_cast<UInt64>(database->getTableMetadataModificationTime(context, table_name))); { Array dependencies_table_name_array; Array dependencies_database_name_array; if (columns_mask[src_index] || columns_mask[src_index + 1]) { const auto dependencies = context.getDependencies(database_name, table_name); dependencies_table_name_array.reserve(dependencies.size()); dependencies_database_name_array.reserve(dependencies.size()); for (const auto & dependency : dependencies) { dependencies_table_name_array.push_back(dependency.second); dependencies_database_name_array.push_back(dependency.first); } } if (columns_mask[src_index++]) res_columns[res_index++]->insert(dependencies_database_name_array); if (columns_mask[src_index++]) res_columns[res_index++]->insert(dependencies_table_name_array); } if (columns_mask[src_index] || columns_mask[src_index + 1]) { ASTPtr ast = database->tryGetCreateTableQuery(context, table_name); if (columns_mask[src_index++]) res_columns[res_index++]->insert(ast ? queryToString(ast) : ""); if (columns_mask[src_index++]) { String engine_full; if (ast) { const ASTCreateQuery & ast_create = typeid_cast<const ASTCreateQuery &>(*ast); if (ast_create.storage) { engine_full = queryToString(*ast_create.storage); static const char * const extra_head = " ENGINE = "; if (startsWith(engine_full, extra_head)) engine_full = engine_full.substr(strlen(extra_head)); } } res_columns[res_index++]->insert(engine_full); } } else src_index += 2; const auto table_it = context.getTable(database_name, table_name); ASTPtr expression_ptr; if (columns_mask[src_index++]) { if ((expression_ptr = table_it->getPartitionKeyAST())) res_columns[res_index++]->insert(queryToString(expression_ptr)); else res_columns[res_index++]->insertDefault(); } if (columns_mask[src_index++]) { if ((expression_ptr = table_it->getSortingKeyAST())) res_columns[res_index++]->insert(queryToString(expression_ptr)); else res_columns[res_index++]->insertDefault(); } if (columns_mask[src_index++]) { if ((expression_ptr = table_it->getPrimaryKeyAST())) res_columns[res_index++]->insert(queryToString(expression_ptr)); else res_columns[res_index++]->insertDefault(); } if (columns_mask[src_index++]) { if ((expression_ptr = table_it->getSamplingKeyAST())) res_columns[res_index++]->insert(queryToString(expression_ptr)); else res_columns[res_index++]->insertDefault(); } } } res.setColumns(std::move(res_columns)); return res; }
std::string ASTQueryWithOnCluster::getRewrittenQueryWithoutOnCluster(const std::string & new_database) const { return queryToString(getRewrittenASTWithoutOnCluster(new_database)); }
String queryToString(const ASTPtr & query) { return queryToString(*query); }