static void onExceptionBeforeStart(const String & query, Context & context, time_t current_time) { /// Exception before the query execution. context.getQuota().addError(); bool log_queries = context.getSettingsRef().log_queries; /// Log the start of query execution into the table if necessary. if (log_queries) { QueryLogElement elem; elem.type = QueryLogElement::EXCEPTION_BEFORE_START; elem.event_time = current_time; elem.query_start_time = current_time; elem.query = query.substr(0, context.getSettingsRef().log_queries_cut_to_length); elem.exception = getCurrentExceptionMessage(false); elem.client_info = context.getClientInfo(); setExceptionStackTrace(elem); logException(context, elem); context.getQueryLog().add(elem); } }
static void onExceptionBeforeStart(const String & query, Context & context, time_t current_time) { /// Эксепшен до начала выполнения запроса. context.getQuota().addError(current_time); bool log_queries = context.getSettingsRef().log_queries; /// Логгируем в таблицу начало выполнения запроса, если нужно. if (log_queries) { QueryLogElement elem; elem.type = QueryLogElement::EXCEPTION_BEFORE_START; elem.event_time = current_time; elem.query_start_time = current_time; elem.query = query.substr(0, context.getSettingsRef().log_queries_cut_to_length); elem.exception = getCurrentExceptionMessage(false); setClientInfo(elem, context); setExceptionStackTrace(elem); logException(context, elem); context.getQueryLog().add(elem); } }
std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace) { try { std::rethrow_exception(std::move(e)); } catch (...) { return getCurrentExceptionMessage(with_stacktrace); } }
void StorageSystemModels::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { const auto & external_models = context.getExternalModels(); auto objects_map = external_models.getObjectsMap(); const auto & models = objects_map.get(); for (const auto & model_info : models) { res_columns[0]->insert(model_info.first); res_columns[1]->insert(model_info.second.origin); if (model_info.second.loadable) { const auto model_ptr = std::static_pointer_cast<IModel>(model_info.second.loadable); res_columns[2]->insert(model_ptr->getTypeName()); res_columns[3]->insert(static_cast<UInt64>(std::chrono::system_clock::to_time_t(model_ptr->getCreationTime()))); } else { res_columns[2]->insertDefault(); res_columns[3]->insertDefault(); } if (model_info.second.exception) { try { std::rethrow_exception(model_info.second.exception); } catch (...) { res_columns[4]->insert(getCurrentExceptionMessage(false)); } } else res_columns[4]->insertDefault(); } }
ConnectionPoolWithFailover::TryResult ConnectionPoolWithFailover::tryGetEntry( IConnectionPool & pool, std::string & fail_message, const Settings * settings, const QualifiedTableName * table_to_check) { TryResult result; try { result.entry = pool.get(settings, /* force_connected = */ false); String server_name; UInt64 server_version_major; UInt64 server_version_minor; UInt64 server_revision; if (table_to_check) result.entry->getServerVersion(server_name, server_version_major, server_version_minor, server_revision); if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) { result.entry->forceConnected(); result.is_usable = true; result.is_up_to_date = true; return result; } /// Only status of the remote table corresponding to the Distributed table is taken into account. /// TODO: request status for joined tables also. TablesStatusRequest status_request; status_request.tables.emplace(*table_to_check); TablesStatusResponse status_response = result.entry->getTablesStatus(status_request); auto table_status_it = status_response.table_states_by_id.find(*table_to_check); if (table_status_it == status_response.table_states_by_id.end()) { fail_message = "There is no table " + table_to_check->database + "." + table_to_check->table + " on server: " + result.entry->getDescription(); LOG_WARNING(log, fail_message); ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); return result; } result.is_usable = true; UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; if (!max_allowed_delay) { result.is_up_to_date = true; return result; } UInt32 delay = table_status_it->second.absolute_delay; if (delay < max_allowed_delay) result.is_up_to_date = true; else { result.is_up_to_date = false; result.staleness = delay; LOG_TRACE( log, "Server " << result.entry->getDescription() << " has unacceptable replica delay " << "for table " << table_to_check->database << "." << table_to_check->table << ": " << delay); ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); } } catch (const Exception & e) { if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) throw; fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); if (!result.entry.isNull()) { result.entry->disconnect(); result.reset(); } } return result; };
BlockInputStreams StorageSystemDictionaries::read( const Names & column_names, const ASTPtr & query, const Context & context, QueryProcessingStage::Enum & processed_stage, const size_t max_block_size, const unsigned) { check(column_names); processed_stage = QueryProcessingStage::FetchColumns; ColumnWithTypeAndName col_name{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "name"}; ColumnWithTypeAndName col_origin{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "origin"}; ColumnWithTypeAndName col_type{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "type"}; ColumnWithTypeAndName col_key{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "key"}; ColumnWithTypeAndName col_attribute_names{ std::make_shared<ColumnArray>(std::make_shared<ColumnString>()), std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()), "attribute.names" }; ColumnWithTypeAndName col_attribute_types{ std::make_shared<ColumnArray>(std::make_shared<ColumnString>()), std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()), "attribute.types" }; ColumnWithTypeAndName col_has_hierarchy{std::make_shared<ColumnUInt8>(), std::make_shared<DataTypeUInt8>(), "has_hierarchy"}; ColumnWithTypeAndName col_bytes_allocated{std::make_shared<ColumnUInt64>(), std::make_shared<DataTypeUInt64>(), "bytes_allocated"}; ColumnWithTypeAndName col_query_count{std::make_shared<ColumnUInt64>(), std::make_shared<DataTypeUInt64>(), "query_count"}; ColumnWithTypeAndName col_hit_rate{std::make_shared<ColumnFloat64>(), std::make_shared<DataTypeFloat64>(), "hit_rate"}; ColumnWithTypeAndName col_element_count{std::make_shared<ColumnUInt64>(), std::make_shared<DataTypeUInt64>(), "element_count"}; ColumnWithTypeAndName col_load_factor{std::make_shared<ColumnFloat64>(), std::make_shared<DataTypeFloat64>(), "load_factor"}; ColumnWithTypeAndName col_creation_time{std::make_shared<ColumnUInt32>(), std::make_shared<DataTypeDateTime>(), "creation_time"}; ColumnWithTypeAndName col_last_exception{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "last_exception"}; ColumnWithTypeAndName col_source{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "source"}; const auto & external_dictionaries = context.getExternalDictionaries(); const std::lock_guard<std::mutex> lock{external_dictionaries.dictionaries_mutex}; for (const auto & dict_info : external_dictionaries.dictionaries) { col_name.column->insert(dict_info.first); col_origin.column->insert(dict_info.second.origin); if (dict_info.second.dict) { const auto dict_ptr = dict_info.second.dict->get(); col_type.column->insert(dict_ptr->getTypeName()); const auto & dict_struct = dict_ptr->getStructure(); col_key.column->insert(dict_struct.getKeyDescription()); col_attribute_names.column->insert(ext::map<Array>(dict_struct.attributes, [] (auto & attr) -> decltype(auto) { return attr.name; })); col_attribute_types.column->insert(ext::map<Array>(dict_struct.attributes, [] (auto & attr) -> decltype(auto) { return attr.type->getName(); })); col_bytes_allocated.column->insert(dict_ptr->getBytesAllocated()); col_query_count.column->insert(dict_ptr->getQueryCount()); col_hit_rate.column->insert(dict_ptr->getHitRate()); col_element_count.column->insert(dict_ptr->getElementCount()); col_load_factor.column->insert(dict_ptr->getLoadFactor()); col_creation_time.column->insert(std::chrono::system_clock::to_time_t(dict_ptr->getCreationTime())); col_source.column->insert(dict_ptr->getSource()->toString()); } else { col_type.column->insertDefault(); col_key.column->insertDefault(); col_attribute_names.column->insertDefault(); col_attribute_types.column->insertDefault(); col_bytes_allocated.column->insertDefault(); col_query_count.column->insertDefault(); col_hit_rate.column->insertDefault(); col_element_count.column->insertDefault(); col_load_factor.column->insertDefault(); col_creation_time.column->insertDefault(); col_source.column->insertDefault(); } if (dict_info.second.exception) { try { std::rethrow_exception(dict_info.second.exception); } catch (...) { col_last_exception.column->insert(getCurrentExceptionMessage(false)); } } else col_last_exception.column->insertDefault(); } Block block{ col_name, col_origin, col_type, col_key, col_attribute_names, col_attribute_types, col_bytes_allocated, col_query_count, col_hit_rate, col_element_count, col_load_factor, col_creation_time, col_last_exception, col_source }; return BlockInputStreams{1, std::make_shared<OneBlockInputStream>(block)}; }
void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) { Poco::Net::HTMLForm params(request, request.stream()); LOG_TRACE(log, "Request URI: " + request.getURI()); auto process_error = [&response, this](const std::string & message) { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); if (!response.sent()) response.send() << message << std::endl; LOG_WARNING(log, message); }; if (!params.has("query")) { process_error("No 'query' in request body"); return; } if (!params.has("columns")) { process_error("No 'columns' in request URL"); return; } if (!params.has("connection_string")) { process_error("No 'connection_string' in request URL"); return; } UInt64 max_block_size = DEFAULT_BLOCK_SIZE; if (params.has("max_block_size")) { std::string max_block_size_str = params.get("max_block_size", ""); if (max_block_size_str.empty()) { process_error("Empty max_block_size specified"); return; } max_block_size = parse<size_t>(max_block_size_str); } std::string columns = params.get("columns"); std::unique_ptr<Block> sample_block; try { sample_block = parseColumns(std::move(columns)); } catch (const Exception & ex) { process_error("Invalid 'columns' parameter in request body '" + ex.message() + "'"); LOG_WARNING(log, ex.getStackTrace().toString()); return; } std::string format = params.get("format", "RowBinary"); std::string query = params.get("query"); LOG_TRACE(log, "Query: " << query); std::string connection_string = params.get("connection_string"); LOG_TRACE(log, "Connection string: '" << connection_string << "'"); WriteBufferFromHTTPServerResponse out(request, response, keep_alive_timeout); try { BlockOutputStreamPtr writer = FormatFactory::instance().getOutput(format, out, *sample_block, *context); auto pool = getPool(connection_string); ODBCBlockInputStream inp(pool->get(), query, *sample_block, max_block_size); copyData(inp, *writer); } catch (...) { auto message = getCurrentExceptionMessage(true); response.setStatusAndReason( Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); // can't call process_error, bacause of too soon response sending writeStringBinary(message, out); tryLogCurrentException(log); } }
void ReplicasStatusHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) { try { HTMLForm params(request); /// Даже в случае, когда отставание небольшое, выводить подробную информацию об отставании. bool verbose = params.get("verbose", "") == "1"; const MergeTreeSettings & settings = context.getMergeTreeSettings(); bool ok = true; std::stringstream message; auto databases = context.getDatabases(); /// Перебираем все реплицируемые таблицы. for (const auto & db : databases) { for (auto iterator = db.second->getIterator(); iterator->isValid(); iterator->next()) { auto & table = iterator->table(); StorageReplicatedMergeTree * table_replicated = typeid_cast<StorageReplicatedMergeTree *>(table.get()); if (!table_replicated) continue; time_t absolute_delay = 0; time_t relative_delay = 0; table_replicated->getReplicaDelays(absolute_delay, relative_delay); if ((settings.min_absolute_delay_to_close && absolute_delay >= static_cast<time_t>(settings.min_absolute_delay_to_close)) || (settings.min_relative_delay_to_close && relative_delay >= static_cast<time_t>(settings.min_relative_delay_to_close))) ok = false; message << backQuoteIfNeed(db.first) << "." << backQuoteIfNeed(iterator->name()) << ":\tAbsolute delay: " << absolute_delay << ". Relative delay: " << relative_delay << ".\n"; } } setResponseDefaultHeaders(response); if (ok && !verbose) { const char * data = "Ok.\n"; response.sendBuffer(data, strlen(data)); } else { response.send() << message.rdbuf(); } } catch (...) { tryLogCurrentException("ReplicasStatusHandler"); try { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); if (!response.sent()) { /// Ещё ничего не отправляли, и даже не знаем, нужно ли сжимать ответ. response.send() << getCurrentExceptionMessage(false) << std::endl; } } catch (...) { LOG_ERROR((&Logger::get("ReplicasStatusHandler")), "Cannot send exception to client"); } } }
static std::tuple<ASTPtr, BlockIO> executeQueryImpl( IParser::Pos begin, IParser::Pos end, Context & context, bool internal, QueryProcessingStage::Enum stage) { ProfileEvents::increment(ProfileEvents::Query); time_t current_time = time(0); const Settings & settings = context.getSettingsRef(); ParserQuery parser; ASTPtr ast; size_t query_size; size_t max_query_size = settings.max_query_size; try { ast = parseQuery(parser, begin, end, ""); /// Copy query into string. It will be written to log and presented in processlist. If an INSERT query, string will not include data to insertion. query_size = ast->range.second - ast->range.first; if (max_query_size && query_size > max_query_size) throw Exception("Query is too large (" + toString(query_size) + ")." " max_query_size = " + toString(max_query_size), ErrorCodes::QUERY_IS_TOO_LARGE); } catch (...) { /// Anyway log query. if (!internal) { String query = String(begin, begin + std::min(end - begin, static_cast<ptrdiff_t>(max_query_size))); logQuery(query.substr(0, settings.log_queries_cut_to_length), context); onExceptionBeforeStart(query, context, current_time); } throw; } String query(begin, query_size); BlockIO res; try { if (!internal) logQuery(query.substr(0, settings.log_queries_cut_to_length), context); /// Check the limits. checkLimits(*ast, settings.limits); QuotaForIntervals & quota = context.getQuota(); quota.addQuery(current_time); quota.checkExceeded(current_time); /// Put query to process list. But don't put SHOW PROCESSLIST query itself. ProcessList::EntryPtr process_list_entry; if (!internal && nullptr == typeid_cast<const ASTShowProcesslistQuery *>(&*ast)) { process_list_entry = context.getProcessList().insert( query, context.getUser(), context.getCurrentQueryId(), context.getIPAddress(), settings); context.setProcessListElement(&process_list_entry->get()); } auto interpreter = InterpreterFactory::get(ast, context, stage); res = interpreter->execute(); /// Hold element of process list till end of query execution. res.process_list_entry = process_list_entry; if (res.in) { if (IProfilingBlockInputStream * stream = dynamic_cast<IProfilingBlockInputStream *>(res.in.get())) { stream->setProgressCallback(context.getProgressCallback()); stream->setProcessListElement(context.getProcessListElement()); } } /// Everything related to query log. { QueryLogElement elem; elem.type = QueryLogElement::QUERY_START; elem.event_time = current_time; elem.query_start_time = current_time; elem.query = query.substr(0, settings.log_queries_cut_to_length); setClientInfo(elem, context); bool log_queries = settings.log_queries && !internal; /// Log into system table start of query execution, if need. if (log_queries) context.getQueryLog().add(elem); /// Also make possible for caller to log successful query finish and exception during execution. res.finish_callback = [elem, &context, log_queries] (IBlockInputStream * stream) mutable { ProcessListElement * process_list_elem = context.getProcessListElement(); if (!process_list_elem) return; double elapsed_seconds = process_list_elem->watch.elapsedSeconds(); elem.type = QueryLogElement::QUERY_FINISH; elem.event_time = time(0); elem.query_duration_ms = elapsed_seconds * 1000; elem.read_rows = process_list_elem->progress.rows; elem.read_bytes = process_list_elem->progress.bytes; auto memory_usage = process_list_elem->memory_tracker.getPeak(); elem.memory_usage = memory_usage > 0 ? memory_usage : 0; if (stream) { if (IProfilingBlockInputStream * profiling_stream = dynamic_cast<IProfilingBlockInputStream *>(stream)) { const BlockStreamProfileInfo & info = profiling_stream->getProfileInfo(); elem.result_rows = info.rows; elem.result_bytes = info.bytes; } } if (elem.read_rows != 0) { LOG_INFO(&Logger::get("executeQuery"), std::fixed << std::setprecision(3) << "Read " << elem.read_rows << " rows, " << formatReadableSizeWithBinarySuffix(elem.read_bytes) << " in " << elapsed_seconds << " sec., " << static_cast<size_t>(elem.read_rows / elapsed_seconds) << " rows/sec., " << formatReadableSizeWithBinarySuffix(elem.read_bytes / elapsed_seconds) << "/sec."); } if (log_queries) context.getQueryLog().add(elem); }; res.exception_callback = [elem, &context, log_queries, current_time] () mutable { context.getQuota().addError(current_time); elem.type = QueryLogElement::EXCEPTION_WHILE_PROCESSING; elem.event_time = time(0); elem.query_duration_ms = 1000 * (elem.event_time - elem.query_start_time); elem.exception = getCurrentExceptionMessage(false); ProcessListElement * process_list_elem = context.getProcessListElement(); if (process_list_elem) { double elapsed_seconds = process_list_elem->watch.elapsedSeconds(); elem.query_duration_ms = elapsed_seconds * 1000; elem.read_rows = process_list_elem->progress.rows; elem.read_bytes = process_list_elem->progress.bytes; auto memory_usage = process_list_elem->memory_tracker.getPeak(); elem.memory_usage = memory_usage > 0 ? memory_usage : 0; } setExceptionStackTrace(elem); logException(context, elem); if (log_queries) context.getQueryLog().add(elem); }; if (!internal && res.in) { std::stringstream log_str; log_str << "Query pipeline:\n"; res.in->dumpTree(log_str); LOG_DEBUG(&Logger::get("executeQuery"), log_str.str()); } } } catch (...) { if (!internal) onExceptionBeforeStart(query, context, current_time); throw; } return std::make_tuple(ast, res); }
void ReplicatedMergeTreeBlockOutputStream::commitPart(zkutil::ZooKeeperPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, const String & block_id) { storage.check(part->columns); assertSessionIsNotExpired(zookeeper); /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem. /// We remove the lock just after renaming the part. In case of exception, block number will be marked as abandoned. /// Also, make deduplication check. If a duplicate is detected, no nodes are created. /// Allocate new block number and check for duplicates bool deduplicate_block = !block_id.empty(); String block_id_path = deduplicate_block ? storage.zookeeper_path + "/blocks/" + block_id : ""; auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zookeeper, block_id_path); if (!block_number_lock) { LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it."); part->is_duplicate = true; last_block_is_duplicate = true; ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks); return; } Int64 block_number = block_number_lock->getNumber(); /// Set part attributes according to part_number. Prepare an entry for log. part->info.min_block = block_number; part->info.max_block = block_number; part->info.level = 0; String part_name = part->getNewName(part->info); part->name = part_name; StorageReplicatedMergeTree::LogEntry log_entry; log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART; log_entry.create_time = time(nullptr); log_entry.source_replica = storage.replica_name; log_entry.new_part_name = part_name; log_entry.quorum = quorum; log_entry.block_id = block_id; /// Simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock. /// Information about the part. Coordination::Requests ops; storage.getCommitPartOps(ops, part, block_id_path); /// Replication log. ops.emplace_back(zkutil::makeCreateRequest( storage.zookeeper_path + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential)); /// Deletes the information that the block number is used for writing. block_number_lock->getUnlockOps(ops); /** If you need a quorum - create a node in which the quorum is monitored. * (If such a node already exists, then someone has managed to make another quorum record at the same time, but for it the quorum has not yet been reached. * You can not do the next quorum record at this time.) */ if (quorum) { ReplicatedMergeTreeQuorumEntry quorum_entry; quorum_entry.part_name = part_name; quorum_entry.required_number_of_replicas = quorum; quorum_entry.replicas.insert(storage.replica_name); /** At this point, this node will contain information that the current replica received a part. * When other replicas will receive this part (in the usual way, processing the replication log), * they will add themselves to the contents of this node. * When it contains information about `quorum` number of replicas, this node is deleted, * which indicates that the quorum has been reached. */ ops.emplace_back( zkutil::makeCreateRequest( quorum_info.status_path, quorum_entry.toString(), zkutil::CreateMode::Persistent)); /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished). ops.emplace_back( zkutil::makeCheckRequest( storage.replica_path + "/is_active", quorum_info.is_active_node_version)); /// Unfortunately, just checking the above is not enough, because `is_active` node can be deleted and reappear with the same version. /// But then the `host` value will change. We will check this. /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread). ops.emplace_back( zkutil::makeCheckRequest( storage.replica_path + "/host", quorum_info.host_node_version)); } MergeTreeData::Transaction transaction(storage.data); /// If you can not add a part to ZK, we'll remove it back from the working set. storage.data.renameTempPartAndAdd(part, nullptr, &transaction); Coordination::Responses responses; int32_t multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT if (multi_code == Coordination::ZOK) { transaction.commit(); storage.merge_selecting_task->schedule(); /// Lock nodes have been already deleted, do not delete them in destructor block_number_lock->assumeUnlocked(); } else if (multi_code == Coordination::ZCONNECTIONLOSS || multi_code == Coordination::ZOPERATIONTIMEOUT) { /** If the connection is lost, and we do not know if the changes were applied, we can not delete the local part * if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again. */ transaction.commit(); storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER); /// We do not know whether or not data has been inserted. throw Exception("Unknown status, client must retry. Reason: " + String(Coordination::errorMessage(multi_code)), ErrorCodes::UNKNOWN_STATUS_OF_INSERT); } else if (Coordination::isUserError(multi_code)) { String failed_op_path = zkutil::KeeperMultiException(multi_code, ops, responses).getPathForFirstFailedOp(); if (multi_code == Coordination::ZNODEEXISTS && deduplicate_block && failed_op_path == block_id_path) { /// Block with the same id have just appeared in table (or other replica), rollback thee insertion. LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")"); part->is_duplicate = true; transaction.rollback(); last_block_is_duplicate = true; ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks); } else if (multi_code == Coordination::ZNODEEXISTS && failed_op_path == quorum_info.status_path) { transaction.rollback(); throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); } else { /// NOTE: We could be here if the node with the quorum existed, but was quickly removed. transaction.rollback(); throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': " + zkutil::ZooKeeper::error2string(multi_code) + ", path " + failed_op_path, ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } } else if (Coordination::isHardwareError(multi_code)) { transaction.rollback(); throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': " + zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } else { transaction.rollback(); throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': " + zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } if (quorum) { /// We are waiting for quorum to be satisfied. LOG_TRACE(log, "Waiting for quorum"); String quorum_status_path = storage.zookeeper_path + "/quorum/status"; try { while (true) { zkutil::EventPtr event = std::make_shared<Poco::Event>(); std::string value; /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there. if (!zookeeper->tryGet(quorum_status_path, value, nullptr, event)) break; ReplicatedMergeTreeQuorumEntry quorum_entry(value); /// If the node has time to disappear, and then appear again for the next insert. if (quorum_entry.part_name != part_name) break; if (!event->tryWait(quorum_timeout_ms)) throw Exception("Timeout while waiting for quorum", ErrorCodes::TIMEOUT_EXCEEDED); } /// And what if it is possible that the current replica at this time has ceased to be active and the quorum is marked as failed and deleted? String value; if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr) || value != quorum_info.is_active_node_value) throw Exception("Replica become inactive while waiting for quorum", ErrorCodes::NO_ACTIVE_REPLICAS); } catch (...) { /// We do not know whether or not data has been inserted /// - whether other replicas have time to download the part and mark the quorum as done. throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false), ErrorCodes::UNKNOWN_STATUS_OF_INSERT); } LOG_TRACE(log, "Quorum satisfied"); } }
void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_message) { try { LOG_ERROR(logger, start_of_message << (start_of_message.empty() ? "" : ": ") << getCurrentExceptionMessage(true)); } catch (...) { } }
ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_of_message) { String msg = (start_of_message.empty() ? "" : (start_of_message + ": ")) + getCurrentExceptionMessage(false, true); return ExecutionStatus(getCurrentExceptionCode(), msg); }
void ReplicatedMergeTreeBlockOutputStream::write(const Block & block) { /// TODO Can I not lock the table structure here? storage.data.delayInsertIfNeeded(&storage.restarting_thread->getWakeupEvent()); auto zookeeper = storage.getZooKeeper(); assertSessionIsNotExpired(zookeeper); /** If write is with quorum, then we check that the required number of replicas is now live, * and also that for all previous pieces for which quorum is required, this quorum is reached. * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node). * TODO Too complex logic, you can do better. */ String quorum_status_path = storage.zookeeper_path + "/quorum/status"; String is_active_node_value; int is_active_node_version = -1; int host_node_version = -1; if (quorum) { zkutil::ZooKeeper::TryGetFuture quorum_status_future = zookeeper->asyncTryGet(quorum_status_path); zkutil::ZooKeeper::TryGetFuture is_active_future = zookeeper->asyncTryGet(storage.replica_path + "/is_active"); zkutil::ZooKeeper::TryGetFuture host_future = zookeeper->asyncTryGet(storage.replica_path + "/host"); /// List of live replicas. All of them register an ephemeral node for leader_election. zkutil::Stat leader_election_stat; zookeeper->get(storage.zookeeper_path + "/leader_election", &leader_election_stat); if (leader_election_stat.numChildren < static_cast<int32_t>(quorum)) throw Exception("Number of alive replicas (" + toString(leader_election_stat.numChildren) + ") is less than requested quorum (" + toString(quorum) + ").", ErrorCodes::TOO_LESS_LIVE_REPLICAS); /** Is there a quorum for the last piece for which a quorum is needed? * Write of all the pieces with the included quorum is linearly ordered. * This means that at any time there can be only one piece, * for which you need, but not yet reach the quorum. * Information about this piece will be located in `/quorum/status` node. * If the quorum is reached, then the node is deleted. */ auto quorum_status = quorum_status_future.get(); if (quorum_status.exists) throw Exception("Quorum for previous write has not been satisfied yet. Status: " + quorum_status.value, ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); /// Both checks are implicitly made also later (otherwise there would be a race condition). auto is_active = is_active_future.get(); auto host = host_future.get(); if (!is_active.exists || !host.exists) throw Exception("Replica is not active right now", ErrorCodes::READONLY); is_active_node_value = is_active.value; is_active_node_version = is_active.stat.version; host_node_version = host.stat.version; } auto part_blocks = storage.writer.splitBlockIntoParts(block); for (auto & current_block : part_blocks) { assertSessionIsNotExpired(zookeeper); ++block_index; String block_id = insert_id.empty() ? "" : insert_id + "__" + toString(block_index); String month_name = toString(DateLUT::instance().toNumYYYYMMDD(DayNum_t(current_block.min_date)) / 100); AbandonableLockInZooKeeper block_number_lock = storage.allocateBlockNumber(month_name); /// 2 RTT Int64 part_number = block_number_lock.getNumber(); MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block, part_number); String part_name = ActiveDataPartSet::getPartName(part->left_date, part->right_date, part->left, part->right, part->level); /// Hash from the data. SipHash hash; part->checksums.summaryDataChecksum(hash); union { char bytes[16]; UInt64 words[2]; } hash_value; hash.get128(hash_value.bytes); String checksum(hash_value.bytes, 16); /// If no ID is specified in query, we take the hash from the data as ID. That is, do not insert the same data twice. /// NOTE: If you do not need this deduplication, you can leave `block_id` empty instead. /// Setting or syntax in the query (for example, `ID = null`) could be done for this. if (block_id.empty()) { block_id = toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]); if (block_id.empty()) throw Exception("Logical error: block_id is empty.", ErrorCodes::LOGICAL_ERROR); } LOG_DEBUG(log, "Wrote block " << part_number << " with ID " << block_id << ", " << current_block.block.rows() << " rows"); StorageReplicatedMergeTree::LogEntry log_entry; log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART; log_entry.create_time = time(0); log_entry.source_replica = storage.replica_name; log_entry.new_part_name = part_name; log_entry.quorum = quorum; log_entry.block_id = block_id; /// Simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock. /// Information about the block. zkutil::Ops ops; auto acl = zookeeper->getDefaultACL(); ops.emplace_back( std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/blocks/" + block_id, "", acl, zkutil::CreateMode::Persistent)); ops.emplace_back( std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/blocks/" + block_id + "/checksum", checksum, acl, zkutil::CreateMode::Persistent)); ops.emplace_back( std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/blocks/" + block_id + "/number", toString(part_number), acl, zkutil::CreateMode::Persistent)); /// Information about the part, in the replica data. storage.addNewPartToZooKeeper(part, ops, part_name); /// Replication log. ops.emplace_back(std::make_unique<zkutil::Op::Create>( storage.zookeeper_path + "/log/log-", log_entry.toString(), acl, zkutil::CreateMode::PersistentSequential)); /// Deletes the information that the block number is used for writing. block_number_lock.getUnlockOps(ops); /** If you need a quorum - create a node in which the quorum is monitored. * (If such a node already exists, then someone has managed to make another quorum record at the same time, but for it the quorum has not yet been reached. * You can not do the next quorum record at this time.) */ if (quorum) { ReplicatedMergeTreeQuorumEntry quorum_entry; quorum_entry.part_name = part_name; quorum_entry.required_number_of_replicas = quorum; quorum_entry.replicas.insert(storage.replica_name); /** At this point, this node will contain information that the current replica received a piece. * When other replicas will receive this piece (in the usual way, processing the replication log), * they will add themselves to the contents of this node. * When it contains information about `quorum` number of replicas, this node is deleted, * which indicates that the quorum has been reached. */ ops.emplace_back( std::make_unique<zkutil::Op::Create>( quorum_status_path, quorum_entry.toString(), acl, zkutil::CreateMode::Persistent)); /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished). ops.emplace_back( std::make_unique<zkutil::Op::Check>( storage.replica_path + "/is_active", is_active_node_version)); /// Unfortunately, just checking the above is not enough, because `is_active` node can be deleted and reappear with the same version. /// But then the `host` value will change. We will check this. /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread). ops.emplace_back( std::make_unique<zkutil::Op::Check>( storage.replica_path + "/host", host_node_version)); } MergeTreeData::Transaction transaction; /// If you can not add a piece to ZK, we'll remove it again from the working set. storage.data.renameTempPartAndAdd(part, nullptr, &transaction); try { auto code = zookeeper->tryMulti(ops); if (code == ZOK) { transaction.commit(); storage.merge_selecting_event.set(); } else if (code == ZNODEEXISTS) { /// If the block with such ID already exists in the table, roll back its insertion. String expected_checksum; if (!block_id.empty() && zookeeper->tryGet( storage.zookeeper_path + "/blocks/" + block_id + "/checksum", expected_checksum)) { LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")"); /// If the data is different from the ones that were inserted earlier with the same ID, throw an exception. if (expected_checksum != checksum) { if (!insert_id.empty()) throw Exception("Attempt to insert block with same ID but different checksum", ErrorCodes::CHECKSUM_DOESNT_MATCH); else throw Exception("Logical error: got ZNODEEXISTS while inserting data, block ID is derived from checksum but checksum doesn't match", ErrorCodes::LOGICAL_ERROR); } transaction.rollback(); } else if (zookeeper->exists(quorum_status_path)) { transaction.rollback(); throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); } else { /// if the node with the quorum existed, but was quickly removed. throw Exception("Unexpected ZNODEEXISTS while adding block " + toString(part_number) + " with ID " + block_id + ": " + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } } else { throw Exception("Unexpected error while adding block " + toString(part_number) + " with ID " + block_id + ": " + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } } catch (const zkutil::KeeperException & e) { /** If the connection is lost, and we do not know if the changes were applied, you can not delete the local chunk * if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again. */ if (e.code == ZOPERATIONTIMEOUT || e.code == ZCONNECTIONLOSS) { transaction.commit(); storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER); /// We do not know whether or not data has been inserted. throw Exception("Unknown status, client must retry. Reason: " + e.displayText(), ErrorCodes::UNKNOWN_STATUS_OF_INSERT); } throw; } if (quorum) { /// We are waiting for the quorum to be reached. LOG_TRACE(log, "Waiting for quorum"); try { while (true) { zkutil::EventPtr event = std::make_shared<Poco::Event>(); std::string value; /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there. if (!zookeeper->tryGet(quorum_status_path, value, nullptr, event)) break; ReplicatedMergeTreeQuorumEntry quorum_entry(value); /// If the node has time to disappear, and then appear again for the next insert. if (quorum_entry.part_name != part_name) break; if (!event->tryWait(quorum_timeout_ms)) throw Exception("Timeout while waiting for quorum"); } /// And what if it is possible that the current replica at this time has ceased to be active and the quorum is marked as failed and deleted? String value; if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr) || value != is_active_node_value) throw Exception("Replica become inactive while waiting for quorum"); } catch (...) { /// We do not know whether or not data has been inserted /// - whether other replicas have time to download the part and mark the quorum as done. throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false), ErrorCodes::UNKNOWN_STATUS_OF_INSERT); } LOG_TRACE(log, "Quorum satisfied"); } } }