Exemplo n.º 1
0
static void onExceptionBeforeStart(const String & query, Context & context, time_t current_time)
{
    /// Exception before the query execution.
    context.getQuota().addError();

    bool log_queries = context.getSettingsRef().log_queries;

    /// Log the start of query execution into the table if necessary.
    if (log_queries)
    {
        QueryLogElement elem;

        elem.type = QueryLogElement::EXCEPTION_BEFORE_START;

        elem.event_time = current_time;
        elem.query_start_time = current_time;

        elem.query = query.substr(0, context.getSettingsRef().log_queries_cut_to_length);
        elem.exception = getCurrentExceptionMessage(false);

        elem.client_info = context.getClientInfo();

        setExceptionStackTrace(elem);
        logException(context, elem);

        context.getQueryLog().add(elem);
    }
}
Exemplo n.º 2
0
static void onExceptionBeforeStart(const String & query, Context & context, time_t current_time)
{
	/// Эксепшен до начала выполнения запроса.
	context.getQuota().addError(current_time);

	bool log_queries = context.getSettingsRef().log_queries;

	/// Логгируем в таблицу начало выполнения запроса, если нужно.
	if (log_queries)
	{
		QueryLogElement elem;

		elem.type = QueryLogElement::EXCEPTION_BEFORE_START;

		elem.event_time = current_time;
		elem.query_start_time = current_time;

		elem.query = query.substr(0, context.getSettingsRef().log_queries_cut_to_length);
		elem.exception = getCurrentExceptionMessage(false);

		setClientInfo(elem, context);
		setExceptionStackTrace(elem);
		logException(context, elem);

		context.getQueryLog().add(elem);
	}
}
Exemplo n.º 3
0
std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace)
{
    try
    {
        std::rethrow_exception(std::move(e));
    }
    catch (...)
    {
        return getCurrentExceptionMessage(with_stacktrace);
    }
}
Exemplo n.º 4
0
void StorageSystemModels::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const
{
    const auto & external_models = context.getExternalModels();
    auto objects_map = external_models.getObjectsMap();
    const auto & models = objects_map.get();

    for (const auto & model_info : models)
    {
        res_columns[0]->insert(model_info.first);
        res_columns[1]->insert(model_info.second.origin);

        if (model_info.second.loadable)
        {
            const auto model_ptr = std::static_pointer_cast<IModel>(model_info.second.loadable);

            res_columns[2]->insert(model_ptr->getTypeName());
            res_columns[3]->insert(static_cast<UInt64>(std::chrono::system_clock::to_time_t(model_ptr->getCreationTime())));
        }
        else
        {
            res_columns[2]->insertDefault();
            res_columns[3]->insertDefault();
        }

        if (model_info.second.exception)
        {
            try
            {
                std::rethrow_exception(model_info.second.exception);
            }
            catch (...)
            {
                res_columns[4]->insert(getCurrentExceptionMessage(false));
            }
        }
        else
            res_columns[4]->insertDefault();
    }
}
ConnectionPoolWithFailover::TryResult
ConnectionPoolWithFailover::tryGetEntry(
        IConnectionPool & pool,
        std::string & fail_message,
        const Settings * settings,
        const QualifiedTableName * table_to_check)
{
    TryResult result;
    try
    {
        result.entry = pool.get(settings, /* force_connected = */ false);

        String server_name;
        UInt64 server_version_major;
        UInt64 server_version_minor;
        UInt64 server_revision;
        if (table_to_check)
            result.entry->getServerVersion(server_name, server_version_major, server_version_minor, server_revision);

        if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS)
        {
            result.entry->forceConnected();
            result.is_usable = true;
            result.is_up_to_date = true;
            return result;
        }

        /// Only status of the remote table corresponding to the Distributed table is taken into account.
        /// TODO: request status for joined tables also.
        TablesStatusRequest status_request;
        status_request.tables.emplace(*table_to_check);

        TablesStatusResponse status_response = result.entry->getTablesStatus(status_request);
        auto table_status_it = status_response.table_states_by_id.find(*table_to_check);
        if (table_status_it == status_response.table_states_by_id.end())
        {
            fail_message = "There is no table " + table_to_check->database + "." + table_to_check->table
                + " on server: " + result.entry->getDescription();
            LOG_WARNING(log, fail_message);
            ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable);

            return result;
        }

        result.is_usable = true;

        UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0;
        if (!max_allowed_delay)
        {
            result.is_up_to_date = true;
            return result;
        }

        UInt32 delay = table_status_it->second.absolute_delay;

        if (delay < max_allowed_delay)
            result.is_up_to_date = true;
        else
        {
            result.is_up_to_date = false;
            result.staleness = delay;

            LOG_TRACE(
                    log, "Server " << result.entry->getDescription() << " has unacceptable replica delay "
                    << "for table " << table_to_check->database << "." << table_to_check->table
                    << ": "  << delay);
            ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica);
        }
    }
    catch (const Exception & e)
    {
        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throw;

        fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false);

        if (!result.entry.isNull())
        {
            result.entry->disconnect();
            result.reset();
        }
    }
    return result;
};
Exemplo n.º 6
0
BlockInputStreams StorageSystemDictionaries::read(
    const Names & column_names,
    const ASTPtr & query,
    const Context & context,
    QueryProcessingStage::Enum & processed_stage,
    const size_t max_block_size,
    const unsigned)
{
    check(column_names);
    processed_stage = QueryProcessingStage::FetchColumns;

    ColumnWithTypeAndName col_name{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "name"};
    ColumnWithTypeAndName col_origin{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "origin"};
    ColumnWithTypeAndName col_type{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "type"};
    ColumnWithTypeAndName col_key{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "key"};
    ColumnWithTypeAndName col_attribute_names{
        std::make_shared<ColumnArray>(std::make_shared<ColumnString>()),
        std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()),
        "attribute.names"
    };
    ColumnWithTypeAndName col_attribute_types{
        std::make_shared<ColumnArray>(std::make_shared<ColumnString>()),
        std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()),
        "attribute.types"
    };
    ColumnWithTypeAndName col_has_hierarchy{std::make_shared<ColumnUInt8>(), std::make_shared<DataTypeUInt8>(), "has_hierarchy"};
    ColumnWithTypeAndName col_bytes_allocated{std::make_shared<ColumnUInt64>(), std::make_shared<DataTypeUInt64>(), "bytes_allocated"};
    ColumnWithTypeAndName col_query_count{std::make_shared<ColumnUInt64>(), std::make_shared<DataTypeUInt64>(), "query_count"};
    ColumnWithTypeAndName col_hit_rate{std::make_shared<ColumnFloat64>(), std::make_shared<DataTypeFloat64>(), "hit_rate"};
    ColumnWithTypeAndName col_element_count{std::make_shared<ColumnUInt64>(), std::make_shared<DataTypeUInt64>(), "element_count"};
    ColumnWithTypeAndName col_load_factor{std::make_shared<ColumnFloat64>(), std::make_shared<DataTypeFloat64>(), "load_factor"};
    ColumnWithTypeAndName col_creation_time{std::make_shared<ColumnUInt32>(), std::make_shared<DataTypeDateTime>(), "creation_time"};
    ColumnWithTypeAndName col_last_exception{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "last_exception"};
    ColumnWithTypeAndName col_source{std::make_shared<ColumnString>(), std::make_shared<DataTypeString>(), "source"};

    const auto & external_dictionaries = context.getExternalDictionaries();
    const std::lock_guard<std::mutex> lock{external_dictionaries.dictionaries_mutex};

    for (const auto & dict_info : external_dictionaries.dictionaries)
    {
        col_name.column->insert(dict_info.first);
        col_origin.column->insert(dict_info.second.origin);

        if (dict_info.second.dict)
        {
            const auto dict_ptr = dict_info.second.dict->get();

            col_type.column->insert(dict_ptr->getTypeName());

            const auto & dict_struct = dict_ptr->getStructure();
            col_key.column->insert(dict_struct.getKeyDescription());

            col_attribute_names.column->insert(ext::map<Array>(dict_struct.attributes, [] (auto & attr) -> decltype(auto) {
                return attr.name;
            }));
            col_attribute_types.column->insert(ext::map<Array>(dict_struct.attributes, [] (auto & attr) -> decltype(auto) {
                return attr.type->getName();
            }));
            col_bytes_allocated.column->insert(dict_ptr->getBytesAllocated());
            col_query_count.column->insert(dict_ptr->getQueryCount());
            col_hit_rate.column->insert(dict_ptr->getHitRate());
            col_element_count.column->insert(dict_ptr->getElementCount());
            col_load_factor.column->insert(dict_ptr->getLoadFactor());
            col_creation_time.column->insert(std::chrono::system_clock::to_time_t(dict_ptr->getCreationTime()));
            col_source.column->insert(dict_ptr->getSource()->toString());
        }
        else
        {
            col_type.column->insertDefault();
            col_key.column->insertDefault();
            col_attribute_names.column->insertDefault();
            col_attribute_types.column->insertDefault();
            col_bytes_allocated.column->insertDefault();
            col_query_count.column->insertDefault();
            col_hit_rate.column->insertDefault();
            col_element_count.column->insertDefault();
            col_load_factor.column->insertDefault();
            col_creation_time.column->insertDefault();
            col_source.column->insertDefault();
        }

        if (dict_info.second.exception)
        {
            try
            {
                std::rethrow_exception(dict_info.second.exception);
            }
            catch (...)
            {
                col_last_exception.column->insert(getCurrentExceptionMessage(false));
            }
        }
        else
            col_last_exception.column->insertDefault();
    }

    Block block{
        col_name,
        col_origin,
        col_type,
        col_key,
        col_attribute_names,
        col_attribute_types,
        col_bytes_allocated,
        col_query_count,
        col_hit_rate,
        col_element_count,
        col_load_factor,
        col_creation_time,
        col_last_exception,
        col_source
    };

    return BlockInputStreams{1, std::make_shared<OneBlockInputStream>(block)};
}
Exemplo n.º 7
0
void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response)
{
    Poco::Net::HTMLForm params(request, request.stream());
    LOG_TRACE(log, "Request URI: " + request.getURI());

    auto process_error = [&response, this](const std::string & message)
    {
        response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR);
        if (!response.sent())
            response.send() << message << std::endl;
        LOG_WARNING(log, message);
    };

    if (!params.has("query"))
    {
        process_error("No 'query' in request body");
        return;
    }

    if (!params.has("columns"))
    {
        process_error("No 'columns' in request URL");
        return;
    }

    if (!params.has("connection_string"))
    {
        process_error("No 'connection_string' in request URL");
        return;
    }

    UInt64 max_block_size = DEFAULT_BLOCK_SIZE;
    if (params.has("max_block_size"))
    {
        std::string max_block_size_str = params.get("max_block_size", "");
        if (max_block_size_str.empty())
        {
            process_error("Empty max_block_size specified");
            return;
        }
        max_block_size = parse<size_t>(max_block_size_str);
    }

    std::string columns = params.get("columns");
    std::unique_ptr<Block> sample_block;
    try
    {
        sample_block = parseColumns(std::move(columns));
    }
    catch (const Exception & ex)
    {
        process_error("Invalid 'columns' parameter in request body '" + ex.message() + "'");
        LOG_WARNING(log, ex.getStackTrace().toString());
        return;
    }

    std::string format = params.get("format", "RowBinary");
    std::string query = params.get("query");
    LOG_TRACE(log, "Query: " << query);

    std::string connection_string = params.get("connection_string");
    LOG_TRACE(log, "Connection string: '" << connection_string << "'");

    WriteBufferFromHTTPServerResponse out(request, response, keep_alive_timeout);
    try
    {
        BlockOutputStreamPtr writer = FormatFactory::instance().getOutput(format, out, *sample_block, *context);
        auto pool = getPool(connection_string);
        ODBCBlockInputStream inp(pool->get(), query, *sample_block, max_block_size);
        copyData(inp, *writer);
    }
    catch (...)
    {
        auto message = getCurrentExceptionMessage(true);
        response.setStatusAndReason(
            Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); // can't call process_error, bacause of too soon response sending
        writeStringBinary(message, out);
        tryLogCurrentException(log);
    }
}
Exemplo n.º 8
0
void ReplicasStatusHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response)
{
	try
	{
		HTMLForm params(request);

		/// Даже в случае, когда отставание небольшое, выводить подробную информацию об отставании.
		bool verbose = params.get("verbose", "") == "1";

		const MergeTreeSettings & settings = context.getMergeTreeSettings();

		bool ok = true;
		std::stringstream message;

		auto databases = context.getDatabases();

		/// Перебираем все реплицируемые таблицы.
		for (const auto & db : databases)
		{
			for (auto iterator = db.second->getIterator(); iterator->isValid(); iterator->next())
			{
				auto & table = iterator->table();
				StorageReplicatedMergeTree * table_replicated = typeid_cast<StorageReplicatedMergeTree *>(table.get());

				if (!table_replicated)
					continue;

				time_t absolute_delay = 0;
				time_t relative_delay = 0;

				table_replicated->getReplicaDelays(absolute_delay, relative_delay);

				if ((settings.min_absolute_delay_to_close && absolute_delay >= static_cast<time_t>(settings.min_absolute_delay_to_close))
					|| (settings.min_relative_delay_to_close && relative_delay >= static_cast<time_t>(settings.min_relative_delay_to_close)))
					ok = false;

				message << backQuoteIfNeed(db.first) << "." << backQuoteIfNeed(iterator->name())
					<< ":\tAbsolute delay: " << absolute_delay << ". Relative delay: " << relative_delay << ".\n";
			}
		}

		setResponseDefaultHeaders(response);

		if (ok && !verbose)
		{
			const char * data = "Ok.\n";
			response.sendBuffer(data, strlen(data));
		}
		else
		{
			response.send() << message.rdbuf();
		}
	}
	catch (...)
	{
		tryLogCurrentException("ReplicasStatusHandler");

		try
		{
			response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR);

			if (!response.sent())
			{
				/// Ещё ничего не отправляли, и даже не знаем, нужно ли сжимать ответ.
				response.send() << getCurrentExceptionMessage(false) << std::endl;
			}
		}
		catch (...)
		{
			LOG_ERROR((&Logger::get("ReplicasStatusHandler")), "Cannot send exception to client");
		}
	}
}
Exemplo n.º 9
0
static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
	IParser::Pos begin,
	IParser::Pos end,
	Context & context,
	bool internal,
	QueryProcessingStage::Enum stage)
{
	ProfileEvents::increment(ProfileEvents::Query);
	time_t current_time = time(0);

	const Settings & settings = context.getSettingsRef();

	ParserQuery parser;
	ASTPtr ast;
	size_t query_size;
	size_t max_query_size = settings.max_query_size;

	try
	{
		ast = parseQuery(parser, begin, end, "");

		/// Copy query into string. It will be written to log and presented in processlist. If an INSERT query, string will not include data to insertion.
		query_size = ast->range.second - ast->range.first;

		if (max_query_size && query_size > max_query_size)
			throw Exception("Query is too large (" + toString(query_size) + ")."
				" max_query_size = " + toString(max_query_size), ErrorCodes::QUERY_IS_TOO_LARGE);
	}
	catch (...)
	{
		/// Anyway log query.
		if (!internal)
		{
			String query = String(begin, begin + std::min(end - begin, static_cast<ptrdiff_t>(max_query_size)));
			logQuery(query.substr(0, settings.log_queries_cut_to_length), context);
			onExceptionBeforeStart(query, context, current_time);
		}

		throw;
	}

	String query(begin, query_size);
	BlockIO res;

	try
	{
		if (!internal)
			logQuery(query.substr(0, settings.log_queries_cut_to_length), context);

		/// Check the limits.
		checkLimits(*ast, settings.limits);

		QuotaForIntervals & quota = context.getQuota();

		quota.addQuery(current_time);
		quota.checkExceeded(current_time);

		/// Put query to process list. But don't put SHOW PROCESSLIST query itself.
		ProcessList::EntryPtr process_list_entry;
		if (!internal && nullptr == typeid_cast<const ASTShowProcesslistQuery *>(&*ast))
		{
			process_list_entry = context.getProcessList().insert(
				query,
				context.getUser(),
				context.getCurrentQueryId(),
				context.getIPAddress(),
				settings);

			context.setProcessListElement(&process_list_entry->get());
		}

		auto interpreter = InterpreterFactory::get(ast, context, stage);
		res = interpreter->execute();

		/// Hold element of process list till end of query execution.
		res.process_list_entry = process_list_entry;

		if (res.in)
		{
			if (IProfilingBlockInputStream * stream = dynamic_cast<IProfilingBlockInputStream *>(res.in.get()))
			{
				stream->setProgressCallback(context.getProgressCallback());
				stream->setProcessListElement(context.getProcessListElement());
			}
		}

		/// Everything related to query log.
		{
			QueryLogElement elem;

			elem.type = QueryLogElement::QUERY_START;

			elem.event_time = current_time;
			elem.query_start_time = current_time;

			elem.query = query.substr(0, settings.log_queries_cut_to_length);

			setClientInfo(elem, context);

			bool log_queries = settings.log_queries && !internal;

			/// Log into system table start of query execution, if need.
			if (log_queries)
				context.getQueryLog().add(elem);

			/// Also make possible for caller to log successful query finish and exception during execution.
			res.finish_callback = [elem, &context, log_queries] (IBlockInputStream * stream) mutable
			{
				ProcessListElement * process_list_elem = context.getProcessListElement();

				if (!process_list_elem)
					return;

				double elapsed_seconds = process_list_elem->watch.elapsedSeconds();

				elem.type = QueryLogElement::QUERY_FINISH;

				elem.event_time = time(0);
				elem.query_duration_ms = elapsed_seconds * 1000;

				elem.read_rows = process_list_elem->progress.rows;
				elem.read_bytes = process_list_elem->progress.bytes;

				auto memory_usage = process_list_elem->memory_tracker.getPeak();
				elem.memory_usage = memory_usage > 0 ? memory_usage : 0;

				if (stream)
				{
					if (IProfilingBlockInputStream * profiling_stream = dynamic_cast<IProfilingBlockInputStream *>(stream))
					{
						const BlockStreamProfileInfo & info = profiling_stream->getProfileInfo();

						elem.result_rows = info.rows;
						elem.result_bytes = info.bytes;
					}
				}

				if (elem.read_rows != 0)
				{
					LOG_INFO(&Logger::get("executeQuery"), std::fixed << std::setprecision(3)
						<< "Read " << elem.read_rows << " rows, "
						<< formatReadableSizeWithBinarySuffix(elem.read_bytes) << " in " << elapsed_seconds << " sec., "
						<< static_cast<size_t>(elem.read_rows / elapsed_seconds) << " rows/sec., "
						<< formatReadableSizeWithBinarySuffix(elem.read_bytes / elapsed_seconds) << "/sec.");
				}

				if (log_queries)
					context.getQueryLog().add(elem);
			};

			res.exception_callback = [elem, &context, log_queries, current_time] () mutable
			{
				context.getQuota().addError(current_time);

				elem.type = QueryLogElement::EXCEPTION_WHILE_PROCESSING;

				elem.event_time = time(0);
				elem.query_duration_ms = 1000 * (elem.event_time - elem.query_start_time);
				elem.exception = getCurrentExceptionMessage(false);

				ProcessListElement * process_list_elem = context.getProcessListElement();

				if (process_list_elem)
				{
					double elapsed_seconds = process_list_elem->watch.elapsedSeconds();

					elem.query_duration_ms = elapsed_seconds * 1000;

					elem.read_rows = process_list_elem->progress.rows;
					elem.read_bytes = process_list_elem->progress.bytes;

					auto memory_usage = process_list_elem->memory_tracker.getPeak();
					elem.memory_usage = memory_usage > 0 ? memory_usage : 0;
				}

				setExceptionStackTrace(elem);
				logException(context, elem);

				if (log_queries)
					context.getQueryLog().add(elem);
			};

			if (!internal && res.in)
			{
				std::stringstream log_str;
				log_str << "Query pipeline:\n";
				res.in->dumpTree(log_str);
				LOG_DEBUG(&Logger::get("executeQuery"), log_str.str());
			}
		}
	}
	catch (...)
	{
		if (!internal)
			onExceptionBeforeStart(query, context, current_time);

		throw;
	}

	return std::make_tuple(ast, res);
}
void ReplicatedMergeTreeBlockOutputStream::commitPart(zkutil::ZooKeeperPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, const String & block_id)
{
    storage.check(part->columns);
    assertSessionIsNotExpired(zookeeper);

    /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem.
    /// We remove the lock just after renaming the part. In case of exception, block number will be marked as abandoned.
    /// Also, make deduplication check. If a duplicate is detected, no nodes are created.

    /// Allocate new block number and check for duplicates
    bool deduplicate_block = !block_id.empty();
    String block_id_path = deduplicate_block ? storage.zookeeper_path + "/blocks/" + block_id : "";
    auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zookeeper, block_id_path);

    if (!block_number_lock)
    {
        LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it.");
        part->is_duplicate = true;
        last_block_is_duplicate = true;
        ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
        return;
    }

    Int64 block_number = block_number_lock->getNumber();

    /// Set part attributes according to part_number. Prepare an entry for log.

    part->info.min_block = block_number;
    part->info.max_block = block_number;
    part->info.level = 0;

    String part_name = part->getNewName(part->info);
    part->name = part_name;

    StorageReplicatedMergeTree::LogEntry log_entry;
    log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART;
    log_entry.create_time = time(nullptr);
    log_entry.source_replica = storage.replica_name;
    log_entry.new_part_name = part_name;
    log_entry.quorum = quorum;
    log_entry.block_id = block_id;

    /// Simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock.

    /// Information about the part.
    Coordination::Requests ops;

    storage.getCommitPartOps(ops, part, block_id_path);

    /// Replication log.
    ops.emplace_back(zkutil::makeCreateRequest(
        storage.zookeeper_path + "/log/log-",
        log_entry.toString(),
        zkutil::CreateMode::PersistentSequential));

    /// Deletes the information that the block number is used for writing.
    block_number_lock->getUnlockOps(ops);

    /** If you need a quorum - create a node in which the quorum is monitored.
        * (If such a node already exists, then someone has managed to make another quorum record at the same time, but for it the quorum has not yet been reached.
        *  You can not do the next quorum record at this time.)
        */
    if (quorum)
    {
        ReplicatedMergeTreeQuorumEntry quorum_entry;
        quorum_entry.part_name = part_name;
        quorum_entry.required_number_of_replicas = quorum;
        quorum_entry.replicas.insert(storage.replica_name);

        /** At this point, this node will contain information that the current replica received a part.
            * When other replicas will receive this part (in the usual way, processing the replication log),
            *  they will add themselves to the contents of this node.
            * When it contains information about `quorum` number of replicas, this node is deleted,
            *  which indicates that the quorum has been reached.
            */

        ops.emplace_back(
            zkutil::makeCreateRequest(
                quorum_info.status_path,
                quorum_entry.toString(),
                zkutil::CreateMode::Persistent));

        /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished).
        ops.emplace_back(
            zkutil::makeCheckRequest(
                storage.replica_path + "/is_active",
                quorum_info.is_active_node_version));

        /// Unfortunately, just checking the above is not enough, because `is_active` node can be deleted and reappear with the same version.
        /// But then the `host` value will change. We will check this.
        /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread).
        ops.emplace_back(
            zkutil::makeCheckRequest(
                storage.replica_path + "/host",
                quorum_info.host_node_version));
    }

    MergeTreeData::Transaction transaction(storage.data); /// If you can not add a part to ZK, we'll remove it back from the working set.
    storage.data.renameTempPartAndAdd(part, nullptr, &transaction);

    Coordination::Responses responses;
    int32_t multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT

    if (multi_code == Coordination::ZOK)
    {
        transaction.commit();
        storage.merge_selecting_task->schedule();

        /// Lock nodes have been already deleted, do not delete them in destructor
        block_number_lock->assumeUnlocked();
    }
    else if (multi_code == Coordination::ZCONNECTIONLOSS
        || multi_code == Coordination::ZOPERATIONTIMEOUT)
    {
        /** If the connection is lost, and we do not know if the changes were applied, we can not delete the local part
          *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
          */
        transaction.commit();
        storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);

        /// We do not know whether or not data has been inserted.
        throw Exception("Unknown status, client must retry. Reason: " + String(Coordination::errorMessage(multi_code)),
            ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
    }
    else if (Coordination::isUserError(multi_code))
    {
        String failed_op_path = zkutil::KeeperMultiException(multi_code, ops, responses).getPathForFirstFailedOp();

        if (multi_code == Coordination::ZNODEEXISTS && deduplicate_block && failed_op_path == block_id_path)
        {
            /// Block with the same id have just appeared in table (or other replica), rollback thee insertion.
            LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")");

            part->is_duplicate = true;
            transaction.rollback();
            last_block_is_duplicate = true;
            ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
        }
        else if (multi_code == Coordination::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
        {
            transaction.rollback();

            throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
        }
        else
        {
            /// NOTE: We could be here if the node with the quorum existed, but was quickly removed.
            transaction.rollback();
            throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
                            + zkutil::ZooKeeper::error2string(multi_code) + ", path " + failed_op_path,
                            ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
        }
    }
    else if (Coordination::isHardwareError(multi_code))
    {
        transaction.rollback();
        throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
                        + zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
    }
    else
    {
        transaction.rollback();
        throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
                        + zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
    }

    if (quorum)
    {
        /// We are waiting for quorum to be satisfied.
        LOG_TRACE(log, "Waiting for quorum");

        String quorum_status_path = storage.zookeeper_path + "/quorum/status";

        try
        {
            while (true)
            {
                zkutil::EventPtr event = std::make_shared<Poco::Event>();

                std::string value;
                /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there.
                if (!zookeeper->tryGet(quorum_status_path, value, nullptr, event))
                    break;

                ReplicatedMergeTreeQuorumEntry quorum_entry(value);

                /// If the node has time to disappear, and then appear again for the next insert.
                if (quorum_entry.part_name != part_name)
                    break;

                if (!event->tryWait(quorum_timeout_ms))
                    throw Exception("Timeout while waiting for quorum", ErrorCodes::TIMEOUT_EXCEEDED);
            }

            /// And what if it is possible that the current replica at this time has ceased to be active and the quorum is marked as failed and deleted?
            String value;
            if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr)
                || value != quorum_info.is_active_node_value)
                throw Exception("Replica become inactive while waiting for quorum", ErrorCodes::NO_ACTIVE_REPLICAS);
        }
        catch (...)
        {
            /// We do not know whether or not data has been inserted
            /// - whether other replicas have time to download the part and mark the quorum as done.
            throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false),
                ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
        }

        LOG_TRACE(log, "Quorum satisfied");
    }
}
Exemplo n.º 11
0
void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_message)
{
    try
    {
        LOG_ERROR(logger, start_of_message << (start_of_message.empty() ? "" : ": ") << getCurrentExceptionMessage(true));
    }
    catch (...)
    {
    }
}
Exemplo n.º 12
0
ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_of_message)
{
    String msg = (start_of_message.empty() ? "" : (start_of_message + ": ")) + getCurrentExceptionMessage(false, true);
    return ExecutionStatus(getCurrentExceptionCode(), msg);
}
void ReplicatedMergeTreeBlockOutputStream::write(const Block & block)
{
    /// TODO Can I not lock the table structure here?
    storage.data.delayInsertIfNeeded(&storage.restarting_thread->getWakeupEvent());

    auto zookeeper = storage.getZooKeeper();

    assertSessionIsNotExpired(zookeeper);

    /** If write is with quorum, then we check that the required number of replicas is now live,
      *  and also that for all previous pieces for which quorum is required, this quorum is reached.
      * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node).
      * TODO Too complex logic, you can do better.
      */
    String quorum_status_path = storage.zookeeper_path + "/quorum/status";
    String is_active_node_value;
    int is_active_node_version = -1;
    int host_node_version = -1;
    if (quorum)
    {
        zkutil::ZooKeeper::TryGetFuture quorum_status_future = zookeeper->asyncTryGet(quorum_status_path);
        zkutil::ZooKeeper::TryGetFuture is_active_future = zookeeper->asyncTryGet(storage.replica_path + "/is_active");
        zkutil::ZooKeeper::TryGetFuture host_future = zookeeper->asyncTryGet(storage.replica_path + "/host");

        /// List of live replicas. All of them register an ephemeral node for leader_election.

        zkutil::Stat leader_election_stat;
        zookeeper->get(storage.zookeeper_path + "/leader_election", &leader_election_stat);

        if (leader_election_stat.numChildren < static_cast<int32_t>(quorum))
            throw Exception("Number of alive replicas ("
                + toString(leader_election_stat.numChildren) + ") is less than requested quorum (" + toString(quorum) + ").",
                ErrorCodes::TOO_LESS_LIVE_REPLICAS);

        /** Is there a quorum for the last piece for which a quorum is needed?
            * Write of all the pieces with the included quorum is linearly ordered.
            * This means that at any time there can be only one piece,
            *  for which you need, but not yet reach the quorum.
            * Information about this piece will be located in `/quorum/status` node.
            * If the quorum is reached, then the node is deleted.
            */

        auto quorum_status = quorum_status_future.get();
        if (quorum_status.exists)
            throw Exception("Quorum for previous write has not been satisfied yet. Status: " + quorum_status.value, ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);

        /// Both checks are implicitly made also later (otherwise there would be a race condition).

        auto is_active = is_active_future.get();
        auto host = host_future.get();

        if (!is_active.exists || !host.exists)
            throw Exception("Replica is not active right now", ErrorCodes::READONLY);

        is_active_node_value = is_active.value;
        is_active_node_version = is_active.stat.version;
        host_node_version = host.stat.version;
    }

    auto part_blocks = storage.writer.splitBlockIntoParts(block);

    for (auto & current_block : part_blocks)
    {
        assertSessionIsNotExpired(zookeeper);

        ++block_index;
        String block_id = insert_id.empty() ? "" : insert_id + "__" + toString(block_index);
        String month_name = toString(DateLUT::instance().toNumYYYYMMDD(DayNum_t(current_block.min_date)) / 100);

        AbandonableLockInZooKeeper block_number_lock = storage.allocateBlockNumber(month_name);    /// 2 RTT

        Int64 part_number = block_number_lock.getNumber();

        MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block, part_number);
        String part_name = ActiveDataPartSet::getPartName(part->left_date, part->right_date, part->left, part->right, part->level);

        /// Hash from the data.
        SipHash hash;
        part->checksums.summaryDataChecksum(hash);
        union
        {
            char bytes[16];
            UInt64 words[2];
        } hash_value;
        hash.get128(hash_value.bytes);

        String checksum(hash_value.bytes, 16);

        /// If no ID is specified in query, we take the hash from the data as ID. That is, do not insert the same data twice.
        /// NOTE: If you do not need this deduplication, you can leave `block_id` empty instead.
        ///       Setting or syntax in the query (for example, `ID = null`) could be done for this.
        if (block_id.empty())
        {
            block_id = toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]);

            if (block_id.empty())
                throw Exception("Logical error: block_id is empty.", ErrorCodes::LOGICAL_ERROR);
        }

        LOG_DEBUG(log, "Wrote block " << part_number << " with ID " << block_id << ", " << current_block.block.rows() << " rows");

        StorageReplicatedMergeTree::LogEntry log_entry;
        log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART;
        log_entry.create_time = time(0);
        log_entry.source_replica = storage.replica_name;
        log_entry.new_part_name = part_name;
        log_entry.quorum = quorum;
        log_entry.block_id = block_id;

        /// Simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock.

        /// Information about the block.
        zkutil::Ops ops;
        auto acl = zookeeper->getDefaultACL();

        ops.emplace_back(
            std::make_unique<zkutil::Op::Create>(
                storage.zookeeper_path + "/blocks/" + block_id,
                "",
                acl,
                zkutil::CreateMode::Persistent));
        ops.emplace_back(
            std::make_unique<zkutil::Op::Create>(
                storage.zookeeper_path + "/blocks/" + block_id + "/checksum",
                checksum,
                acl,
                zkutil::CreateMode::Persistent));
        ops.emplace_back(
            std::make_unique<zkutil::Op::Create>(
                storage.zookeeper_path + "/blocks/" + block_id + "/number",
                toString(part_number),
                acl,
                zkutil::CreateMode::Persistent));

        /// Information about the part, in the replica data.
        storage.addNewPartToZooKeeper(part, ops, part_name);

        /// Replication log.
        ops.emplace_back(std::make_unique<zkutil::Op::Create>(
            storage.zookeeper_path + "/log/log-",
            log_entry.toString(),
            acl,
            zkutil::CreateMode::PersistentSequential));

        /// Deletes the information that the block number is used for writing.
        block_number_lock.getUnlockOps(ops);

        /** If you need a quorum - create a node in which the quorum is monitored.
            * (If such a node already exists, then someone has managed to make another quorum record at the same time, but for it the quorum has not yet been reached.
            *  You can not do the next quorum record at this time.)
            */
        if (quorum)
        {
            ReplicatedMergeTreeQuorumEntry quorum_entry;
            quorum_entry.part_name = part_name;
            quorum_entry.required_number_of_replicas = quorum;
            quorum_entry.replicas.insert(storage.replica_name);

            /** At this point, this node will contain information that the current replica received a piece.
                * When other replicas will receive this piece (in the usual way, processing the replication log),
                *  they will add themselves to the contents of this node.
                * When it contains information about `quorum` number of replicas, this node is deleted,
                *  which indicates that the quorum has been reached.
                */

            ops.emplace_back(
                std::make_unique<zkutil::Op::Create>(
                    quorum_status_path,
                    quorum_entry.toString(),
                    acl,
                    zkutil::CreateMode::Persistent));

            /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished).
            ops.emplace_back(
                std::make_unique<zkutil::Op::Check>(
                    storage.replica_path + "/is_active",
                    is_active_node_version));

            /// Unfortunately, just checking the above is not enough, because `is_active` node can be deleted and reappear with the same version.
            /// But then the `host` value will change. We will check this.
            /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread).
            ops.emplace_back(
                std::make_unique<zkutil::Op::Check>(
                    storage.replica_path + "/host",
                    host_node_version));
        }

        MergeTreeData::Transaction transaction; /// If you can not add a piece to ZK, we'll remove it again from the working set.
        storage.data.renameTempPartAndAdd(part, nullptr, &transaction);

        try
        {
            auto code = zookeeper->tryMulti(ops);
            if (code == ZOK)
            {
                transaction.commit();
                storage.merge_selecting_event.set();
            }
            else if (code == ZNODEEXISTS)
            {
                /// If the block with such ID already exists in the table, roll back its insertion.
                String expected_checksum;
                if (!block_id.empty() && zookeeper->tryGet(
                    storage.zookeeper_path + "/blocks/" + block_id + "/checksum", expected_checksum))
                {
                    LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")");

                    /// If the data is different from the ones that were inserted earlier with the same ID, throw an exception.
                    if (expected_checksum != checksum)
                    {
                        if (!insert_id.empty())
                            throw Exception("Attempt to insert block with same ID but different checksum", ErrorCodes::CHECKSUM_DOESNT_MATCH);
                        else
                            throw Exception("Logical error: got ZNODEEXISTS while inserting data, block ID is derived from checksum but checksum doesn't match", ErrorCodes::LOGICAL_ERROR);
                    }

                    transaction.rollback();
                }
                else if (zookeeper->exists(quorum_status_path))
                {
                    transaction.rollback();

                    throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
                }
                else
                {
                    /// if the node with the quorum existed, but was quickly removed.

                    throw Exception("Unexpected ZNODEEXISTS while adding block " + toString(part_number) + " with ID " + block_id + ": "
                        + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
                }
            }
            else
            {
                throw Exception("Unexpected error while adding block " + toString(part_number) + " with ID " + block_id + ": "
                    + zkutil::ZooKeeper::error2string(code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
            }
        }
        catch (const zkutil::KeeperException & e)
        {
            /** If the connection is lost, and we do not know if the changes were applied, you can not delete the local chunk
                *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
                */
            if (e.code == ZOPERATIONTIMEOUT ||
                e.code == ZCONNECTIONLOSS)
            {
                transaction.commit();
                storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);

                /// We do not know whether or not data has been inserted.
                throw Exception("Unknown status, client must retry. Reason: " + e.displayText(), ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
            }

            throw;
        }

        if (quorum)
        {
            /// We are waiting for the quorum to be reached.
            LOG_TRACE(log, "Waiting for quorum");

            try
            {
                while (true)
                {
                    zkutil::EventPtr event = std::make_shared<Poco::Event>();

                    std::string value;
                    /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there.
                    if (!zookeeper->tryGet(quorum_status_path, value, nullptr, event))
                        break;

                    ReplicatedMergeTreeQuorumEntry quorum_entry(value);

                    /// If the node has time to disappear, and then appear again for the next insert.
                    if (quorum_entry.part_name != part_name)
                        break;

                    if (!event->tryWait(quorum_timeout_ms))
                        throw Exception("Timeout while waiting for quorum");
                }

                /// And what if it is possible that the current replica at this time has ceased to be active and the quorum is marked as failed and deleted?
                String value;
                if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr)
                    || value != is_active_node_value)
                    throw Exception("Replica become inactive while waiting for quorum");
            }
            catch (...)
            {
                /// We do not know whether or not data has been inserted
                /// - whether other replicas have time to download the part and mark the quorum as done.
                throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false),
                    ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
            }

            LOG_TRACE(log, "Quorum satisfied");
        }
    }
}