Esempio n. 1
0
void Service::processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & body, WriteBuffer & out, Poco::Net::HTTPServerResponse & response)
{
    if (is_cancelled)
        throw Exception{"RemoteQueryExecutor service terminated", ErrorCodes::ABORTED};

    std::string query = params.get("query");

    bool flag = true;

    try
    {
        (void) executeQuery(query, context, true);
    }
    catch (...)
    {
        tryLogCurrentException(__PRETTY_FUNCTION__);
        flag = false;
    }

    writeBinary(flag, out);
    out.next();
}
Esempio n. 2
0
void ThreadStatus::finalizePerformanceCounters()
{
    if (performance_counters_finalized)
        return;

    performance_counters_finalized = true;
    updatePerformanceCounters();

    try
    {
        if (global_context && query_context)
        {
            auto & settings = query_context->getSettingsRef();
            if (settings.log_queries && settings.log_query_threads)
                if (auto thread_log = global_context->getQueryThreadLog())
                    logToQueryThreadLog(*thread_log);
        }
    }
    catch (...)
    {
        tryLogCurrentException(log);
    }
}
Esempio n. 3
0
void StorageDistributedDirectoryMonitor::run()
{
	setThreadName("DistrDirMonitor");

	std::unique_lock<std::mutex> lock{mutex};

	const auto quit_requested = [this] { return quit; };

	while (!quit_requested())
	{
		auto do_sleep = true;

		try
		{
			do_sleep = !findFiles();
		}
		catch (...)
		{
			do_sleep = true;
			++error_count;
			sleep_time = std::min(
				std::chrono::milliseconds{Int64(default_sleep_time.count() * std::exp2(error_count))},
				std::chrono::milliseconds{max_sleep_time});
			tryLogCurrentException(getLoggerName().data());
		};

		if (do_sleep)
			cond.wait_for(lock, sleep_time, quit_requested);

		const auto now = std::chrono::system_clock::now();
		if (now - last_decrease_time > decrease_error_count_period)
		{
			error_count /= 2;
			last_decrease_time = now;
		}
	}
}
void StorageDistributed::reshardPartitions(ASTPtr query, const String & database_name,
	const Field & first_partition, const Field & last_partition,
	const WeightedZooKeeperPaths & weighted_zookeeper_paths,
	const ASTPtr & sharding_key_expr, bool do_copy, const Field & coordinator,
	const Settings & settings)
{
	auto & resharding_worker = context.getReshardingWorker();
	if (!resharding_worker.isStarted())
		throw Exception{"Resharding background thread is not running", ErrorCodes::RESHARDING_NO_WORKER};

	if (!coordinator.isNull())
		throw Exception{"Use of COORDINATE WITH is forbidden in ALTER TABLE ... RESHARD"
			" queries for distributed tables",
			ErrorCodes::RESHARDING_INVALID_PARAMETERS};

	std::string coordinator_id = resharding_worker.createCoordinator(cluster);

	std::atomic<bool> has_notified_error{false};

	std::string dumped_coordinator_state;

	auto handle_exception = [&](const std::string & msg = "")
	{
		try
		{
			if (!has_notified_error)
				resharding_worker.setStatus(coordinator_id, ReshardingWorker::STATUS_ERROR, msg);
			dumped_coordinator_state = resharding_worker.dumpCoordinatorState(coordinator_id);
			resharding_worker.deleteCoordinator(coordinator_id);
		}
		catch (...)
		{
			tryLogCurrentException(__PRETTY_FUNCTION__);
		}
	};

	try
	{
		/// Создать запрос ALTER TABLE ... RESHARD [COPY] PARTITION ... COORDINATE WITH ...

		ASTPtr alter_query_ptr = std::make_shared<ASTAlterQuery>();
		auto & alter_query = static_cast<ASTAlterQuery &>(*alter_query_ptr);

		alter_query.database = remote_database;
		alter_query.table = remote_table;

		alter_query.parameters.emplace_back();
		ASTAlterQuery::Parameters & parameters = alter_query.parameters.back();

		parameters.type = ASTAlterQuery::RESHARD_PARTITION;
		if (!first_partition.isNull())
			parameters.partition = std::make_shared<ASTLiteral>(StringRange(), first_partition);
		if (!last_partition.isNull())
			parameters.last_partition = std::make_shared<ASTLiteral>(StringRange(), last_partition);

		ASTPtr expr_list = std::make_shared<ASTExpressionList>();
		for (const auto & entry : weighted_zookeeper_paths)
		{
			ASTPtr weighted_path_ptr = std::make_shared<ASTWeightedZooKeeperPath>();
			auto & weighted_path = static_cast<ASTWeightedZooKeeperPath &>(*weighted_path_ptr);
			weighted_path.path = entry.first;
			weighted_path.weight = entry.second;
			expr_list->children.push_back(weighted_path_ptr);
		}

		parameters.weighted_zookeeper_paths = expr_list;
		parameters.sharding_key_expr = sharding_key_expr;
		parameters.do_copy = do_copy;
		parameters.coordinator = std::make_shared<ASTLiteral>(StringRange(), Field(coordinator_id));

		resharding_worker.registerQuery(coordinator_id, queryToString(alter_query_ptr));

		/** Функциональность shard_multiplexing не доделана - выключаем её.
		* (Потому что установка соединений с разными шардами в рамках одного потока выполняется не параллельно.)
		* Подробнее смотрите в https://███████████.yandex-team.ru/METR-18300
		*/
		bool enable_shard_multiplexing = false;

		ClusterProxy::AlterQueryConstructor alter_query_constructor;

		BlockInputStreams streams = ClusterProxy::Query{alter_query_constructor, cluster, alter_query_ptr,
			context, settings, enable_shard_multiplexing}.execute();

		/// This callback is called if an exception has occurred while attempting to read
		/// a block from a shard. This is to avoid a potential deadlock if other shards are
		/// waiting inside a barrier. Actually, even without this solution, we would avoid
		/// such a deadlock because we would eventually time out while trying to get remote
		/// blocks. Nevertheless this is not the ideal way of sorting out this issue since
		/// we would then not get to know the actual cause of the failure.
		auto exception_callback = [&resharding_worker, coordinator_id, &has_notified_error]()
		{
			try
			{
				resharding_worker.setStatus(coordinator_id, ReshardingWorker::STATUS_ERROR);
				has_notified_error = true;
			}
			catch (...)
			{
				tryLogCurrentException(__PRETTY_FUNCTION__);
			}
		};

		streams[0] = std::make_shared<UnionBlockInputStream<>>(
			streams, nullptr, settings.max_distributed_connections, exception_callback);
		streams.resize(1);

		auto stream_ptr = dynamic_cast<IProfilingBlockInputStream *>(&*streams[0]);
		if (stream_ptr == nullptr)
			throw Exception{"StorageDistributed: Internal error", ErrorCodes::LOGICAL_ERROR};
		auto & stream = *stream_ptr;

		stream.readPrefix();

		while (!stream.isCancelled() && stream.read())
			;

		if (!stream.isCancelled())
			stream.readSuffix();
	}
	catch (const Exception & ex)
	{
		handle_exception(ex.message());
		LOG_ERROR(log, dumped_coordinator_state);
		throw;
	}
	catch (const std::exception & ex)
	{
		handle_exception(ex.what());
		LOG_ERROR(log, dumped_coordinator_state);
		throw;
	}
	catch (...)
	{
		handle_exception();
		LOG_ERROR(log, dumped_coordinator_state);
		throw;
	}
}
Esempio n. 5
0
void AsynchronousMetrics::update()
{
    {
        if (auto mark_cache = context.getMarkCache())
        {
            set("MarkCacheBytes", mark_cache->weight());
            set("MarkCacheFiles", mark_cache->count());
        }
    }

    {
        if (auto uncompressed_cache = context.getUncompressedCache())
        {
            set("UncompressedCacheBytes", uncompressed_cache->weight());
            set("UncompressedCacheCells", uncompressed_cache->count());
        }
    }

    {
        auto databases = context.getDatabases();

        size_t max_queue_size = 0;
        size_t max_inserts_in_queue = 0;
        size_t max_merges_in_queue = 0;

        size_t sum_queue_size = 0;
        size_t sum_inserts_in_queue = 0;
        size_t sum_merges_in_queue = 0;

        size_t max_absolute_delay = 0;
        size_t max_relative_delay = 0;

        size_t max_part_count_for_partition = 0;

        for (const auto & db : databases)
        {
            for (auto iterator = db.second->getIterator(); iterator->isValid(); iterator->next())
            {
                auto & table = iterator->table();
                StorageMergeTree * table_merge_tree = typeid_cast<StorageMergeTree *>(table.get());
                StorageReplicatedMergeTree * table_replicated_merge_tree = typeid_cast<StorageReplicatedMergeTree *>(table.get());

                if (table_replicated_merge_tree)
                {
                    StorageReplicatedMergeTree::Status status;
                    table_replicated_merge_tree->getStatus(status, false);

                    calculateMaxAndSum(max_queue_size, sum_queue_size, status.queue.queue_size);
                    calculateMaxAndSum(max_inserts_in_queue, sum_inserts_in_queue, status.queue.inserts_in_queue);
                    calculateMaxAndSum(max_merges_in_queue, sum_merges_in_queue, status.queue.merges_in_queue);

                    try
                    {
                        time_t absolute_delay = 0;
                        time_t relative_delay = 0;
                        table_replicated_merge_tree->getReplicaDelays(absolute_delay, relative_delay);

                        calculateMax(max_absolute_delay, absolute_delay);
                        calculateMax(max_relative_delay, relative_delay);
                    }
                    catch (...)
                    {
                        tryLogCurrentException(__PRETTY_FUNCTION__,
                            "Cannot get replica delay for table: " + backQuoteIfNeed(db.first) + "." + backQuoteIfNeed(iterator->name()));
                    }

                    calculateMax(max_part_count_for_partition, table_replicated_merge_tree->getData().getMaxPartsCountForMonth());
                }

                if (table_merge_tree)
                {
                    calculateMax(max_part_count_for_partition, table_merge_tree->getData().getMaxPartsCountForMonth());
                }
            }
        }

        set("ReplicasMaxQueueSize", max_queue_size);
        set("ReplicasMaxInsertsInQueue", max_inserts_in_queue);
        set("ReplicasMaxMergesInQueue", max_merges_in_queue);

        set("ReplicasSumQueueSize", sum_queue_size);
        set("ReplicasSumInsertsInQueue", sum_inserts_in_queue);
        set("ReplicasSumMergesInQueue", sum_merges_in_queue);

        set("ReplicasMaxAbsoluteDelay", max_absolute_delay);
        set("ReplicasMaxRelativeDelay", max_relative_delay);

        set("MaxPartCountForPartition", max_part_count_for_partition);
    }

#if USE_TCMALLOC
    {
        /// tcmalloc related metrics. Remove if you switch to different allocator.

        MallocExtension & malloc_extension = *MallocExtension::instance();

        auto malloc_metrics =
        {
            "generic.current_allocated_bytes",
            "generic.heap_size",
            "tcmalloc.current_total_thread_cache_bytes",
            "tcmalloc.central_cache_free_bytes",
            "tcmalloc.transfer_cache_free_bytes",
            "tcmalloc.thread_cache_free_bytes",
            "tcmalloc.pageheap_free_bytes",
            "tcmalloc.pageheap_unmapped_bytes",
        };

        for (auto malloc_metric : malloc_metrics)
        {
            size_t value = 0;
            if (malloc_extension.GetNumericProperty(malloc_metric, &value))
                set(malloc_metric, value);
        }
    }
#endif

    /// Add more metrics as you wish.
}
Esempio n. 6
0
SharedLibraryPtr Compiler::getOrCount(
	const std::string & key,
	UInt32 min_count_to_compile,
	const std::string & additional_compiler_flags,
	CodeGenerator get_code,
	ReadyCallback on_ready)
{
	HashedKey hashed_key = getHash(key);

	std::lock_guard<std::mutex> lock(mutex);

	UInt32 count = ++counts[hashed_key];

	/// Есть готовая открытая библиотека? Или, если библиотека в процессе компиляции, там будет nullptr.
	Libraries::iterator it = libraries.find(hashed_key);
	if (libraries.end() != it)
	{
		if (!it->second)
			LOG_INFO(log, "Library " << hashedKeyToFileName(hashed_key) << " is already compiling or compilation was failed.");

		/// TODO В этом случае, после окончания компиляции, не будет дёрнут колбэк.

		return it->second;
	}

	/// Есть файл с библиотекой, оставшийся от предыдущего запуска?
	std::string file_name = hashedKeyToFileName(hashed_key);
	if (files.count(file_name))
	{
		std::string so_file_path = path + '/' + file_name + ".so";
		LOG_INFO(log, "Loading existing library " << so_file_path);

		SharedLibraryPtr lib(new SharedLibrary(so_file_path));
		libraries[hashed_key] = lib;
		return lib;
	}

	/// Достигнуто ли min_count_to_compile?
	if (count >= min_count_to_compile)
	{
		/// Значение min_count_to_compile, равное нулю, обозначает необходимость синхронной компиляции.

		/// Есть ли свободные потоки.
		if (min_count_to_compile == 0 || pool.active() < pool.size())
		{
			/// Обозначает, что библиотека в процессе компиляции.
			libraries[hashed_key] = nullptr;

			LOG_INFO(log, "Compiling code " << file_name << ", key: " << key);

			if (min_count_to_compile == 0)
			{
				{
					ext::unlock_guard<std::mutex> unlock(mutex);
					compile(hashed_key, file_name, additional_compiler_flags, get_code, on_ready);
				}

				return libraries[hashed_key];
			}
			else
			{
				pool.schedule([=]
				{
					try
					{
						compile(hashed_key, file_name, additional_compiler_flags, get_code, on_ready);
					}
					catch (...)
					{
						tryLogCurrentException("Compiler");
					}
				});
			}
		}
		else
			LOG_INFO(log, "All threads are busy.");
	}

	return nullptr;
}
void ReplicasStatusHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response)
{
	try
	{
		HTMLForm params(request);

		/// Даже в случае, когда отставание небольшое, выводить подробную информацию об отставании.
		bool verbose = params.get("verbose", "") == "1";

		const MergeTreeSettings & settings = context.getMergeTreeSettings();

		bool ok = true;
		std::stringstream message;

		auto databases = context.getDatabases();

		/// Перебираем все реплицируемые таблицы.
		for (const auto & db : databases)
		{
			for (auto iterator = db.second->getIterator(); iterator->isValid(); iterator->next())
			{
				auto & table = iterator->table();
				StorageReplicatedMergeTree * table_replicated = typeid_cast<StorageReplicatedMergeTree *>(table.get());

				if (!table_replicated)
					continue;

				time_t absolute_delay = 0;
				time_t relative_delay = 0;

				table_replicated->getReplicaDelays(absolute_delay, relative_delay);

				if ((settings.min_absolute_delay_to_close && absolute_delay >= static_cast<time_t>(settings.min_absolute_delay_to_close))
					|| (settings.min_relative_delay_to_close && relative_delay >= static_cast<time_t>(settings.min_relative_delay_to_close)))
					ok = false;

				message << backQuoteIfNeed(db.first) << "." << backQuoteIfNeed(iterator->name())
					<< ":\tAbsolute delay: " << absolute_delay << ". Relative delay: " << relative_delay << ".\n";
			}
		}

		setResponseDefaultHeaders(response);

		if (ok && !verbose)
		{
			const char * data = "Ok.\n";
			response.sendBuffer(data, strlen(data));
		}
		else
		{
			response.send() << message.rdbuf();
		}
	}
	catch (...)
	{
		tryLogCurrentException("ReplicasStatusHandler");

		try
		{
			response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR);

			if (!response.sent())
			{
				/// Ещё ничего не отправляли, и даже не знаем, нужно ли сжимать ответ.
				response.send() << getCurrentExceptionMessage(false) << std::endl;
			}
		}
		catch (...)
		{
			LOG_ERROR((&Logger::get("ReplicasStatusHandler")), "Cannot send exception to client");
		}
	}
}
Esempio n. 8
0
void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response)
{
    Poco::Net::HTMLForm params(request, request.stream());
    LOG_TRACE(log, "Request URI: " + request.getURI());

    auto process_error = [&response, this](const std::string & message)
    {
        response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR);
        if (!response.sent())
            response.send() << message << std::endl;
        LOG_WARNING(log, message);
    };

    if (!params.has("query"))
    {
        process_error("No 'query' in request body");
        return;
    }

    if (!params.has("columns"))
    {
        process_error("No 'columns' in request URL");
        return;
    }

    if (!params.has("connection_string"))
    {
        process_error("No 'connection_string' in request URL");
        return;
    }

    UInt64 max_block_size = DEFAULT_BLOCK_SIZE;
    if (params.has("max_block_size"))
    {
        std::string max_block_size_str = params.get("max_block_size", "");
        if (max_block_size_str.empty())
        {
            process_error("Empty max_block_size specified");
            return;
        }
        max_block_size = parse<size_t>(max_block_size_str);
    }

    std::string columns = params.get("columns");
    std::unique_ptr<Block> sample_block;
    try
    {
        sample_block = parseColumns(std::move(columns));
    }
    catch (const Exception & ex)
    {
        process_error("Invalid 'columns' parameter in request body '" + ex.message() + "'");
        LOG_WARNING(log, ex.getStackTrace().toString());
        return;
    }

    std::string format = params.get("format", "RowBinary");
    std::string query = params.get("query");
    LOG_TRACE(log, "Query: " << query);

    std::string connection_string = params.get("connection_string");
    LOG_TRACE(log, "Connection string: '" << connection_string << "'");

    WriteBufferFromHTTPServerResponse out(request, response, keep_alive_timeout);
    try
    {
        BlockOutputStreamPtr writer = FormatFactory::instance().getOutput(format, out, *sample_block, *context);
        auto pool = getPool(connection_string);
        ODBCBlockInputStream inp(pool->get(), query, *sample_block, max_block_size);
        copyData(inp, *writer);
    }
    catch (...)
    {
        auto message = getCurrentExceptionMessage(true);
        response.setStatusAndReason(
            Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); // can't call process_error, bacause of too soon response sending
        writeStringBinary(message, out);
        tryLogCurrentException(log);
    }
}
Esempio n. 9
0
void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed)
{
    std::lock_guard lock(reload_mutex);

    FilesChangesTracker new_files = getNewFileList();
    if (force || need_reload_from_zk || new_files.isDifferOrNewerThan(files))
    {
        ConfigProcessor config_processor(path);
        ConfigProcessor::LoadedConfig loaded_config;
        try
        {
            LOG_DEBUG(log, "Loading config `" << path << "'");

            loaded_config = config_processor.loadConfig(/* allow_zk_includes = */ true);
            if (loaded_config.has_zk_includes)
                loaded_config = config_processor.loadConfigWithZooKeeperIncludes(
                    zk_node_cache, zk_changed_event, fallback_to_preprocessed);
        }
        catch (const Coordination::Exception & e)
        {
            if (Coordination::isHardwareError(e.code))
                need_reload_from_zk = true;

            if (throw_on_error)
                throw;

            tryLogCurrentException(log, "ZooKeeper error when loading config from `" + path + "'");
            return;
        }
        catch (...)
        {
            if (throw_on_error)
                throw;

            tryLogCurrentException(log, "Error loading config from `" + path + "'");
            return;
        }
        config_processor.savePreprocessedConfig(loaded_config, preprocessed_dir);

        /** We should remember last modification time if and only if config was sucessfully loaded
         * Otherwise a race condition could occur during config files update:
         *  File is contain raw (and non-valid) data, therefore config is not applied.
         *  When file has been written (and contain valid data), we don't load new data since modification time remains the same.
         */
        if (!loaded_config.loaded_from_preprocessed)
        {
            files = std::move(new_files);
            need_reload_from_zk = false;
        }

        try
        {
            updater(loaded_config.configuration);
        }
        catch (...)
        {
            if (throw_on_error)
                throw;
            tryLogCurrentException(log, "Error updating configuration from `" + path + "' config.");
        }
    }
}
void BackgroundProcessingPool::threadFunction()
{
	setThreadName("BackgrProcPool");

	MemoryTracker memory_tracker;
	memory_tracker.setMetric(CurrentMetrics::MemoryTrackingInBackgroundProcessingPool);
	current_memory_tracker = &memory_tracker;

	std::mt19937 rng(reinterpret_cast<intptr_t>(&rng));
	std::this_thread::sleep_for(std::chrono::duration<double>(std::uniform_real_distribution<double>(0, sleep_seconds_random_part)(rng)));

	while (!shutdown)
	{
		bool done_work = false;
		TaskHandle task;

		try
		{
			Poco::Timestamp min_time;

			{
				std::unique_lock<std::mutex> lock(tasks_mutex);

				if (!tasks.empty())
				{
					for (const auto & time_handle : tasks)
					{
						if (!time_handle.second->removed)
						{
							min_time = time_handle.first;
							task = time_handle.second;
							break;
						}
					}
				}
			}

			if (shutdown)
				break;

			if (!task)
			{
				std::unique_lock<std::mutex> lock(tasks_mutex);
				wake_event.wait_for(lock,
					std::chrono::duration<double>(sleep_seconds
						+ std::uniform_real_distribution<double>(0, sleep_seconds_random_part)(rng)));
				continue;
			}

			/// No tasks ready for execution.
			Poco::Timestamp current_time;
			if (min_time > current_time)
			{
				std::unique_lock<std::mutex> lock(tasks_mutex);
				wake_event.wait_for(lock, std::chrono::microseconds(
					min_time - current_time + std::uniform_int_distribution<uint64_t>(0, sleep_seconds_random_part * 1000000)(rng)));
			}

			Poco::ScopedReadRWLock rlock(task->rwlock);

			if (task->removed)
				continue;

			{
				CurrentMetrics::Increment metric_increment{CurrentMetrics::BackgroundPoolTask};
				done_work = task->function();
			}
		}
		catch (...)
		{
			tryLogCurrentException(__PRETTY_FUNCTION__);
		}

		if (shutdown)
			break;

		/// If task has done work, it could be executed again immediately.
		/// If not, add delay before next run.
		Poco::Timestamp next_time_to_execute = Poco::Timestamp() + (done_work ? 0 : sleep_seconds * 1000000);

		{
			std::unique_lock<std::mutex> lock(tasks_mutex);

			if (task->removed)
				continue;

			tasks.erase(task->iterator);
			task->iterator = tasks.emplace(next_time_to_execute, task);
		}
	}

	current_memory_tracker = nullptr;
}
Esempio n. 11
0
SharedLibraryPtr Compiler::getOrCount(
    const std::string & key,
    UInt32 min_count_to_compile,
    const std::string & additional_compiler_flags,
    CodeGenerator get_code,
    ReadyCallback on_ready)
{
    HashedKey hashed_key = getHash(key);

    std::lock_guard<std::mutex> lock(mutex);

    UInt32 count = ++counts[hashed_key];

    /// Is there a ready open library? Or, if the library is in the process of compiling, there will be nullptr.
    Libraries::iterator it = libraries.find(hashed_key);
    if (libraries.end() != it)
    {
        if (!it->second)
            LOG_INFO(log, "Library " << hashedKeyToFileName(hashed_key) << " is already compiling or compilation was failed.");

        /// TODO In this case, after the compilation is finished, the callback will not be called.

        return it->second;
    }

    /// Is there a file with the library left over from the previous launch?
    std::string file_name = hashedKeyToFileName(hashed_key);
    if (files.count(file_name))
    {
        std::string so_file_path = path + '/' + file_name + ".so";
        LOG_INFO(log, "Loading existing library " << so_file_path);

        SharedLibraryPtr lib(new SharedLibrary(so_file_path));
        libraries[hashed_key] = lib;
        return lib;
    }

    /// Has min_count_to_compile been reached?
    if (count >= min_count_to_compile)
    {
        /// The min_count_to_compile value of zero indicates the need for synchronous compilation.

        /// Are there any free threads?
        if (min_count_to_compile == 0 || pool.active() < pool.size())
        {
            /// Indicates that the library is in the process of compiling.
            libraries[hashed_key] = nullptr;

            LOG_INFO(log, "Compiling code " << file_name << ", key: " << key);

            if (min_count_to_compile == 0)
            {
                {
                    ext::unlock_guard<std::mutex> unlock(mutex);
                    compile(hashed_key, file_name, additional_compiler_flags, get_code, on_ready);
                }

                return libraries[hashed_key];
            }
            else
            {
                pool.schedule([=]
                {
                    try
                    {
                        compile(hashed_key, file_name, additional_compiler_flags, get_code, on_ready);
                    }
                    catch (...)
                    {
                        tryLogCurrentException("Compiler");
                    }
                });
            }
        }
        else
            LOG_INFO(log, "All threads are busy.");
    }

    return nullptr;
}
void ReplicatedMergeTreePartCheckThread::checkPart(const String & part_name)
{
    LOG_WARNING(log, "Checking part " << part_name);
    ProfileEvents::increment(ProfileEvents::ReplicatedPartChecks);

    /// If the part is still in the PreCommitted -> Committed transition, it is not lost
    /// and there is no need to go searching for it on other replicas. To definitely find the needed part
    /// if it exists (or a part containing it) we first search among the PreCommitted parts.
    auto part = storage.data.getPartIfExists(part_name, {MergeTreeDataPartState::PreCommitted});
    if (!part)
        part = storage.data.getActiveContainingPart(part_name);

    /// We do not have this or a covering part.
    if (!part)
    {
        searchForMissingPart(part_name);
    }
    /// We have this part, and it's active. We will check whether we need this part and whether it has the right data.
    else if (part->name == part_name)
    {
        auto zookeeper = storage.getZooKeeper();
        auto table_lock = storage.lockStructure(false, __PRETTY_FUNCTION__);

        /// If the part is in ZooKeeper, check its data with its checksums, and them with ZooKeeper.
        if (zookeeper->exists(storage.replica_path + "/parts/" + part_name))
        {
            LOG_WARNING(log, "Checking data of part " << part_name << ".");

            try
            {
                auto zk_checksums = MinimalisticDataPartChecksums::deserializeFrom(
                    zookeeper->get(storage.replica_path + "/parts/" + part_name + "/checksums"));
                zk_checksums.checkEqual(part->checksums, true);

                auto zk_columns = NamesAndTypesList::parse(
                    zookeeper->get(storage.replica_path + "/parts/" + part_name + "/columns"));
                if (part->columns != zk_columns)
                    throw Exception("Columns of local part " + part_name + " are different from ZooKeeper");

                checkDataPart(
                    storage.data.getFullPath() + part_name,
                    storage.data.index_granularity,
                    true,
                    storage.data.primary_key_data_types,
                    [this] { return need_stop.load(); });

                if (need_stop)
                {
                    LOG_INFO(log, "Checking part was cancelled.");
                    return;
                }

                LOG_INFO(log, "Part " << part_name << " looks good.");
            }
            catch (const Exception & e)
            {
                /// TODO Better to check error code.

                tryLogCurrentException(log, __PRETTY_FUNCTION__);

                LOG_ERROR(log, "Part " << part_name << " looks broken. Removing it and queueing a fetch.");
                ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed);

                storage.removePartAndEnqueueFetch(part_name);

                /// Delete part locally.
                storage.data.forgetPartAndMoveToDetached(part, "broken_");
            }
        }
        else if (part->modification_time + MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER < time(nullptr))
        {
            /// If the part is not in ZooKeeper, delete it locally.
            /// Probably, someone just wrote down the part, and has not yet added to ZK.
            /// Therefore, delete only if the part is old (not very reliable).
            ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed);

            LOG_ERROR(log, "Unexpected part " << part_name << " in filesystem. Removing.");
            storage.data.forgetPartAndMoveToDetached(part, "unexpected_");
        }
        else
        {
            /// TODO You need to make sure that the part is still checked after a while.
            /// Otherwise, it's possible that the part was not added to ZK,
            ///  but remained in the filesystem and in a number of active parts.
            /// And then for a long time (before restarting), the data on the replicas will be different.

            LOG_TRACE(log, "Young part " << part_name
                << " with age " << (time(nullptr) - part->modification_time)
                << " seconds hasn't been added to ZooKeeper yet. It's ok.");
        }
    }
    else
    {
        /// If we have a covering part, ignore all the problems with this part.
        /// In the worst case, errors will still appear `old_parts_lifetime` seconds in error log until the part is removed as the old one.
        LOG_WARNING(log, "We have part " << part->name << " covering part " << part_name);
    }
}
void ReplicatedMergeTreeAlterThread::run()
{
    try
    {
        /** We have a description of columns in ZooKeeper, common for all replicas (Example: /clickhouse/tables/02-06/visits/columns),
          *  as well as a description of columns in local file with metadata (storage.data.getColumnsList()).
          *
          * If these descriptions are different - you need to do ALTER.
          *
          * If stored version of the node (columns_version) differs from the version in ZK,
          *  then the description of the columns in ZK does not necessarily differ from the local
          *  - this can happen with a loop from ALTER-s, which as a whole, does not change anything.
          * In this case, you need to update the stored version number,
          *  and also check the structure of parts, and, if necessary, make ALTER.
          *
          * Recorded version number needs to be updated after updating the metadata, under lock.
          * This version number is checked against the current one for INSERT.
          * That is, we make sure to insert blocks with the correct structure.
          *
          * When the server starts, previous ALTER might not have been completed.
          * Therefore, for the first time, regardless of the changes, we check the structure of all parts,
          *  (Example: /clickhouse/tables/02-06/visits/replicas/example02-06-1.yandex.ru/parts/20140806_20140831_131664_134988_3296/columns)
          *  and do ALTER if necessary.
          *
          * TODO: Too complicated, rewrite everything.
          */

        auto zookeeper = storage.getZooKeeper();

        String columns_path = storage.zookeeper_path + "/columns";
        auto columns_znode = zk_node_cache.get(columns_path, task->getWatchCallback());
        if (!columns_znode.exists)
            throw Exception(columns_path + " doesn't exist", ErrorCodes::NOT_FOUND_NODE);
        int32_t columns_version = columns_znode.stat.version;

        String metadata_path = storage.zookeeper_path + "/metadata";
        auto metadata_znode = zk_node_cache.get(metadata_path, task->getWatchCallback());
        if (!metadata_znode.exists)
            throw Exception(metadata_path + " doesn't exist", ErrorCodes::NOT_FOUND_NODE);
        int32_t metadata_version = metadata_znode.stat.version;

        const bool changed_columns_version = (columns_version != storage.columns_version);
        const bool changed_metadata_version = (metadata_version != storage.metadata_version);

        if (!(changed_columns_version || changed_metadata_version || force_recheck_parts))
            return;

        const String & columns_str = columns_znode.contents;
        auto columns_in_zk = ColumnsDescription::parse(columns_str);

        const String & metadata_str = metadata_znode.contents;
        auto metadata_in_zk = ReplicatedMergeTreeTableMetadata::parse(metadata_str);
        auto metadata_diff = ReplicatedMergeTreeTableMetadata(storage.data).checkAndFindDiff(metadata_in_zk, /* allow_alter = */ true);

        /// If you need to lock table structure, then suspend merges.
        ActionLock merge_blocker = storage.merger_mutator.actions_blocker.cancel();

        MergeTreeData::DataParts parts;

        /// If metadata nodes have changed, we will update table structure locally.
        if (changed_columns_version || changed_metadata_version)
        {
            /// Temporarily cancel part checks to avoid locking for long time.
            auto temporarily_stop_part_checks = storage.part_check_thread.temporarilyStop();

            /// Temporarily cancel parts sending
            ActionLock data_parts_exchange_blocker;
            if (storage.data_parts_exchange_endpoint_holder)
                data_parts_exchange_blocker = storage.data_parts_exchange_endpoint_holder->getBlocker().cancel();

            /// Temporarily cancel part fetches
            auto fetches_blocker = storage.fetcher.blocker.cancel();

            LOG_INFO(log, "Version of metadata nodes in ZooKeeper changed. Waiting for structure write lock.");

            auto table_lock = storage.lockExclusively(RWLockImpl::NO_QUERY);

            if (columns_in_zk == storage.getColumns() && metadata_diff.empty())
            {
                LOG_INFO(log, "Metadata nodes changed in ZooKeeper, but their contents didn't change. "
                    "Most probably it is a cyclic ALTER.");
            }
            else
            {
                LOG_INFO(log, "Metadata changed in ZooKeeper. Applying changes locally.");

                storage.setTableStructure(std::move(columns_in_zk), metadata_diff);

                LOG_INFO(log, "Applied changes to the metadata of the table.");
            }

            /// You need to get a list of parts under table lock to avoid race condition with merge.
            parts = storage.data.getDataParts();

            storage.columns_version = columns_version;
            storage.metadata_version = metadata_version;
        }

        /// Update parts.
        if (changed_columns_version || force_recheck_parts)
        {
            auto table_lock = storage.lockStructureForShare(false, RWLockImpl::NO_QUERY);

            if (changed_columns_version)
                LOG_INFO(log, "ALTER-ing parts");

            int changed_parts = 0;

            if (!changed_columns_version)
                parts = storage.data.getDataParts();

            const auto columns_for_parts = storage.getColumns().getAllPhysical();
            const auto indices_for_parts = storage.getIndicesDescription();

            for (const MergeTreeData::DataPartPtr & part : parts)
            {
                /// Update the part and write result to temporary files.
                /// TODO: You can skip checking for too large changes if ZooKeeper has, for example,
                /// node /flags/force_alter.
                auto transaction = storage.data.alterDataPart(part, columns_for_parts, indices_for_parts.indices, false);
                if (!transaction)
                    continue;

                storage.updatePartHeaderInZooKeeperAndCommit(zookeeper, *transaction);

                ++changed_parts;
            }

            /// Columns sizes could be quietly changed in case of MODIFY/ADD COLUMN
            storage.data.recalculateColumnSizes();

            if (changed_columns_version)
            {
                if (changed_parts != 0)
                    LOG_INFO(log, "ALTER-ed " << changed_parts << " parts");
                else
                    LOG_INFO(log, "No parts ALTER-ed");
            }
        }

        /// Update metadata ZK nodes for a specific replica.
        if (changed_columns_version || force_recheck_parts)
            zookeeper->set(storage.replica_path + "/columns", columns_str);
        if (changed_metadata_version || force_recheck_parts)
            zookeeper->set(storage.replica_path + "/metadata", metadata_str);

        force_recheck_parts = false;
    }
    catch (const Coordination::Exception & e)
    {
        tryLogCurrentException(log, __PRETTY_FUNCTION__);

        if (e.code == Coordination::ZSESSIONEXPIRED)
            return;

        force_recheck_parts = true;
        task->scheduleAfter(ALTER_ERROR_SLEEP_MS);
    }
    catch (...)
    {
        tryLogCurrentException(log, __PRETTY_FUNCTION__);

        force_recheck_parts = true;
        task->scheduleAfter(ALTER_ERROR_SLEEP_MS);
    }
}
Esempio n. 14
0
void ExternalLoader::reloadAndUpdate(bool throw_on_error)
{
    reloadFromConfigFiles(throw_on_error);

    /// list of recreated loadable objects to perform delayed removal from unordered_map
    std::list<std::string> recreated_failed_loadable_objects;

    std::unique_lock<std::mutex> all_lock(all_mutex);

    /// retry loading failed loadable objects
    for (auto & failed_loadable_object : failed_loadable_objects)
    {
        if (std::chrono::system_clock::now() < failed_loadable_object.second.next_attempt_time)
            continue;

        const auto & name = failed_loadable_object.first;

        try
        {
            auto loadable_ptr = failed_loadable_object.second.loadable->clone();
            if (const auto exception_ptr = loadable_ptr->getCreationException())
            {
                /// recalculate next attempt time
                std::uniform_int_distribution<UInt64> distribution(
                        0, static_cast<UInt64>(std::exp2(failed_loadable_object.second.error_count)));

                std::chrono::seconds delay(std::min<UInt64>(
                        update_settings.backoff_max_sec,
                        update_settings.backoff_initial_sec + distribution(rnd_engine)));
                failed_loadable_object.second.next_attempt_time = std::chrono::system_clock::now() + delay;

                ++failed_loadable_object.second.error_count;

                std::rethrow_exception(exception_ptr);
            }
            else
            {
                const std::lock_guard<std::mutex> lock{map_mutex};

                const auto & lifetime = loadable_ptr->getLifetime();
                std::uniform_int_distribution<UInt64> distribution{lifetime.min_sec, lifetime.max_sec};
                update_times[name] = std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)};

                const auto dict_it = loadable_objects.find(name);

                dict_it->second.loadable.reset();
                dict_it->second.loadable = std::move(loadable_ptr);

                /// clear stored exception on success
                dict_it->second.exception = std::exception_ptr{};

                recreated_failed_loadable_objects.push_back(name);
            }
        }
        catch (...)
        {
            tryLogCurrentException(log, "Failed reloading '" + name + "' " + object_name);

            if (throw_on_error)
                throw;
        }
    }

    /// do not undertake further attempts to recreate these loadable objects
    for (const auto & name : recreated_failed_loadable_objects)
        failed_loadable_objects.erase(name);

    /// periodic update
    for (auto & loadable_object : loadable_objects)
    {
        const auto & name = loadable_object.first;

        try
        {
            /// If the loadable objects failed to load or even failed to initialize from the config.
            if (!loadable_object.second.loadable)
                continue;

            auto current = loadable_object.second.loadable;
            const auto & lifetime = current->getLifetime();

            /// do not update loadable objects with zero as lifetime
            if (lifetime.min_sec == 0 || lifetime.max_sec == 0)
                continue;

            if (current->supportUpdates())
            {
                auto & update_time = update_times[current->getName()];

                /// check that timeout has passed
                if (std::chrono::system_clock::now() < update_time)
                    continue;

                SCOPE_EXIT({
                        /// calculate next update time
                        std::uniform_int_distribution<UInt64> distribution{lifetime.min_sec, lifetime.max_sec};
                        update_time = std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)};
                           });

                /// check source modified
                if (current->isModified())
                {
                    /// create new version of loadable object
                    auto new_version = current->clone();

                    if (const auto exception_ptr = new_version->getCreationException())
                        std::rethrow_exception(exception_ptr);

                    loadable_object.second.loadable.reset();
                    loadable_object.second.loadable = std::move(new_version);
                }
            }

            /// erase stored exception on success
            loadable_object.second.exception = std::exception_ptr{};
        }
Esempio n. 15
0
void tryLogCurrentException(const char * log_name, const std::string & start_of_message)
{
    tryLogCurrentException(&Logger::get(log_name), start_of_message);
}
Esempio n. 16
0
void ExternalDictionaries::reloadImpl(const bool throw_on_error)
{
    const auto config_paths = getDictionariesConfigPaths(Poco::Util::Application::instance().config());

    for (const auto & config_path : config_paths)
    {
        try
        {
            reloadFromFile(config_path, throw_on_error);
        }
        catch (...)
        {
            tryLogCurrentException(log, "reloadFromFile has thrown while reading from " + config_path);

            if (throw_on_error)
                throw;
        }
    }

    /// list of recreated dictionaries to perform delayed removal from unordered_map
    std::list<std::string> recreated_failed_dictionaries;

    /// retry loading failed dictionaries
    for (auto & failed_dictionary : failed_dictionaries)
    {
        if (std::chrono::system_clock::now() < failed_dictionary.second.next_attempt_time)
            continue;

        const auto & name = failed_dictionary.first;

        try
        {
            auto dict_ptr = failed_dictionary.second.dict->clone();
            if (const auto exception_ptr = dict_ptr->getCreationException())
            {
                /// recalculate next attempt time
                std::uniform_int_distribution<UInt64> distribution(
                    0, std::exp2(failed_dictionary.second.error_count));

                failed_dictionary.second.next_attempt_time = std::chrono::system_clock::now() +
                    std::chrono::seconds{
                        std::min<UInt64>(backoff_max_sec, backoff_initial_sec + distribution(rnd_engine))};

                ++failed_dictionary.second.error_count;

                std::rethrow_exception(exception_ptr);
            }
            else
            {
                const std::lock_guard<std::mutex> lock{dictionaries_mutex};

                const auto & lifetime = dict_ptr->getLifetime();
                std::uniform_int_distribution<UInt64> distribution{lifetime.min_sec, lifetime.max_sec};
                update_times[name] = std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)};

                const auto dict_it = dictionaries.find(name);
                if (dict_it->second.dict)
                    dict_it->second.dict->set(dict_ptr.release());
                else
                    dict_it->second.dict = std::make_shared<MultiVersion<IDictionaryBase>>(dict_ptr.release());

                /// erase stored exception on success
                dict_it->second.exception = std::exception_ptr{};

                recreated_failed_dictionaries.push_back(name);
            }
        }
        catch (...)
        {
            tryLogCurrentException(log, "Failed reloading '" + name + "' dictionary");

            if (throw_on_error)
                throw;
        }
    }

    /// do not undertake further attempts to recreate these dictionaries
    for (const auto & name : recreated_failed_dictionaries)
        failed_dictionaries.erase(name);

    /// periodic update
    for (auto & dictionary : dictionaries)
    {
        const auto & name = dictionary.first;

        try
        {
            /// If the dictionary failed to load or even failed to initialize from the config.
            if (!dictionary.second.dict)
                continue;

            auto current = dictionary.second.dict->get();
            const auto & lifetime = current->getLifetime();

            /// do not update dictionaries with zero as lifetime
            if (lifetime.min_sec == 0 || lifetime.max_sec == 0)
                continue;

            /// update only non-cached dictionaries
            if (!current->isCached())
            {
                auto & update_time = update_times[current->getName()];

                /// check that timeout has passed
                if (std::chrono::system_clock::now() < update_time)
                    continue;

                SCOPE_EXIT({
                    /// calculate next update time
                    std::uniform_int_distribution<UInt64> distribution{lifetime.min_sec, lifetime.max_sec};
                    update_time = std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)};
                });

                /// check source modified
                if (current->getSource()->isModified())
                {
                    /// create new version of dictionary
                    auto new_version = current->clone();

                    if (const auto exception_ptr = new_version->getCreationException())
                        std::rethrow_exception(exception_ptr);

                    dictionary.second.dict->set(new_version.release());
                }
            }

            /// erase stored exception on success
            dictionary.second.exception = std::exception_ptr{};
        }
void ReplicatedMergeTreePartCheckThread::run()
{
    if (need_stop)
        return;

    try
    {
        time_t current_time = time(nullptr);

        /// Take part from the queue for verification.
        PartsToCheckQueue::iterator selected = parts_queue.end();    /// end from std::list is not get invalidated
        time_t min_check_time = std::numeric_limits<time_t>::max();

        {
            std::lock_guard<std::mutex> lock(parts_mutex);

            if (parts_queue.empty())
            {
                if (!parts_set.empty())
                {
                    LOG_ERROR(log, "Non-empty parts_set with empty parts_queue. This is a bug.");
                    parts_set.clear();
                }
            }
            else
            {
                for (auto it = parts_queue.begin(); it != parts_queue.end(); ++it)
                {
                    if (it->second <= current_time)
                    {
                        selected = it;
                        break;
                    }

                    if (it->second < min_check_time)
                        min_check_time = it->second;
                }
            }
        }

        if (selected == parts_queue.end())
            return;

        checkPart(selected->first);

        if (need_stop)
            return;

        /// Remove the part from check queue.
        {
            std::lock_guard<std::mutex> lock(parts_mutex);

            if (parts_queue.empty())
            {
                LOG_ERROR(log, "Someone erased cheking part from parts_queue. This is a bug.");
            }
            else
            {
                parts_set.erase(selected->first);
                parts_queue.erase(selected);
            }
        }

        task->schedule();
    }
    catch (const zkutil::KeeperException & e)
    {
        tryLogCurrentException(log, __PRETTY_FUNCTION__);

        if (e.code == ZooKeeperImpl::ZooKeeper::ZSESSIONEXPIRED)
            return;

        task->scheduleAfter(PART_CHECK_ERROR_SLEEP_MS);
    }
    catch (...)
    {
        tryLogCurrentException(log, __PRETTY_FUNCTION__);
        task->scheduleAfter(PART_CHECK_ERROR_SLEEP_MS);
    }
}
void ReplicatedMergeTreeRestartingThread::run()
{
	constexpr auto retry_period_ms = 10 * 1000;

	/// Периодичность проверки истечения сессии в ZK.
	time_t check_period_ms = 60 * 1000;

	/// Периодичность проверки величины отставания реплики.
	if (check_period_ms > static_cast<time_t>(storage.data.settings.check_delay_period) * 1000)
		check_period_ms = storage.data.settings.check_delay_period * 1000;

	setThreadName("ReplMTRestart");

	try
	{
		bool first_time = true;					/// Активация реплики в первый раз.
		bool need_restart = false;				/// Перезапуск по собственной инициативе, чтобы отдать лидерство.
		time_t prev_time_of_check_delay = 0;

		/// Запуск реплики при старте сервера/создании таблицы. Перезапуск реплики при истечении сессии с ZK.
		while (!need_stop)
		{
			if (first_time || need_restart || storage.getZooKeeper()->expired())
			{
				if (first_time)
				{
					LOG_DEBUG(log, "Activating replica.");
				}
				else
				{
					if (need_restart)
						LOG_WARNING(log, "Will reactivate replica.");
					else
						LOG_WARNING(log, "ZooKeeper session has expired. Switching to a new session.");

					if (!storage.is_readonly)
						CurrentMetrics::add(CurrentMetrics::ReadonlyReplica);
					storage.is_readonly = true;
					partialShutdown();
				}

				while (true)
				{
					try
					{
						storage.setZooKeeper(storage.context.getZooKeeper());
					}
					catch (const zkutil::KeeperException & e)
					{
						/// Исключение при попытке zookeeper_init обычно бывает, если не работает DNS. Будем пытаться сделать это заново.
						tryLogCurrentException(__PRETTY_FUNCTION__);

						wakeup_event.tryWait(retry_period_ms);
						continue;
					}

					if (!need_stop && !tryStartup())
					{
						wakeup_event.tryWait(retry_period_ms);
						continue;
					}

					break;
				}

				if (storage.is_readonly)
					CurrentMetrics::sub(CurrentMetrics::ReadonlyReplica);
				storage.is_readonly = false;
				first_time = false;
				need_restart = false;
			}

			time_t current_time = time(0);
			if (current_time >= prev_time_of_check_delay + static_cast<time_t>(storage.data.settings.check_delay_period))
			{
				/// Выясняем отставания реплик.
				time_t absolute_delay = 0;
				time_t relative_delay = 0;

				bool error = false;
				try
				{
					storage.getReplicaDelays(absolute_delay, relative_delay);
					LOG_TRACE(log, "Absolute delay: " << absolute_delay << ". Relative delay: " << relative_delay << ".");
				}
				catch (...)
				{
					tryLogCurrentException(__PRETTY_FUNCTION__, "Cannot get replica delays");
					error = true;
				}

				prev_time_of_check_delay = current_time;

				/// Уступаем лидерство, если относительное отставание больше порога.
				if (storage.is_leader_node
					&& (error || relative_delay > static_cast<time_t>(storage.data.settings.min_relative_delay_to_yield_leadership)))
				{
					if (error)
						LOG_INFO(log, "Will yield leadership.");
					else
						LOG_INFO(log, "Relative replica delay (" << relative_delay << " seconds) is bigger than threshold ("
							<< storage.data.settings.min_relative_delay_to_yield_leadership << "). Will yield leadership.");

					ProfileEvents::increment(ProfileEvents::ReplicaYieldLeadership);

					need_restart = true;
					continue;
				}
			}

			wakeup_event.tryWait(check_period_ms);
		}
	}
	catch (...)
	{
		tryLogCurrentException("StorageReplicatedMergeTree::restartingThread");
		LOG_ERROR(log, "Unexpected exception in restartingThread. The storage will be readonly until server restart.");
		goReadOnlyPermanently();
		LOG_DEBUG(log, "Restarting thread finished");
		return;
	}

	try
	{
		storage.endpoint_holder->cancel();
		storage.endpoint_holder = nullptr;

		storage.disk_space_monitor_endpoint_holder->cancel();
		storage.disk_space_monitor_endpoint_holder = nullptr;

		storage.sharded_partition_uploader_endpoint_holder->cancel();
		storage.sharded_partition_uploader_endpoint_holder = nullptr;

		storage.remote_query_executor_endpoint_holder->cancel();
		storage.remote_query_executor_endpoint_holder = nullptr;

		storage.remote_part_checker_endpoint_holder->cancel();
		storage.remote_part_checker_endpoint_holder = nullptr;

		partialShutdown();
	}
	catch (...)
	{
		tryLogCurrentException(__PRETTY_FUNCTION__);
	}

	LOG_DEBUG(log, "Restarting thread finished");
}
void ReplicatedMergeTreeAlterThread::run()
{
	setThreadName("ReplMTAlter");

	bool force_recheck_parts = true;

	while (!need_stop)
	{
		try
		{
			/** Имеем описание столбцов в ZooKeeper, общее для всех реплик (Пример: /clickhouse/tables/02-06/visits/columns),
			  *  а также описание столбцов в локальном файле с метаданными (storage.data.getColumnsList()).
			  *
			  * Если эти описания отличаются - нужно сделать ALTER.
			  *
			  * Если запомненная версия ноды (columns_version) отличается от версии в ZK,
			  *  то описание столбцов в ZK не обязательно отличается от локального
			  *  - такое может быть при цикле из ALTER-ов, который в целом, ничего не меняет.
			  * В этом случае, надо обновить запомненный номер версии,
			  *  а также всё-равно проверить структуру кусков, и, при необходимости, сделать ALTER.
			  *
			  * Запомненный номер версии нужно обновить после обновления метаданных, под блокировкой.
			  * Этот номер версии проверяется на соответствие актуальному при INSERT-е.
			  * То есть, так добиваемся, чтобы вставлялись блоки с правильной структурой.
			  *
			  * При старте сервера, мог быть не завершён предыдущий ALTER.
			  * Поэтому, в первый раз, независимо от изменений, проверяем структуру всех part-ов,
			  *  (Пример: /clickhouse/tables/02-06/visits/replicas/example02-06-1.yandex.ru/parts/20140806_20140831_131664_134988_3296/columns)
			  *  и делаем ALTER, если необходимо.
			  *
			  * TODO: Слишком сложно, всё переделать.
			  */

			auto zookeeper = storage.getZooKeeper();

			zkutil::Stat stat;
			const String columns_str = zookeeper->get(storage.zookeeper_path + "/columns", &stat, wakeup_event);
			auto columns_desc = ColumnsDescription<true>::parse(columns_str);

			auto & columns = columns_desc.columns;
			auto & materialized_columns = columns_desc.materialized;
			auto & alias_columns = columns_desc.alias;
			auto & column_defaults = columns_desc.defaults;

			bool changed_version = (stat.version != storage.columns_version);

			{
				/// Если потребуется блокировать структуру таблицы, то приостановим мерджи.
				std::unique_ptr<MergeTreeMergeBlocker> merge_blocker;
				std::unique_ptr<MergeTreeMergeBlocker> unreplicated_merge_blocker;

				if (changed_version || force_recheck_parts)
				{
					merge_blocker = std::make_unique<MergeTreeMergeBlocker>(storage.merger);
					if (storage.unreplicated_merger)
						unreplicated_merge_blocker = std::make_unique<MergeTreeMergeBlocker>(*storage.unreplicated_merger);
				}

				MergeTreeData::DataParts parts;

				/// Если описание столбцов изменилось, обновим структуру таблицы локально.
				if (changed_version)
				{
					LOG_INFO(log, "Changed version of 'columns' node in ZooKeeper. Waiting for structure write lock.");

					auto table_lock = storage.lockStructureForAlter();

					const auto columns_changed = columns != storage.data.getColumnsListNonMaterialized();
					const auto materialized_columns_changed = materialized_columns != storage.data.materialized_columns;
					const auto alias_columns_changed = alias_columns != storage.data.alias_columns;
					const auto column_defaults_changed = column_defaults != storage.data.column_defaults;

					if (columns_changed || materialized_columns_changed || alias_columns_changed ||
						column_defaults_changed)
					{
						LOG_INFO(log, "Columns list changed in ZooKeeper. Applying changes locally.");

						storage.context.getDatabase(storage.database_name)->alterTable(
							storage.context, storage.table_name,
							columns, materialized_columns, alias_columns, column_defaults, {});

						if (columns_changed)
						{
							storage.data.setColumnsList(columns);

							if (storage.unreplicated_data)
								storage.unreplicated_data->setColumnsList(columns);
						}

						if (materialized_columns_changed)
						{
							storage.materialized_columns = materialized_columns;
							storage.data.materialized_columns = std::move(materialized_columns);
						}

						if (alias_columns_changed)
						{
							storage.alias_columns = alias_columns;
							storage.data.alias_columns = std::move(alias_columns);
						}

						if (column_defaults_changed)
						{
							storage.column_defaults = column_defaults;
							storage.data.column_defaults = std::move(column_defaults);
						}

						LOG_INFO(log, "Applied changes to table.");
					}
					else
					{
						LOG_INFO(log, "Columns version changed in ZooKeeper, but data wasn't changed. It's like cyclic ALTERs.");
					}

					/// Нужно получить список кусков под блокировкой таблицы, чтобы избежать race condition с мерджем.
					parts = storage.data.getDataParts();

					storage.columns_version = stat.version;
				}

				/// Обновим куски.
				if (changed_version || force_recheck_parts)
				{
					auto table_lock = storage.lockStructure(false);

					if (changed_version)
						LOG_INFO(log, "ALTER-ing parts");

					int changed_parts = 0;

					if (!changed_version)
						parts = storage.data.getDataParts();

					const auto columns_plus_materialized = storage.data.getColumnsList();

					for (const MergeTreeData::DataPartPtr & part : parts)
					{
						/// Обновим кусок и запишем результат во временные файлы.
						/// TODO: Можно пропускать проверку на слишком большие изменения, если в ZooKeeper есть, например,
						///  нода /flags/force_alter.
						auto transaction = storage.data.alterDataPart(
							part, columns_plus_materialized, storage.data.primary_expr_ast, false);

						if (!transaction)
							continue;

						++changed_parts;

						/// Обновим метаданные куска в ZooKeeper.
						zkutil::Ops ops;
						ops.push_back(new zkutil::Op::SetData(
							storage.replica_path + "/parts/" + part->name + "/columns", transaction->getNewColumns().toString(), -1));
						ops.push_back(new zkutil::Op::SetData(
							storage.replica_path + "/parts/" + part->name + "/checksums", transaction->getNewChecksums().toString(), -1));

						try
						{
							zookeeper->multi(ops);
						}
						catch (const zkutil::KeeperException & e)
						{
							/// Куска не существует в ZK. Добавим в очередь для проверки - может быть, кусок лишний, и его надо убрать локально.
							if (e.code == ZNONODE)
								storage.enqueuePartForCheck(part->name);

							throw;
						}

						/// Применим изменения файлов.
						transaction->commit();
					}

					/// То же самое для нереплицируемых данных.
					if (storage.unreplicated_data)
					{
						parts = storage.unreplicated_data->getDataParts();

						for (const MergeTreeData::DataPartPtr & part : parts)
						{
							auto transaction = storage.unreplicated_data->alterDataPart(
								part, columns_plus_materialized, storage.data.primary_expr_ast, false);

							if (!transaction)
								continue;

							++changed_parts;

							transaction->commit();
						}
					}

					/// Список столбцов для конкретной реплики.
					zookeeper->set(storage.replica_path + "/columns", columns_str);

					if (changed_version)
					{
						if (changed_parts != 0)
							LOG_INFO(log, "ALTER-ed " << changed_parts << " parts");
						else
							LOG_INFO(log, "No parts ALTER-ed");
					}

					force_recheck_parts = false;
				}

				/// Важно, что уничтожается parts и merge_blocker перед wait-ом.
			}

			wakeup_event->wait();
		}
		catch (...)
		{
			tryLogCurrentException(__PRETTY_FUNCTION__);

			force_recheck_parts = true;

			wakeup_event->tryWait(ALTER_ERROR_SLEEP_MS);
		}
	}

	LOG_DEBUG(log, "Alter thread finished");
}