ReadBufferFromFile::ReadBufferFromFile( const std::string & file_name_, size_t buf_size, int flags, char * existing_memory, size_t alignment) : ReadBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment), file_name(file_name_) { ProfileEvents::increment(ProfileEvents::FileOpen); #ifdef __APPLE__ bool o_direct = (flags != -1) && (flags & O_DIRECT); if (o_direct) { flags = flags & ~O_DIRECT; } #endif fd = open(file_name.c_str(), flags == -1 ? O_RDONLY : flags); if (-1 == fd) { ProfileEvents::increment(ProfileEvents::FileOpenFailed); throwFromErrno("Cannot open file " + file_name, errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE); } #ifdef __APPLE__ if (o_direct) { if (fcntl(fd, F_NOCACHE, 1) == -1) { ProfileEvents::increment(ProfileEvents::FileOpenFailed); throwFromErrno("Cannot set F_NOCACHE on file " + file_name, ErrorCodes::CANNOT_OPEN_FILE); } } #endif }
StatusFile::StatusFile(const std::string & path_) : path(path_) { /// Если файл уже существует. NOTE Незначительный race condition. if (Poco::File(path).exists()) { std::string contents; { ReadBufferFromFile in(path, 1024); LimitReadBuffer limit_in(in, 1024); WriteBufferFromString out(contents); copyData(limit_in, out); } if (!contents.empty()) LOG_INFO(&Logger::get("StatusFile"), "Status file " << path << " already exists - unclean restart. Contents:\n" << contents); else LOG_INFO(&Logger::get("StatusFile"), "Status file " << path << " already exists and is empty - probably unclean hardware restart."); } fd = open(path.c_str(), O_WRONLY | O_CREAT, 0666); if (-1 == fd) throwFromErrno("Cannot open file " + path); try { int flock_ret = flock(fd, LOCK_EX | LOCK_NB); if (-1 == flock_ret) { if (errno == EWOULDBLOCK) throw Exception("Cannot lock file " + path + ". Another server instance in same directory is already running."); else throwFromErrno("Cannot lock file " + path); } if (0 != ftruncate(fd, 0)) throwFromErrno("Cannot ftruncate " + path); if (0 != lseek(fd, 0, SEEK_SET)) throwFromErrno("Cannot lseek " + path); /// Записываем в файл информацию о текущем экземпляре сервера. { WriteBufferFromFileDescriptor out(fd, 1024); out << "PID: " << getpid() << "\n" << "Started at: " << LocalDateTime(time(0)) << "\n" << "Revision: " << ClickHouseRevision::get() << "\n"; } } catch (...) { close(fd); throw; } }
BlockIO InterpreterSystemQuery::execute() { auto & query = typeid_cast<ASTSystemQuery &>(*query_ptr); using Type = ASTSystemQuery::Type; switch (query.type) { case Type::SHUTDOWN: if (kill(0, SIGTERM)) throwFromErrno("System call kill(0, SIGTERM) failed", ErrorCodes::CANNOT_KILL); break; case Type::KILL: if (kill(0, SIGKILL)) throwFromErrno("System call kill(0, SIGKILL) failed", ErrorCodes::CANNOT_KILL); break; case Type::DROP_DNS_CACHE: DNSCache::instance().drop(); /// Reinitialize clusters to update their resolved_addresses context.reloadClusterConfig(); break; case Type::DROP_MARK_CACHE: context.dropMarkCache(); break; case Type::DROP_UNCOMPRESSED_CACHE: context.dropUncompressedCache(); break; case Type::RELOAD_DICTIONARY: context.getExternalDictionaries().reloadDictionary(query.target_dictionary); break; case Type::RELOAD_DICTIONARIES: { auto status = getOverallExecutionStatusOfCommands( [&] { context.getExternalDictionaries().reload(); }, [&] { context.getEmbeddedDictionaries().reload(); } ); if (status.code != 0) throw Exception(status.message, status.code); break; } case Type::STOP_LISTEN_QUERIES: case Type::START_LISTEN_QUERIES: case Type::RESTART_REPLICAS: case Type::SYNC_REPLICA: case Type::STOP_MERGES: case Type::START_MERGES: case Type::STOP_REPLICATION_QUEUES: case Type::START_REPLICATION_QUEUES: throw Exception(String(ASTSystemQuery::typeToString(query.type)) + " is not supported yet", ErrorCodes::NOT_IMPLEMENTED); default: throw Exception("Unknown type of SYSTEM query", ErrorCodes::BAD_ARGUMENTS); } return BlockIO(); }
std::unique_ptr<ShellCommand> ShellCommand::executeImpl(const char * filename, char * const argv[]) { /** Тут написано, что при обычном вызове vfork, есть шанс deadlock-а в многопоточных программах, * из-за резолвинга символов в shared-библиотеке: * http://www.oracle.com/technetwork/server-storage/solaris10/subprocess-136439.html * Поэтому, отделим резолвинг символа от вызова. */ static void * real_vfork = dlsym(RTLD_DEFAULT, "vfork"); if (!real_vfork) throwFromErrno("Cannot find symbol vfork in myself", ErrorCodes::CANNOT_DLSYM); Pipe pipe_stdin; Pipe pipe_stdout; Pipe pipe_stderr; pid_t pid = reinterpret_cast<pid_t(*)()>(real_vfork)(); if (-1 == pid) throwFromErrno("Cannot vfork", ErrorCodes::CANNOT_FORK); if (0 == pid) { /// Находимся в свежесозданном процессе. /// Почему _exit а не exit? Потому что exit вызывает atexit и деструкторы thread local storage. /// А там куча мусора (в том числе, например, блокируется mutex). А это нельзя делать после vfork - происходит deadlock. /// Заменяем файловые дескрипторы на концы наших пайпов. if (STDIN_FILENO != dup2(pipe_stdin.read_fd, STDIN_FILENO)) _exit(int(ReturnCodes::CANNOT_DUP_STDIN)); if (STDOUT_FILENO != dup2(pipe_stdout.write_fd, STDOUT_FILENO)) _exit(int(ReturnCodes::CANNOT_DUP_STDOUT)); if (STDERR_FILENO != dup2(pipe_stderr.write_fd, STDERR_FILENO)) _exit(int(ReturnCodes::CANNOT_DUP_STDERR)); execv(filename, argv); /// Если процесс запущен, то execv не возвращает сюда. _exit(int(ReturnCodes::CANNOT_EXEC)); } std::unique_ptr<ShellCommand> res(new ShellCommand(pid, pipe_stdin.write_fd, pipe_stdout.read_fd, pipe_stderr.read_fd)); /// Теперь владение файловыми дескрипторами передано в результат. pipe_stdin.write_fd = -1; pipe_stdout.read_fd = -1; pipe_stderr.read_fd = -1; return res; }
/// Используется для проверки, выставили ли ноду is_active мы, или нет. static String generateActiveNodeIdentifier() { struct timespec times; if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, ×)) throwFromErrno("Cannot clock_gettime.", ErrorCodes::CANNOT_CLOCK_GETTIME); return "pid: " + toString(getpid()) + ", random: " + toString(times.tv_nsec + times.tv_sec + getpid()); }
StorageStripeLog::StorageStripeLog( const std::string & path_, const std::string & name_, const NamesAndTypesList & columns_, const NamesAndTypesList & materialized_columns_, const NamesAndTypesList & alias_columns_, const ColumnDefaults & column_defaults_, bool attach, size_t max_compress_block_size_) : IStorage{materialized_columns_, alias_columns_, column_defaults_}, path(path_), name(name_), columns(columns_), max_compress_block_size(max_compress_block_size_), file_checker(path + escapeForFileName(name) + '/' + "sizes.json"), log(&Logger::get("StorageStripeLog")) { if (columns.empty()) throw Exception("Empty list of columns passed to StorageStripeLog constructor", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); String full_path = path + escapeForFileName(name) + '/'; if (!attach) { /// create files if they do not exist if (0 != mkdir(full_path.c_str(), S_IRWXU | S_IRWXG | S_IRWXO) && errno != EEXIST) throwFromErrno("Cannot create directory " + full_path, ErrorCodes::CANNOT_CREATE_DIRECTORY); } }
void thread(ConnectionPool::Entry connection) { Query query; try { /// In these threads we do not accept INT signal. sigset_t sig_set; if (sigemptyset(&sig_set) || sigaddset(&sig_set, SIGINT) || pthread_sigmask(SIG_BLOCK, &sig_set, nullptr)) throwFromErrno("Cannot block signal.", ErrorCodes::CANNOT_BLOCK_SIGNAL); while (true) { bool extracted = false; while (!extracted) { extracted = queue.tryPop(query, 100); if (shutdown) return; } execute(connection, query); } } catch (...) { shutdown = true; std::cerr << "An error occurred while processing query:\n" << query << "\n"; throw; } }
StorageTinyLog::StorageTinyLog( const std::string & path_, const std::string & name_, const ColumnsDescription & columns_, bool attach, size_t max_compress_block_size_) : IStorage{columns_}, path(path_), name(name_), max_compress_block_size(max_compress_block_size_), file_checker(path + escapeForFileName(name) + '/' + "sizes.json"), log(&Logger::get("StorageTinyLog")) { if (path.empty()) throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME); String full_path = path + escapeForFileName(name) + '/'; if (!attach) { /// create files if they do not exist if (0 != mkdir(full_path.c_str(), S_IRWXU | S_IRWXG | S_IRWXO) && errno != EEXIST) throwFromErrno("Cannot create directory " + full_path, ErrorCodes::CANNOT_CREATE_DIRECTORY); } for (const auto & col : getColumns().getAllPhysical()) addFiles(col.name, *col.type); }
StorageFileBlockInputStream(StorageFile & storage_, const Context & context, size_t max_block_size) : storage(storage_) { if (storage.use_table_fd) { storage.rwlock.lock(); /// We could use common ReadBuffer and WriteBuffer in storage to leverage cache /// and add ability to seek unseekable files, but cache sync isn't supported. if (storage.table_fd_was_used) /// We need seek to initial position { if (storage.table_fd_init_offset < 0) throw Exception("File descriptor isn't seekable, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); /// ReadBuffer's seek() doesn't make sence, since cache is empty if (lseek(storage.table_fd, storage.table_fd_init_offset, SEEK_SET) < 0) throwFromErrno("Cannot seek file descriptor, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); } storage.table_fd_was_used = true; read_buf = std::make_unique<ReadBufferFromFileDescriptor>(storage.table_fd); } else { storage.rwlock.lock_shared(); read_buf = std::make_unique<ReadBufferFromFile>(storage.path); } reader = FormatFactory().getInput(storage.format_name, *read_buf, storage.getSampleBlock(), context, max_block_size); }
void sync() { int result = hdfsSync(fs.get(), fout); if (result < 0) throwFromErrno("Cannot HDFS sync" + hdfs_uri.toString() + " " + std::string(hdfsGetLastError()), ErrorCodes::CANNOT_FSYNC); }
IConv(const CharsetsFromTo & charsets) { impl = iconv_open(charsets.second.data(), charsets.first.data()); if (impl == reinterpret_cast<iconv_t>(-1)) throwFromErrno("Cannot iconv_open with charsets " + charsets.first + " and " + charsets.second, ErrorCodes::BAD_ARGUMENTS); }
void DistributedBlockOutputStream::writeToShard(const Block & block, const std::vector<std::string> & dir_names) { /** tmp directory is used to ensure atomicity of transactions * and keep monitor thread out from reading incomplete data */ std::string first_file_tmp_path{}; auto first = true; const auto & query_string = queryToString(query_ast); /// write first file, hardlink the others for (const auto & dir_name : dir_names) { const auto & path = storage.getPath() + dir_name + '/'; /// ensure shard subdirectory creation and notify storage if (Poco::File(path).createDirectory()) storage.requireDirectoryMonitor(dir_name); const auto & file_name = toString(storage.file_names_increment.get()) + ".bin"; const auto & block_file_path = path + file_name; /** on first iteration write block to a temporary directory for subsequent hardlinking to ensure * the inode is not freed until we're done */ if (first) { first = false; const auto & tmp_path = path + "tmp/"; Poco::File(tmp_path).createDirectory(); const auto & block_file_tmp_path = tmp_path + file_name; first_file_tmp_path = block_file_tmp_path; WriteBufferFromFile out{block_file_tmp_path}; CompressedWriteBuffer compress{out}; NativeBlockOutputStream stream{compress, ClickHouseRevision::get()}; writeStringBinary(query_string, out); stream.writePrefix(); stream.write(block); stream.writeSuffix(); } if (link(first_file_tmp_path.data(), block_file_path.data())) throwFromErrno("Could not link " + block_file_path + " to " + first_file_tmp_path); } /** remove the temporary file, enabling the OS to reclaim inode after all threads * have removed their corresponding files */ Poco::File(first_file_tmp_path).remove(); }
void createHardLink(const String & source_path, const String & destination_path) { if (0 != link(source_path.c_str(), destination_path.c_str())) { if (errno == EEXIST) { auto link_errno = errno; struct stat source_descr; struct stat destination_descr; if (0 != lstat(source_path.c_str(), &source_descr)) throwFromErrno("Cannot stat " + source_path, ErrorCodes::CANNOT_STAT); if (0 != lstat(destination_path.c_str(), &destination_descr)) throwFromErrno("Cannot stat " + destination_path, ErrorCodes::CANNOT_STAT); if (source_descr.st_ino != destination_descr.st_ino) throwFromErrno("Destination file " + destination_path + " is already exist and have different inode.", ErrorCodes::CANNOT_LINK, link_errno); } else throwFromErrno("Cannot link " + source_path + " to " + destination_path, ErrorCodes::CANNOT_LINK); } }
int ShellCommand::tryWait() { int status = 0; if (-1 == waitpid(pid, &status, 0)) throwFromErrno("Cannot waitpid", ErrorCodes::CANNOT_WAITPID); if (WIFEXITED(status)) return WEXITSTATUS(status); if (WIFSIGNALED(status)) throw Exception("Child process was terminated by signal " + toString(WTERMSIG(status)), ErrorCodes::CHILD_WAS_NOT_EXITED_NORMALLY); if (WIFSTOPPED(status)) throw Exception("Child process was stopped by signal " + toString(WSTOPSIG(status)), ErrorCodes::CHILD_WAS_NOT_EXITED_NORMALLY); throw Exception("Child process was not exited normally by unknown reason", ErrorCodes::CHILD_WAS_NOT_EXITED_NORMALLY); }
/// Note: an additional page is allocated that will contain the data that /// does not fit into the main buffer. ReadBufferAIO::ReadBufferAIO(const std::string & filename_, size_t buffer_size_, int flags_, char * existing_memory_) : ReadBufferFromFileBase(buffer_size_ + DEFAULT_AIO_FILE_BLOCK_SIZE, existing_memory_, DEFAULT_AIO_FILE_BLOCK_SIZE), fill_buffer(BufferWithOwnMemory<ReadBuffer>(internalBuffer().size(), nullptr, DEFAULT_AIO_FILE_BLOCK_SIZE)), filename(filename_) { ProfileEvents::increment(ProfileEvents::FileOpen); int open_flags = (flags_ == -1) ? O_RDONLY : flags_; open_flags |= O_DIRECT; fd = ::open(filename.c_str(), open_flags); if (fd == -1) { auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE; throwFromErrno("Cannot open file " + filename, error_code); } }
static std::string getUserName(uid_t user_id) { /// Try to convert user id into user name. auto buffer_size = sysconf(_SC_GETPW_R_SIZE_MAX); if (buffer_size <= 0) buffer_size = 1024; std::string buffer; buffer.reserve(buffer_size); struct passwd passwd_entry; struct passwd * result = nullptr; const auto error = getpwuid_r(user_id, &passwd_entry, buffer.data(), buffer_size, &result); if (error) throwFromErrno("Failed to find user name for " + toString(user_id), ErrorCodes::FAILED_TO_GETPWUID, error); else if (result) return result->pw_name; return toString(user_id); }
void convert(const String & from_charset, const String & to_charset, const ColumnString::Chars_t & from_chars, const ColumnString::Offsets_t & from_offsets, ColumnString::Chars_t & to_chars, ColumnString::Offsets_t & to_offsets) { auto converter = getConverter(CharsetsFromTo(from_charset, to_charset)); iconv_t iconv_state = converter->impl; to_chars.resize(from_chars.size()); to_offsets.resize(from_offsets.size()); ColumnString::Offset_t current_from_offset = 0; ColumnString::Offset_t current_to_offset = 0; size_t size = from_offsets.size(); for (size_t i = 0; i < size; ++i) { size_t from_string_size = from_offsets[i] - current_from_offset - 1; /// We assume that empty string is empty in every charset. if (0 != from_string_size) { /// reset state of iconv size_t res = iconv(iconv_state, nullptr, nullptr, nullptr, nullptr); if (static_cast<size_t>(-1) == res) throwFromErrno("Cannot reset iconv", ErrorCodes::CANNOT_ICONV); /// perform conversion; resize output buffer and continue if required char * in_buf = const_cast<char *>(reinterpret_cast<const char *>(&from_chars[current_from_offset])); size_t in_bytes_left = from_string_size; char * out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]); size_t out_bytes_left = to_chars.size() - current_to_offset; while (in_bytes_left) { size_t res = iconv(iconv_state, &in_buf, &in_bytes_left, &out_buf, &out_bytes_left); current_to_offset = to_chars.size() - out_bytes_left; if (static_cast<size_t>(-1) == res) { if (E2BIG == errno) { to_chars.resize(to_chars.size() * 2); out_buf = reinterpret_cast<char *>(&to_chars[current_to_offset]); out_bytes_left = to_chars.size() - current_to_offset; continue; } throwFromErrno("Cannot convert charset", ErrorCodes::CANNOT_ICONV); } } } if (to_chars.size() < current_to_offset + 1) to_chars.resize(current_to_offset + 1); to_chars[current_to_offset] = 0; ++current_to_offset; to_offsets[i] = current_to_offset; current_from_offset = from_offsets[i]; } to_chars.resize(current_to_offset); }
BlockIO InterpreterSystemQuery::execute() { auto & query = typeid_cast<ASTSystemQuery &>(*query_ptr); using Type = ASTSystemQuery::Type; /// Use global context with fresh system profile settings Context system_context = context.getGlobalContext(); system_context.setSetting("profile", context.getSystemProfileName()); /// Make canonical query for simpler processing if (!query.target_table.empty() && query.target_database.empty()) query.target_database = context.getCurrentDatabase(); switch (query.type) { case Type::SHUTDOWN: if (kill(0, SIGTERM)) throwFromErrno("System call kill(0, SIGTERM) failed", ErrorCodes::CANNOT_KILL); break; case Type::KILL: if (kill(0, SIGKILL)) throwFromErrno("System call kill(0, SIGKILL) failed", ErrorCodes::CANNOT_KILL); break; case Type::DROP_DNS_CACHE: DNSResolver::instance().dropCache(); /// Reinitialize clusters to update their resolved_addresses system_context.reloadClusterConfig(); break; case Type::DROP_MARK_CACHE: system_context.dropMarkCache(); break; case Type::DROP_UNCOMPRESSED_CACHE: system_context.dropUncompressedCache(); break; case Type::RELOAD_DICTIONARY: system_context.getExternalDictionaries().reloadDictionary(query.target_dictionary); break; case Type::RELOAD_DICTIONARIES: { auto status = getOverallExecutionStatusOfCommands( [&] { system_context.getExternalDictionaries().reload(); }, [&] { system_context.getEmbeddedDictionaries().reload(); } ); if (status.code != 0) throw Exception(status.message, status.code); break; } case Type::RELOAD_EMBEDDED_DICTIONARIES: system_context.getEmbeddedDictionaries().reload(); break; case Type::RELOAD_CONFIG: system_context.reloadConfig(); break; case Type::STOP_MERGES: startStopAction(context, query, ActionLocks::PartsMerge, false); break; case Type::START_MERGES: startStopAction(context, query, ActionLocks::PartsMerge, true); break; case Type::STOP_FETCHES: startStopAction(context, query, ActionLocks::PartsFetch, false); break; case Type::START_FETCHES: startStopAction(context, query, ActionLocks::PartsFetch, true); break; case Type::STOP_REPLICATED_SENDS: startStopAction(context, query, ActionLocks::PartsSend, false); break; case Type::START_REPLICATEDS_SENDS: startStopAction(context, query, ActionLocks::PartsSend, false); break; case Type::STOP_REPLICATION_QUEUES: startStopAction(context, query, ActionLocks::ReplicationQueue, false); break; case Type::START_REPLICATION_QUEUES: startStopAction(context, query, ActionLocks::ReplicationQueue, true); break; case Type::SYNC_REPLICA: syncReplica(query); break; case Type::RESTART_REPLICAS: restartReplicas(system_context); break; case Type::RESTART_REPLICA: if (!tryRestartReplica(query.target_database, query.target_table, system_context)) throw Exception("There is no " + query.target_database + "." + query.target_table + " replicated table", ErrorCodes::BAD_ARGUMENTS); break; case Type::STOP_LISTEN_QUERIES: case Type::START_LISTEN_QUERIES: throw Exception(String(ASTSystemQuery::typeToString(query.type)) + " is not supported yet", ErrorCodes::NOT_IMPLEMENTED); default: throw Exception("Unknown type of SYSTEM query", ErrorCodes::BAD_ARGUMENTS); } return BlockIO(); }