TEST_F(AutoRestarterTest, RestartsOnException) { AutoCurrentContext()->Initiate(); // Use a bolt facility to attach one of our text fixtures: AutoCurrentContext()->BoltTo<ThrowsAnExceptionFirstTime, RestartingSigil>(); // Create our bolt and the restarter AutoRequired<CreationDetectionBolt> bolt; AutoRestarterConfig cfg; cfg.restartOnException = true; cfg.restartOnShutdown = false; cfg.startWhenCreated = true; AutoConstruct<AutoRestarter<RestartingSigil>> restarter(cfg); // Verify the bolt got called: ASSERT_TRUE(bolt->called) << "Bolt was not called even though a context restarter was present in the current context"; // Verify subcontext properties: auto subCtxt = restarter->GetContext(); ASSERT_TRUE(subCtxt != nullptr) << "Restarter did not correctly create a subcontext"; ASSERT_TRUE(subCtxt->Is<RestartingSigil>()) << "Generated subcontext was not marked with the right sigil"; ASSERT_TRUE(subCtxt->IsInitiated()) << "Generated subcontext should have been prospectively started, but was not"; // Terminate the subcontext directly: subCtxt->SignalShutdown(); // Verify that this causes the restarter to release its hold on the subcontext: auto newSubCtxt = restarter->GetContext(); ASSERT_NE(subCtxt, newSubCtxt) << "Restarter did not release a terminated subcontext"; ASSERT_FALSE(newSubCtxt) << "Restarter should not have attempted to create any new contexts on teardown"; }
int rollingRestart(NDBT_Context* ctx, NDBT_Step* step) { // Assuming 2 replicas AtrtClient atrt; SqlResultSet clusters; if (!atrt.getClusters(clusters)) return NDBT_FAILED; while (clusters.next()) { uint clusterId= clusters.columnAsInt("id"); SqlResultSet tmp_result; if (!atrt.getConnectString(clusterId, tmp_result)) return NDBT_FAILED; NdbRestarter restarter(tmp_result.column("connectstring")); if (restarter.rollingRestart()) return NDBT_FAILED; } return NDBT_OK; }
TEST_F(AutoRestarterTest, RestartsOnShutdown) { AutoCurrentContext()->Initiate(); // Create the restarter AutoRestarterConfig cfg; cfg.restartOnShutdown = true; cfg.startWhenCreated = true; AutoConstruct<AutoRestarter<RestartingSigil>> restarter(cfg); // Terminate the restarter's subcontext: auto subCtxt = restarter->GetContext(); subCtxt->SignalShutdown(); // New context should be created immediately ASSERT_NE(subCtxt, restarter->GetContext()) << "Restarter incorrectly held original context beyond shutdown"; ASSERT_TRUE(restarter->GetContext() != nullptr) << "Restarter did not correctly generate a new context after termination"; }
int NdbRestarts::executeRestart(NDBT_Context* ctx, const NdbRestarts::NdbRestart* _restart, unsigned int _timeout, int safety){ // Check that there are enough nodes in the cluster // for this test NdbRestarter restarter(0, &ctx->m_cluster_connection); if (_restart->m_numRequiredNodes > restarter.getNumDbNodes()){ g_err << "This test requires " << _restart->m_numRequiredNodes << " nodes " << "there are only "<< restarter.getNumDbNodes() <<" nodes in cluster" << endl; return NDBT_OK; } if (restarter.waitClusterStarted(120) != 0){ // If cluster is not started when we shall peform restart // the restart can not be executed and the test fails return NDBT_FAILED; } int res = _restart->m_restartFunc(ctx, restarter, _restart, safety); // Sleep a little waiting for nodes to react to command NdbSleep_SecSleep(2); if (_timeout == 0){ // If timeout == 0 wait for ever while(restarter.waitClusterStarted(60) != 0) g_err << "Cluster is not started after restart. Waiting 60s more..." << endl; } else { if (restarter.waitClusterStarted(_timeout) != 0){ g_err<<"Cluster failed to start" << endl; res = NDBT_FAILED; } } return res; }
TEST_F(AutoRestarterTest, RestarterCanExistInStoppedContext) { AutoRestarterConfig cfg; cfg.startWhenCreated = true; AutoConstruct<AutoRestarter<RestartingSigil>> restarter(cfg); }
int main(int argc, char **argv) { krb5_error_code ret; krb5_context context; void *kadm_handle; kadm5_server_context *server_context; kadm5_config_params conf; krb5_socket_t signal_fd, listen_fd; int log_fd; slave *slaves = NULL; uint32_t current_version = 0, old_version = 0; uint32_t current_tstamp = 0; krb5_keytab keytab; char **files; int aret; int optidx = 0; int restarter_fd = -1; struct stat st; setprogname(argv[0]); if (getarg(args, num_args, argc, argv, &optidx)) krb5_std_usage(1, args, num_args); if (help_flag) krb5_std_usage(0, args, num_args); if (version_flag) { print_version(NULL); exit(0); } if (detach_from_console && daemon_child == -1) roken_detach_prep(argc, argv, "--daemon-child"); rk_pidfile(NULL); ret = krb5_init_context(&context); if (ret) errx(1, "krb5_init_context failed: %d", ret); setup_signal(); if (config_file == NULL) { aret = asprintf(&config_file, "%s/kdc.conf", hdb_db_dir(context)); if (aret == -1 || config_file == NULL) errx(1, "out of memory"); } ret = krb5_prepend_config_files_default(config_file, &files); if (ret) krb5_err(context, 1, ret, "getting configuration files"); ret = krb5_set_config_files(context, files); krb5_free_config_files(files); if (ret) krb5_err(context, 1, ret, "reading configuration files"); time_before_gone = parse_time (slave_time_gone, "s"); if (time_before_gone < 0) krb5_errx (context, 1, "couldn't parse time: %s", slave_time_gone); time_before_missing = parse_time (slave_time_missing, "s"); if (time_before_missing < 0) krb5_errx (context, 1, "couldn't parse time: %s", slave_time_missing); krb5_openlog(context, "ipropd-master", &log_facility); krb5_set_warn_dest(context, log_facility); ret = krb5_kt_register(context, &hdb_get_kt_ops); if(ret) krb5_err(context, 1, ret, "krb5_kt_register"); ret = krb5_kt_resolve(context, keytab_str, &keytab); if(ret) krb5_err(context, 1, ret, "krb5_kt_resolve: %s", keytab_str); memset(&conf, 0, sizeof(conf)); if(realm) { conf.mask |= KADM5_CONFIG_REALM; conf.realm = realm; } ret = kadm5_init_with_skey_ctx (context, KADM5_ADMIN_SERVICE, NULL, KADM5_ADMIN_SERVICE, &conf, 0, 0, &kadm_handle); if (ret) krb5_err (context, 1, ret, "kadm5_init_with_password_ctx"); server_context = (kadm5_server_context *)kadm_handle; log_fd = open (server_context->log_context.log_file, O_RDONLY, 0); if (log_fd < 0) krb5_err (context, 1, errno, "open %s", server_context->log_context.log_file); if (fstat(log_fd, &st) == -1) krb5_err(context, 1, errno, "stat %s", server_context->log_context.log_file); if (flock(log_fd, LOCK_SH) == -1) krb5_err(context, 1, errno, "shared flock %s", server_context->log_context.log_file); kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST, ¤t_version, ¤t_tstamp); flock(log_fd, LOCK_UN); signal_fd = make_signal_socket (context); listen_fd = make_listen_socket (context, port_str); krb5_warnx(context, "ipropd-master started at version: %lu", (unsigned long)current_version); roken_detach_finish(NULL, daemon_child); restarter_fd = restarter(context, NULL); while (exit_flag == 0){ slave *p; fd_set readset; int max_fd = 0; struct timeval to = {30, 0}; uint32_t vers; struct stat st2;; #ifndef NO_LIMIT_FD_SETSIZE if (signal_fd >= FD_SETSIZE || listen_fd >= FD_SETSIZE || restarter_fd >= FD_SETSIZE) krb5_errx (context, IPROPD_RESTART, "fd too large"); #endif FD_ZERO(&readset); FD_SET(signal_fd, &readset); max_fd = max(max_fd, signal_fd); FD_SET(listen_fd, &readset); max_fd = max(max_fd, listen_fd); if (restarter_fd > -1) { FD_SET(restarter_fd, &readset); max_fd = max(max_fd, restarter_fd); } for (p = slaves; p != NULL; p = p->next) { if (p->flags & SLAVE_F_DEAD) continue; FD_SET(p->fd, &readset); max_fd = max(max_fd, p->fd); } ret = select (max_fd + 1, &readset, NULL, NULL, &to); if (ret < 0) { if (errno == EINTR) continue; else krb5_err (context, IPROPD_RESTART, errno, "select"); } if (stat(server_context->log_context.log_file, &st2) == -1) { krb5_warn(context, errno, "could not stat log file by path"); st2 = st; } if (st2.st_dev != st.st_dev || st2.st_ino != st.st_ino) { (void) close(log_fd); log_fd = open(server_context->log_context.log_file, O_RDONLY, 0); if (log_fd < 0) krb5_err(context, 1, IPROPD_RESTART_SLOW, "open %s", server_context->log_context.log_file); if (fstat(log_fd, &st) == -1) krb5_err(context, IPROPD_RESTART_SLOW, errno, "stat %s", server_context->log_context.log_file); if (flock(log_fd, LOCK_SH) == -1) krb5_err(context, IPROPD_RESTART, errno, "shared flock %s", server_context->log_context.log_file); kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST, ¤t_version, ¤t_tstamp); flock(log_fd, LOCK_UN); } if (ret == 0) { /* Recover from failed transactions */ if (kadm5_log_init_nb(server_context) == 0) kadm5_log_end(server_context); if (flock(log_fd, LOCK_SH) == -1) krb5_err(context, IPROPD_RESTART, errno, "could not lock log file"); kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST, ¤t_version, ¤t_tstamp); flock(log_fd, LOCK_UN); if (current_version > old_version) { krb5_warnx(context, "Missed a signal, updating slaves %lu to %lu", (unsigned long)old_version, (unsigned long)current_version); for (p = slaves; p != NULL; p = p->next) { if (p->flags & SLAVE_F_DEAD) continue; send_diffs (server_context, p, log_fd, database, current_version, current_tstamp); } old_version = current_version; } } if (ret && FD_ISSET(restarter_fd, &readset)) { exit_flag = SIGTERM; break; } if (ret && FD_ISSET(signal_fd, &readset)) { #ifndef NO_UNIX_SOCKETS struct sockaddr_un peer_addr; #else struct sockaddr_storage peer_addr; #endif socklen_t peer_len = sizeof(peer_addr); if(recvfrom(signal_fd, (void *)&vers, sizeof(vers), 0, (struct sockaddr *)&peer_addr, &peer_len) < 0) { krb5_warn (context, errno, "recvfrom"); continue; } --ret; assert(ret >= 0); old_version = current_version; if (flock(log_fd, LOCK_SH) == -1) krb5_err(context, IPROPD_RESTART, errno, "shared flock %s", server_context->log_context.log_file); kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST, ¤t_version, ¤t_tstamp); flock(log_fd, LOCK_UN); if (current_version != old_version) { /* * If current_version < old_version then the log got * truncated and we'll end up doing full propagations. * * Truncating the log when the current version is * numerically small can lead to race conditions. * Ideally we should identify log versions as * {init_or_trunc_time, vno}, then we could not have any * such race conditions, but this would either require * breaking backwards compatibility for the protocol or * adding new messages to it. */ krb5_warnx(context, "Got a signal, updating slaves %lu to %lu", (unsigned long)old_version, (unsigned long)current_version); for (p = slaves; p != NULL; p = p->next) { if (p->flags & SLAVE_F_DEAD) continue; send_diffs (server_context, p, log_fd, database, current_version, current_tstamp); } } else { krb5_warnx(context, "Got a signal, but no update in log version %lu", (unsigned long)current_version); } } for(p = slaves; p != NULL; p = p->next) { if (p->flags & SLAVE_F_DEAD) continue; if (ret && FD_ISSET(p->fd, &readset)) { --ret; assert(ret >= 0); if(process_msg (server_context, p, log_fd, database, current_version, current_tstamp)) slave_dead(context, p); } else if (slave_gone_p (p)) slave_dead(context, p); else if (slave_missing_p (p)) send_are_you_there (context, p); } if (ret && FD_ISSET(listen_fd, &readset)) { add_slave (context, keytab, &slaves, listen_fd); --ret; assert(ret >= 0); } write_stats(context, slaves, current_version); } if(exit_flag == SIGINT || exit_flag == SIGTERM) krb5_warnx(context, "%s terminated", getprogname()); #ifdef SIGXCPU else if(exit_flag == SIGXCPU) krb5_warnx(context, "%s CPU time limit exceeded", getprogname()); #endif else krb5_warnx(context, "%s unexpected exit reason: %ld", getprogname(), (long)exit_flag); write_master_down(context); return 0; }
int main(int argc, char **argv) { krb5_error_code ret, ret2; krb5_context context; krb5_auth_context auth_context; void *kadm_handle; kadm5_server_context *server_context; kadm5_config_params conf; int master_fd; krb5_ccache ccache; krb5_principal server; char **files; int optidx = 0; time_t reconnect_min; time_t backoff; time_t reconnect_max; time_t reconnect; time_t before = 0; int restarter_fd = -1; const char *master; setprogname(argv[0]); if (getarg(args, num_args, argc, argv, &optidx)) usage(1); if (help_flag) usage(0); if (version_flag) { print_version(NULL); exit(0); } if (detach_from_console && daemon_child == -1) roken_detach_prep(argc, argv, "--daemon-child"); rk_pidfile(NULL); ret = krb5_init_context(&context); if (ret) errx (1, "krb5_init_context failed: %d", ret); setup_signal(); if (config_file == NULL) { if (asprintf(&config_file, "%s/kdc.conf", hdb_db_dir(context)) == -1 || config_file == NULL) errx(1, "out of memory"); } ret = krb5_prepend_config_files_default(config_file, &files); if (ret) krb5_err(context, 1, ret, "getting configuration files"); ret = krb5_set_config_files(context, files); krb5_free_config_files(files); if (ret) krb5_err(context, 1, ret, "reading configuration files"); argc -= optidx; argv += optidx; if (argc != 1) usage(1); master = argv[0]; if (status_file == NULL) { if (asprintf(&status_file, "%s/ipropd-slave-status", hdb_db_dir(context)) < 0 || status_file == NULL) krb5_errx(context, 1, "can't allocate status file buffer"); } krb5_openlog(context, "ipropd-slave", &log_facility); krb5_set_warn_dest(context, log_facility); slave_status(context, status_file, "bootstrapping"); ret = krb5_kt_register(context, &hdb_get_kt_ops); if(ret) krb5_err(context, 1, ret, "krb5_kt_register"); time_before_lost = parse_time (server_time_lost, "s"); if (time_before_lost < 0) krb5_errx (context, 1, "couldn't parse time: %s", server_time_lost); slave_status(context, status_file, "getting credentials from keytab/database"); memset(&conf, 0, sizeof(conf)); if(realm) { conf.mask |= KADM5_CONFIG_REALM; conf.realm = realm; } ret = kadm5_init_with_password_ctx (context, KADM5_ADMIN_SERVICE, NULL, KADM5_ADMIN_SERVICE, &conf, 0, 0, &kadm_handle); if (ret) krb5_err (context, 1, ret, "kadm5_init_with_password_ctx"); server_context = (kadm5_server_context *)kadm_handle; slave_status(context, status_file, "creating log file"); ret = server_context->db->hdb_open(context, server_context->db, O_RDWR | O_CREAT, 0600); if (ret) krb5_err (context, 1, ret, "db->open"); ret = kadm5_log_init (server_context); if (ret) krb5_err (context, 1, ret, "kadm5_log_init"); ret = server_context->db->hdb_close (context, server_context->db); if (ret) krb5_err (context, 1, ret, "db->close"); get_creds(context, keytab_str, &ccache, master); ret = krb5_sname_to_principal (context, master, IPROP_NAME, KRB5_NT_SRV_HST, &server); if (ret) krb5_err (context, 1, ret, "krb5_sname_to_principal"); auth_context = NULL; master_fd = -1; krb5_appdefault_time(context, config_name, NULL, "reconnect-min", 10, &reconnect_min); krb5_appdefault_time(context, config_name, NULL, "reconnect-max", 300, &reconnect_max); krb5_appdefault_time(context, config_name, NULL, "reconnect-backoff", 10, &backoff); reconnect = reconnect_min; slave_status(context, status_file, "ipropd-slave started"); roken_detach_finish(NULL, daemon_child); restarter_fd = restarter(context, NULL); while (!exit_flag) { struct timeval to; time_t now, elapsed; fd_set readset; int connected = FALSE; #ifndef NO_LIMIT_FD_SETSIZE if (restarter_fd >= FD_SETSIZE) krb5_errx(context, IPROPD_RESTART, "fd too large"); #endif FD_ZERO(&readset); if (restarter_fd > -1) FD_SET(restarter_fd, &readset); now = time(NULL); elapsed = now - before; if (elapsed < reconnect) { time_t left = reconnect - elapsed; krb5_warnx(context, "sleeping %d seconds before " "retrying to connect", (int)left); to.tv_sec = left; to.tv_usec = 0; if (select(restarter_fd + 1, &readset, NULL, NULL, &to) == 1) { exit_flag = SIGTERM; continue; } } before = now; slave_status(context, status_file, "connecting to master: %s\n", master); master_fd = connect_to_master (context, master, port_str); if (master_fd < 0) goto retry; reconnect = reconnect_min; if (auth_context) { krb5_auth_con_free(context, auth_context); auth_context = NULL; krb5_cc_destroy(context, ccache); get_creds(context, keytab_str, &ccache, master); } if (verbose) krb5_warnx(context, "authenticating to master"); ret = krb5_sendauth (context, &auth_context, &master_fd, IPROP_VERSION, NULL, server, AP_OPTS_MUTUAL_REQUIRED, NULL, NULL, ccache, NULL, NULL, NULL); if (ret) { krb5_warn (context, ret, "krb5_sendauth"); goto retry; } krb5_warnx(context, "ipropd-slave started at version: %ld", (long)server_context->log_context.version); ret = ihave(context, auth_context, master_fd, server_context->log_context.version); if (ret) goto retry; connected = TRUE; if (verbose) krb5_warnx(context, "connected to master"); slave_status(context, status_file, "connected to master, waiting instructions"); while (connected && !exit_flag) { krb5_data out; krb5_storage *sp; uint32_t tmp; int max_fd; #ifndef NO_LIMIT_FD_SETSIZE if (master_fd >= FD_SETSIZE) krb5_errx(context, IPROPD_RESTART, "fd too large"); if (restarter_fd >= FD_SETSIZE) krb5_errx(context, IPROPD_RESTART, "fd too large"); max_fd = max(restarter_fd, master_fd); #endif FD_ZERO(&readset); FD_SET(master_fd, &readset); if (restarter_fd != -1) FD_SET(restarter_fd, &readset); to.tv_sec = time_before_lost; to.tv_usec = 0; ret = select (max_fd + 1, &readset, NULL, NULL, &to); if (ret < 0) { if (errno == EINTR) continue; else krb5_err (context, 1, errno, "select"); } if (ret == 0) { krb5_warnx(context, "server didn't send a message " "in %d seconds", time_before_lost); connected = FALSE; continue; } if (restarter_fd > -1 && FD_ISSET(restarter_fd, &readset)) { if (verbose) krb5_warnx(context, "slave restarter exited"); exit_flag = SIGTERM; } if (!FD_ISSET(master_fd, &readset)) continue; if (verbose) krb5_warnx(context, "message from master"); ret = krb5_read_priv_message(context, auth_context, &master_fd, &out); if (ret) { krb5_warn(context, ret, "krb5_read_priv_message"); connected = FALSE; continue; } sp = krb5_storage_from_mem (out.data, out.length); if (sp == NULL) krb5_err(context, IPROPD_RESTART, errno, "krb5_storage_from_mem"); ret = krb5_ret_uint32(sp, &tmp); if (ret == HEIM_ERR_EOF) { krb5_warn(context, ret, "master sent zero-length message"); connected = FALSE; continue; } if (ret != 0) { krb5_warn(context, ret, "couldn't read master's message"); connected = FALSE; continue; } ret = server_context->db->hdb_open(context, server_context->db, O_RDWR | O_CREAT, 0600); if (ret) krb5_err (context, 1, ret, "db->open while handling a " "message from the master"); ret = kadm5_log_init(server_context); if (ret) { krb5_err(context, IPROPD_RESTART, ret, "kadm5_log_init while " "handling a message from the master"); } ret = server_context->db->hdb_close (context, server_context->db); if (ret) krb5_err (context, 1, ret, "db->close while handling a " "message from the master"); switch (tmp) { case FOR_YOU : if (verbose) krb5_warnx(context, "master sent us diffs"); ret2 = receive(context, sp, server_context); if (ret2) krb5_warn(context, ret2, "receive from ipropd-master had errors"); ret = ihave(context, auth_context, master_fd, server_context->log_context.version); if (ret || ret2) connected = FALSE; /* * If it returns an error, receive() may nonetheless * have committed some entries successfully, so we must * update the slave_status even if there were errors. */ is_up_to_date(context, status_file, server_context); break; case TELL_YOU_EVERYTHING : if (verbose) krb5_warnx(context, "master sent us a full dump"); ret = receive_everything(context, master_fd, server_context, auth_context); if (ret == 0) { ret = ihave(context, auth_context, master_fd, server_context->log_context.version); } if (ret) connected = FALSE; else is_up_to_date(context, status_file, server_context); break; case ARE_YOU_THERE : if (verbose) krb5_warnx(context, "master sent us a ping"); is_up_to_date(context, status_file, server_context); ret = ihave(context, auth_context, master_fd, server_context->log_context.version); if (ret) connected = FALSE; send_im_here(context, master_fd, auth_context); break; case YOU_HAVE_LAST_VERSION: if (verbose) krb5_warnx(context, "master tells us we are up to date"); is_up_to_date(context, status_file, server_context); break; case NOW_YOU_HAVE : case I_HAVE : case ONE_PRINC : case I_AM_HERE : default : krb5_warnx (context, "Ignoring command %d", tmp); break; } krb5_storage_free (sp); krb5_data_free (&out); } slave_status(context, status_file, "disconnected from master"); retry: if (connected == FALSE) krb5_warnx (context, "disconnected for server"); if (exit_flag) krb5_warnx (context, "got an exit signal"); if (master_fd >= 0) close(master_fd); reconnect += backoff; if (reconnect > reconnect_max) { slave_status(context, status_file, "disconnected from master for a long time"); reconnect = reconnect_max; } } if (status_file) { /* XXX It'd be better to leave it saying we're not here */ unlink(status_file); } if (0); #ifndef NO_SIGXCPU else if(exit_flag == SIGXCPU) krb5_warnx(context, "%s CPU time limit exceeded", getprogname()); #endif else if(exit_flag == SIGINT || exit_flag == SIGTERM) krb5_warnx(context, "%s terminated", getprogname()); else krb5_warnx(context, "%s unexpected exit reason: %ld", getprogname(), (long)exit_flag); return 0; }
static int runUpgrade_Half(NDBT_Context* ctx, NDBT_Step* step) { // Assuming 2 replicas AtrtClient atrt; const bool waitNode = ctx->getProperty("WaitNode", Uint32(0)) != 0; const bool event = ctx->getProperty("CreateDropEvent", Uint32(0)) != 0; const char * args = ""; if (ctx->getProperty("KeepFS", Uint32(0)) != 0) { args = "--initial=0"; } NodeSet mgmdNodeSet = (NodeSet) ctx->getProperty("MgmdNodeSet", Uint32(0)); NodeSet ndbdNodeSet = (NodeSet) ctx->getProperty("NdbdNodeSet", Uint32(0)); SqlResultSet clusters; if (!atrt.getClusters(clusters)) return NDBT_FAILED; while (clusters.next()) { uint clusterId= clusters.columnAsInt("id"); SqlResultSet tmp_result; if (!atrt.getConnectString(clusterId, tmp_result)) return NDBT_FAILED; NdbRestarter restarter(tmp_result.column("connectstring")); restarter.setReconnect(true); // Restarting mgmd g_err << "Cluster '" << clusters.column("name") << "@" << tmp_result.column("connectstring") << "'" << endl; if(restarter.waitClusterStarted()) return NDBT_FAILED; // Restart ndb_mgmd(s) SqlResultSet mgmds; if (!atrt.getMgmds(clusterId, mgmds)) return NDBT_FAILED; uint mgmdCount = mgmds.numRows(); uint restartCount = getNodeCount(mgmdNodeSet, mgmdCount); ndbout << "Restarting " << restartCount << " of " << mgmdCount << " mgmds" << endl; while (mgmds.next() && restartCount --) { ndbout << "Restart mgmd" << mgmds.columnAsInt("node_id") << endl; if (!atrt.changeVersion(mgmds.columnAsInt("id"), "")) return NDBT_FAILED; if(restarter.waitConnected()) return NDBT_FAILED; } NdbSleep_SecSleep(5); // TODO, handle arbitration // Restart one ndbd in each node group SqlResultSet ndbds; if (!atrt.getNdbds(clusterId, ndbds)) return NDBT_FAILED; Vector<NodeInfo> nodes; while (ndbds.next()) { struct NodeInfo n; n.nodeId = ndbds.columnAsInt("node_id"); n.processId = ndbds.columnAsInt("id"); n.nodeGroup = restarter.getNodeGroup(n.nodeId); nodes.push_back(n); } uint ndbdCount = ndbds.numRows(); restartCount = getNodeCount(ndbdNodeSet, ndbdCount); ndbout << "Restarting " << restartCount << " of " << ndbdCount << " ndbds" << endl; int nodesarray[256]; int cnt= 0; Bitmask<4> seen_groups; Bitmask<4> restarted_nodes; for (Uint32 i = 0; (i<nodes.size() && restartCount); i++) { int nodeId = nodes[i].nodeId; int processId = nodes[i].processId; int nodeGroup= nodes[i].nodeGroup; if (seen_groups.get(nodeGroup)) { // One node in this node group already down continue; } seen_groups.set(nodeGroup); restarted_nodes.set(nodeId); ndbout << "Restart node " << nodeId << endl; if (!atrt.changeVersion(processId, args)) return NDBT_FAILED; if (waitNode) { restarter.waitNodesNoStart(&nodeId, 1); } nodesarray[cnt++]= nodeId; restartCount--; } if (!waitNode) { if (restarter.waitNodesNoStart(nodesarray, cnt)) return NDBT_FAILED; } ndbout << "Starting and wait for started..." << endl; if (restarter.startAll()) return NDBT_FAILED; if (restarter.waitClusterStarted()) return NDBT_FAILED; if (event && createDropEvent(ctx, step)) { return NDBT_FAILED; } ndbout << "Half started" << endl; if (ctx->getProperty("HalfStartedHold", (Uint32)0) != 0) { while (ctx->getProperty("HalfStartedHold", (Uint32)0) != 0) { ndbout << "Half started holding..." << endl; ctx->setProperty("HalfStartedDone", (Uint32)1); NdbSleep_SecSleep(30); } ndbout << "Got half started continue..." << endl; } // Restart the remaining nodes cnt= 0; for (Uint32 i = 0; (i<nodes.size() && restartCount); i++) { int nodeId = nodes[i].nodeId; int processId = nodes[i].processId; if (restarted_nodes.get(nodeId)) continue; ndbout << "Restart node " << nodeId << endl; if (!atrt.changeVersion(processId, args)) return NDBT_FAILED; if (waitNode) { restarter.waitNodesNoStart(&nodeId, 1); } nodesarray[cnt++]= nodeId; restartCount --; } if (!waitNode) { if (restarter.waitNodesNoStart(nodesarray, cnt)) return NDBT_FAILED; } ndbout << "Starting and wait for started..." << endl; if (restarter.startAll()) return NDBT_FAILED; if (restarter.waitClusterStarted()) return NDBT_FAILED; if (event && createDropEvent(ctx, step)) { return NDBT_FAILED; } } return NDBT_OK; }
int runUpgrade_NR1(NDBT_Context* ctx, NDBT_Step* step) { AtrtClient atrt; NodeSet mgmdNodeSet = (NodeSet) ctx->getProperty("MgmdNodeSet", Uint32(0)); NodeSet ndbdNodeSet = (NodeSet) ctx->getProperty("NdbdNodeSet", Uint32(0)); SqlResultSet clusters; if (!atrt.getClusters(clusters)) return NDBT_FAILED; while (clusters.next()) { uint clusterId= clusters.columnAsInt("id"); SqlResultSet tmp_result; if (!atrt.getConnectString(clusterId, tmp_result)) return NDBT_FAILED; NdbRestarter restarter(tmp_result.column("connectstring")); restarter.setReconnect(true); // Restarting mgmd g_err << "Cluster '" << clusters.column("name") << "@" << tmp_result.column("connectstring") << "'" << endl; if (restarter.waitClusterStarted()) return NDBT_FAILED; // Restart ndb_mgmd(s) SqlResultSet mgmds; if (!atrt.getMgmds(clusterId, mgmds)) return NDBT_FAILED; uint mgmdCount = mgmds.numRows(); uint restartCount = getNodeCount(mgmdNodeSet, mgmdCount); ndbout << "Restarting " << restartCount << " of " << mgmdCount << " mgmds" << endl; while (mgmds.next() && restartCount --) { ndbout << "Restart mgmd " << mgmds.columnAsInt("node_id") << endl; if (!atrt.changeVersion(mgmds.columnAsInt("id"), "")) return NDBT_FAILED; if (restarter.waitConnected()) return NDBT_FAILED; ndbout << "Connected to mgmd"<< endl; } ndbout << "Waiting for started"<< endl; if (restarter.waitClusterStarted()) return NDBT_FAILED; ndbout << "Started"<< endl; // Restart ndbd(s) SqlResultSet ndbds; if (!atrt.getNdbds(clusterId, ndbds)) return NDBT_FAILED; uint ndbdCount = ndbds.numRows(); restartCount = getNodeCount(ndbdNodeSet, ndbdCount); ndbout << "Restarting " << restartCount << " of " << ndbdCount << " ndbds" << endl; while(ndbds.next() && restartCount --) { int nodeId = ndbds.columnAsInt("node_id"); int processId = ndbds.columnAsInt("id"); ndbout << "Restart node " << nodeId << endl; if (!atrt.changeVersion(processId, "")) return NDBT_FAILED; if (restarter.waitNodesNoStart(&nodeId, 1)) return NDBT_FAILED; if (restarter.startNodes(&nodeId, 1)) return NDBT_FAILED; if (restarter.waitNodesStarted(&nodeId, 1)) return NDBT_FAILED; if (createDropEvent(ctx, step)) return NDBT_FAILED; } } ctx->stopTest(); return NDBT_OK; }
static int runUpgrade_SR(NDBT_Context* ctx, NDBT_Step* step) { /* System restart upgrade. * Stop all data nodes * Change versions * Restart em together. */ AtrtClient atrt; NodeSet mgmdNodeSet = All; const char * args = ""; bool skipMgmds = (ctx->getProperty("SkipMgmds", Uint32(0)) != 0); SqlResultSet clusters; if (!atrt.getClusters(clusters)) return NDBT_FAILED; while (clusters.next()) { uint clusterId= clusters.columnAsInt("id"); SqlResultSet tmp_result; if (!atrt.getConnectString(clusterId, tmp_result)) return NDBT_FAILED; NdbRestarter restarter(tmp_result.column("connectstring")); restarter.setReconnect(true); // Restarting mgmd g_err << "Cluster '" << clusters.column("name") << "@" << tmp_result.column("connectstring") << "'" << endl; if(restarter.waitClusterStarted()) return NDBT_FAILED; /* Now restart to nostart state, prior to SR */ g_err << "Restarting all data nodes-nostart" << endl; if (restarter.restartAll2(NdbRestarter::NRRF_NOSTART) != 0) { g_err << "Failed to restart all" << endl; return NDBT_FAILED; } ndbout << "Waiting for no-start state" << endl; if (restarter.waitClusterNoStart() != 0) { g_err << "Failed waiting for NoStart state" << endl; return NDBT_FAILED; } // Restart ndb_mgmd(s) SqlResultSet mgmds; if (!atrt.getMgmds(clusterId, mgmds)) return NDBT_FAILED; uint mgmdCount = mgmds.numRows(); uint restartCount = getNodeCount(mgmdNodeSet, mgmdCount); if (!skipMgmds) { ndbout << "Restarting " << restartCount << " of " << mgmdCount << " mgmds" << endl; while (mgmds.next() && restartCount --) { ndbout << "Restart mgmd" << mgmds.columnAsInt("node_id") << endl; if (!atrt.changeVersion(mgmds.columnAsInt("id"), "")) return NDBT_FAILED; if(restarter.waitConnected()) return NDBT_FAILED; } NdbSleep_SecSleep(5); // TODO, handle arbitration } else { ndbout << "Skipping MGMD upgrade" << endl; } // Restart all ndbds SqlResultSet ndbds; if (!atrt.getNdbds(clusterId, ndbds)) return NDBT_FAILED; uint ndbdCount = ndbds.numRows(); restartCount = ndbdCount; ndbout << "Upgrading " << restartCount << " of " << ndbdCount << " ndbds" << endl; while (ndbds.next()) { uint nodeId = ndbds.columnAsInt("node_id"); uint processId = ndbds.columnAsInt("id"); ndbout << "Upgrading node " << nodeId << endl; if (!atrt.changeVersion(processId, args)) return NDBT_FAILED; } ndbout << "Waiting for no-start state" << endl; if (restarter.waitClusterNoStart() != 0) { g_err << "Failed waiting for NoStart state" << endl; return NDBT_FAILED; } ndbout << "Starting cluster (SR)" << endl; if (restarter.restartAll2(0) != 0) { g_err << "Error restarting all nodes" << endl; return NDBT_FAILED; } ndbout << "Waiting for cluster to start" << endl; if (restarter.waitClusterStarted() != 0) { g_err << "Failed waiting for Cluster start" << endl; return NDBT_FAILED; } ndbout << "Cluster started." << endl; } return NDBT_OK; }
int main(int argc, const char** argv){ ndb_init(); const char* _hostName = NULL; int _loops = 10; int _wait = 15; int _help = 0; #if 0 int _crash = 0; int _abort = 0; #endif struct getargs args[] = { { "seconds", 's', arg_integer, &_wait, "Seconds to wait between each restart(0=random)", "secs" }, { "loops", 'l', arg_integer, &_loops, "Number of loops", "loops 0=forever"}, #if 0 // Not yet! { "abort", 'a', arg_flag, &_abort, "Restart abort"}, { "crash", 'c', arg_flag, &_crash, "Crash instead of restart"}, #endif { "usage", '?', arg_flag, &_help, "Print help", "" } }; int num_args = sizeof(args) / sizeof(args[0]); int optind = 0; char desc[] = "hostname:port\n"\ "This program will connect to the mgmsrv of a NDB cluster.\n"\ "It will wait for all nodes to be started, then restart all nodes\n"\ "into nostart state. Then after a random delay it will tell all nodes\n"\ "to start. It will do this loop number of times\n"; if(getarg(args, num_args, argc, argv, &optind) || _help) { arg_printusage(args, num_args, argv[0], desc); return NDBT_ProgramExit(NDBT_WRONGARGS); } _hostName = argv[optind]; NdbRestarter restarter(_hostName); #if 0 if(_abort && _crash){ g_err << "You can't specify both abort and crash" << endl; arg_printusage(args, num_args, argv[0], desc); return NDBT_ProgramExit(NDBT_WRONGARGS); } if(_abort){ restarter.setRestartType(NdbRestarter::AbortRestart); } if(_crash){ restarter.setRestartType(NdbRestarter::Crash); } #endif int l = 0; while (_loops == 0 || l<_loops){ g_info << "Waiting for cluster to start" << endl; while(restarter.waitClusterStarted(120) != 0){ g_warning << "Ndb failed to start in 2 minutes" << endl; } int seconds = _wait; if(seconds==0) seconds = (rand() % 120) + 1; // Create random value max 120 secs g_info << "Waiting for "<<seconds<<" secs" << endl; NdbSleep_SecSleep(seconds); g_info << l << ": restarting all nodes with nostart" << endl; const bool b = (restarter.restartAll(false, true, false) == 0); assert(b); g_info << "Waiting for cluster to enter nostart" << endl; while(restarter.waitClusterNoStart(120) != 0){ g_warning << "Ndb failed to enter no start in 2 minutes" << endl; } seconds = _wait; if(seconds==0) seconds = (rand() % 120) + 1; // Create random value max 120 secs g_info << "Waiting for " <<seconds<<" secs" << endl; NdbSleep_SecSleep(seconds); g_info << l << ": Telling all nodes to start" << endl; const bool b2 = (restarter.startAll() == 0); assert(b2); l++; } return NDBT_ProgramExit(NDBT_OK); }