/* called when the startup event script finishes */ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p) { if (status != 0) { DEBUG(DEBUG_ERR,("startup event failed\n")); } else if (status == 0) { DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n")); ctdb->done_startup = true; ctdb->monitor->next_interval = 5; ctdb_run_notification_script(ctdb, "startup"); } event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(ctdb->monitor->next_interval, 0), ctdb_check_health, ctdb); }
/* called when the startup event script finishes */ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p) { if (status != 0) { DEBUG(DEBUG_ERR,("startup event failed\n")); tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(5, 0), ctdb_run_startup, ctdb); return; } DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n")); ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING); ctdb->monitor->next_interval = 2; ctdb_run_notification_script(ctdb, "startup"); ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE; tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(ctdb->monitor->next_interval, 0), ctdb_check_health, ctdb); }
/* called when a health monitoring event script finishes */ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) { struct ctdb_node *node = ctdb->nodes[ctdb->pnn]; TDB_DATA data; struct ctdb_node_flag_change c; uint32_t next_interval; int ret; TDB_DATA rddata; struct ctdb_srvid_message rd; const char *state_str = NULL; c.pnn = ctdb->pnn; c.old_flags = node->flags; rd.pnn = ctdb->pnn; rd.srvid = CTDB_SRVID_TAKEOVER_RUN_RESPONSE; rddata.dptr = (uint8_t *)&rd; rddata.dsize = sizeof(rd); if (status == -ECANCELED) { DEBUG(DEBUG_ERR,("Monitoring event was cancelled\n")); goto after_change_status; } if (status == -ETIME) { ctdb->monitor->event_script_timeouts++; if (ctdb->monitor->event_script_timeouts >= ctdb->tunable.monitor_timeout_count) { DEBUG(DEBUG_ERR, ("Maximum monitor timeout count %u reached." " Making node unhealthy\n", ctdb->tunable.monitor_timeout_count)); } else { /* We pretend this is OK. */ goto after_change_status; } } else { ctdb->monitor->event_script_timeouts = 0; } if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) { DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n")); node->flags |= NODE_FLAGS_UNHEALTHY; ctdb->monitor->next_interval = 5; ctdb_run_notification_script(ctdb, "unhealthy"); } else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) { DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n")); node->flags &= ~NODE_FLAGS_UNHEALTHY; ctdb->monitor->next_interval = 5; ctdb_run_notification_script(ctdb, "healthy"); } after_change_status: next_interval = ctdb->monitor->next_interval; ctdb->monitor->next_interval *= 2; if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) { ctdb->monitor->next_interval = ctdb->tunable.monitor_interval; } tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(next_interval, 0), ctdb_check_health, ctdb); if (c.old_flags == node->flags) { return; } c.new_flags = node->flags; data.dptr = (uint8_t *)&c; data.dsize = sizeof(c); /* ask the recovery daemon to push these changes out to all nodes */ ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_PUSH_NODE_FLAGS, data); if (c.new_flags & NODE_FLAGS_UNHEALTHY) { state_str = "UNHEALTHY"; } else { state_str = "HEALTHY"; } /* ask the recmaster to reallocate all addresses */ DEBUG(DEBUG_ERR, ("Node became %s. Ask recovery master to reallocate IPs\n", state_str)); ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_TAKEOVER_RUN, rddata); if (ret != 0) { DEBUG(DEBUG_ERR, (__location__ " Failed to send IP takeover run request\n")); } }