void fmd_dr_init(void) { const char *subclass = ESC_DR_AP_STATE_CHANGE; if (geteuid() != 0) return; /* legacy sysevent mechanism is still root-only */ if ((fmd.d_dr_hdl = sysevent_bind_handle(fmd_dr_event)) == NULL) fmd_error(EFMD_EXIT, "failed to bind handle for DR sysevent"); if (sysevent_subscribe_event(fmd.d_dr_hdl, EC_DR, &subclass, 1) == -1) fmd_error(EFMD_EXIT, "failed to subscribe for DR sysevent"); }
/* * Restore a checkpoint for the specified module. Any errors which occur * during restore will call fmd_ckpt_error() or trigger an fmd_api_error(), * either of which will automatically unlock the module and trigger an abort. */ void fmd_ckpt_restore(fmd_module_t *mp) { fmd_ckpt_t ckp; if (mp->mod_stats->ms_ckpt_restore.fmds_value.b == FMD_B_FALSE) return; /* never restore checkpoints for this module */ TRACE((FMD_DBG_CKPT, "ckpt restore begin %s", mp->mod_name)); if (fmd_ckpt_open(&ckp, mp) == -1) { if (errno != ENOENT) fmd_error(EFMD_CKPT_OPEN, "can't open %s", ckp.ckp_src); TRACE((FMD_DBG_CKPT, "ckpt restore end %s", mp->mod_name)); return; } ASSERT(!fmd_module_locked(mp)); fmd_ckpt_restore_module(&ckp, mp); fmd_ckpt_destroy(&ckp); fmd_module_clrdirty(mp); TRACE((FMD_DBG_CKPT, "ckpt restore end %s", mp->mod_name)); fmd_dprintf(FMD_DBG_CKPT, "restored checkpoint of %s\n", mp->mod_name); }
/* * Move aside the module's checkpoint file if checkpoint restore has failed. * We rename the file rather than deleting it in the hopes that someone might * send it to us for post-mortem analysis of whether we have a checkpoint bug. */ void fmd_ckpt_rename(fmd_module_t *mp) { char src[PATH_MAX], dst[PATH_MAX]; (void) snprintf(src, sizeof (src), "%s/%s", mp->mod_ckpt, mp->mod_name); (void) snprintf(dst, sizeof (dst), "%s-", src); TRACE((FMD_DBG_CKPT, "rename %s ckpt", mp->mod_name)); if (rename(src, dst) != 0 && errno != ENOENT) fmd_error(EFMD_CKPT_DELETE, "failed to rename %s", src); }
/* * Delete the module's checkpoint file. This is used by the ckpt.zero property * code or by the fmadm reset RPC service path to force a checkpoint delete. */ void fmd_ckpt_delete(fmd_module_t *mp) { char path[PATH_MAX]; (void) snprintf(path, sizeof (path), "%s/%s", mp->mod_ckpt, mp->mod_name); TRACE((FMD_DBG_CKPT, "delete %s ckpt", mp->mod_name)); if (unlink(path) != 0 && errno != ENOENT) fmd_error(EFMD_CKPT_DELETE, "failed to delete %s", path); }
void fmd_rpc_init(void) { int err, prog, mode = RPC_SVC_MT_USER; uint64_t sndsize = 0, rcvsize = 0; const char *s; if (rpc_control(RPC_SVC_MTMODE_SET, &mode) == FALSE) fmd_panic("failed to enable user-MT rpc mode"); (void) fmd_conf_getprop(fmd.d_conf, "rpc.sndsize", &sndsize); (void) fmd_conf_getprop(fmd.d_conf, "rpc.rcvsize", &rcvsize); /* * Infer whether we are the "default" fault manager or an alternate one * based on whether the initial setting of rpc.adm.prog is non-zero. */ (void) fmd_conf_getprop(fmd.d_conf, "rpc.adm.prog", &prog); (void) fmd_conf_getprop(fmd.d_conf, "rpc.adm.path", &s); if (prog != 0) { err = fmd_rpc_svc_init(fmd_adm_1, "FMD_ADM", s, "rpc.adm.prog", FMD_ADM, FMD_ADM, FMD_ADM_VERSION_1, (uint_t)sndsize, (uint_t)rcvsize, TRUE); } else { err = fmd_rpc_svc_init(fmd_adm_1, "FMD_ADM", s, "rpc.adm.prog", RPC_TRANS_MIN, RPC_TRANS_MAX, FMD_ADM_VERSION_1, (uint_t)sndsize, (uint_t)rcvsize, FALSE); } if (err != 0) fmd_error(EFMD_EXIT, "failed to create rpc server bindings"); if (fmd_thread_create(fmd.d_rmod, (fmd_thread_f *)svc_run, 0) == NULL) fmd_error(EFMD_EXIT, "failed to create rpc server thread"); }
void fmd_thread_destroy(fmd_thread_t *tp, int flag) { if (flag == FMD_THREAD_JOIN && tp->thr_tid != pthread_self() && pthread_join(tp->thr_tid, NULL) != 0) { fmd_error(EFMD_MOD_JOIN, "failed to join thread for module " "%s (tid %u)\n", tp->thr_mod->mod_name, tp->thr_tid); } (void) pthread_mutex_lock(&fmd.d_thr_lock); fmd_list_delete(&fmd.d_thr_list, tp); (void) pthread_mutex_unlock(&fmd.d_thr_lock); fmd_trace_destroy(tp->thr_trdata); fmd_free(tp, sizeof (fmd_thread_t)); }
void fmd_run(fmd_t *dp, int pfd) { char *nodc_key[] = { FMD_FLT_NODC, NULL }; char nodc_str[128]; struct sigaction act; int status = FMD_EXIT_SUCCESS; const char *name; fmd_conf_path_t *pap; fmd_event_t *e; int dbout, err; /* * Cache all the current debug property settings in d_fmd_debug, * d_fmd_dbout, d_hdl_debug, and d_hdl_dbout. If a given debug mask * is non-zero and the corresponding dbout mask is zero, set dbout * to a sensible default value based on whether we have daemonized. */ (void) fmd_conf_getprop(dp->d_conf, "dbout", &dbout); if (dp->d_fmd_debug != 0 && dbout == 0) dp->d_fmd_dbout = dp->d_fg? FMD_DBOUT_STDERR : FMD_DBOUT_SYSLOG; else dp->d_fmd_dbout = dbout; (void) fmd_conf_getprop(dp->d_conf, "client.debug", &dp->d_hdl_debug); (void) fmd_conf_getprop(dp->d_conf, "client.dbout", &dbout); if (dp->d_hdl_debug != 0 && dbout == 0) dp->d_hdl_dbout = dp->d_fg? FMD_DBOUT_STDERR : FMD_DBOUT_SYSLOG; else dp->d_hdl_dbout = dbout; /* * Initialize remaining major program data structures such as the * clock, dispatch queues, log files, module hash collections, etc. * This work is done here rather than in fmd_create() to permit the -o * command-line option to modify properties after fmd_create() is done. */ name = dp->d_rootdir != NULL && *dp->d_rootdir != '\0' ? dp->d_rootdir : NULL; if ((dp->d_topo = topo_open(TOPO_VERSION, name, &err)) == NULL) { fmd_error(EFMD_EXIT, "failed to initialize " "topology library: %s\n", topo_strerror(err)); } dp->d_clockptr = dp->d_clockops->fto_init(); dp->d_xprt_ids = fmd_idspace_create("xprt_ids", 1, INT_MAX); fmd_xprt_suspend_all(); (void) door_server_create(fmd_door); fmd_dr_init(); dp->d_rmod->mod_timerids = fmd_idspace_create(dp->d_pname, 1, 16); dp->d_timers = fmd_timerq_create(); dp->d_disp = fmd_dispq_create(); dp->d_cases = fmd_case_hash_create(); /* * The root module's mod_queue is created with limit zero, making it * act like /dev/null; anything inserted here is simply ignored. */ dp->d_rmod->mod_queue = fmd_eventq_create(dp->d_rmod, &dp->d_rmod->mod_stats->ms_evqstat, &dp->d_rmod->mod_stats_lock, 0); /* * Once our subsystems that use signals have been set up, install the * signal handler for the fmd_thr_signal() API. Verify that the signal * being used for this purpose doesn't conflict with something else. */ (void) fmd_conf_getprop(dp->d_conf, "client.thrsig", &dp->d_thr_sig); if (sigaction(dp->d_thr_sig, NULL, &act) != 0) { fmd_error(EFMD_EXIT, "invalid signal selected for " "client.thrsig property: %d\n", dp->d_thr_sig); } if (act.sa_handler != SIG_IGN && act.sa_handler != SIG_DFL) { fmd_error(EFMD_EXIT, "signal selected for client.thrsig " "property is already in use: %d\n", dp->d_thr_sig); } act.sa_handler = fmd_signal; act.sa_flags = 0; (void) sigemptyset(&act.sa_mask); (void) sigaction(dp->d_thr_sig, &act, NULL); (void) fmd_conf_getprop(dp->d_conf, "schemedir", &name); dp->d_schemes = fmd_scheme_hash_create(dp->d_rootdir, name); (void) fmd_conf_getprop(dp->d_conf, "log.rsrc", &name); dp->d_asrus = fmd_asru_hash_create(dp->d_rootdir, name); (void) fmd_conf_getprop(dp->d_conf, "log.error", &name); dp->d_errlog = fmd_log_open(dp->d_rootdir, name, FMD_LOG_ERROR); (void) fmd_conf_getprop(dp->d_conf, "log.fault", &name); dp->d_fltlog = fmd_log_open(dp->d_rootdir, name, FMD_LOG_FAULT); if (dp->d_asrus == NULL || dp->d_errlog == NULL || dp->d_fltlog == NULL) fmd_error(EFMD_EXIT, "failed to initialize log files\n"); /* * Before loading modules, create an empty control event which will act * as a global barrier for module event processing. Each module we * load successfully will insert it at their head of their event queue, * and then pause inside of fmd_ctl_rele() after dequeuing the event. * This module barrier is required for two reasons: * * (a) During module loading, the restoration of case checkpoints may * result in a list.* event being recreated for which the intended * subscriber has not yet loaded depending on the load order. Such * events could then result in spurious "no subscriber" errors. * * (b) During errlog replay, a sequence of errors from a long time ago * may be replayed, and the module may attempt to install relative * timers associated with one or more of these events. If errlog * replay were "racing" with active module threads, an event E1 * that resulted in a relative timer T at time E1 + N nsec could * fire prior to an event E2 being enqueued, even if the relative * time ordering was E1 < E2 < E1 + N, causing mis-diagnosis. */ dp->d_mod_event = e = fmd_event_create(FMD_EVT_CTL, FMD_HRT_NOW, NULL, fmd_ctl_init(NULL)); fmd_event_hold(e); /* * Once all data structures are initialized, we load all of our modules * in order according to class in order to load up any subscriptions. * Once built-in modules are loaded, we detach from our waiting parent. */ dp->d_mod_hash = fmd_modhash_create(); if (fmd_builtin_loadall(dp->d_mod_hash) != 0 && !dp->d_fg) fmd_error(EFMD_EXIT, "failed to initialize fault manager\n"); (void) fmd_conf_getprop(dp->d_conf, "self.name", &name); dp->d_self = fmd_modhash_lookup(dp->d_mod_hash, name); if (dp->d_self != NULL && fmd_module_dc_key2code(dp->d_self, nodc_key, nodc_str, sizeof (nodc_str)) == 0) (void) fmd_conf_setprop(dp->d_conf, "nodiagcode", nodc_str); fmd_rpc_init(); dp->d_running = 1; /* we are now officially an active fmd */ /* * Now that we're running, if a pipe fd was specified, write an exit * status to it to indicate that our parent process can safely detach. * Then proceed to loading the remaining non-built-in modules. */ if (pfd >= 0) (void) write(pfd, &status, sizeof (status)); /* * Before loading all modules, repopulate the ASRU cache from its * persistent repository on disk. Then during module loading, the * restoration of checkpoint files will reparent any active cases. */ fmd_asru_hash_refresh(dp->d_asrus); (void) fmd_conf_getprop(dp->d_conf, "plugin.path", &pap); fmd_modhash_loadall(dp->d_mod_hash, pap, &fmd_rtld_ops, ".so"); (void) fmd_conf_getprop(dp->d_conf, "agent.path", &pap); fmd_modhash_loadall(dp->d_mod_hash, pap, &fmd_proc_ops, NULL); /* * With all modules loaded, replay fault events from the ASRU cache for * any ASRUs that must be retired, replay error events from the errlog * that did not finish processing the last time ran, and then release * the global module barrier by executing a final rele on d_mod_event. */ fmd_asru_hash_replay(dp->d_asrus); (void) pthread_rwlock_rdlock(&dp->d_log_lock); fmd_log_replay(dp->d_errlog, (fmd_log_f *)fmd_err_replay, dp); fmd_log_update(dp->d_errlog); (void) pthread_rwlock_unlock(&dp->d_log_lock); dp->d_mod_event = NULL; fmd_event_rele(e); /* * Finally, awaken any threads associated with receiving events from * open transports and tell them to proceed with fmd_xprt_recv(). */ fmd_xprt_resume_all(); fmd_gc(dp, 0, 0); dp->d_booted = 1; }
void fmd_create(fmd_t *dp, const char *arg0, const char *root, const char *conf) { fmd_conf_path_t *pap; char file[PATH_MAX]; const char *name; fmd_stat_t *sp; int i; smbios_hdl_t *shp; smbios_system_t s1; smbios_info_t s2; id_t id; di_prom_handle_t promh = DI_PROM_HANDLE_NIL; di_node_t rooth = DI_NODE_NIL; char *bufp; (void) sysinfo(SI_PLATFORM, _fmd_plat, sizeof (_fmd_plat)); (void) sysinfo(SI_ARCHITECTURE, _fmd_isa, sizeof (_fmd_isa)); (void) uname(&_fmd_uts); if ((shp = smbios_open(NULL, SMB_VERSION, 0, NULL)) != NULL) { if ((id = smbios_info_system(shp, &s1)) != SMB_ERR && smbios_info_common(shp, id, &s2) != SMB_ERR) { (void) strlcpy(_fmd_prod, s2.smbi_product, MAXNAMELEN); (void) strlcpy(_fmd_csn, s2.smbi_serial, MAXNAMELEN); } smbios_close(shp); } else if ((rooth = di_init("/", DINFOPROP)) != DI_NODE_NIL && (promh = di_prom_init()) != DI_PROM_HANDLE_NIL) { if (di_prom_prop_lookup_bytes(promh, rooth, "chassis-sn", (unsigned char **)&bufp) != -1) { (void) strlcpy(_fmd_csn, bufp, MAXNAMELEN); } } if (promh != DI_PROM_HANDLE_NIL) di_prom_fini(promh); if (rooth != DI_NODE_NIL) di_fini(rooth); bzero(dp, sizeof (fmd_t)); dp->d_version = _fmd_version; dp->d_pname = fmd_strbasename(arg0); dp->d_pid = getpid(); if (pthread_key_create(&dp->d_key, NULL) != 0) fmd_error(EFMD_EXIT, "failed to create pthread key"); (void) pthread_mutex_init(&dp->d_xprt_lock, NULL); (void) pthread_mutex_init(&dp->d_err_lock, NULL); (void) pthread_mutex_init(&dp->d_thr_lock, NULL); (void) pthread_mutex_init(&dp->d_mod_lock, NULL); (void) pthread_mutex_init(&dp->d_stats_lock, NULL); (void) pthread_rwlock_init(&dp->d_log_lock, NULL); /* * A small number of properties must be set manually before we open * the root configuration file. These include any settings for our * memory allocator and path expansion token values, because these * values are needed by the routines in fmd_conf.c itself. After * the root configuration file is processed, we reset these properties * based upon the latest values from the configuration file. */ dp->d_alloc_msecs = 10; dp->d_alloc_tries = 3; dp->d_str_buckets = 211; dp->d_rootdir = root ? root : ""; dp->d_platform = _fmd_plat; dp->d_machine = _fmd_uts.machine; dp->d_isaname = _fmd_isa; dp->d_conf = fmd_conf_open(conf, sizeof (_fmd_conf) / sizeof (_fmd_conf[0]), _fmd_conf, FMD_CONF_DEFER); if (dp->d_conf == NULL) { fmd_error(EFMD_EXIT, "failed to load required configuration properties\n"); } (void) fmd_conf_getprop(dp->d_conf, "alloc.msecs", &dp->d_alloc_msecs); (void) fmd_conf_getprop(dp->d_conf, "alloc.tries", &dp->d_alloc_tries); (void) fmd_conf_getprop(dp->d_conf, "strbuckets", &dp->d_str_buckets); (void) fmd_conf_getprop(dp->d_conf, "platform", &dp->d_platform); (void) fmd_conf_getprop(dp->d_conf, "machine", &dp->d_machine); (void) fmd_conf_getprop(dp->d_conf, "isaname", &dp->d_isaname); /* * Manually specified rootdirs override config files, so only update * d_rootdir based on the config files we parsed if no 'root' was set. */ if (root == NULL) (void) fmd_conf_getprop(dp->d_conf, "rootdir", &dp->d_rootdir); else (void) fmd_conf_setprop(dp->d_conf, "rootdir", dp->d_rootdir); /* * Once the base conf file properties are loaded, lookup the values * of $conf_path and $conf_file and merge in any other conf files. */ (void) fmd_conf_getprop(dp->d_conf, "conf_path", &pap); (void) fmd_conf_getprop(dp->d_conf, "conf_file", &name); for (i = 0; i < pap->cpa_argc; i++) { (void) snprintf(file, sizeof (file), "%s/%s", pap->cpa_argv[i], name); if (access(file, F_OK) == 0) fmd_conf_merge(dp->d_conf, file); } /* * Update the value of fmd.d_fg based on "fg". We cache this property * because it must be accessed deep within fmd at fmd_verror() time. * Update any other properties that must be cached for performance. */ (void) fmd_conf_getprop(fmd.d_conf, "fg", &fmd.d_fg); (void) fmd_conf_getprop(fmd.d_conf, "xprt.ttl", &fmd.d_xprt_ttl); /* * Initialize our custom libnvpair allocator and create an nvlist for * authority elements corresponding to this instance of the daemon. */ (void) nv_alloc_init(&dp->d_nva, &fmd_nv_alloc_ops); dp->d_auth = fmd_protocol_authority(); /* * The fmd_module_t for the root module must be created manually. Most * of it remains unused and zero, except for the few things we fill in. */ dp->d_rmod = fmd_zalloc(sizeof (fmd_module_t), FMD_SLEEP); dp->d_rmod->mod_name = fmd_strdup(dp->d_pname, FMD_SLEEP); dp->d_rmod->mod_fmri = fmd_protocol_fmri_module(dp->d_rmod); fmd_list_append(&dp->d_mod_list, dp->d_rmod); fmd_module_hold(dp->d_rmod); (void) pthread_mutex_init(&dp->d_rmod->mod_lock, NULL); (void) pthread_cond_init(&dp->d_rmod->mod_cv, NULL); (void) pthread_mutex_init(&dp->d_rmod->mod_stats_lock, NULL); dp->d_rmod->mod_thread = fmd_thread_xcreate(dp->d_rmod, pthread_self()); dp->d_rmod->mod_stats = fmd_zalloc(sizeof (fmd_modstat_t), FMD_SLEEP); dp->d_rmod->mod_ustat = fmd_ustat_create(); if (pthread_setspecific(dp->d_key, dp->d_rmod->mod_thread) != 0) fmd_error(EFMD_EXIT, "failed to attach main thread key"); if ((dp->d_stats = (fmd_statistics_t *)fmd_ustat_insert( dp->d_rmod->mod_ustat, FMD_USTAT_NOALLOC, sizeof (_fmd_stats) / sizeof (fmd_stat_t), (fmd_stat_t *)&_fmd_stats, NULL)) == NULL) fmd_error(EFMD_EXIT, "failed to initialize statistics"); (void) pthread_mutex_lock(&dp->d_rmod->mod_lock); dp->d_rmod->mod_flags |= FMD_MOD_INIT; (void) pthread_mutex_unlock(&dp->d_rmod->mod_lock); /* * In addition to inserting the _fmd_stats collection of program-wide * statistics, we also insert a statistic named after each of our * errors and update these counts in fmd_verror() (see fmd_subr.c). */ dp->d_errstats = sp = fmd_zalloc(sizeof (fmd_stat_t) * (EFMD_END - EFMD_UNKNOWN), FMD_SLEEP); for (i = 0; i < EFMD_END - EFMD_UNKNOWN; i++, sp++) { (void) snprintf(sp->fmds_name, sizeof (sp->fmds_name), "err.%s", strrchr(fmd_errclass(EFMD_UNKNOWN + i), '.') + 1); sp->fmds_type = FMD_TYPE_UINT64; } (void) fmd_ustat_insert(dp->d_rmod->mod_ustat, FMD_USTAT_NOALLOC, EFMD_END - EFMD_UNKNOWN, dp->d_errstats, NULL); }
/* * We use our own private version of svc_create() which registers our services * only on loopback transports and enables an option whereby Solaris ucreds * are associated with each connection, permitting us to check privilege bits. */ static int fmd_rpc_svc_create_local(void (*disp)(struct svc_req *, SVCXPRT *), rpcprog_t prog, rpcvers_t vers, uint_t ssz, uint_t rsz, int force) { struct netconfig *ncp; struct netbuf buf; SVCXPRT *xprt; void *hdl; int fd, n = 0; char door[PATH_MAX]; time_t tm; if ((hdl = setnetconfig()) == NULL) { fmd_error(EFMD_RPC_REG, "failed to iterate over " "netconfig database: %s\n", nc_sperror()); return (fmd_set_errno(EFMD_RPC_REG)); } if (force) svc_unreg(prog, vers); /* clear stale rpcbind registrations */ buf.buf = alloca(_SS_MAXSIZE); buf.maxlen = _SS_MAXSIZE; buf.len = 0; while ((ncp = getnetconfig(hdl)) != NULL) { if (strcmp(ncp->nc_protofmly, NC_LOOPBACK) != 0) continue; if (!force && rpcb_getaddr(prog, vers, ncp, &buf, HOST_SELF)) { (void) endnetconfig(hdl); return (fmd_set_errno(EFMD_RPC_BOUND)); } if ((fd = t_open(ncp->nc_device, O_RDWR, NULL)) == -1) { fmd_error(EFMD_RPC_REG, "failed to open %s: %s\n", ncp->nc_device, t_strerror(t_errno)); continue; } svc_fd_negotiate_ucred(fd); /* enable ucred option on xprt */ if ((xprt = svc_tli_create(fd, ncp, NULL, ssz, rsz)) == NULL) { (void) t_close(fd); continue; } if (svc_reg(xprt, prog, vers, disp, ncp) == FALSE) { fmd_error(EFMD_RPC_REG, "failed to register " "rpc service on %s\n", ncp->nc_netid); svc_destroy(xprt); continue; } n++; } (void) endnetconfig(hdl); /* * If we failed to register services (n == 0) because rpcbind is down, * then check to see if the RPC door file exists before attempting an * svc_door_create(), which cleverly destroys any existing door file. * The RPC APIs have no stable errnos, so we use rpcb_gettime() as a * hack to determine if rpcbind itself is down. */ if (!force && n == 0 && rpcb_gettime(HOST_SELF, &tm) == FALSE && snprintf(door, sizeof (door), RPC_DOOR_RENDEZVOUS, prog, vers) > 0 && access(door, F_OK) == 0) return (fmd_set_errno(EFMD_RPC_BOUND)); /* * Attempt to create a door server for the RPC program as well. Limit * the maximum request size for the door transport to the receive size. */ if ((xprt = svc_door_create(disp, prog, vers, ssz)) == NULL) { fmd_error(EFMD_RPC_REG, "failed to create door for " "rpc service 0x%lx/0x%lx\n", prog, vers); } else { (void) svc_control(xprt, SVCSET_CONNMAXREC, &rsz); n++; } return (n); }
void fmd_ckpt_save(fmd_module_t *mp) { struct stat64 st; char path[PATH_MAX]; mode_t dirmode; hrtime_t now = gethrtime(); fmd_ckpt_t ckp; int err; ASSERT(fmd_module_locked(mp)); /* * If checkpointing is disabled for the module, just return. We must * commit the module state anyway to transition pending log events. */ if (mp->mod_stats->ms_ckpt_save.fmds_value.b == FMD_B_FALSE) { fmd_module_commit(mp); return; } if (!(mp->mod_flags & (FMD_MOD_MDIRTY | FMD_MOD_CDIRTY))) return; /* no checkpoint is necessary for this module */ TRACE((FMD_DBG_CKPT, "ckpt save begin %s %llu", mp->mod_name, mp->mod_gen + 1)); /* * If the per-module checkpoint directory isn't found or isn't of type * directory, move aside whatever is there (if anything) and attempt * to mkdir(2) a new module checkpoint directory. If this fails, we * have no choice but to abort the checkpoint and try again later. */ if (stat64(mp->mod_ckpt, &st) != 0 || !S_ISDIR(st.st_mode)) { (void) snprintf(path, sizeof (path), "%s-", mp->mod_ckpt); (void) rename(mp->mod_ckpt, path); (void) fmd_conf_getprop(fmd.d_conf, "ckpt.dirmode", &dirmode); if (mkdir(mp->mod_ckpt, dirmode) != 0) { fmd_error(EFMD_CKPT_MKDIR, "failed to mkdir %s", mp->mod_ckpt); return; /* return without clearing dirty bits */ } } /* * Create a temporary file to write out the checkpoint into, and create * a fmd_ckpt_t structure to manage construction of the checkpoint. We * then figure out how much space will be required, and allocate it. */ if (fmd_ckpt_create(&ckp, mp) == -1) { fmd_error(EFMD_CKPT_CREATE, "failed to create %s", ckp.ckp_src); return; } fmd_ckpt_resv_module(&ckp, mp); if (fmd_ckpt_alloc(&ckp, mp->mod_gen + 1) != 0) { fmd_error(EFMD_CKPT_NOMEM, "failed to build %s", ckp.ckp_src); fmd_ckpt_destroy(&ckp); return; } /* * Fill in the checkpoint content, write it to disk, sync it, and then * atomically rename it to the destination path. If this fails, we * have no choice but to leave all our dirty bits set and return. */ fmd_ckpt_save_module(&ckp, mp); err = fmd_ckpt_commit(&ckp); fmd_ckpt_destroy(&ckp); if (err != 0) { fmd_error(EFMD_CKPT_COMMIT, "failed to commit %s", ckp.ckp_dst); return; /* return without clearing dirty bits */ } fmd_module_commit(mp); TRACE((FMD_DBG_CKPT, "ckpt save end %s", mp->mod_name)); mp->mod_stats->ms_ckpt_cnt.fmds_value.ui64++; mp->mod_stats->ms_ckpt_time.fmds_value.ui64 += gethrtime() - now; fmd_dprintf(FMD_DBG_CKPT, "saved checkpoint of %s (%llu)\n", mp->mod_name, mp->mod_gen); }