/*ARGSUSED*/ static void sp_timeout(fmd_hdl_t *hdl, id_t id, void *data) { sp_monitor_t *smp = fmd_hdl_getspecific(hdl); uint32_t seconds, generation; if (ipmi_sunoem_uptime(smp->sm_hdl, &seconds, &generation) != 0) { /* * Ignore uptime failures. We will generate the appropriate * event when it comes back online. */ fmd_hdl_debug(hdl, "failed to get uptime: %s", ipmi_errmsg(smp->sm_hdl)); } else { /* * We want to catch cases where the generation number is * explicitly reset, or when the SP configuration is reset after * a reboot (and the generation number is 0). We also post a * sysevent when the module initially loads, since we can't be * sure if we missed a SP reset or not. */ if (seconds < smp->sm_seconds || generation != smp->sm_generation || smp->sm_seconds == 0) sp_post_sysevent(hdl); smp->sm_seconds = seconds; smp->sm_generation = generation; } (void) fmd_timer_install(hdl, NULL, NULL, smp->sm_interval); }
/* * Read back the persistent representation of an active case. */ static zfs_case_t * zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) { zfs_case_t *zcp; zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); zcp->zc_case = cp; fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, sizeof (zcp->zc_data)); if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); return (NULL); } /* * fmd_buf_read() will have already zeroed out the remainder of the * buffer, so we don't have to do anything special if the version * doesn't include the SERD engine name. */ if (zcp->zc_data.zc_has_remove_timer) zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, zfs_remove_timeout); uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); (void) uu_list_insert_before(zfs_cases, NULL, zcp); fmd_case_setspecific(hdl, cp, zcp); return (zcp); }
void _fmd_init(fmd_hdl_t *hdl) { sp_monitor_t *smp; int error; char *msg; if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) return; smp = fmd_hdl_zalloc(hdl, sizeof (sp_monitor_t), FMD_SLEEP); fmd_hdl_setspecific(hdl, smp); if ((smp->sm_hdl = ipmi_open(&error, &msg, IPMI_TRANSPORT_BMC, NULL)) == NULL) { /* * If /dev/ipmi0 doesn't exist on the system, then unload the * module without doing anything. */ if (error != EIPMI_BMC_OPEN_FAILED) fmd_hdl_abort(hdl, "failed to initialize IPMI " "connection: %s\n", msg); fmd_hdl_debug(hdl, "failed to load: no IPMI connection " "present"); fmd_hdl_free(hdl, smp, sizeof (sp_monitor_t)); fmd_hdl_unregister(hdl); return; } /* * Attempt an initial uptime() call. If the IPMI command is * unrecognized, then this is an unsupported platform and the module * should be unloaded. Any other error is treated is transient failure. */ if ((error = ipmi_sunoem_uptime(smp->sm_hdl, &smp->sm_seconds, &smp->sm_generation)) != 0 && ipmi_errno(smp->sm_hdl) == EIPMI_INVALID_COMMAND) { fmd_hdl_debug(hdl, "failed to load: uptime command " "not supported"); ipmi_close(smp->sm_hdl); fmd_hdl_free(hdl, smp, sizeof (sp_monitor_t)); fmd_hdl_unregister(hdl); return; } smp->sm_interval = fmd_prop_get_int64(hdl, "interval"); if (error == 0) fmd_hdl_debug(hdl, "successfully loaded, uptime = %u seconds " "(generation %u)", smp->sm_seconds, smp->sm_generation); else fmd_hdl_debug(hdl, "successfully loaded, but uptime call " "failed: %s", ipmi_errmsg(smp->sm_hdl)); /* * Setup the recurring timer. */ (void) fmd_timer_install(hdl, NULL, NULL, 0); }
int cpu_offline(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid, int cpustate) { int i; uint_t cpuid; cma_cpu_t *cpu; if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) { fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID); cma_stats.bad_flts.fmds_value.ui64++; return (CMA_RA_FAILURE); } /* * cpu offlining using ldom_fmri_retire() may be asynchronous, so we * have to set the timer and check the cpu status later. */ for (i = 0; i < cma.cma_cpu_tries; i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) { if (cpu_cmd(hdl, asru, cpustate) != -1) { cma_stats.cpu_flts.fmds_value.ui64++; break; } } if (i >= cma.cma_cpu_tries) { cma_stats.cpu_fails.fmds_value.ui64++; } /* * check to see if the cpu has been offline. */ fmd_hdl_debug(hdl, "cpu is not offline yet - sleeping\n"); /* * Create a cpu node and add to the head of the cpu list */ cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP); (void) nvlist_dup(asru, &cpu->cpu_fmri, 0); if (uuid != NULL) cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP); cpu->cpuid = cpuid; cpu->cpu_next = cma.cma_cpus; cma.cma_cpus = cpu; if (cma.cma_cpu_timerid != 0) fmd_timer_remove(hdl, cma.cma_cpu_timerid); cma.cma_cpu_curdelay = cma.cma_cpu_mindelay; cma.cma_cpu_timerid = fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay); return (CMA_RA_FAILURE); }
void cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid, boolean_t repair) { cma_cpu_t *cpu; char *scheme; uint_t cpuid; nvlist_t *asru = NULL; topo_hdl_t *thp; int err; if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0) return; if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) { if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0) return; } else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) { return; } else { /* lookup cpuid from ASRU */ thp = fmd_fmri_topo_hold(TOPO_VERSION); if (thp != NULL) { (void) topo_fmri_asru(thp, fmri, &asru, &err); fmd_fmri_topo_rele(thp); } if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) { nvlist_free(asru); return; } } /* * check to see if the cpu has been offline. */ fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid); /* * Create a cpu node and add to the head of the cpu list */ cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP); (void) nvlist_dup(fmri, &cpu->cpu_fmri, 0); if (uuid != NULL) cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP); cpu->cpuid = cpuid; cpu->cpu_next = cma.cma_cpus; cma.cma_cpus = cpu; if (cma.cma_cpu_timerid != 0) fmd_timer_remove(hdl, cma.cma_cpu_timerid); cma.cma_cpu_curdelay = cma.cma_cpu_mindelay; cma.cma_cpu_timerid = fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay); }
/*ARGSUSED*/ static void disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data) { disk_lights_t *dl = fmd_hdl_getspecific(hdl); dl->dl_triggered = B_FALSE; dl_examine_topo(dl); /* * Install the long-interval timer for the next poll. */ dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval); }
void cma_page_retry(fmd_hdl_t *hdl) { cma_page_t **pagep; cma.cma_page_timerid = 0; fmd_hdl_debug(hdl, "page_retry: timer fired\n"); pagep = &cma.cma_pages; while (*pagep != NULL) { cma_page_t *page = *pagep; if (page_retry(hdl, page)) { /* * Successful retry or we're giving up - remove from * the list */ *pagep = page->pg_next; if (page->pg_uuid != NULL) fmd_hdl_strfree(hdl, page->pg_uuid); cma_page_free(hdl, page); } else { page->pg_nretries++; pagep = &page->pg_next; } } if (cma.cma_pages == NULL) return; /* no more retirements */ /* * We still have retirements that haven't completed. Back the delay * off, and schedule a retry. */ cma.cma_page_curdelay = MIN(cma.cma_page_curdelay * 2, cma.cma_page_maxdelay); fmd_hdl_debug(hdl, "scheduled page retirement retry for %llu secs\n", (u_longlong_t)(cma.cma_page_curdelay / NANOSEC)); cma.cma_page_timerid = fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay); }
static void dl_trigger_enum(disk_lights_t *dl) { /* * If we're already on the short-poll coalesce timer, then return * immediately. */ if (dl->dl_triggered == B_TRUE) return; dl->dl_triggered = B_TRUE; /* * Replace existing poll timer with coalesce timer: */ if (dl->dl_timer != 0) fmd_timer_remove(dl->dl_fmd, dl->dl_timer); dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL, dl->dl_coalesce_interval); }
void cma_cpu_retry(fmd_hdl_t *hdl) { cma_cpu_t **cpup; fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n"); cma.cma_cpu_timerid = 0; cpup = &cma.cma_cpus; while (*cpup != NULL) { cma_cpu_t *cpu = *cpup; if (cpu_retry(hdl, cpu)) { /* * Successful retry or we're giving up - remove from * the list */ *cpup = cpu->cpu_next; cma_cpu_free(hdl, cpu); } else { cpu->cpu_nretries++; cpup = &cpu->cpu_next; } } if (cma.cma_cpus == NULL) return; /* no more cpus */ /* * We still have cpus to check. Back the delay * off, and schedule a retry. */ cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2, cma.cma_cpu_maxdelay); fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n", (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC)); cma.cma_cpu_timerid = fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay); }
void _fmd_init(fmd_hdl_t *hdl) { char isa[8]; /* * For now the module only sends message to ILOM on i386 platforms * till CR 6933053 is fixed. Module unregister may cause etm module * core dump due to 6933053. */ if ((sysinfo(SI_ARCHITECTURE, isa, sizeof (isa)) == -1) || (strncmp(isa, "i386", 4) != 0)) return; if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) return; /* * Setup the timer. */ (void) fmd_timer_install(hdl, NULL, NULL, 2000000000ULL); }
/*ARGSUSED*/ int cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, boolean_t repair) { cma_page_t *page; uint64_t pageaddr; const char *action = repair ? "unretire" : "retire"; int rc; nvlist_t *rsrc = NULL, *asrucp = NULL, *hcsp; (void) nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc); if (nvlist_dup(asru, &asrucp, 0) != 0) { fmd_hdl_debug(hdl, "page retire nvlist dup failed\n"); return (CMA_RA_FAILURE); } /* It should already be expanded, but we'll do it again anyway */ if (fmd_nvl_fmri_expand(hdl, asrucp) < 0) { fmd_hdl_debug(hdl, "failed to expand page asru\n"); cma_stats.bad_flts.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } if (!repair && !fmd_nvl_fmri_present(hdl, asrucp)) { fmd_hdl_debug(hdl, "page retire overtaken by events\n"); cma_stats.page_nonent.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_SUCCESS); } /* Figure out physaddr from resource or asru */ if (rsrc == NULL || nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcsp) != 0 || (nvlist_lookup_uint64(hcsp, "asru-" FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0 && nvlist_lookup_uint64(hcsp, FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0)) { if (nvlist_lookup_uint64(asrucp, FM_FMRI_MEM_PHYSADDR, &pageaddr) != 0) { fmd_hdl_debug(hdl, "mem fault missing 'physaddr'\n"); cma_stats.bad_flts.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } } if (repair) { if (!cma.cma_page_dounretire) { fmd_hdl_debug(hdl, "suppressed unretire of page %llx\n", (u_longlong_t)pageaddr); cma_stats.page_supp.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_SUCCESS); } /* If unretire via topo fails, we fall back to legacy way */ if (rsrc == NULL || (rc = fmd_nvl_fmri_unretire(hdl, rsrc)) < 0) rc = cma_fmri_page_unretire(hdl, asrucp); } else { if (!cma.cma_page_doretire) { fmd_hdl_debug(hdl, "suppressed retire of page %llx\n", (u_longlong_t)pageaddr); cma_stats.page_supp.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } /* If retire via topo fails, we fall back to legacy way */ if (rsrc == NULL || (rc = fmd_nvl_fmri_retire(hdl, rsrc)) < 0) rc = cma_fmri_page_retire(hdl, asrucp); } if (rc == FMD_AGENT_RETIRE_DONE) { fmd_hdl_debug(hdl, "%sd page 0x%llx\n", action, (u_longlong_t)pageaddr); if (repair) cma_stats.page_repairs.fmds_value.ui64++; else cma_stats.page_flts.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_SUCCESS); } else if (repair || rc != FMD_AGENT_RETIRE_ASYNC) { fmd_hdl_debug(hdl, "%s of page 0x%llx failed, will not " "retry: %s\n", action, (u_longlong_t)pageaddr, strerror(errno)); cma_stats.page_fails.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } /* * The page didn't immediately retire. We'll need to periodically * check to see if it has been retired. */ fmd_hdl_debug(hdl, "page didn't retire - sleeping\n"); page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP); page->pg_addr = pageaddr; if (rsrc != NULL) (void) nvlist_dup(rsrc, &page->pg_rsrc, 0); page->pg_asru = asrucp; if (uuid != NULL) page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP); page->pg_next = cma.cma_pages; cma.cma_pages = page; if (cma.cma_page_timerid != 0) fmd_timer_remove(hdl, cma.cma_page_timerid); cma.cma_page_curdelay = cma.cma_page_mindelay; cma.cma_page_timerid = fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay); /* Don't free asrucp here. This FMRI will be needed for retry. */ return (CMA_RA_FAILURE); }