static int cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu) { int rc = 0; fmd_hdl_debug(hdl, "cpu_retry()\n"); if (cpu->cpu_fmri == NULL) { return (1); } if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) { fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid); return (1); } rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS); if (rc == P_FAULTED || rc == P_OFFLINE) { fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n", cpu->cpuid, cpu->cpu_nretries); cma_stats.cpu_flts.fmds_value.ui64++; if (cpu->cpu_uuid != NULL) fmd_case_uuclose(hdl, cpu->cpu_uuid); return (1); /* success */ } if (rc == -1) { fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid); cma_stats.page_fails.fmds_value.ui64++; return (1); /* give up */ } return (0); }
static int cpu_present(fmd_hdl_t *hdl, nvlist_t *asru, uint32_t *cpuid) { nvlist_t *cp_asru; uint32_t i; if (nvlist_dup(asru, &cp_asru, 0) != 0) { fmd_hdl_debug(hdl, "unable to alloc asru for thread\n"); return (-1); } for (i = *cpuid; i < *cpuid + UTS2_CPUS_PER_CHIP; i++) { (void) nvlist_remove_all(cp_asru, FM_FMRI_CPU_ID); if (nvlist_add_uint32(cp_asru, FM_FMRI_CPU_ID, i) == 0) { if (fmd_nvl_fmri_present(hdl, cp_asru) && !fmd_nvl_fmri_unusable(hdl, cp_asru)) { nvlist_free(cp_asru); *cpuid = i; return (0); } } } nvlist_free(cp_asru); return (-1); }
cmd_dimm_t * cmd_dimm_lookup(fmd_hdl_t *hdl, nvlist_t *asru) { cmd_dimm_t *dimm; const char *unum; if ((unum = cmd_fmri_get_unum(asru)) == NULL) { CMD_STAT_BUMP(bad_mem_asru); return (NULL); } dimm = dimm_lookup_by_unum(unum); if (dimm != NULL && !fmd_nvl_fmri_present(hdl, dimm->dimm_asru_nvl)) { /* * The DIMM doesn't exist anymore, so we need to delete the * state structure, which is now out of date. The containing * bank (if any) is also out of date, so blow it away too. */ fmd_hdl_debug(hdl, "dimm_lookup: discarding old dimm\n"); if (dimm->dimm_bank != NULL) cmd_bank_destroy(hdl, dimm->dimm_bank); cmd_dimm_destroy(hdl, dimm); return (NULL); } return (dimm); }
static int page_retry(fmd_hdl_t *hdl, cma_page_t *page) { int rc; if (page->pg_asru != NULL && !fmd_nvl_fmri_present(hdl, page->pg_asru)) { fmd_hdl_debug(hdl, "page retire overtaken by events"); cma_stats.page_nonent.fmds_value.ui64++; if (page->pg_uuid != NULL) fmd_case_uuclose(hdl, page->pg_uuid); return (1); /* no longer a page to retire */ } if (page->pg_rsrc == NULL || (rc = fmd_nvl_fmri_service_state(hdl, page->pg_rsrc)) < 0) rc = cma_fmri_page_service_state(hdl, page->pg_asru); if (rc == FMD_SERVICE_STATE_UNUSABLE) { fmd_hdl_debug(hdl, "retired page 0x%llx on retry %u\n", page->pg_addr, page->pg_nretries); cma_stats.page_flts.fmds_value.ui64++; if (page->pg_uuid != NULL) fmd_case_uuclose(hdl, page->pg_uuid); return (1); /* page retired */ } if (rc == FMD_SERVICE_STATE_ISOLATE_PENDING) { fmd_hdl_debug(hdl, "scheduling another retry for 0x%llx\n", page->pg_addr); return (0); /* schedule another retry */ } else { fmd_hdl_debug(hdl, "failed to retry page 0x%llx " "retirement: %s\n", page->pg_addr, strerror(errno)); cma_stats.page_fails.fmds_value.ui64++; return (1); /* give up */ } }
cmd_dimm_t * cmd_dimm_create(fmd_hdl_t *hdl, nvlist_t *asru) { cmd_dimm_t *dimm; const char *unum; nvlist_t *fmri; size_t nserids = 0; char **serids = NULL; if (!fmd_nvl_fmri_present(hdl, asru)) { fmd_hdl_debug(hdl, "dimm_lookup: discarding old ereport\n"); return (NULL); } if ((unum = cmd_fmri_get_unum(asru)) == NULL) { CMD_STAT_BUMP(bad_mem_asru); return (NULL); } #ifdef sun4v if (nvlist_lookup_string_array(asru, FM_FMRI_HC_SERIAL_ID, &serids, &nserids) != 0) { fmd_hdl_debug(hdl, "sun4v mem: FMRI does not" " have serial_ids\n"); CMD_STAT_BUMP(bad_mem_asru); return (NULL); } #endif fmri = cmd_mem_fmri_create(unum, serids, nserids); if (fmd_nvl_fmri_expand(hdl, fmri) < 0) { CMD_STAT_BUMP(bad_mem_asru); nvlist_free(fmri); return (NULL); } fmd_hdl_debug(hdl, "dimm_create: creating new DIMM %s\n", unum); CMD_STAT_BUMP(dimm_creat); dimm = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP); dimm->dimm_nodetype = CMD_NT_DIMM; dimm->dimm_version = CMD_DIMM_VERSION; cmd_bufname(dimm->dimm_bufname, sizeof (dimm->dimm_bufname), "dimm_%s", unum); cmd_fmri_init(hdl, &dimm->dimm_asru, fmri, "dimm_asru_%s", unum); nvlist_free(fmri); (void) nvlist_lookup_string(dimm->dimm_asru_nvl, FM_FMRI_MEM_UNUM, (char **)&dimm->dimm_unum); dimm_attach_to_bank(hdl, dimm); cmd_mem_retirestat_create(hdl, &dimm->dimm_retstat, dimm->dimm_unum, 0, CMD_DIMM_STAT_PREFIX); cmd_list_append(&cmd.cmd_dimms, dimm); cmd_dimm_dirty(hdl, dimm); return (dimm); }
/* * Solve a given ZFS case. This first checks to make sure the diagnosis is * still valid, as well as cleaning up any pending timer associated with the * case. */ static void zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, boolean_t checkunusable) { libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); nvlist_t *detector, *fault; boolean_t serialize; nvlist_t *fmri, *fru; topo_hdl_t *thp; int err; /* * Construct the detector from the case data. The detector is in the * ZFS scheme, and is either the pool or the vdev, depending on whether * this is a vdev or pool fault. */ detector = fmd_nvl_alloc(hdl, FMD_SLEEP); (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, zcp->zc_data.zc_pool_guid); if (zcp->zc_data.zc_vdev_guid != 0) { (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, zcp->zc_data.zc_vdev_guid); } /* * We also want to make sure that the detector (pool or vdev) properly * reflects the diagnosed state, when the fault corresponds to internal * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a * device which was unavailable early in boot (because the driver/file * wasn't available) and is now healthy will be mis-diagnosed. */ if (!fmd_nvl_fmri_present(hdl, detector) || (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { fmd_case_close(hdl, zcp->zc_case); nvlist_free(detector); return; } fru = NULL; if (zcp->zc_fru != NULL && (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) { /* * If the vdev had an associated FRU, then get the FRU nvlist * from the topo handle and use that in the suspect list. We * explicitly lookup the FRU because the fmri reported from the * kernel may not have up to date details about the disk itself * (serial, part, etc). */ if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) { /* * If the disk is part of the system chassis, but the * FRU indicates a different chassis ID than our * current system, then ignore the error. This * indicates that the device was part of another * cluster head, and for obvious reasons cannot be * imported on this system. */ if (libzfs_fru_notself(zhdl, zcp->zc_fru)) { fmd_case_close(hdl, zcp->zc_case); nvlist_free(fmri); fmd_hdl_topo_rele(hdl, thp); nvlist_free(detector); return; } /* * If the device is no longer present on the system, or * topo_fmri_fru() fails for other reasons, then fall * back to the fmri specified in the vdev. */ if (topo_fmri_fru(thp, fmri, &fru, &err) != 0) fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP); nvlist_free(fmri); } fmd_hdl_topo_rele(hdl, thp); } fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, fru, detector); fmd_case_add_suspect(hdl, zcp->zc_case, fault); nvlist_free(fru); fmd_case_solve(hdl, zcp->zc_case); serialize = B_FALSE; if (zcp->zc_data.zc_has_remove_timer) { fmd_timer_remove(hdl, zcp->zc_remove_timer); zcp->zc_data.zc_has_remove_timer = 0; serialize = B_TRUE; } if (serialize) zfs_case_serialize(hdl, zcp); nvlist_free(detector); }
/*ARGSUSED*/ int cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid, boolean_t repair) { cma_page_t *page; uint64_t pageaddr; const char *action = repair ? "unretire" : "retire"; int rc; nvlist_t *rsrc = NULL, *asrucp = NULL, *hcsp; (void) nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc); if (nvlist_dup(asru, &asrucp, 0) != 0) { fmd_hdl_debug(hdl, "page retire nvlist dup failed\n"); return (CMA_RA_FAILURE); } /* It should already be expanded, but we'll do it again anyway */ if (fmd_nvl_fmri_expand(hdl, asrucp) < 0) { fmd_hdl_debug(hdl, "failed to expand page asru\n"); cma_stats.bad_flts.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } if (!repair && !fmd_nvl_fmri_present(hdl, asrucp)) { fmd_hdl_debug(hdl, "page retire overtaken by events\n"); cma_stats.page_nonent.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_SUCCESS); } /* Figure out physaddr from resource or asru */ if (rsrc == NULL || nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcsp) != 0 || (nvlist_lookup_uint64(hcsp, "asru-" FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0 && nvlist_lookup_uint64(hcsp, FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0)) { if (nvlist_lookup_uint64(asrucp, FM_FMRI_MEM_PHYSADDR, &pageaddr) != 0) { fmd_hdl_debug(hdl, "mem fault missing 'physaddr'\n"); cma_stats.bad_flts.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } } if (repair) { if (!cma.cma_page_dounretire) { fmd_hdl_debug(hdl, "suppressed unretire of page %llx\n", (u_longlong_t)pageaddr); cma_stats.page_supp.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_SUCCESS); } /* If unretire via topo fails, we fall back to legacy way */ if (rsrc == NULL || (rc = fmd_nvl_fmri_unretire(hdl, rsrc)) < 0) rc = cma_fmri_page_unretire(hdl, asrucp); } else { if (!cma.cma_page_doretire) { fmd_hdl_debug(hdl, "suppressed retire of page %llx\n", (u_longlong_t)pageaddr); cma_stats.page_supp.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } /* If retire via topo fails, we fall back to legacy way */ if (rsrc == NULL || (rc = fmd_nvl_fmri_retire(hdl, rsrc)) < 0) rc = cma_fmri_page_retire(hdl, asrucp); } if (rc == FMD_AGENT_RETIRE_DONE) { fmd_hdl_debug(hdl, "%sd page 0x%llx\n", action, (u_longlong_t)pageaddr); if (repair) cma_stats.page_repairs.fmds_value.ui64++; else cma_stats.page_flts.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_SUCCESS); } else if (repair || rc != FMD_AGENT_RETIRE_ASYNC) { fmd_hdl_debug(hdl, "%s of page 0x%llx failed, will not " "retry: %s\n", action, (u_longlong_t)pageaddr, strerror(errno)); cma_stats.page_fails.fmds_value.ui64++; nvlist_free(asrucp); return (CMA_RA_FAILURE); } /* * The page didn't immediately retire. We'll need to periodically * check to see if it has been retired. */ fmd_hdl_debug(hdl, "page didn't retire - sleeping\n"); page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP); page->pg_addr = pageaddr; if (rsrc != NULL) (void) nvlist_dup(rsrc, &page->pg_rsrc, 0); page->pg_asru = asrucp; if (uuid != NULL) page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP); page->pg_next = cma.cma_pages; cma.cma_pages = page; if (cma.cma_page_timerid != 0) fmd_timer_remove(hdl, cma.cma_page_timerid); cma.cma_page_curdelay = cma.cma_page_mindelay; cma.cma_page_timerid = fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay); /* Don't free asrucp here. This FMRI will be needed for retry. */ return (CMA_RA_FAILURE); }