static void add_bdflt_to_case(fmd_hdl_t *hdl, char *label, const char *fltnm, uint8_t board_cert, fmd_case_t *cp) { nvlist_t *memb_nvl, *flt; memb_nvl = fru_by_label(hdl, label); if (memb_nvl != NULL) { flt = cmd_nvl_create_fault(hdl, fltnm, board_cert, memb_nvl, memb_nvl, NULL); flt = cmd_fault_add_location(hdl, flt, label); if (flt != NULL) { fmd_case_add_suspect(hdl, cp, flt); } nvlist_free(memb_nvl); } }
/* * Solve a given ZFS case. This first checks to make sure the diagnosis is * still valid, as well as cleaning up any pending timer associated with the * case. */ static void zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, boolean_t checkunusable) { nvlist_t *detector, *fault; boolean_t serialize; nvlist_t *fru = NULL; fmd_hdl_debug(hdl, "solving fault '%s'", faultname); /* * Construct the detector from the case data. The detector is in the * ZFS scheme, and is either the pool or the vdev, depending on whether * this is a vdev or pool fault. */ detector = fmd_nvl_alloc(hdl, FMD_SLEEP); (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, zcp->zc_data.zc_pool_guid); if (zcp->zc_data.zc_vdev_guid != 0) { (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, zcp->zc_data.zc_vdev_guid); } fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, fru, detector); fmd_case_add_suspect(hdl, zcp->zc_case, fault); nvlist_free(fru); fmd_case_solve(hdl, zcp->zc_case); serialize = B_FALSE; if (zcp->zc_data.zc_has_remove_timer) { fmd_timer_remove(hdl, zcp->zc_remove_timer); zcp->zc_data.zc_has_remove_timer = 0; serialize = B_TRUE; } if (serialize) zfs_case_serialize(hdl, zcp); nvlist_free(detector); }
/* * For t5440, the memory channel goes like this: * VF -> cpuboard -> D0 -> motherboard -> memboard -> D[1..3] * If there is a dimm on the memory board, the memory board, * motherboard, cpuboard, and dimms are in the suspect list. * If there is no dimm on the memory board, the cpu board and * the dimms are in the suspect list * The board certainty = total board certainty / number of * the faulty boards in the suspect list. */ void cmd_branch_create_fault(fmd_hdl_t *hdl, cmd_branch_t *branch, const char *fltnm, nvlist_t *asru) { nvlist_t *flt; cmd_branch_memb_t *bm; cmd_dimm_t *dimm; int dimm_count = 0; uint_t cert = 0; uint_t board_cert = 0; char *fruloc = NULL, *membd_label; /* attach the dimms to the branch */ dimm_count = branch_dimmlist_create(hdl, branch); if ((membd_label = mbd_label(hdl, branch, "MEM")) != NULL) { board_cert = CMD_BOARDS_CERT / 3; /* CPU, MEM, MB */ /* * Batoka with memory expansion. CPU expansion board will * be added below. Add memory expansion board and motherboard * FRUs here. */ add_bdflt_to_case(hdl, membd_label, fltnm, board_cert, branch->branch_case.cc_cp); fmd_hdl_strfree(hdl, membd_label); add_bdflt_to_case(hdl, "MB", fltnm, board_cert, branch->branch_case.cc_cp); } else if ((membd_label = mbd_label(hdl, branch, "MR")) != NULL) { board_cert = CMD_BOARDS_CERT / 2; /* MB, MR */ /* * Maramba or similar platform with mezzanine board. * Motherboard FRU will be added below. Add the mezzanine * board here. */ add_bdflt_to_case(hdl, membd_label, fltnm, board_cert, branch->branch_case.cc_cp); fmd_hdl_strfree(hdl, membd_label); } else { board_cert = CMD_BOARDS_CERT; /* only MB or CPU */ } /* * The code which follows adds to the suspect list the FRU which * contains the ereport 'detector'. This can be either a CPU * expansion board (Batoka), or motherboard (Huron, Maramba, or * derivative). */ fruloc = cmd_getfru_loc(hdl, asru); flt = cmd_boardfru_create_fault(hdl, asru, fltnm, board_cert, fruloc); if (flt != NULL) fmd_case_add_suspect(hdl, branch->branch_case.cc_cp, flt); if (dimm_count != 0) cert = (100 - CMD_BOARDS_CERT) / dimm_count; /* create dimm faults */ for (bm = cmd_list_next(&branch->branch_dimms); bm != NULL; bm = cmd_list_next(bm)) { dimm = bm->dimm; if (dimm != NULL) { dimm->dimm_flags |= CMD_MEM_F_FAULTING; cmd_dimm_dirty(hdl, dimm); flt = cmd_dimm_create_fault(hdl, dimm, fltnm, cert); fmd_case_add_suspect(hdl, branch->branch_case.cc_cp, flt); } } if (fruloc != NULL) fmd_hdl_strfree(hdl, fruloc); }
/* * Solve a given ZFS case. This first checks to make sure the diagnosis is * still valid, as well as cleaning up any pending timer associated with the * case. */ static void zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, boolean_t checkunusable) { libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); nvlist_t *detector, *fault; boolean_t serialize; nvlist_t *fmri, *fru; topo_hdl_t *thp; int err; /* * Construct the detector from the case data. The detector is in the * ZFS scheme, and is either the pool or the vdev, depending on whether * this is a vdev or pool fault. */ detector = fmd_nvl_alloc(hdl, FMD_SLEEP); (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, zcp->zc_data.zc_pool_guid); if (zcp->zc_data.zc_vdev_guid != 0) { (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, zcp->zc_data.zc_vdev_guid); } /* * We also want to make sure that the detector (pool or vdev) properly * reflects the diagnosed state, when the fault corresponds to internal * ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a * device which was unavailable early in boot (because the driver/file * wasn't available) and is now healthy will be mis-diagnosed. */ if (!fmd_nvl_fmri_present(hdl, detector) || (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { fmd_case_close(hdl, zcp->zc_case); nvlist_free(detector); return; } fru = NULL; if (zcp->zc_fru != NULL && (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) { /* * If the vdev had an associated FRU, then get the FRU nvlist * from the topo handle and use that in the suspect list. We * explicitly lookup the FRU because the fmri reported from the * kernel may not have up to date details about the disk itself * (serial, part, etc). */ if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) { /* * If the disk is part of the system chassis, but the * FRU indicates a different chassis ID than our * current system, then ignore the error. This * indicates that the device was part of another * cluster head, and for obvious reasons cannot be * imported on this system. */ if (libzfs_fru_notself(zhdl, zcp->zc_fru)) { fmd_case_close(hdl, zcp->zc_case); nvlist_free(fmri); fmd_hdl_topo_rele(hdl, thp); nvlist_free(detector); return; } /* * If the device is no longer present on the system, or * topo_fmri_fru() fails for other reasons, then fall * back to the fmri specified in the vdev. */ if (topo_fmri_fru(thp, fmri, &fru, &err) != 0) fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP); nvlist_free(fmri); } fmd_hdl_topo_rele(hdl, thp); } fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, fru, detector); fmd_case_add_suspect(hdl, zcp->zc_case, fault); nvlist_free(fru); fmd_case_solve(hdl, zcp->zc_case); serialize = B_FALSE; if (zcp->zc_data.zc_has_remove_timer) { fmd_timer_remove(hdl, zcp->zc_remove_timer); zcp->zc_data.zc_has_remove_timer = 0; serialize = B_TRUE; } if (serialize) zfs_case_serialize(hdl, zcp); nvlist_free(detector); }