Example #1
0
static void
add_bdflt_to_case(fmd_hdl_t *hdl, char *label, const char *fltnm,
    uint8_t board_cert, fmd_case_t *cp)
{
	nvlist_t *memb_nvl, *flt;

	memb_nvl = fru_by_label(hdl, label);
	if (memb_nvl != NULL) {
		flt = cmd_nvl_create_fault(hdl, fltnm, board_cert,
		    memb_nvl, memb_nvl, NULL);
		flt = cmd_fault_add_location(hdl, flt, label);
		if (flt != NULL) {
			fmd_case_add_suspect(hdl, cp, flt);
		}
		nvlist_free(memb_nvl);
	}
}
Example #2
0
/*
 * Solve a given ZFS case.  This first checks to make sure the diagnosis is
 * still valid, as well as cleaning up any pending timer associated with the
 * case.
 */
static void
zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
    boolean_t checkunusable)
{
	nvlist_t *detector, *fault;
	boolean_t serialize;
	nvlist_t *fru = NULL;
	fmd_hdl_debug(hdl, "solving fault '%s'", faultname);

	/*
	 * Construct the detector from the case data.  The detector is in the
	 * ZFS scheme, and is either the pool or the vdev, depending on whether
	 * this is a vdev or pool fault.
	 */
	detector = fmd_nvl_alloc(hdl, FMD_SLEEP);

	(void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
	(void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
	(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
	    zcp->zc_data.zc_pool_guid);
	if (zcp->zc_data.zc_vdev_guid != 0) {
		(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
		    zcp->zc_data.zc_vdev_guid);
	}

	fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
	    fru, detector);
	fmd_case_add_suspect(hdl, zcp->zc_case, fault);

	nvlist_free(fru);

	fmd_case_solve(hdl, zcp->zc_case);

	serialize = B_FALSE;
	if (zcp->zc_data.zc_has_remove_timer) {
		fmd_timer_remove(hdl, zcp->zc_remove_timer);
		zcp->zc_data.zc_has_remove_timer = 0;
		serialize = B_TRUE;
	}
	if (serialize)
		zfs_case_serialize(hdl, zcp);

	nvlist_free(detector);
}
Example #3
0
/*
 * For t5440, the memory channel goes like this:
 * VF -> cpuboard -> D0 -> motherboard -> memboard -> D[1..3]
 * If there is a dimm on the memory board, the memory board,
 * motherboard, cpuboard, and dimms are in the suspect list.
 * If there is no dimm on the memory board, the cpu board and
 * the dimms are in the suspect list
 * The board certainty = total board certainty / number of
 * the faulty boards in the suspect list.
 */
void
cmd_branch_create_fault(fmd_hdl_t *hdl, cmd_branch_t *branch,
    const char *fltnm, nvlist_t *asru)
{
	nvlist_t *flt;
	cmd_branch_memb_t *bm;
	cmd_dimm_t *dimm;
	int dimm_count = 0;
	uint_t cert = 0;
	uint_t board_cert = 0;
	char *fruloc = NULL, *membd_label;

	/* attach the dimms to the branch */
	dimm_count = branch_dimmlist_create(hdl, branch);

	if ((membd_label = mbd_label(hdl, branch, "MEM")) != NULL) {
		board_cert = CMD_BOARDS_CERT / 3; /* CPU, MEM, MB */

		/*
		 * Batoka with memory expansion.  CPU expansion board will
		 * be added below.  Add memory expansion board and motherboard
		 * FRUs here.
		 */

		add_bdflt_to_case(hdl, membd_label, fltnm, board_cert,
		    branch->branch_case.cc_cp);
		fmd_hdl_strfree(hdl, membd_label);
		add_bdflt_to_case(hdl, "MB", fltnm, board_cert,
		    branch->branch_case.cc_cp);

	} else if ((membd_label = mbd_label(hdl, branch, "MR")) != NULL) {

		board_cert = CMD_BOARDS_CERT / 2; /* MB, MR */

		/*
		 * Maramba or similar platform with mezzanine board.
		 * Motherboard FRU will be added below.  Add the mezzanine
		 * board here.
		 */

		add_bdflt_to_case(hdl, membd_label, fltnm, board_cert,
		    branch->branch_case.cc_cp);
		fmd_hdl_strfree(hdl, membd_label);
	} else {
		board_cert = CMD_BOARDS_CERT; /* only MB or CPU */
	}

	/*
	 * The code which follows adds to the suspect list the FRU which
	 * contains the ereport 'detector'.  This can be either a CPU
	 * expansion board (Batoka), or motherboard (Huron, Maramba, or
	 * derivative).
	 */

	fruloc = cmd_getfru_loc(hdl, asru);
	flt = cmd_boardfru_create_fault(hdl, asru, fltnm, board_cert, fruloc);
	if (flt != NULL)
		fmd_case_add_suspect(hdl, branch->branch_case.cc_cp, flt);

	if (dimm_count != 0)
		cert = (100 - CMD_BOARDS_CERT) / dimm_count;

	/* create dimm faults */
	for (bm = cmd_list_next(&branch->branch_dimms); bm != NULL;
	    bm = cmd_list_next(bm)) {
		dimm = bm->dimm;
		if (dimm != NULL) {
			dimm->dimm_flags |= CMD_MEM_F_FAULTING;
			cmd_dimm_dirty(hdl, dimm);
			flt = cmd_dimm_create_fault(hdl, dimm, fltnm, cert);
			fmd_case_add_suspect(hdl, branch->branch_case.cc_cp,
			    flt);
		}
	}
	if (fruloc != NULL)
		fmd_hdl_strfree(hdl, fruloc);
}
Example #4
0
/*
 * Solve a given ZFS case.  This first checks to make sure the diagnosis is
 * still valid, as well as cleaning up any pending timer associated with the
 * case.
 */
static void
zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
    boolean_t checkunusable)
{
	libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
	nvlist_t *detector, *fault;
	boolean_t serialize;
	nvlist_t *fmri, *fru;
	topo_hdl_t *thp;
	int err;

	/*
	 * Construct the detector from the case data.  The detector is in the
	 * ZFS scheme, and is either the pool or the vdev, depending on whether
	 * this is a vdev or pool fault.
	 */
	detector = fmd_nvl_alloc(hdl, FMD_SLEEP);

	(void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
	(void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
	(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
	    zcp->zc_data.zc_pool_guid);
	if (zcp->zc_data.zc_vdev_guid != 0) {
		(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
		    zcp->zc_data.zc_vdev_guid);
	}

	/*
	 * We also want to make sure that the detector (pool or vdev) properly
	 * reflects the diagnosed state, when the fault corresponds to internal
	 * ZFS state (i.e. not checksum or I/O error-induced).  Otherwise, a
	 * device which was unavailable early in boot (because the driver/file
	 * wasn't available) and is now healthy will be mis-diagnosed.
	 */
	if (!fmd_nvl_fmri_present(hdl, detector) ||
	    (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) {
		fmd_case_close(hdl, zcp->zc_case);
		nvlist_free(detector);
		return;
	}


	fru = NULL;
	if (zcp->zc_fru != NULL &&
	    (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) {
		/*
		 * If the vdev had an associated FRU, then get the FRU nvlist
		 * from the topo handle and use that in the suspect list.  We
		 * explicitly lookup the FRU because the fmri reported from the
		 * kernel may not have up to date details about the disk itself
		 * (serial, part, etc).
		 */
		if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) {
			/*
			 * If the disk is part of the system chassis, but the
			 * FRU indicates a different chassis ID than our
			 * current system, then ignore the error.  This
			 * indicates that the device was part of another
			 * cluster head, and for obvious reasons cannot be
			 * imported on this system.
			 */
			if (libzfs_fru_notself(zhdl, zcp->zc_fru)) {
				fmd_case_close(hdl, zcp->zc_case);
				nvlist_free(fmri);
				fmd_hdl_topo_rele(hdl, thp);
				nvlist_free(detector);
				return;
			}

			/*
			 * If the device is no longer present on the system, or
			 * topo_fmri_fru() fails for other reasons, then fall
			 * back to the fmri specified in the vdev.
			 */
			if (topo_fmri_fru(thp, fmri, &fru, &err) != 0)
				fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP);
			nvlist_free(fmri);
		}

		fmd_hdl_topo_rele(hdl, thp);
	}

	fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
	    fru, detector);
	fmd_case_add_suspect(hdl, zcp->zc_case, fault);

	nvlist_free(fru);

	fmd_case_solve(hdl, zcp->zc_case);

	serialize = B_FALSE;
	if (zcp->zc_data.zc_has_remove_timer) {
		fmd_timer_remove(hdl, zcp->zc_remove_timer);
		zcp->zc_data.zc_has_remove_timer = 0;
		serialize = B_TRUE;
	}
	if (serialize)
		zfs_case_serialize(hdl, zcp);

	nvlist_free(detector);
}