static int
cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu)
{
	int rc = 0;

	fmd_hdl_debug(hdl, "cpu_retry()\n");

	if (cpu->cpu_fmri == NULL) {
		return (1);
	}

	if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) {
		fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid);
		return (1);
	}

	rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS);
	if (rc == P_FAULTED || rc == P_OFFLINE) {
		fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n",
		    cpu->cpuid, cpu->cpu_nretries);
		cma_stats.cpu_flts.fmds_value.ui64++;

		if (cpu->cpu_uuid != NULL)
			fmd_case_uuclose(hdl, cpu->cpu_uuid);
		return (1); /* success */
	}

	if (rc == -1) {
		fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid);
		cma_stats.page_fails.fmds_value.ui64++;
		return (1); /* give up */
	}

	return (0);
}
Exemple #2
0
static int
cpu_present(fmd_hdl_t *hdl, nvlist_t *asru, uint32_t *cpuid)
{
    nvlist_t *cp_asru;
    uint32_t i;

    if (nvlist_dup(asru, &cp_asru, 0) != 0) {
        fmd_hdl_debug(hdl, "unable to alloc asru for thread\n");
        return (-1);
    }

    for (i = *cpuid; i < *cpuid + UTS2_CPUS_PER_CHIP; i++) {

        (void) nvlist_remove_all(cp_asru, FM_FMRI_CPU_ID);

        if (nvlist_add_uint32(cp_asru, FM_FMRI_CPU_ID, i) == 0) {
            if (fmd_nvl_fmri_present(hdl, cp_asru) &&
                    !fmd_nvl_fmri_unusable(hdl, cp_asru)) {
                nvlist_free(cp_asru);
                *cpuid = i;
                return (0);
            }
        }
    }
    nvlist_free(cp_asru);
    return (-1);
}
Exemple #3
0
cmd_dimm_t *
cmd_dimm_lookup(fmd_hdl_t *hdl, nvlist_t *asru)
{
	cmd_dimm_t *dimm;
	const char *unum;

	if ((unum = cmd_fmri_get_unum(asru)) == NULL) {
		CMD_STAT_BUMP(bad_mem_asru);
		return (NULL);
	}

	dimm = dimm_lookup_by_unum(unum);

	if (dimm != NULL && !fmd_nvl_fmri_present(hdl, dimm->dimm_asru_nvl)) {
		/*
		 * The DIMM doesn't exist anymore, so we need to delete the
		 * state structure, which is now out of date.  The containing
		 * bank (if any) is also out of date, so blow it away too.
		 */
		fmd_hdl_debug(hdl, "dimm_lookup: discarding old dimm\n");

		if (dimm->dimm_bank != NULL)
			cmd_bank_destroy(hdl, dimm->dimm_bank);
		cmd_dimm_destroy(hdl, dimm);

		return (NULL);
	}

	return (dimm);
}
Exemple #4
0
static int
page_retry(fmd_hdl_t *hdl, cma_page_t *page)
{
    int rc;

    if (page->pg_asru != NULL &&
            !fmd_nvl_fmri_present(hdl, page->pg_asru)) {
        fmd_hdl_debug(hdl, "page retire overtaken by events");
        cma_stats.page_nonent.fmds_value.ui64++;

        if (page->pg_uuid != NULL)
            fmd_case_uuclose(hdl, page->pg_uuid);
        return (1); /* no longer a page to retire */
    }

    if (page->pg_rsrc == NULL ||
            (rc = fmd_nvl_fmri_service_state(hdl, page->pg_rsrc)) < 0)
        rc = cma_fmri_page_service_state(hdl, page->pg_asru);

    if (rc == FMD_SERVICE_STATE_UNUSABLE) {
        fmd_hdl_debug(hdl, "retired page 0x%llx on retry %u\n",
                      page->pg_addr, page->pg_nretries);
        cma_stats.page_flts.fmds_value.ui64++;

        if (page->pg_uuid != NULL)
            fmd_case_uuclose(hdl, page->pg_uuid);
        return (1); /* page retired */
    }

    if (rc == FMD_SERVICE_STATE_ISOLATE_PENDING) {
        fmd_hdl_debug(hdl, "scheduling another retry for 0x%llx\n",
                      page->pg_addr);
        return (0); /* schedule another retry */
    } else {
        fmd_hdl_debug(hdl, "failed to retry page 0x%llx "
                      "retirement: %s\n", page->pg_addr,
                      strerror(errno));

        cma_stats.page_fails.fmds_value.ui64++;
        return (1); /* give up */
    }
}
Exemple #5
0
cmd_dimm_t *
cmd_dimm_create(fmd_hdl_t *hdl, nvlist_t *asru)
{
	cmd_dimm_t *dimm;
	const char *unum;
	nvlist_t *fmri;
	size_t nserids = 0;
	char **serids = NULL;

	if (!fmd_nvl_fmri_present(hdl, asru)) {
		fmd_hdl_debug(hdl, "dimm_lookup: discarding old ereport\n");
		return (NULL);
	}

	if ((unum = cmd_fmri_get_unum(asru)) == NULL) {
		CMD_STAT_BUMP(bad_mem_asru);
		return (NULL);
	}

#ifdef sun4v
	if (nvlist_lookup_string_array(asru, FM_FMRI_HC_SERIAL_ID, &serids,
	    &nserids) != 0) {
		fmd_hdl_debug(hdl, "sun4v mem: FMRI does not"
		    " have serial_ids\n");
		CMD_STAT_BUMP(bad_mem_asru);
		return (NULL);
	}
#endif
	fmri = cmd_mem_fmri_create(unum, serids, nserids);
	if (fmd_nvl_fmri_expand(hdl, fmri) < 0) {
		CMD_STAT_BUMP(bad_mem_asru);
		nvlist_free(fmri);
		return (NULL);
	}

	fmd_hdl_debug(hdl, "dimm_create: creating new DIMM %s\n", unum);
	CMD_STAT_BUMP(dimm_creat);

	dimm = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
	dimm->dimm_nodetype = CMD_NT_DIMM;
	dimm->dimm_version = CMD_DIMM_VERSION;

	cmd_bufname(dimm->dimm_bufname, sizeof (dimm->dimm_bufname), "dimm_%s",
	    unum);
	cmd_fmri_init(hdl, &dimm->dimm_asru, fmri, "dimm_asru_%s", unum);

	nvlist_free(fmri);

	(void) nvlist_lookup_string(dimm->dimm_asru_nvl, FM_FMRI_MEM_UNUM,
	    (char **)&dimm->dimm_unum);

	dimm_attach_to_bank(hdl, dimm);

	cmd_mem_retirestat_create(hdl, &dimm->dimm_retstat, dimm->dimm_unum, 0,
	    CMD_DIMM_STAT_PREFIX);

	cmd_list_append(&cmd.cmd_dimms, dimm);
	cmd_dimm_dirty(hdl, dimm);

	return (dimm);
}
Exemple #6
0
/*
 * Solve a given ZFS case.  This first checks to make sure the diagnosis is
 * still valid, as well as cleaning up any pending timer associated with the
 * case.
 */
static void
zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
    boolean_t checkunusable)
{
	libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
	nvlist_t *detector, *fault;
	boolean_t serialize;
	nvlist_t *fmri, *fru;
	topo_hdl_t *thp;
	int err;

	/*
	 * Construct the detector from the case data.  The detector is in the
	 * ZFS scheme, and is either the pool or the vdev, depending on whether
	 * this is a vdev or pool fault.
	 */
	detector = fmd_nvl_alloc(hdl, FMD_SLEEP);

	(void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
	(void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
	(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
	    zcp->zc_data.zc_pool_guid);
	if (zcp->zc_data.zc_vdev_guid != 0) {
		(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
		    zcp->zc_data.zc_vdev_guid);
	}

	/*
	 * We also want to make sure that the detector (pool or vdev) properly
	 * reflects the diagnosed state, when the fault corresponds to internal
	 * ZFS state (i.e. not checksum or I/O error-induced).  Otherwise, a
	 * device which was unavailable early in boot (because the driver/file
	 * wasn't available) and is now healthy will be mis-diagnosed.
	 */
	if (!fmd_nvl_fmri_present(hdl, detector) ||
	    (checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) {
		fmd_case_close(hdl, zcp->zc_case);
		nvlist_free(detector);
		return;
	}


	fru = NULL;
	if (zcp->zc_fru != NULL &&
	    (thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION)) != NULL) {
		/*
		 * If the vdev had an associated FRU, then get the FRU nvlist
		 * from the topo handle and use that in the suspect list.  We
		 * explicitly lookup the FRU because the fmri reported from the
		 * kernel may not have up to date details about the disk itself
		 * (serial, part, etc).
		 */
		if (topo_fmri_str2nvl(thp, zcp->zc_fru, &fmri, &err) == 0) {
			/*
			 * If the disk is part of the system chassis, but the
			 * FRU indicates a different chassis ID than our
			 * current system, then ignore the error.  This
			 * indicates that the device was part of another
			 * cluster head, and for obvious reasons cannot be
			 * imported on this system.
			 */
			if (libzfs_fru_notself(zhdl, zcp->zc_fru)) {
				fmd_case_close(hdl, zcp->zc_case);
				nvlist_free(fmri);
				fmd_hdl_topo_rele(hdl, thp);
				nvlist_free(detector);
				return;
			}

			/*
			 * If the device is no longer present on the system, or
			 * topo_fmri_fru() fails for other reasons, then fall
			 * back to the fmri specified in the vdev.
			 */
			if (topo_fmri_fru(thp, fmri, &fru, &err) != 0)
				fru = fmd_nvl_dup(hdl, fmri, FMD_SLEEP);
			nvlist_free(fmri);
		}

		fmd_hdl_topo_rele(hdl, thp);
	}

	fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
	    fru, detector);
	fmd_case_add_suspect(hdl, zcp->zc_case, fault);

	nvlist_free(fru);

	fmd_case_solve(hdl, zcp->zc_case);

	serialize = B_FALSE;
	if (zcp->zc_data.zc_has_remove_timer) {
		fmd_timer_remove(hdl, zcp->zc_remove_timer);
		zcp->zc_data.zc_has_remove_timer = 0;
		serialize = B_TRUE;
	}
	if (serialize)
		zfs_case_serialize(hdl, zcp);

	nvlist_free(detector);
}
Exemple #7
0
/*ARGSUSED*/
int
cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
                const char *uuid, boolean_t repair)
{
    cma_page_t *page;
    uint64_t pageaddr;
    const char *action = repair ? "unretire" : "retire";
    int rc;
    nvlist_t *rsrc = NULL, *asrucp = NULL, *hcsp;

    (void) nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc);

    if (nvlist_dup(asru, &asrucp, 0) != 0) {
        fmd_hdl_debug(hdl, "page retire nvlist dup failed\n");
        return (CMA_RA_FAILURE);
    }

    /* It should already be expanded, but we'll do it again anyway */
    if (fmd_nvl_fmri_expand(hdl, asrucp) < 0) {
        fmd_hdl_debug(hdl, "failed to expand page asru\n");
        cma_stats.bad_flts.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_FAILURE);
    }

    if (!repair && !fmd_nvl_fmri_present(hdl, asrucp)) {
        fmd_hdl_debug(hdl, "page retire overtaken by events\n");
        cma_stats.page_nonent.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_SUCCESS);
    }

    /* Figure out physaddr from resource or asru */
    if (rsrc == NULL ||
            nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcsp) != 0 ||
            (nvlist_lookup_uint64(hcsp, "asru-" FM_FMRI_HC_SPECIFIC_PHYSADDR,
                                  &pageaddr) != 0 && nvlist_lookup_uint64(hcsp,
                                          FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0)) {
        if (nvlist_lookup_uint64(asrucp, FM_FMRI_MEM_PHYSADDR,
                                 &pageaddr) != 0) {
            fmd_hdl_debug(hdl, "mem fault missing 'physaddr'\n");
            cma_stats.bad_flts.fmds_value.ui64++;
            nvlist_free(asrucp);
            return (CMA_RA_FAILURE);
        }
    }

    if (repair) {
        if (!cma.cma_page_dounretire) {
            fmd_hdl_debug(hdl, "suppressed unretire of page %llx\n",
                          (u_longlong_t)pageaddr);
            cma_stats.page_supp.fmds_value.ui64++;
            nvlist_free(asrucp);
            return (CMA_RA_SUCCESS);
        }
        /* If unretire via topo fails, we fall back to legacy way */
        if (rsrc == NULL || (rc = fmd_nvl_fmri_unretire(hdl, rsrc)) < 0)
            rc = cma_fmri_page_unretire(hdl, asrucp);
    } else {
        if (!cma.cma_page_doretire) {
            fmd_hdl_debug(hdl, "suppressed retire of page %llx\n",
                          (u_longlong_t)pageaddr);
            cma_stats.page_supp.fmds_value.ui64++;
            nvlist_free(asrucp);
            return (CMA_RA_FAILURE);
        }
        /* If retire via topo fails, we fall back to legacy way */
        if (rsrc == NULL || (rc = fmd_nvl_fmri_retire(hdl, rsrc)) < 0)
            rc = cma_fmri_page_retire(hdl, asrucp);
    }

    if (rc == FMD_AGENT_RETIRE_DONE) {
        fmd_hdl_debug(hdl, "%sd page 0x%llx\n",
                      action, (u_longlong_t)pageaddr);
        if (repair)
            cma_stats.page_repairs.fmds_value.ui64++;
        else
            cma_stats.page_flts.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_SUCCESS);
    } else if (repair || rc != FMD_AGENT_RETIRE_ASYNC) {
        fmd_hdl_debug(hdl, "%s of page 0x%llx failed, will not "
                      "retry: %s\n", action, (u_longlong_t)pageaddr,
                      strerror(errno));

        cma_stats.page_fails.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_FAILURE);
    }

    /*
     * The page didn't immediately retire.  We'll need to periodically
     * check to see if it has been retired.
     */
    fmd_hdl_debug(hdl, "page didn't retire - sleeping\n");

    page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP);
    page->pg_addr = pageaddr;
    if (rsrc != NULL)
        (void) nvlist_dup(rsrc, &page->pg_rsrc, 0);
    page->pg_asru = asrucp;
    if (uuid != NULL)
        page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);

    page->pg_next = cma.cma_pages;
    cma.cma_pages = page;

    if (cma.cma_page_timerid != 0)
        fmd_timer_remove(hdl, cma.cma_page_timerid);

    cma.cma_page_curdelay = cma.cma_page_mindelay;

    cma.cma_page_timerid =
        fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);

    /* Don't free asrucp here.  This FMRI will be needed for retry. */
    return (CMA_RA_FAILURE);
}