Esempio n. 1
0
/*ARGSUSED*/
static void
sp_timeout(fmd_hdl_t *hdl, id_t id, void *data)
{
	sp_monitor_t *smp = fmd_hdl_getspecific(hdl);
	uint32_t seconds, generation;

	if (ipmi_sunoem_uptime(smp->sm_hdl, &seconds, &generation) != 0) {
		/*
		 * Ignore uptime failures.  We will generate the appropriate
		 * event when it comes back online.
		 */
		fmd_hdl_debug(hdl, "failed to get uptime: %s",
		    ipmi_errmsg(smp->sm_hdl));
	} else {
		/*
		 * We want to catch cases where the generation number is
		 * explicitly reset, or when the SP configuration is reset after
		 * a reboot (and the generation number is 0).  We also post a
		 * sysevent when the module initially loads, since we can't be
		 * sure if we missed a SP reset or not.
		 */
		if (seconds < smp->sm_seconds ||
		    generation != smp->sm_generation ||
		    smp->sm_seconds == 0)
			sp_post_sysevent(hdl);

		smp->sm_seconds = seconds;
		smp->sm_generation = generation;
	}

	(void) fmd_timer_install(hdl, NULL, NULL, smp->sm_interval);
}
Esempio n. 2
0
/*
 * Read back the persistent representation of an active case.
 */
static zfs_case_t *
zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
{
	zfs_case_t *zcp;

	zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
	zcp->zc_case = cp;

	fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
	    sizeof (zcp->zc_data));

	if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
		return (NULL);
	}

	/*
	 * fmd_buf_read() will have already zeroed out the remainder of the
	 * buffer, so we don't have to do anything special if the version
	 * doesn't include the SERD engine name.
	 */

	if (zcp->zc_data.zc_has_remove_timer)
		zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
		    NULL, zfs_remove_timeout);

	uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool);
	(void) uu_list_insert_before(zfs_cases, NULL, zcp);

	fmd_case_setspecific(hdl, cp, zcp);

	return (zcp);
}
Esempio n. 3
0
void
_fmd_init(fmd_hdl_t *hdl)
{
	sp_monitor_t *smp;
	int error;
	char *msg;

	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
		return;

	smp = fmd_hdl_zalloc(hdl, sizeof (sp_monitor_t), FMD_SLEEP);
	fmd_hdl_setspecific(hdl, smp);

	if ((smp->sm_hdl = ipmi_open(&error, &msg, IPMI_TRANSPORT_BMC, NULL))
	    == NULL) {
		/*
		 * If /dev/ipmi0 doesn't exist on the system, then unload the
		 * module without doing anything.
		 */
		if (error != EIPMI_BMC_OPEN_FAILED)
			fmd_hdl_abort(hdl, "failed to initialize IPMI "
			    "connection: %s\n", msg);
		fmd_hdl_debug(hdl, "failed to load: no IPMI connection "
		    "present");
		fmd_hdl_free(hdl, smp, sizeof (sp_monitor_t));
		fmd_hdl_unregister(hdl);
		return;
	}

	/*
	 * Attempt an initial uptime() call.  If the IPMI command is
	 * unrecognized, then this is an unsupported platform and the module
	 * should be unloaded.  Any other error is treated is transient failure.
	 */
	if ((error = ipmi_sunoem_uptime(smp->sm_hdl, &smp->sm_seconds,
	    &smp->sm_generation)) != 0 &&
	    ipmi_errno(smp->sm_hdl) == EIPMI_INVALID_COMMAND) {
		fmd_hdl_debug(hdl, "failed to load: uptime command "
		    "not supported");
		ipmi_close(smp->sm_hdl);
		fmd_hdl_free(hdl, smp, sizeof (sp_monitor_t));
		fmd_hdl_unregister(hdl);
		return;
	}

	smp->sm_interval = fmd_prop_get_int64(hdl, "interval");

	if (error == 0)
		fmd_hdl_debug(hdl, "successfully loaded, uptime = %u seconds "
		    "(generation %u)", smp->sm_seconds, smp->sm_generation);
	else
		fmd_hdl_debug(hdl, "successfully loaded, but uptime call "
		    "failed: %s", ipmi_errmsg(smp->sm_hdl));

	/*
	 * Setup the recurring timer.
	 */
	(void) fmd_timer_install(hdl, NULL, NULL, 0);
}
Esempio n. 4
0
int
cpu_offline(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid, int cpustate)
{
	int i;
	uint_t cpuid;
	cma_cpu_t *cpu;

	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
		fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID);
		cma_stats.bad_flts.fmds_value.ui64++;
		return (CMA_RA_FAILURE);
	}

	/*
	 * cpu offlining using ldom_fmri_retire() may be asynchronous, so we
	 * have to set the timer and check the cpu status later.
	 */
	for (i = 0; i < cma.cma_cpu_tries;
	    i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) {
		if (cpu_cmd(hdl, asru, cpustate) != -1) {
			cma_stats.cpu_flts.fmds_value.ui64++;
			break;
		}
	}

	if (i >= cma.cma_cpu_tries) {
		cma_stats.cpu_fails.fmds_value.ui64++;
	}

	/*
	 * check to see if the cpu has been offline.
	 */
	fmd_hdl_debug(hdl, "cpu is not offline yet - sleeping\n");

	/*
	 * Create a cpu node and add to the head of the cpu list
	 */
	cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP);
	(void) nvlist_dup(asru, &cpu->cpu_fmri, 0);
	if (uuid != NULL)
		cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);

	cpu->cpuid = cpuid;
	cpu->cpu_next = cma.cma_cpus;
	cma.cma_cpus = cpu;

	if (cma.cma_cpu_timerid != 0)
		fmd_timer_remove(hdl, cma.cma_cpu_timerid);

	cma.cma_cpu_curdelay = cma.cma_cpu_mindelay;

	cma.cma_cpu_timerid =
	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);

	return (CMA_RA_FAILURE);
}
Esempio n. 5
0
void
cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid,
    boolean_t repair)
{
	cma_cpu_t *cpu;
	char *scheme;
	uint_t cpuid;
	nvlist_t *asru = NULL;
	topo_hdl_t *thp;
	int err;

	if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0)
		return;
	if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
		if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0)
			return;
	} else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) {
		return;
	} else {
		/* lookup cpuid from ASRU */
		thp = fmd_fmri_topo_hold(TOPO_VERSION);
		if (thp != NULL) {
			(void) topo_fmri_asru(thp, fmri, &asru, &err);
			fmd_fmri_topo_rele(thp);
		}
		if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
			nvlist_free(asru);
			return;
		}
	}

	/*
	 * check to see if the cpu has been offline.
	 */
	fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid);

	/*
	 * Create a cpu node and add to the head of the cpu list
	 */
	cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP);
	(void) nvlist_dup(fmri, &cpu->cpu_fmri, 0);
	if (uuid != NULL)
		cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);

	cpu->cpuid = cpuid;
	cpu->cpu_next = cma.cma_cpus;
	cma.cma_cpus = cpu;

	if (cma.cma_cpu_timerid != 0)
		fmd_timer_remove(hdl, cma.cma_cpu_timerid);

	cma.cma_cpu_curdelay = cma.cma_cpu_mindelay;

	cma.cma_cpu_timerid =
	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
}
Esempio n. 6
0
/*ARGSUSED*/
static void
disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data)
{
	disk_lights_t *dl = fmd_hdl_getspecific(hdl);

	dl->dl_triggered = B_FALSE;

	dl_examine_topo(dl);

	/*
	 * Install the long-interval timer for the next poll.
	 */
	dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval);
}
Esempio n. 7
0
void
cma_page_retry(fmd_hdl_t *hdl)
{
    cma_page_t **pagep;

    cma.cma_page_timerid = 0;

    fmd_hdl_debug(hdl, "page_retry: timer fired\n");

    pagep = &cma.cma_pages;
    while (*pagep != NULL) {
        cma_page_t *page = *pagep;

        if (page_retry(hdl, page)) {
            /*
             * Successful retry or we're giving up - remove from
             * the list
             */
            *pagep = page->pg_next;

            if (page->pg_uuid != NULL)
                fmd_hdl_strfree(hdl, page->pg_uuid);

            cma_page_free(hdl, page);
        } else {
            page->pg_nretries++;
            pagep = &page->pg_next;
        }
    }

    if (cma.cma_pages == NULL)
        return; /* no more retirements */

    /*
     * We still have retirements that haven't completed.  Back the delay
     * off, and schedule a retry.
     */
    cma.cma_page_curdelay = MIN(cma.cma_page_curdelay * 2,
                                cma.cma_page_maxdelay);

    fmd_hdl_debug(hdl, "scheduled page retirement retry for %llu secs\n",
                  (u_longlong_t)(cma.cma_page_curdelay / NANOSEC));

    cma.cma_page_timerid =
        fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);
}
Esempio n. 8
0
static void
dl_trigger_enum(disk_lights_t *dl)
{
	/*
	 * If we're already on the short-poll coalesce timer, then return
	 * immediately.
	 */
	if (dl->dl_triggered == B_TRUE)
		return;
	dl->dl_triggered = B_TRUE;

	/*
	 * Replace existing poll timer with coalesce timer:
	 */
	if (dl->dl_timer != 0)
		fmd_timer_remove(dl->dl_fmd, dl->dl_timer);
	dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL,
	    dl->dl_coalesce_interval);
}
Esempio n. 9
0
void
cma_cpu_retry(fmd_hdl_t *hdl)
{
	cma_cpu_t **cpup;

	fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n");

	cma.cma_cpu_timerid = 0;

	cpup = &cma.cma_cpus;
	while (*cpup != NULL) {
		cma_cpu_t *cpu = *cpup;

		if (cpu_retry(hdl, cpu)) {
			/*
			 * Successful retry or we're giving up - remove from
			 * the list
			 */
			*cpup = cpu->cpu_next;

			cma_cpu_free(hdl, cpu);
		} else {
			cpu->cpu_nretries++;
			cpup = &cpu->cpu_next;
		}
	}

	if (cma.cma_cpus == NULL)
		return; /* no more cpus */

	/*
	 * We still have cpus to check.  Back the delay
	 * off, and schedule a retry.
	 */
	cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2,
	    cma.cma_cpu_maxdelay);

	fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n",
	    (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC));

	cma.cma_cpu_timerid =
	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
}
Esempio n. 10
0
void
_fmd_init(fmd_hdl_t *hdl)
{
    char isa[8];

    /*
     * For now the module only sends message to ILOM on i386 platforms
     * till CR 6933053 is fixed. Module unregister may cause etm module
     * core dump due to 6933053.
     */
    if ((sysinfo(SI_ARCHITECTURE, isa, sizeof (isa)) == -1) ||
            (strncmp(isa, "i386", 4) != 0))
        return;

    if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
        return;

    /*
     * Setup the timer.
     */
    (void) fmd_timer_install(hdl, NULL, NULL, 2000000000ULL);
}
Esempio n. 11
0
/*ARGSUSED*/
int
cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
                const char *uuid, boolean_t repair)
{
    cma_page_t *page;
    uint64_t pageaddr;
    const char *action = repair ? "unretire" : "retire";
    int rc;
    nvlist_t *rsrc = NULL, *asrucp = NULL, *hcsp;

    (void) nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc);

    if (nvlist_dup(asru, &asrucp, 0) != 0) {
        fmd_hdl_debug(hdl, "page retire nvlist dup failed\n");
        return (CMA_RA_FAILURE);
    }

    /* It should already be expanded, but we'll do it again anyway */
    if (fmd_nvl_fmri_expand(hdl, asrucp) < 0) {
        fmd_hdl_debug(hdl, "failed to expand page asru\n");
        cma_stats.bad_flts.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_FAILURE);
    }

    if (!repair && !fmd_nvl_fmri_present(hdl, asrucp)) {
        fmd_hdl_debug(hdl, "page retire overtaken by events\n");
        cma_stats.page_nonent.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_SUCCESS);
    }

    /* Figure out physaddr from resource or asru */
    if (rsrc == NULL ||
            nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcsp) != 0 ||
            (nvlist_lookup_uint64(hcsp, "asru-" FM_FMRI_HC_SPECIFIC_PHYSADDR,
                                  &pageaddr) != 0 && nvlist_lookup_uint64(hcsp,
                                          FM_FMRI_HC_SPECIFIC_PHYSADDR, &pageaddr) != 0)) {
        if (nvlist_lookup_uint64(asrucp, FM_FMRI_MEM_PHYSADDR,
                                 &pageaddr) != 0) {
            fmd_hdl_debug(hdl, "mem fault missing 'physaddr'\n");
            cma_stats.bad_flts.fmds_value.ui64++;
            nvlist_free(asrucp);
            return (CMA_RA_FAILURE);
        }
    }

    if (repair) {
        if (!cma.cma_page_dounretire) {
            fmd_hdl_debug(hdl, "suppressed unretire of page %llx\n",
                          (u_longlong_t)pageaddr);
            cma_stats.page_supp.fmds_value.ui64++;
            nvlist_free(asrucp);
            return (CMA_RA_SUCCESS);
        }
        /* If unretire via topo fails, we fall back to legacy way */
        if (rsrc == NULL || (rc = fmd_nvl_fmri_unretire(hdl, rsrc)) < 0)
            rc = cma_fmri_page_unretire(hdl, asrucp);
    } else {
        if (!cma.cma_page_doretire) {
            fmd_hdl_debug(hdl, "suppressed retire of page %llx\n",
                          (u_longlong_t)pageaddr);
            cma_stats.page_supp.fmds_value.ui64++;
            nvlist_free(asrucp);
            return (CMA_RA_FAILURE);
        }
        /* If retire via topo fails, we fall back to legacy way */
        if (rsrc == NULL || (rc = fmd_nvl_fmri_retire(hdl, rsrc)) < 0)
            rc = cma_fmri_page_retire(hdl, asrucp);
    }

    if (rc == FMD_AGENT_RETIRE_DONE) {
        fmd_hdl_debug(hdl, "%sd page 0x%llx\n",
                      action, (u_longlong_t)pageaddr);
        if (repair)
            cma_stats.page_repairs.fmds_value.ui64++;
        else
            cma_stats.page_flts.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_SUCCESS);
    } else if (repair || rc != FMD_AGENT_RETIRE_ASYNC) {
        fmd_hdl_debug(hdl, "%s of page 0x%llx failed, will not "
                      "retry: %s\n", action, (u_longlong_t)pageaddr,
                      strerror(errno));

        cma_stats.page_fails.fmds_value.ui64++;
        nvlist_free(asrucp);
        return (CMA_RA_FAILURE);
    }

    /*
     * The page didn't immediately retire.  We'll need to periodically
     * check to see if it has been retired.
     */
    fmd_hdl_debug(hdl, "page didn't retire - sleeping\n");

    page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP);
    page->pg_addr = pageaddr;
    if (rsrc != NULL)
        (void) nvlist_dup(rsrc, &page->pg_rsrc, 0);
    page->pg_asru = asrucp;
    if (uuid != NULL)
        page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);

    page->pg_next = cma.cma_pages;
    cma.cma_pages = page;

    if (cma.cma_page_timerid != 0)
        fmd_timer_remove(hdl, cma.cma_page_timerid);

    cma.cma_page_curdelay = cma.cma_page_mindelay;

    cma.cma_page_timerid =
        fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);

    /* Don't free asrucp here.  This FMRI will be needed for retry. */
    return (CMA_RA_FAILURE);
}