/* * Function Specification * * Name: homer_log_access_error * * Description: Utility function to log an error that occurred while accessing * the HOMER. * * End Function Specification */ void homer_log_access_error(const homer_rc_t i_homer_rc, const int i_ssx_rc, const uint32_t i_usr_data2) { // Catch and log the homer error if (HOMER_SUCCESS != i_homer_rc) { // We could potentially have both an internal error dealing with the // homer and an SSX error, for example we could find an unsupported // version number in the homer and then have an ssx error trying to // unmap the homer address space. This check catches all those cases. if (SSX_OK != i_ssx_rc) { /* @ * @errortype * @moduleid MAIN_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 HOMER and SSX return codes * @userdata2 Host interrupt type used * @userdata4 ERC_HOMER_MAIN_SSX_ERROR * @devdesc An SSX error occurred mapping the HOMER host data * into the OCC address space. User word 1 contains * both the internal and SSX return codes returned * by the method used to access the HOMER data. */ errlHndl_t l_err = createErrl(MAIN_MID, //modId SSX_GENERIC_FAILURE, //reasoncode ERC_HOMER_MAIN_SSX_ERROR, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size (i_homer_rc << 16) | (0xFFFF & (uint32_t)i_ssx_rc), //userdata1 i_usr_data2); //userdata2 commitErrl(&l_err); } else { /* @ * @errortype * @moduleid MAIN_MID * @reasoncode INTERNAL_FAILURE * @userdata1 HOMER return code * @userdata2 Default host interrupt type used. * @userdata4 ERC_HOMER_MAIN_ACCESS_ERROR * @devdesc Error accessing initialization data */ errlHndl_t l_err = createErrl(MAIN_MID, //modId INTERNAL_FAILURE, //reasoncode ERC_HOMER_MAIN_ACCESS_ERROR,//Extended reason code ERRL_SEV_INFORMATIONAL, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size i_homer_rc, //userdata1 i_usr_data2); //userdata2 commitErrl(&l_err); } } }
// Function Specification // // Name: errlTestSetErrlSevToInfo // // Description: errlTestSetErrlSevToInfo // // End Function Specification uint32_t errlTestSetErrlSevToInfo() { uint32_t l_rc = 0; ERRL_DBG("START"); do { errlHndl_t l_handle = NULL; /****************************************************/ // Check setErrlSevToInfo // Create ERRL_SEV_PREDICTIVE log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Add callout addCalloutToErrl(l_handle,ERRL_CALLOUT_TYPE_HUID,0x00,ERRL_CALLOUT_PRIORITY_LOW); CHECK_CONDITION( l_handle->iv_numCallouts == 1, l_rc); // Call setErrlSevToInfo. Callouts within log should be cleared and // iv_severity should be set to ERRL_SEV_INFORMATIONAL setErrlSevToInfo(l_handle); CHECK_CONDITION( (l_handle->iv_numCallouts == 0) && (l_handle->iv_severity == ERRL_SEV_INFORMATIONAL), l_rc); deleteErrl( &l_handle ); ppdumpslot(); /****************************************************/ // Check setErrlSevToInfo after errl is committed // Create log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); errlHndl_t l_log = l_handle; // Add callout addCalloutToErrl(l_handle,ERRL_CALLOUT_TYPE_HUID,0x00,ERRL_CALLOUT_PRIORITY_LOW); CHECK_CONDITION( l_handle->iv_numCallouts == 1, l_rc); // Commit log and call setErrlSevToInfo. But setErrlSevToInfo will do nothing commitErrl( &l_handle ); setErrlSevToInfo(l_handle); CHECK_CONDITION( (l_log->iv_numCallouts == ERRL_MAX_CALLOUTS) && (l_log->iv_severity == ERRL_SEV_PREDICTIVE), l_rc); deleteErrl(&l_log); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestTime // // Description: errlTestTime // // End Function Specification uint32_t errlTestTime() { uint32_t l_rc = 0; do { ERRL_DBG("START"); errlHndl_t l_handle = NULL; uint64_t l_start = 0; uint64_t l_end = 0; /****************************************************/ // Check timeStamp // Create one log l_start = ssx_timebase_get(); l_handle = createErrl( 0x1716, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA, g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // check time stamp errlHndl_t l_handle2 = l_handle; commitErrl( &l_handle ); l_end = ssx_timebase_get(); CHECK_CONDITION( (l_handle2->iv_userDetails.iv_timeStamp >= l_start) && (l_handle2->iv_userDetails.iv_timeStamp <= l_end ), l_rc); deleteErrl(&l_handle2); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Initialize the memory task data void memory_init() { if(G_mem_monitoring_allowed) { // Check if memory task is running (default task is for NIMBUS) const task_id_t mem_task = TASK_ID_DIMM_SM; if(!rtl_task_is_runnable(mem_task)) { if (MEM_TYPE_NIMBUS == G_sysConfigData.mem_type) { // Init DIMM state manager IPC request memory_nimbus_init(); } else { // TODO CUMULUS NOT SUPPORTED YET IN PHASE1 #if 0 TRAC_INFO("memory_init: calling centaur_init()"); centaur_init(); //no rc, handles errors internally #endif TRAC_ERR("memory_init: invalid memory type 0x%02X", G_sysConfigData.mem_type); /* * @errortype * @moduleid DIMM_MID_MEMORY_INIT * @reasoncode MEMORY_INIT_FAILED * @userdata1 memory type * @userdata2 0 * @devdesc Invalid memory type detected */ errlHndl_t err = createErrl(DIMM_MID_MEMORY_INIT, MEMORY_INIT_FAILED, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, G_sysConfigData.mem_type, 0); REQUEST_RESET(err); } // check if the init resulted in a reset if(isSafeStateRequested()) { TRAC_ERR("memory_init: OCC is being reset, memory init failed (type=0x%02X)", G_sysConfigData.mem_type); } else { // Initialization was successful. Set task flags to allow memory // tasks to run and also prevent from doing initialization again. G_task_table[mem_task].flags = MEMORY_DATA_RTL_FLAGS; //G_task_table[TASK_ID_CENTAUR_CONTROL].flags = MEMORY_CONTROL_RTL_FLAGS; } } } } // end memory_init()
uint32_t errlTestCreate2InfoCallhomeLog() { ERRL_DBG("START" ); uint32_t l_rc = 0; do { /****************************************************/ // Check creating Info logs twice // Create first Info log errlHndl_t l_handle = NULL; errlHndl_t l_handle2= NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL,g_trac_inf, 32, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Create second Info log and it should fail l_handle2 = createErrl( 0x2727, 0x19, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL, g_trac_inf, 32, 0x2, 0x3); CHECK_CONDITION( l_handle2 == INVALID_ERR_HNDL, l_rc); deleteErrl(&l_handle); /****************************************************/ // Check creating Callhome logs twice // Create first Callhome log l_handle = NULL; l_handle2= NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA,g_trac_inf, 32, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Create second Callhome log and it should fail l_handle2 = createErrl( 0x2727, 0x19, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA, g_trac_inf, 32, 0x2, 0x3); CHECK_CONDITION( l_handle2 == INVALID_ERR_HNDL, l_rc); deleteErrl(&l_handle); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Called after a failure to read a DIMM temperature. The error will // be counted and if threshold is reached, and error will be created with // the DIMM as a callout and then set flag to trigger I2C reset void mark_dimm_failed() { const uint8_t port = G_dimm_sm_args.i2cPort; const uint8_t dimm = G_dimm_sm_args.dimm; INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X " "(ffdc 0x%08X%08X, completion_state 0x%02X)", DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount, WORD_HIGH(G_dimm_sm_args.error.ffdc), WORD_LOW(G_dimm_sm_args.error.ffdc), G_dimm_sm_request.request.completion_state); if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS) { // Disable collection on this DIMM, collect FFDC and log error G_dimm[port][dimm].disabled = true; INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to %d consecutive errors (state=%d)", DIMM_AND_PORT, G_dimm[port][dimm].errorCount, G_dimm_sm_args.state); errlHndl_t l_err = NULL; /* * @errortype * @moduleid DIMM_MID_MARK_DIMM_FAILED * @reasoncode DIMM_GPE_FAILURE * @userdata1 GPE returned rc code * @userdata4 ERC_DIMM_COMPLETE_FAILURE * @devdesc Failure writing dimm i2c mode register */ l_err = createErrl(DIMM_MID_MARK_DIMM_FAILED, DIMM_GPE_FAILURE, ERC_DIMM_COMPLETE_FAILURE, ERRL_SEV_INFORMATIONAL, NULL, DEFAULT_TRACE_SIZE, G_dimm_sm_args.error.rc, 0); addUsrDtlsToErrl(l_err, (uint8_t*)&G_dimm_sm_request.ffdc, sizeof(G_dimm_sm_request.ffdc), ERRL_STRUCT_VERSION_1, ERRL_USR_DTL_BINARY_DATA); addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.dimm_huids[port][dimm], ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } // Reset DIMM I2C engine G_dimm_i2c_reset_required = true; G_dimm_i2c_reset_cause = port<<24 | dimm<<16 | (G_dimm_sm_args.error.rc & 0xFFFF); G_dimm_state = DIMM_STATE_RESET_MASTER; } // end mark_dimm_failed()
// Function Specification // // Name: errlTestWordAlign // // Description: errlTestWordAlign // // End Function Specification uint32_t errlTestWordAlign() { uint32_t l_rc = 0; uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; ERRL_DBG("START"); do { /****************************************************/ // Test word align for addUsrDtlsToErrl // Create log errlHndl_t l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it errlHndl_t l_handleX = l_handle; ppdumpslot(); // add 13 bytes of "user details" l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xAA, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 13, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 13 bytes" ); ppdumpslot(); // (header + WORDALIGN(13)) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+16), l_rc); /****************************************************/ // Test word align for addTraceToErrl // add 21 bytes of trace l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 21, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 21 bytes" ); ppdumpslot(); // (header + WORDALIGN(21)) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter <= (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+24), l_rc); commitErrl( &l_handle ); deleteErrl(&l_handleX); ERRL_DBG("Slots should now be empty"); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestCreateCommitDeleteLog // // Description: errlTestCreateCommitDeleteLog // // End Function Specification uint32_t errlTestCreateCommitDeleteLog() { ERRL_DBG("START"); uint32_t l_rc = 0; do { /****************************************************/ // Test create log errlHndl_t l_handle = NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA, g_trac_inf, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); ERRL_DBG("Slots after Creating call home log" ); ppdumpslot(); /****************************************************/ // Test commit log errlHndl_t l_handle2 = l_handle; commitErrl( &l_handle ); CHECK_CONDITION( (l_handle == NULL) && (l_handle2->iv_userDetails.iv_committed == 1), l_rc); ERRL_DBG("Slots after Commiting call home log" ); dumpLog( l_handle2, l_handle2->iv_userDetails.iv_entrySize ); ppdumpslot(); /****************************************************/ // Test delete log deleteErrl(&l_handle2); CHECK_CONDITION( l_handle2 == NULL, l_rc); ERRL_DBG("Slots after delete Log" ); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Check and update lock ownership for the specified i2c engine. // Returns true if OCC owns the lock, or false if host owns lock // // If host has requesed the i2c lock, it will be released and an external interrupt // will be generated/queued and function will return false. // If the host has not released the lock, function will return false. // If the host cleared its lock bit, OCC will take back ownership and return true. // bool check_and_update_i2c_lock(const uint8_t i_engine) { bool occ_owns_lock = true; if ((PIB_I2C_ENGINE_E == i_engine) || (PIB_I2C_ENGINE_D == i_engine) || (PIB_I2C_ENGINE_C == i_engine)) { bool needRetry = false; do { ocb_occflg_t original_occflags; original_occflags.value = in32(OCB_OCCFLG); LOCK_DBG("check_and_update_i2c_lock: I2C engine %d - host=%d, occ=%d (dimmTick=%d)", i_engine, original_occflags.fields.i2c_engine3_lock_host, original_occflags.fields.i2c_engine3_lock_occ, DIMM_TICK); if (occ_owns_i2c_lock(original_occflags, i_engine)) { if (host_wants_i2c_lock(original_occflags, i_engine)) { // Host requested lock, clear the OCC lock and notify host update_i2c_lock(LOCK_RELEASE, i_engine); occ_owns_lock = false; } // else OCC already owns the lock } else { // OCC does not own the lock occ_owns_lock = false; if (false == host_wants_i2c_lock(original_occflags, i_engine)) { // Host is not requesting the lock, acquire lock for OCC update_i2c_lock(LOCK_ACQUIRE, i_engine); occ_owns_lock = true; } // else Host still holds the lock } if ((occ_owns_lock) && (original_occflags.fields.i2c_engine1_lock_host == 0) && (original_occflags.fields.i2c_engine1_lock_occ == 0)) { // If neither lock bit is set, we must read back the register to make // sure the host did not set at same time (lock conflict) ocb_occflg_t verify_occflags; verify_occflags.value = in32(OCB_OCCFLG); if (host_wants_i2c_lock(verify_occflags, i_engine)) { // Host wrote their lock bit at same time, clear OCC lock and notify host update_i2c_lock(LOCK_RELEASE, i_engine); occ_owns_lock = false; } else { if (false == occ_owns_i2c_lock(verify_occflags, i_engine)) { // ERROR - OCC OWNERSHIP BIT DID NOT GET SET INTR_TRAC_ERR("check_and_update_i2c_lock: I2C lock bit did not get set (OCCFLAGS reg: 0x%08X)", verify_occflags.value); if (needRetry) { // After one retry, log error and goto safe /* * @errortype * @moduleid I2C_LOCK_UPDATE * @reasoncode OCI_WRITE_FAILURE * @userdata1 I2C engine number * @userdata2 OCC Flags register * @devdesc OCI write failure setting I2C ownership bit */ errlHndl_t err = createErrl(I2C_LOCK_UPDATE, OCI_WRITE_FAILURE, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, i_engine, verify_occflags.value); //Callout firmware addCalloutToErrl(err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_MED); //Callout processor addCalloutToErrl(err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_LOW); REQUEST_RESET(err); occ_owns_lock = false; break; } needRetry = true; } // else verify succeeded (OCC owns lock) } } } while (needRetry); } else { // Invalid engine INTR_TRAC_ERR("check_and_update_i2c_lock: Invalid engine specified: 0x%02X", i_engine); } return occ_owns_lock; } // end check_and_update_i2c_lock()
// Function Specification // // Name: apss_initialize // // Description: Completes all APSS initialization including GPIOs, altitude and // mode // // End Function Specification errlHndl_t apss_initialize() { errlHndl_t l_err = NULL; PoreFlex request; // Setup the GPIO init structure to pass to the GPE program G_gpe_apss_initialize_gpio_args.error.error = 0; G_gpe_apss_initialize_gpio_args.error.ffdc = 0; G_gpe_apss_initialize_gpio_args.config0.direction = G_gpio_config[0].direction; G_gpe_apss_initialize_gpio_args.config0.drive = G_gpio_config[0].drive; G_gpe_apss_initialize_gpio_args.config0.interrupt = G_gpio_config[0].interrupt; G_gpe_apss_initialize_gpio_args.config1.direction = G_gpio_config[1].direction; G_gpe_apss_initialize_gpio_args.config1.drive = G_gpio_config[1].drive; G_gpe_apss_initialize_gpio_args.config1.interrupt = G_gpio_config[1].interrupt; // Create/schedule GPE_apss_initialize_gpio and wait for it to complete (BLOCKING) TRAC_INFO("Creating request for GPE_apss_initialize_gpio"); pore_flex_create(&request, // request &G_pore_gpe0_queue, // queue (void*)GPE_apss_initialize_gpio, // GPE entry_point (uint32_t)&G_gpe_apss_initialize_gpio_args,// GPE argument_ptr SSX_SECONDS(5), // timeout NULL, // callback NULL, // callback arg ASYNC_REQUEST_BLOCKING); // options // Schedule the request to be executed pore_flex_schedule(&request); // Check for a timeout, will create the error log later // NOTE: As of 2013/07/16, simics will still fail here on a OCC reset if(ASYNC_REQUEST_STATE_TIMED_OUT == request.request.completion_state) { // For whatever reason, we hit a timeout. It could be either // that the HW did not work, or the request didn't ever make // it to the front of the queue. // Let's log an error, and include the FFDC data if it was // generated. TRAC_ERR("Timeout communicating with PORE-GPE for APSS Init"); } TRAC_INFO("GPE_apss_initialize_gpio completed w/rc=0x%08x\n", request.request.completion_state); // Only continue if completed without errors... if (ASYNC_REQUEST_STATE_COMPLETE == request.request.completion_state) { // Setup the composite mode structure to pass to the GPE program G_gpe_apss_set_composite_mode_args.error.error = 0; G_gpe_apss_set_composite_mode_args.error.ffdc = 0; G_gpe_apss_set_composite_mode_args.config.numAdcChannelsToRead = G_apss_composite_config.numAdcChannelsToRead; G_gpe_apss_set_composite_mode_args.config.numGpioPortsToRead = G_apss_composite_config.numGpioPortsToRead; // Create/schedule GPE_apss_set_composite_mode and wait for it to complete (BLOCKING) TRAC_INFO("Creating request for GPE_apss_set_composite_mode"); pore_flex_create(&request, // request &G_pore_gpe0_queue, // queue (void*)GPE_apss_set_composite_mode, // GPE entry_point (uint32_t)&G_gpe_apss_set_composite_mode_args,// GPE argument_ptr SSX_SECONDS(5), // timeout NULL, // callback NULL, // callback arg ASYNC_REQUEST_BLOCKING); // options pore_flex_schedule(&request); // Check for a timeout, will create the error log later if(ASYNC_REQUEST_STATE_TIMED_OUT == request.request.completion_state) { // For whatever reason, we hit a timeout. It could be either // that the HW did not work, or the request didn't ever make // it to the front of the queue. // Let's log an error, and include the FFDC data if it was // generated. TRAC_ERR("Timeout communicating with PORE-GPE for APSS Init"); } TRAC_INFO("GPE_apss_set_composite_mode completed w/rc=0x%08x", request.request.completion_state); if (ASYNC_REQUEST_STATE_COMPLETE != request.request.completion_state) { /* * @errortype * @moduleid PSS_MID_APSS_INIT * @reasoncode INTERNAL_FAILURE * @userdata1 GPE returned rc code * @userdata2 GPE returned abort code * @userdata4 ERC_PSS_COMPOSITE_MODE_FAIL * @devdesc Failure from GPE for setting composite mode on * APSS */ l_err = createErrl(PSS_MID_APSS_INIT, // i_modId, INTERNAL_FAILURE, // i_reasonCode, ERC_PSS_COMPOSITE_MODE_FAIL, // extended reason code ERRL_SEV_UNRECOVERABLE, // i_severity NULL, // i_trace, 0x0000, // i_traceSz, request.request.completion_state, // i_userData1, request.request.abort_state); // i_userData2 addUsrDtlsToErrl(l_err, (uint8_t*)&G_gpe_apss_set_composite_mode_args, sizeof(G_gpe_apss_set_composite_mode_args), ERRL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // Returning an error log will cause us to go to safe // state so we can report error to FSP } TRAC_INFO("apss_initialize: Creating request G_meas_start_request."); //Create the request for measure start. Scheduling will happen in apss.c pore_flex_create(&G_meas_start_request, &G_pore_gpe0_queue, // queue (void*)GPE_apss_start_pwr_meas_read, // entry_point (uint32_t)&G_gpe_start_pwr_meas_read_args, // entry_point arg SSX_WAIT_FOREVER, // no timeout NULL, // callback NULL, // callback arg ASYNC_CALLBACK_IMMEDIATE); // options TRAC_INFO("apss_initialize: Creating request G_meas_cont_request."); //Create the request for measure continue. Scheduling will happen in apss.c pore_flex_create(&G_meas_cont_request, &G_pore_gpe0_queue, // request (void*)GPE_apss_continue_pwr_meas_read, // entry_point (uint32_t)&G_gpe_continue_pwr_meas_read_args, // entry_point arg SSX_WAIT_FOREVER, // no timeout NULL, // callback NULL, // callback arg ASYNC_CALLBACK_IMMEDIATE); // options TRAC_INFO("apss_initialize: Creating request G_meas_complete_request."); //Create the request for measure complete. Scheduling will happen in apss.c pore_flex_create(&G_meas_complete_request, &G_pore_gpe0_queue, // queue (void*)GPE_apss_complete_pwr_meas_read, // entry_point (uint32_t)&G_gpe_complete_pwr_meas_read_args,// entry_point arg SSX_WAIT_FOREVER, // no timeout (AsyncRequestCallback)reformat_meas_data, // callback, (void*)NULL, // callback arg ASYNC_CALLBACK_IMMEDIATE); // options } else { /* * @errortype * @moduleid PSS_MID_APSS_INIT * @reasoncode INTERNAL_FAILURE * @userdata1 GPE returned rc code * @userdata2 GPE returned abort code * @userdata4 ERC_PSS_GPIO_INIT_FAIL * @devdesc Failure from GPE for gpio initialization on APSS */ l_err = createErrl(PSS_MID_APSS_INIT, // i_modId, INTERNAL_FAILURE, // i_reasonCode, ERC_PSS_GPIO_INIT_FAIL, // extended reason code ERRL_SEV_UNRECOVERABLE, // i_severity NULL, // tracDesc_t i_trace, 0x0000, // i_traceSz, request.request.completion_state, // i_userData1, request.request.abort_state); // i_userData2 addUsrDtlsToErrl(l_err, (uint8_t*)&G_gpe_apss_initialize_gpio_args, sizeof(G_gpe_apss_initialize_gpio_args), ERRL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // Returning an error log will cause us to go to safe // state so we can report error to FSP } return l_err; }
// Verifies that each core is at the correct frequency after they have had // time to stabilize void amec_verify_pstate() { uint8_t l_core = 0; int8_t l_pstate_from_fmax = 0; gpe_bulk_core_data_t * l_core_data_ptr; pmc_pmsr_ffcdc_data_t l_pmc_pmsr_ffdc; errlHndl_t l_err = NULL; if ( (G_time_until_freq_check == 0) && ( CURRENT_MODE() != OCC_MODE_DYN_POWER_SAVE ) && ( CURRENT_MODE() != OCC_MODE_DYN_POWER_SAVE_FP ) && (!G_sysConfigData.system_type.kvm)) { // Reset the counter G_time_until_freq_check = FREQ_CHG_CHECK_TIME; // Convert fmax to the corresponding pstate l_pstate_from_fmax = proc_freq2pstate(g_amec->sys.fmax); for( l_core = 0; l_core < MAX_NUM_CORES; l_core++ ) { // If the core isn't present, skip it if(!CORE_PRESENT(l_core)) { l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].value = 0; continue; } // Get pointer to core data l_core_data_ptr = proc_get_bulk_core_data_ptr(l_core); // Get the core's pmsr data l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core] = l_core_data_ptr->pcb_slave.pmsr; // Verify that the core is running at the correct frequency // If not, log an error if( (l_pstate_from_fmax != l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].fields.local_pstate_actual) && (l_pstate_from_fmax > l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].fields.pv_min) && (l_err == NULL) ) { TRAC_ERR("Frequency mismatch in core %d: actual_ps[%d] req_ps[%d] fmax[%d] mode[%d].", l_core, l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].fields.local_pstate_actual, l_pstate_from_fmax, g_amec->sys.fmax, CURRENT_MODE()); fill_pmc_ffdc_buffer(&l_pmc_pmsr_ffdc.pmc_ffcdc_data); /* @ * @moduleid AMEC_VERIFY_FREQ_MID * @reasonCode TARGET_FREQ_FAILURE * @severity ERRL_SEV_PREDICTIVE * @userdata1 0 * @userdata2 0 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc A core is not running at the expected frequency */ l_err = createErrl( AMEC_VERIFY_FREQ_MID, // i_modId, TARGET_FREQ_FAILURE, // i_reasonCode, OCC_NO_EXTENDED_RC, ERRL_SEV_UNRECOVERABLE, NULL, // i_trace, DEFAULT_TRACE_SIZE, // i_traceSz, 0, // i_userData1, 0); // i_userData2 //Add firmware callout addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); //Add processor callout addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_MED); } } if( l_err != NULL) { //Add our register dump to the error log addUsrDtlsToErrl(l_err, (uint8_t*) &l_pmc_pmsr_ffdc, sizeof(l_pmc_pmsr_ffdc), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_BINARY_DATA); REQUEST_RESET(l_err); } } }
// Function Specification // // Name: amec_slv_check_perf // // Description: Slave OCC's Detect and log degraded performance errors // This function will run every tick. // // Thread: RealTime Loop // // Task Flags: // // End Function Specification void amec_slv_check_perf(void) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ static BOOLEAN l_prev_failsafe_state = FALSE; static BOOLEAN l_prev_ovs_state = FALSE; static BOOLEAN l_prev_pcap_state = FALSE; static ERRL_SEVERITY l_pcap_sev = ERRL_SEV_PREDICTIVE; static BOOLEAN l_throttle_traced = FALSE; static uint64_t l_time = 0; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // Verify that cores are at proper frequency amec_verify_pstate(); do { // was frequency limited by power ? if ( G_non_dps_power_limited != TRUE ) { if(l_throttle_traced) { TRAC_INFO("Frequency not limited by power algorithms anymore"); l_throttle_traced = FALSE; } // we are done break and return break; } // frequency limited due to failsafe condition ? if ( AMEC_INTF_GET_FAILSAFE() == TRUE ) { if ( l_prev_failsafe_state == TRUE) { // we are done break and return break; } else { // log this error ONLY ONCE per IPL l_prev_failsafe_state = TRUE; TRAC_ERR("Frequency limited due to failsafe condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out OVS procedure // set error severity to RRL_SEV_PREDICTIVE /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode INTERNAL_FAILURE * @userdata1 Previous FailSafe State * @userdata4 ERC_AMEC_SLAVE_FAILSAFE_STATE * @devdesc Frequency limited due to failsafe condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId INTERNAL_FAILURE, //reasoncode ERC_AMEC_SLAVE_FAILSAFE_STATE,//Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_prev_failsafe_state, //userdata1 0); //userdata2 addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_OVERSUBSCRIPTION, ERRL_CALLOUT_PRIORITY_HIGH ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // Commit Error commitErrl(&l_errl); // we are done lets break break; } } // frequency limited due to oversubscription condition ? if ( AMEC_INTF_GET_OVERSUBSCRIPTION() == TRUE ) { if ( l_prev_ovs_state == TRUE) { // we are done break and return break; } else { // log this error ONLY ONCE per IPL l_prev_ovs_state = TRUE; TRAC_ERR("Frequency limited due to oversubscription condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out OVS procedure // set error severity to RRL_SEV_PREDICTIVE // Updated the RC to match the actual RC passed to createErrl() /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode OVERSUB_LIMIT_ALERT * @userdata1 Previous OVS State * @userdata4 ERC_AMEC_SLAVE_OVS_STATE * @devdesc Frequency limited due to oversubscription condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId OVERSUB_LIMIT_ALERT, //reasoncode ERC_AMEC_SLAVE_OVS_STATE, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_prev_ovs_state, //userdata1 0); //userdata2 // Callout to Oversubscription addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_OVERSUBSCRIPTION, ERRL_CALLOUT_PRIORITY_HIGH ); // Callout to APSS addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.apss_huid, ERRL_CALLOUT_PRIORITY_MED ); // Callout to Firmware addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_LOW ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // Commit Error commitErrl(&l_errl); // we are done lets break break; } } uint16_t l_snrBulkPwr = AMECSENSOR_PTR(PWR250US)->sample; // frequency limited due to system power cap condition ? if (( l_snrBulkPwr > (G_sysConfigData.pcap.system_pcap - PDROP_THRESH) ) && ( G_sysConfigData.pcap.current_pcap == 0 )) { if ( l_prev_pcap_state == TRUE) { // we are done break and return break; } else { //log this error ONLY ONCE per IPL l_prev_pcap_state = TRUE; TRAC_ERR("Frequency limited due to power cap condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); TRAC_ERR("SnrBulkPwr %d > Sys Pcap %d ",l_snrBulkPwr, G_sysConfigData.pcap.system_pcap ); TRAC_ERR("SnrFanPwr %d, SnrIOPwr %d, SnrStoragePwr %d, SnrGpuPrw %d ", AMECSENSOR_PTR(PWR250USFAN)->sample, AMECSENSOR_PTR(PWR250USIO)->sample, AMECSENSOR_PTR(PWR250USSTORE)->sample, AMECSENSOR_PTR(PWR250USGPU)->sample ); TRAC_ERR("SnrProcPwr 0 %d, SnrProcPwr 1 %d, SnrProcPwr 2 %d, SnrProcPwr 3 %d", g_amec->proc_snr_pwr[0], g_amec->proc_snr_pwr[1], g_amec->proc_snr_pwr[2], g_amec->proc_snr_pwr[3] ); TRAC_ERR("SnrMemPwr 0 %d, SnrMemPwr 1 %d, SnrMemPwr 2 %d, SnrMemPwr 3 %d", g_amec->mem_snr_pwr[0], g_amec->mem_snr_pwr[1], g_amec->mem_snr_pwr[2], g_amec->mem_snr_pwr[3] ); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out firmware and APSS procedure // set error severity to l_pcap_sev /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode PCAP_THROTTLE_POWER_LIMIT * @userdata1 Current Sensor Bulk Power * @userdata2 System PCAP * @userdata4 ERC_AMEC_SLAVE_POWERCAP * @devdesc Frequency limited due to PowerCap condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId PCAP_THROTTLE_POWER_LIMIT, //reasoncode ERC_AMEC_SLAVE_POWERCAP, //Extended reason code l_pcap_sev, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_snrBulkPwr, //userdata1 G_sysConfigData.pcap.system_pcap);//userdata2 addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH ); addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.apss_huid, ERRL_CALLOUT_PRIORITY_HIGH ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // then l_pcap_sev to informational l_pcap_sev = ERRL_SEV_INFORMATIONAL; // Commit Error commitErrl(&l_errl); // we are done lets break break; } } // trottle trace to every 3600 seconds (1hr = 3600000) if(!l_throttle_traced && ( DURATION_IN_MS_UNTIL_NOW_FROM(l_time) > 3600000 ) ) { TRAC_INFO("Frequency power limited due to transient condition: PowerLimited=%x, FailSafe=%x, OverSubScription=%x CurrentBulkPwr=%x", G_non_dps_power_limited, AMEC_INTF_GET_FAILSAFE(), AMEC_INTF_GET_OVERSUBSCRIPTION(), l_snrBulkPwr ); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); } } while( 0 ); return; }
// Function Specification // // Name: dbug_err_inject // // Description: Injects an error // // End Function Specification void dbug_err_inject(const cmdh_fsp_cmd_t * i_cmd_ptr, cmdh_fsp_rsp_t * i_rsp_ptr) { errlHndl_t l_err; cmdh_dbug_inject_errl_query_t *l_cmd_ptr = (cmdh_dbug_inject_errl_query_t*) i_cmd_ptr; i_rsp_ptr->data_length[0] = 0; i_rsp_ptr->data_length[1] = 0; G_rsp_status = ERRL_RC_SUCCESS; if(!strncmp(l_cmd_ptr->comp, "RST", OCC_TRACE_NAME_SIZE)) { l_err = createErrl(CMDH_DBUG_MID, //modId INTERNAL_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0xff, //userdata1 0); //userdata2 if (INVALID_ERR_HNDL == l_err) { G_rsp_status = ERRL_RC_INTERNAL_FAIL; } addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, //callout type (HUID/CompID) G_sysConfigData.proc_huid, //callout data ERRL_CALLOUT_PRIORITY_HIGH); //priority REQUEST_RESET(l_err); } else { l_err = createErrl(CMDH_DBUG_MID, //modId INTERNAL_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity TRAC_get_td(l_cmd_ptr->comp), //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0xff, //userdata1 0); //userdata2 if (INVALID_ERR_HNDL == l_err) { G_rsp_status = ERRL_RC_INTERNAL_FAIL; } // Commit Error log commitErrl(&l_err); } if (G_rsp_status == ERRL_RC_INTERNAL_FAIL) { TRAC_ERR("cmdh_dbug_inject_errl: Fail creating ERR Log\n"); } else { TRAC_INFO("cmdh_dbug_inject_errl: inject errl for COMP : %s\n", l_cmd_ptr->comp); } return; }
// Function Specification // // Name: amec_slave_init // // Description: Perform initialization of any/all AMEC Slave Functions // // End Function Specification void amec_slave_init() { errlHndl_t l_err = NULL; // Error handler int rc = 0; // Return code int rc2 = 0; // Return code // Set the GPE Request Pointers to NULL in case the create fails. G_fw_timing.gpe0_timing_request = NULL; G_fw_timing.gpe1_timing_request = NULL; // Initializes the GPE routine that will be used to measure the worst case // timings for GPE0 rc = pore_flex_create( &G_gpe_nop_request[0], //gpe_req for the task &G_pore_gpe0_queue, //queue (void *) GPE_pore_nop, //entry point (uint32_t) NULL, //parm for the task SSX_WAIT_FOREVER, //no timeout (AsyncRequestCallback) amec_slv_update_gpe_sensors, //callback (void *) GPE_ENGINE_0, //callback argument ASYNC_CALLBACK_IMMEDIATE ); //options // Initializes the GPE routine that will be used to measure the worst case // timings for GPE1 rc2 = pore_flex_create( &G_gpe_nop_request[1], //gpe_req for the task &G_pore_gpe1_queue, //queue (void *)GPE_pore_nop, //entry point (uint32_t) NULL, //parm for the task SSX_WAIT_FOREVER, //no timeout (AsyncRequestCallback) amec_slv_update_gpe_sensors, //callback (void *) GPE_ENGINE_1, //callback argument ASYNC_CALLBACK_IMMEDIATE ); //options // If we couldn't create the poreFlex objects, there must be a major problem // so we will log an error and halt OCC. if( rc || rc2 ) { //If fail to create pore flex object then there is a problem. TRAC_ERR("Failed to create GPE duration poreFlex object[0x%x, 0x%x]", rc, rc2 ); /* @ * @errortype * @moduleid AMEC_INITIALIZE_FW_SENSORS * @reasoncode SSX_GENERIC_FAILURE * @userdata1 return code - gpe0 * @userdata2 return code - gpe1 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failure to create PORE-GPE poreFlex object for FW timing * analysis. * */ l_err = createErrl( AMEC_INITIALIZE_FW_SENSORS, //modId SSX_GENERIC_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //TODO: create trace //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size rc, //userdata1 rc2 //userdata2 ); REQUEST_RESET(l_err); } else { // Everything was successful, so set FW timing pointers to these // GPE Request objects G_fw_timing.gpe0_timing_request = &G_gpe_nop_request[0]; G_fw_timing.gpe1_timing_request = &G_gpe_nop_request[1]; } // Initialize Vector Sensors for AMEC use amec_init_vector_sensors(); // Initialize AMEC internal parameters amec_init_gamec_struct(); }
void task_core_data( task_t * i_task ) { errlHndl_t l_err = NULL; //Error handler tracDesc_t l_trace = NULL; //Temporary trace descriptor int rc = 0; //return code bulk_core_data_task_t * l_bulk_core_data_ptr = (bulk_core_data_task_t *)i_task->data_ptr; GpeGetCoreDataParms * l_parms = (GpeGetCoreDataParms *)(l_bulk_core_data_ptr->gpe_req.parameter); gpe_bulk_core_data_t * l_temp = NULL; do { //First, check to see if the previous GPE request still running //A request is considered idle if it is not attached to any of the //asynchronous request queues if( !(async_request_is_idle(&l_bulk_core_data_ptr->gpe_req.request)) ) { //This should not happen unless there's a problem //Trace 1 time if( !G_queue_not_idle_traced ) { TRAC_ERR("Core data GPE is still running \n"); G_queue_not_idle_traced = TRUE; } break; } //Need to complete collecting data for all assigned cores from previous interval //and tick 0 is the current tick before collect data again. if( (l_bulk_core_data_ptr->current_core == l_bulk_core_data_ptr->end_core) && ((CURRENT_TICK & (MAX_NUM_TICKS - 1)) != 0) ) { PROC_DBG("Not collect data. Need to wait for tick.\n"); break; } //Check to see if the previously GPE request has successfully completed //A request is not considered complete until both the engine job //has finished without error and any callback has run to completion. if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request) && CORE_PRESENT(l_bulk_core_data_ptr->current_core) ) { //If the previous GPE request succeeded then swap core_data_ptr //with the global one. The gpe routine will write new data into //a buffer that is not being accessed by the RTLoop code. PROC_DBG( "Swap core_data_ptr [%x] with the global one\n", l_bulk_core_data_ptr->current_core ); //debug only #ifdef PROC_DEBUG print_core_status(l_bulk_core_data_ptr->current_core); print_core_data_sensors(l_bulk_core_data_ptr->current_core); #endif l_temp = l_bulk_core_data_ptr->core_data_ptr; l_bulk_core_data_ptr->core_data_ptr = G_core_data_ptrs[l_bulk_core_data_ptr->current_core]; G_core_data_ptrs[l_bulk_core_data_ptr->current_core] = l_temp; //Core data has been collected so set the bit in global mask. //AMEC code will know which cores to update sensors for. AMEC is //responsible for clearing the bit later on. G_updated_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core); // Presumptively clear the empath error mask G_empath_error_core_mask &= ~(CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core)); // The gpe_data collection code has to handle the workaround for // HW280375. Two new flags have been added to the OHA_RO_STATUS_REG // image to indicate whether the EMPATH collection failed, and // whether it was due to an "expected" error that we can ignore // (we can ignore the data as well), or an "unexpected" error that // we will create an informational log one time. // // The "expected" errors are very rare in practice, in fact we may // never even see them unless running a specific type of workload. // If you want to test the handling of expected errors compile the // GPE code with -DINJECT_HW280375_ERRORS which will inject an error // approximately every 1024 samples // // To determine if the expected error has occurred inspect the // CoreDataOha element of the CoreData structure written by the GPE // core data job. The OHA element contains the oha_ro_status_reg. // Inside the OHA status register is a 16 bit reserved field. // gpe_data.h defines two masks that can be applied against the // reserved field to check for these errors: // CORE_DATA_EXPECTED_EMPATH_ERROR // CORE_DATA_UNEXPECTED_EMPATH_ERROR // Also, a 4-bit PCB parity + error code is saved at bit position: // CORE_DATA_EMPATH_ERROR_LOCATION, formally the length is // specified by: CORE_DATA_EMPATH_ERROR_BITS gpe_bulk_core_data_t *l_core_data = G_core_data_ptrs[l_bulk_core_data_ptr->current_core]; // We will trace the errors, but only a certain number of // times, we will only log the unexpected error once. #define OCC_EMPATH_ERROR_THRESH 10 static uint32_t L_expected_emp_err_cnt = 0; static uint32_t L_unexpected_emp_err_cnt = 0; // Check the reserved field for the expected or the unexpected error flag if ((l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_EXPECTED_EMPATH_ERROR) || (l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_UNEXPECTED_EMPATH_ERROR)) { // Indicate empath error on current core G_empath_error_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core); // Save the high and low order words of the OHA status reg uint32_t l_oha_reg_high = l_core_data->oha.oha_ro_status_reg.words.high_order; uint32_t l_oha_reg_low = l_core_data->oha.oha_ro_status_reg.words.low_order; // Handle each error case if ((l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_EXPECTED_EMPATH_ERROR) && (L_expected_emp_err_cnt < OCC_EMPATH_ERROR_THRESH)) { L_expected_emp_err_cnt++; TRAC_IMP("Expected empath collection error occurred %d time(s)! Core = %d", L_expected_emp_err_cnt, l_bulk_core_data_ptr->current_core); TRAC_IMP("OHA status register: 0x%4.4x%4.4x", l_oha_reg_high, l_oha_reg_low); } if ((l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_UNEXPECTED_EMPATH_ERROR) && (L_unexpected_emp_err_cnt < OCC_EMPATH_ERROR_THRESH)) { L_unexpected_emp_err_cnt++; TRAC_ERR("Unexpected empath collection error occurred %d time(s)! Core = %d", L_unexpected_emp_err_cnt, l_bulk_core_data_ptr->current_core); TRAC_ERR("OHA status register: 0x%4.4x%4.4x", l_oha_reg_high, l_oha_reg_low); // Create and commit an informational error the first // time this occurs. if (L_unexpected_emp_err_cnt == 1) { TRAC_IMP("Logging unexpected empath collection error 1 time only."); /* * @errortype * @moduleid PROC_TASK_CORE_DATA_MOD * @reasoncode INTERNAL_HW_FAILURE * @userdata1 OHA status reg high * @userdata2 OHA status reg low * @userdata4 ERC_PROC_CORE_DATA_EMPATH_ERROR * @devdesc An unexpected error occurred while * collecting core empath data. */ l_err = createErrl( PROC_TASK_CORE_DATA_MOD, //modId INTERNAL_HW_FAILURE, //reason code ERC_PROC_CORE_DATA_EMPATH_ERROR, //Extended reason code ERRL_SEV_INFORMATIONAL, //Severity NULL, //Trace DEFAULT_TRACE_SIZE, //Trace Size l_oha_reg_high, //userdata1 l_oha_reg_low); //userdata2 commitErrl(&l_err); } } } }
// Function Specification // // Name: SMGR_set_mode // // Description: // // End Function Specification errlHndl_t SMGR_set_mode(const OCC_MODE i_mode, const uint8_t i_sms_type) { errlHndl_t l_errlHndl = NULL; int jj=0; OCC_MODE l_mode = i_mode; do { // Get lock for critical section if(ssx_semaphore_pend(&G_smgrModeChangeSem,SSX_WAIT_FOREVER)) { /* @ * @errortype * @moduleid MAIN_MODE_TRANSITION_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 none * @userdata4 ERC_RUNNING_SEM_PENDING_FAILURE * @devdesc SSX semaphore related failure */ l_errlHndl = createErrl(MAIN_MODE_TRANSITION_MID, //modId SSX_GENERIC_FAILURE, //reasoncode ERC_RUNNING_SEM_PENDING_FAILURE,//Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0, //userdata1 0); //userdata2 // Callout firmware addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); break; } //Check to see if we need to make a change if(l_mode == OCC_MODE_NOCHANGE) { break; } // SAPPHIRE only accepts DPS-FE mode. In case OCC gets other modes, it should accept the request // and keep reporting back that it is in that mode. However, internally we should not // initiate any mode transition, i.e., OCC should remain internally in DPS-FE mode. if(G_sysConfigData.system_type.kvm) { G_occ_external_req_mode_kvm = l_mode; if (l_mode != OCC_MODE_DYN_POWER_SAVE) { TRAC_ERR("SAPPHIRE only accepts DPS-FE mode(6) but requested mode is : %d", l_mode); l_mode = OCC_MODE_DYN_POWER_SAVE; } } switch (l_mode) { case OCC_MODE_NOMINAL: // FALL THROUGH case OCC_MODE_PWRSAVE: // FALL THROUGH case OCC_MODE_DYN_POWER_SAVE: // FALL THROUGH case OCC_MODE_DYN_POWER_SAVE_FP: // FALL THROUGH case OCC_MODE_TURBO: // FALL THROUGH case OCC_MODE_STURBO: // FALL THROUGH case OCC_MODE_FFO: // FALL THROUGH // Notify AMEC of mode change // Change Mode via Transition Function do { // Loop through mode transition table, and find the state // transition function that matches the transition we need to do. for(jj=0; jj<G_smgr_mode_trans_count; jj++) { if( ((G_smgr_mode_trans[jj].old_state == G_occ_internal_mode) || (G_smgr_mode_trans[jj].old_state == OCC_MODE_ALL) ) && (G_smgr_mode_trans[jj].new_state == l_mode) ) { // We found the transtion that matches, now run the function // that is associated with that state transition. if(NULL != G_smgr_mode_trans[jj].trans_func_ptr) { // Signal that we are now in a mode transition G_mode_transition_occuring = TRUE; // Run transition function l_errlHndl = (G_smgr_mode_trans[jj].trans_func_ptr)(); // Signal that we are done with the transition G_mode_transition_occuring = FALSE; break; } } } // Check if we hit the end of the table without finding a valid // mode transition. If we did, log an internal error. if(G_smgr_mode_trans_count == jj) { TRAC_ERR("No transition (or NULL) found for the mode change\n"); l_errlHndl = NULL; //TODO: Create Error break; } // Update the power mode for all core groups that are following system mode AMEC_part_update_sysmode_policy(CURRENT_MODE()); } while(0); break; default: //unsupported mode break; } if(l_errlHndl) { // Punt !!! :-) break; } // Load correct thermal thresholds based on the current mode l_errlHndl = AMEC_data_write_thrm_thresholds(CURRENT_MODE()); // Update the CPU speed in AME? // Register the New Mode? // Update Power Policy Requirements? // Update CPM Calibration }while(0); // If we have a mode change failure, Mode change flag needs to be set, // otherwise, it needs be be cleared/unset. if(l_errlHndl) { } // Unlock critical section ssx_semaphore_post(&G_smgrModeChangeSem); return l_errlHndl; }
//************************************************************************* // Functions //************************************************************************* // Function errlTestMain // // Name: sensorTestMain // // Description: Entry point function // // End Function Specification errlHndl_t errlTestMain(void * i_arg) { errlHndl_t l_err = NULL; uint16_t l_modId = 0; uint32_t l_rc = ERRL_RC_SUCCESS; ERRL_DBG("Enter errlTestMain\n"); do { l_rc = errlTestErrorHandling(); l_modId = TEST_ERROR_HANDLING; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on error handling test"); break; }; l_rc = errlTestCreateCommitDeleteLog(); l_modId = TEST_CREATE_COMMIT_DELETE_LOG ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on Log test"); break; } l_rc = errlTestAddUsrDtlsToErrl(); l_modId = TEST_ADD_USRDTLS_TO_ERRL ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on add user detail test"); break; } l_rc = errlTestAddTraceToErrl(); l_modId = TEST_ADD_TRACE_TO_ERRL ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on add trace test"); break; } l_rc = errlTestTime(); l_modId = TEST_TIME ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on time test"); break; } l_rc = errlTestCreate2InfoCallhomeLog(); l_modId = TEST_CREATE2INFO_CALLHOMELOG ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on create call home log test"); break; } l_rc = errlTestCreateMaxLogs(); l_modId = TEST_CREATE_MAX_LOGS ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on create max logs test"); break; } l_rc = errlTestCallouts(); l_modId = TEST_CALLOUTS ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on callouts test"); break; } l_rc = errlTestSetErrlSevToInfo(); l_modId = TEST_SET_ERRLSEV_TO_INFO ; if( l_rc != ERRL_RC_SUCCESS) { TRAC_INFO("Failure on SetErrlSevToInfo test"); break; } } while (0); if( l_rc != ERRL_RC_SUCCESS) { ERRL_DBG("**********************************************"); ERRL_DBG("* errl Test Failed (errlTest.c): line: %d",l_rc); ERRL_DBG("**********************************************"); /* @ * @errortype * @moduleid TEST_APLT_MODID_ERRLTEST * @reasoncode INTERNAL_FAILURE * @userdata1 Test Applet ID * @userdata2 Return Code * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failure executing test applet */ l_err = createErrl(TEST_APLT_MODID_ERRLTEST, INTERNAL_FAILURE, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL, NULL, 0, ERRL_TEST_APLT, l_rc); } else { ERRL_DBG("**********************************************"); ERRL_DBG("* errl Test Passed (errlTest.c)"); ERRL_DBG("**********************************************"); } ERRL_DBG("Exit errlTestMain\n"); return l_err; }
// Function Specification // // Name: amec_update_vrm_sensors // // Description: Updates sensors that use data from the VRMs // (e.g., VR_FAN, FANS_FULL_SPEED, VR_HOT). // // Thread: RealTime Loop // // End Function Specification void amec_update_vrm_sensors(void) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ int l_rc = 0; int l_vrfan = 0; int l_softoc = 0; int l_minus_np1_regmode = 0; int l_minus_n_regmode = 0; static uint8_t L_error_count = 0; uint8_t l_pin = 0; uint8_t l_pin_value = 1; // active low, so set default to high uint8_t l_vrhot_count = 0; errlHndl_t l_err = NULL; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // Check if we have access to SPIVID. In DCMs only Master OCC has access to // the SPIVID. if (G_dcm_occ_role == OCC_DCM_MASTER) { // VR_FAN and SOFT_OC come from SPIVID l_rc = vrm_read_state(SPIVRM_PORT(0), &l_minus_np1_regmode, &l_minus_n_regmode, &l_vrfan, &l_softoc); if (l_rc == 0) { // Update the VR_FAN sensor sensor_update( AMECSENSOR_PTR(VRFAN250USPROC), (uint16_t)l_vrfan ); // Clear our error count and the 'read failure' flag (since we can // read VR_FAN signal) L_error_count = 0; G_thrm_fru_data[DATA_FRU_VRM].read_failure = 0; // Obtain the 'fan_full_speed' GPIO from APSS l_pin = G_sysConfigData.apss_gpio_map.fans_full_speed; // No longer reading gpio from APSS in GA1 due to instability in // APSS composite mode //apss_gpio_get(l_pin, &l_pin_value); // VR_HOT sensor is a counter of number of times the VRHOT signal // has been asserted l_vrhot_count = AMECSENSOR_PTR(VRHOT250USPROC)->sample; // Check if VR_FAN is asserted AND if 'fans_full_speed' GPIO is ON. // Note that this GPIO is active low. if (AMECSENSOR_PTR(VRFAN250USPROC)->sample && !(l_pin_value)) { // VR_FAN is asserted and 'fans_full_speed' GPIO is ON, // then increment our VR_HOT counter if (l_vrhot_count < g_amec->vrhotproc.setpoint) { l_vrhot_count++; } } else { // Reset our VR_HOT counter l_vrhot_count = 0; } sensor_update(AMECSENSOR_PTR(VRHOT250USPROC), l_vrhot_count); } else { // Increment our error count L_error_count++; // Don't allow the error count to wrap if (L_error_count == 0) { L_error_count = 0xFF; } // Log an error if we exceeded our number of fail-to-read sensor if ((L_error_count == g_amec->proc[0].vrfan_error_count) && (g_amec->proc[0].vrfan_error_count != 0xFF)) { TRAC_ERR("amec_update_vrm_sensors: Failed to read VR_FAN for %u consecutive times!", L_error_count); // Also, inform the thermal thread to send a cooling request G_thrm_fru_data[DATA_FRU_VRM].read_failure = 1; /* @ * @errortype * @moduleid AMEC_HEALTH_CHECK_VRFAN_TIMEOUT * @reasoncode VRM_VRFAN_TIMEOUT * @userdata1 timeout value * @userdata2 0 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failed to read VR_FAN signal from regulator. * */ l_err = createErrl(AMEC_HEALTH_CHECK_VRFAN_TIMEOUT, //modId VRM_VRFAN_TIMEOUT, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size g_amec->thermaldimm.temp_timeout, //userdata1 0); //userdata2 // Callout backplane for this VRM error addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.backplane_huid, ERRL_CALLOUT_PRIORITY_MED); // Commit the error commitErrl(&l_err); } } } if( 1 ) { sensor_update( AMECSENSOR_PTR(VRFAN250USMEM), 0 ); sensor_update( AMECSENSOR_PTR(VRHOT250USMEM), 0 ); } }
//************************************************************************* // Functions //************************************************************************* void amec_vectorize_core_sensor(sensor_t * l_sensor, vectorSensor_t * l_vector, const VECTOR_SENSOR_OP l_op, uint16_t l_sensor_elem_array_gsid) { #define VECTOR_CREATE_FAILURE 1 #define VECTOR_ADD_ELEM_FAILURE 2 int l_idx = 0; // Used to index the for loops for vector create int l_rc = 0; // Indicates failure to add a sensor to vector uint16_t l_gsid = 0xFFFF; errlHndl_t l_err = NULL; do { // Grab GSID for errl in case of failure l_gsid = l_sensor->gsid; // Vectorize the sensor sensor_vectorize(l_sensor, l_vector, l_op); // If vectorize worked, add elements to the vector sensor if(NULL != l_sensor->vector) { // Loop through cores for(l_idx = 0; l_idx < MAX_NUM_CORES; l_idx++) { // Add elements to the vector sensor sensor_vector_elem_add(l_sensor->vector, l_idx, AMECSENSOR_ARRAY_PTR(l_sensor_elem_array_gsid, l_idx)); // If core is not present, disable this vector element if(!CORE_PRESENT(l_idx)) { sensor_vector_elem_enable(l_sensor->vector, l_idx, 0 /* Disable */); } } // Sanity check, we should have MAX_NUM_CORES entries in // vector sensor if(l_sensor->vector->size != MAX_NUM_CORES) { // Set l_rc and break out so that we can create an errl l_rc = VECTOR_ADD_ELEM_FAILURE; break; } } else { // Set l_rc and break out so that we can create an errl l_rc = VECTOR_CREATE_FAILURE; break; } }while(0); if(l_rc) { //If fail to create pore flex object then there is a problem. TRAC_ERR("Failed to vectorize sensor[0x%x, 0x%x]", l_gsid, l_rc ); /* @ * @errortype * @moduleid AMEC_VECTORIZE_FW_SENSORS * @reasoncode SSX_GENERIC_FAILURE * @userdata1 return code * @userdata2 gsid of failed sensor * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Firmware failure in call to vectorize sensor */ l_err = createErrl( AMEC_VECTORIZE_FW_SENSORS, //modId SSX_GENERIC_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity NULL,//TODO: create trace //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_rc, //userdata1 l_gsid //userdata2 ); REQUEST_RESET(l_err); } }
// Function Specification // // Name: errlTestAddTraceToErrl // // Description: errlTestAddTraceToErrl // // End Function Specification uint32_t errlTestAddTraceToErrl() { uint32_t l_rc = 0; uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; ERRL_DBG("START"); do { // Create one err log errlHndl_t l_handle = NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it errlHndl_t l_handleX = l_handle; ERRL_DBG("Slots after Create - 1 slots should be used (one of each"); ppdumpslot(); /****************************************************/ // Test size limit for addTraceToErrl // Add "trace" data that exceeds the max size l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, MAX_BUFFER_SIZE, l_handle); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeAfter <= MAX_ERRL_ENTRY_SZ, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); commitErrl( &l_handle ); ERRL_DBG("Slots after Commit - 1 slots should be used/committed"); ppdumpslot(); deleteErrl(&l_handleX); ERRL_DBG("Slots after delete Log - All slots should be empty"); ppdumpslot(); /****************************************************/ // Test size limit for addTraceToErrl with continuous calls // Create log with 512 bytes trace l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, g_trac_inf, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it l_handleX = l_handle; ppdumpslot(); // Add 256 bytes of trace (512+256) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 256, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 bytes" ); ppdumpslot(); // (header + 256) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter <= (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+256), l_rc); // Add 512 bytes of trace (512+256+512) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 512, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 bytes"); ppdumpslot(); CHECK_CONDITION( l_entrySizeAfter <= (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+512), l_rc); // Add 1024 bytes of trace (512+256+512+1024), the entry size is more than 2048 now l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 1024, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 bytes"); ppdumpslot(); CHECK_CONDITION( l_entrySizeAfter <= MAX_ERRL_ENTRY_SZ, l_rc); commitErrl( &l_handle ); deleteErrl(&l_handleX); ERRL_DBG("Slots should now be empty"); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestAddUsrDtlsToErrl // // Description: errlTestAddUsrDtlsToErrl // // End Function Specification uint32_t errlTestAddUsrDtlsToErrl() { uint32_t l_rc = 0; ERRL_DBG("START"); uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; do { // Create three err logs errlHndl_t l_handle = NULL; errlHndl_t l_handle2 = NULL; errlHndl_t l_handle3 = NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_UNRECOVERABLE, NULL, 512, 0x1, 0x2); l_handle2 = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA, NULL, 512, 0x1, 0x2); l_handle3 = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL, NULL, 512, 0x1, 0x2); // l_handle will set to NULL after calling the commitErrl, so we need to store it errlHndl_t l_handleX = l_handle; errlHndl_t l_handle2X = l_handle2; errlHndl_t l_handle3X = l_handle3; ERRL_DBG("Slots after Create - 3 slots should be used (one of each"); ppdumpslot(); CHECK_CONDITION( (l_handle != INVALID_ERR_HNDL) && (l_handle2 != INVALID_ERR_HNDL) && (l_handle3 != INVALID_ERR_HNDL), l_rc); /****************************************************/ // Test size limit for addUsrDtlsToErrl // Add "user details" data that exceeds the max size for l_handle l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xCC, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, sizeof( G_data ), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeAfter == MAX_ERRL_ENTRY_SZ, l_rc); // Add "user details" data that exceeds the max size for l_handle2 l_entrySizeBefore = l_handle2->iv_userDetails.iv_entrySize; memset( G_data, 0xDD, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle2, G_data, sizeof( G_data ), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_CALLHOME_DATA ); l_entrySizeAfter = l_handle2->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeAfter == MAX_ERRL_CALL_HOME_SZ, l_rc); // Add "user details" with size 76 for l_handle3 l_entrySizeBefore = l_handle3->iv_userDetails.iv_entrySize; memset( G_data, 0xEE, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle3, G_data, 76, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle3->iv_userDetails.iv_entrySize; // (header + 76) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+76), l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); dumpLog( l_handle2, l_handle2->iv_userDetails.iv_entrySize ); dumpLog( l_handle3, l_handle3->iv_userDetails.iv_entrySize ); commitErrl( &l_handle ); commitErrl( &l_handle2 ); commitErrl( &l_handle3 ); ERRL_DBG("Slots after Commit - 3 slots should be used/committed"); ppdumpslot(); deleteErrl(&l_handleX); deleteErrl(&l_handle2X); deleteErrl(&l_handle3X); CHECK_CONDITION( (l_handleX == NULL) && (l_handle2X == NULL) && (l_handle3X == NULL), l_rc); ERRL_DBG("Slots after delete Log - All slots should be empty"); ppdumpslot(); /****************************************************/ // Test size limit for addUsrDtlsToErrl with continuous calls // Create log with 512 bytes trace l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, g_trac_inf, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it l_handleX = l_handle; ppdumpslot(); // add 256 bytes of "user details" (512+256) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xAA, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 256, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 bytes" ); ppdumpslot(); // (header + 256) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+256), l_rc); // add 512 bytes of "user details" (512+256+512) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xBB, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 512, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 bytes"); ppdumpslot(); // (header + 512) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+512), l_rc); // add 1024 bytes of "user details" (512+256+512+1024), the entry size is more than 2048 now l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xCC, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 1024, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 +1024 bytes"); ppdumpslot(); // (header + 1024) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter <= MAX_ERRL_ENTRY_SZ, l_rc); // @at012c commitErrl( &l_handle ); deleteErrl(&l_handleX); ERRL_DBG("Slots should now be empty"); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestErrorHandling // // Description: errlTestErrorHandling // // End Function Specification uint32_t errlTestErrorHandling() { uint32_t l_rc = 0; errlHndl_t l_errlHnd = NULL; uint8_t l_dataPtr[10]; uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; ERRL_DBG(" START"); do { /****************************************************/ // Test createErrl with incorrect parameter // Set ERRL_SEVERITY to 0x04, out of range so log won't be created l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, 0x04, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd == INVALID_ERR_HNDL, l_rc); /****************************************************/ // Test addTraceToErrl with incorrect parameter // Create a log l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd != INVALID_ERR_HNDL, l_rc); // i_trace = NULL, so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(NULL, 5, l_errlHnd); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION(l_entrySizeBefore == l_entrySizeAfter, l_rc); // i_traceSz = 0, entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 0, l_errlHnd); // @at012c l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // io_err = NULL, entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 32, NULL); // @at012c l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // test addTraceToErrl after log is comitted so entry size doesn't change errlHndl_t l_errlHndx = l_errlHnd; commitErrl(&l_errlHnd); l_entrySizeBefore = l_errlHndx->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 32, l_errlHndx); // @at012c l_entrySizeAfter = l_errlHndx->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); deleteErrl(&l_errlHndx); CHECK_CONDITION( l_errlHndx == NULL, l_rc); // io_err = INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) l_errlHnd = INVALID_ERR_HNDL; addTraceToErrl(g_trac_inf, 32, l_errlHnd); /****************************************************/ // Test commitErrl with incorrect parameter // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) commitErrl( NULL); // l_errlHnd should be set to NULL l_errlHnd = INVALID_ERR_HNDL; commitErrl(&l_errlHnd); CHECK_CONDITION( l_errlHnd == NULL, l_rc); /****************************************************/ // Test deleteErrl with incorrect parameter // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) deleteErrl( NULL); // l_errlHnd should be set to NULL l_errlHnd = INVALID_ERR_HNDL; deleteErrl(&l_errlHnd); CHECK_CONDITION( l_errlHnd == NULL, l_rc); /****************************************************/ // Test addCalloutToErrl with incorrect parameter // Set io_err to NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addCalloutToErrl(NULL, ERRL_CALLOUT_TYPE_HUID, 0, ERRL_CALLOUT_PRIORITY_LOW); // Set io_err to INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addCalloutToErrl(INVALID_ERR_HNDL, ERRL_CALLOUT_TYPE_HUID, 0, ERRL_CALLOUT_PRIORITY_LOW); /****************************************************/ // Test addUsrDtlsToErrl with incorrect parameter // Create a log l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd != INVALID_ERR_HNDL, l_rc); // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addUsrDtlsToErrl(NULL, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // io_err = INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addUsrDtlsToErrl(INVALID_ERR_HNDL, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // i_dataPtr = NULL so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHnd, NULL, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // i_size = 0 so so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHnd, l_dataPtr, 0, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // test addUsrDtlsToErrl after log is committed so entry size doesn't change l_errlHndx = l_errlHnd; commitErrl(&l_errlHnd); l_entrySizeBefore = l_errlHndx->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHndx, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHndx->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); deleteErrl(&l_errlHndx); CHECK_CONDITION( l_errlHndx == NULL, l_rc); /****************************************************/ // Test setErrlSevToInfo with incorrect parameter // Set io_err to NULL. // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) setErrlSevToInfo(NULL); // Set io_err to INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) setErrlSevToInfo(INVALID_ERR_HNDL); }while(0); return l_rc; }
// Function Specification // // Name: errlTestCreateMaxLogs // // Description: errlTestCreateMaxLogs // // End Function Specification uint32_t errlTestCreateMaxLogs() { uint32_t l_rc = 0; ERRL_DBG("START"); do { /****************************************************/ // Check max logs ERRL_SEVERITY l_sev = 0; errlHndl_t l_backupHandle[ERRL_MAX_SLOTS-2]; errlHndl_t l_handle = NULL; uint32_t l_index = 0; // Create 7 ERRL_SEV_PREDICTIVE or ERRL_SEV_UNRECOVERABLE slots randomly for(l_index =0; l_index < ERRL_MAX_SLOTS-2; l_index++) { uint64_t l_time = ssx_timebase_get(); l_sev = l_time%2 ? ERRL_SEV_PREDICTIVE : ERRL_SEV_UNRECOVERABLE; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, l_sev, g_trac_inf, 512, 0x1, l_index); CHECK_CONDITION( (l_handle != INVALID_ERR_HNDL) && (l_handle != NULL), l_rc); // backup handle l_backupHandle[l_index] = l_handle; ERRL_DBG("Log Created @ %p with Sev: %d\n",l_handle, l_sev ); // addUsrDtlsToErrl memset( G_data, l_index, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, sizeof(G_data), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); // commitErrl( &l_handle ); } // check if something wrong in for loop if(l_rc != 0) break; // Create one more and it should fail l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, l_sev, g_trac_inf, 512, 0x1, l_index); CHECK_CONDITION( l_handle == INVALID_ERR_HNDL, l_rc); // delete errl for(l_index = 0; l_index < ERRL_MAX_SLOTS-2; l_index++) { deleteErrl(&l_backupHandle[l_index]); } ppdumpslot(); /****************************************************/ // Check log id overflow for(l_index = 0; l_index < 256; l_index++) { l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, l_sev, g_trac_inf, 512, 0x1, l_index); CHECK_CONDITION( (l_handle != INVALID_ERR_HNDL) && (l_handle != NULL), l_rc); deleteErrl(&l_handle); } ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestCallouts // // Description: errlTestCallouts // // End Function Specification uint32_t errlTestCallouts() { uint32_t l_rc = 0; ERRL_DBG("START"); do { errlHndl_t l_handle = NULL; ERRL_DBG("--------------------------------\n"); /****************************************************/ // Check max callouts l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); ERRL_CALLOUT_PRIORITY l_array[8] = { ERRL_CALLOUT_PRIORITY_HIGH, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_LOW, ERRL_CALLOUT_PRIORITY_HIGH, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_LOW, ERRL_CALLOUT_PRIORITY_LOW, }; ERRL_CALLOUT_TYPE l_type[8] = { ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, }; // add 6 (ERRL_MAX_CALLOUTS) callouts uint8_t l_index = 0; for(l_index = 0; l_index < ERRL_MAX_CALLOUTS; l_index++) { ERRL_DBG("current callouts %d attempting to add callout # %d with type %d ,priority %d", l_handle->iv_numCallouts, l_index, l_type[l_index], l_array[l_index] ); addCalloutToErrl(l_handle,l_type[l_index],l_index,l_array[l_index]); } CHECK_CONDITION( l_handle->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); // add one more callout and it should fail addCalloutToErrl(l_handle,l_type[0],l_index,l_array[0]); CHECK_CONDITION( l_handle->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); deleteErrl( &l_handle ); ppdumpslot(); /****************************************************/ // Check callouts after errl is committed // Create log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 32, 0x1, 0x2); errlHndl_t l_log = l_handle; CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Commit log and add callout. But adding callout should fail commitErrl( &l_handle ); addCalloutToErrl(l_handle,l_type[0],0,l_array[0]); CHECK_CONDITION( l_log->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); deleteErrl(&l_log); /****************************************************/ // Check addCalloutToErrl for ERRL_SEV_INFORMATIONAL log // Create ERRL_SEV_INFORMATIONAL log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); if(l_handle == INVALID_ERR_HNDL) // add one callout and it should fail addCalloutToErrl(l_handle,l_type[0],l_index,l_array[0]); CHECK_CONDITION( l_handle->iv_numCallouts == 0, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); deleteErrl( &l_handle ); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Schedule a GPE request for the specified DIMM state bool schedule_dimm_req(uint8_t i_state) { bool l_scheduled = false; bool scheduleRequest = true; DIMM_DBG("dimm_sm called with state 0x%02X (tick=%d)", i_state, DIMM_TICK); if (!async_request_is_idle(&G_dimm_sm_request.request)) { INTR_TRAC_ERR("dimm_sm: request is not idle."); } else { switch(i_state) { // Init case DIMM_STATE_INIT: break; // Read DIMM temp case DIMM_STATE_WRITE_MODE: case DIMM_STATE_WRITE_ADDR: case DIMM_STATE_INITIATE_READ: case DIMM_STATE_READ_TEMP: break; // I2C reset case DIMM_STATE_RESET_MASTER: case DIMM_STATE_RESET_SLAVE_P0: case DIMM_STATE_RESET_SLAVE_P0_COMPLETE: case DIMM_STATE_RESET_SLAVE_P1: case DIMM_STATE_RESET_SLAVE_P1_COMPLETE: break; default: INTR_TRAC_ERR("dimm_sm: Invalid state (0x%02X)", i_state); errlHndl_t err = NULL; /* * @errortype * @moduleid DIMM_MID_DIMM_SM * @reasoncode DIMM_INVALID_STATE * @userdata1 DIMM state * @userdata2 0 * @devdesc Invalid DIMM I2C state requested */ err = createErrl(DIMM_MID_DIMM_SM, DIMM_INVALID_STATE, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, i_state, 0); // Request reset since this should never happen. REQUEST_RESET(err); scheduleRequest = false; break; } if (scheduleRequest) { // Clear errors and init common arguments for GPE G_dimm_sm_args.error.error = 0; G_dimm_sm_args.state = i_state; DIMM_DBG("dimm_sm: Scheduling GPE1 DIMM I2C state 0x%02X (tick %d)", i_state, DIMM_TICK); int l_rc = gpe_request_schedule(&G_dimm_sm_request); if (0 == l_rc) { l_scheduled = true; } else { errlHndl_t l_err = NULL; INTR_TRAC_ERR("dimm_sm: schedule failed w/rc=0x%08X (%d us)", l_rc, (int) ((ssx_timebase_get())/(SSX_TIMEBASE_FREQUENCY_HZ/1000000))); /* * @errortype * @moduleid DIMM_MID_DIMM_SM * @reasoncode SSX_GENERIC_FAILURE * @userdata1 GPE shedule returned rc code * @userdata2 state * @devdesc dimm_sm schedule failed */ l_err = createErrl(DIMM_MID_DIMM_SM, SSX_GENERIC_FAILURE, ERC_DIMM_SCHEDULE_FAILURE, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, l_rc, i_state); // Request reset since this should never happen. REQUEST_RESET(l_err); } } } return l_scheduled; } // end schedule_dimm_req()
////////////////////////// // Function Specification // // Name: amec_gpu_pcap // // Description: Determine power cap for GPUs // // Thread: Real Time Loop // // End Function Specification void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t i_avail_power) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ uint8_t i = 0; uint32_t l_gpu_cap_mw = 0; uint16_t l_system_gpu_total_pcap = 0; // total GPU pcap required by system based on if currently in oversub or not static uint16_t L_total_gpu_pcap = 0; // Current total GPU pcap in effect static uint16_t L_n_plus_1_mode_gpu_total_pcap = 0; // Total GPU pcap required for N+1 (not in oversubscription) static uint16_t L_n_mode_gpu_total_pcap = 0; // Total GPU pcap required for oversubscription static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR static uint16_t L_per_gpu_pcap = 0; // Amount of L_total_gpu_pcap for each GPU static uint8_t L_psr = 100; // PSR value used in L_active_psr_gpu_total_pcap calculation static bool L_first_run = TRUE; // for calculations done only 1 time static uint32_t L_last_pcap_traced[MAX_NUM_GPU_PER_DOMAIN] = {0}; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // If this is the first time running calculate the total GPU power cap for system power caps (N and N+1) if(L_first_run) { // calculate total GPU power cap for oversubscription if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the oversubscription power cap L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs L_n_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; } else { // This should not happen, the total non GPU power should never be higher than the N mode cap // Log error and set GPUs to minimum power cap L_n_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N mode pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.ovs_node_pcap); /* @ * @errortype * @moduleid AMEC_GPU_PCAP_MID * @reasoncode GPU_FAILURE * @userdata1 N mode Power Cap watts * @userdata2 Total non-GPU power watts * @userdata4 ERC_GPU_N_MODE_PCAP_CALC_FAILURE * @devdesc Total non-GPU power more than N mode power cap * */ errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, GPU_FAILURE, ERC_GPU_N_MODE_PCAP_CALC_FAILURE, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, g_amec->pcap.ovs_node_pcap, G_sysConfigData.total_non_gpu_max_pwr_watts); //Callout firmware addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } // calculate total GPU power cap for N+1 (not in oversubscription) if(G_sysConfigData.pcap.system_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the N+1 power cap L_n_plus_1_mode_gpu_total_pcap = G_sysConfigData.pcap.system_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs L_n_plus_1_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; } else { // This should not happen, the total non GPU power should never be higher than the N+1 mode cap // Log error and set GPUs to minimum power cap L_n_plus_1_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N+1 mode pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, G_sysConfigData.pcap.system_pcap); /* @ * @errortype * @moduleid AMEC_GPU_PCAP_MID * @reasoncode GPU_FAILURE * @userdata1 N+1 mode Power Cap watts * @userdata2 Total non-GPU power watts * @userdata4 ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE * @devdesc Total non-GPU power more than N+1 mode power cap * */ errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, GPU_FAILURE, ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, G_sysConfigData.pcap.system_pcap, G_sysConfigData.total_non_gpu_max_pwr_watts); //Callout firmware addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } } // if first run // Calculate the total GPU power cap for the current active limit and PSR // this only needs to be calculated if either the active limit or PSR changed if( (L_first_run) || (i_active_pcap_changed) || (L_psr != G_sysConfigData.psr) ) { L_psr = G_sysConfigData.psr; if(g_amec->pcap.active_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the active power cap L_active_psr_gpu_total_pcap = g_amec->pcap.active_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling based on the PSR // to give to GPUs L_active_psr_gpu_total_pcap += ( (L_psr / 100) * G_sysConfigData.total_proc_mem_pwr_drop_watts ); } else { // Set GPUs to minimum power cap L_active_psr_gpu_total_pcap = 0; TRAC_IMP("amec_gpu_pcap: non GPU max power %dW is more than active pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap); } // Total GPU power cap is the lower of system (N+1 or oversubscription depending on if in oversub) // and the active power limit. We do not need to always account for oversubscription since // the automatic hw power brake will assert to the GPUs if there is a problem when oversub is // entered from the time OCC can set and GPUs react to a new power limit if(i_oversubscription) { // system in oversubscription use N mode cap l_system_gpu_total_pcap = L_n_mode_gpu_total_pcap; } else { // system is not in oversubscription use N+1 mode cap l_system_gpu_total_pcap = L_n_plus_1_mode_gpu_total_pcap; } L_total_gpu_pcap = (l_system_gpu_total_pcap < L_active_psr_gpu_total_pcap) ? l_system_gpu_total_pcap : L_active_psr_gpu_total_pcap; // Divide the total equally across all GPUs in the system if(G_first_num_gpus_sys) { L_per_gpu_pcap = L_total_gpu_pcap / G_first_num_gpus_sys; } else { L_per_gpu_pcap = 0; TRAC_ERR("amec_gpu_pcap: Called with no GPUs present!"); } } // Setup to send new power limit to GPUs. The actual sending of GPU power limit will be handled by task_gpu_sm() for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++) { // Before sending a GPU a power limit the power limits must be read from the GPU to know min/max GPU allows if( GPU_PRESENT(i) && g_amec->gpu[i].pcap.pwr_limits_read ) { l_gpu_cap_mw = L_per_gpu_pcap * 1000; // convert W to mW // GPU is present and have min/max power limits from GPU // clip the GPU power limit to min/max GPU limit if needed if(l_gpu_cap_mw < g_amec->gpu[i].pcap.gpu_min_pcap_mw) // clip to min? { l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw; } else if(l_gpu_cap_mw > g_amec->gpu[i].pcap.gpu_max_pcap_mw) // clip to max? { l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_max_pcap_mw; } // check if this is a new power limit if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw) { if( (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) || (L_last_pcap_traced[i] != l_gpu_cap_mw) ) { L_last_pcap_traced[i] = l_gpu_cap_mw; TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i, g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw); } g_amec->gpu[i].pcap.gpu_desired_pcap_mw = l_gpu_cap_mw; } } } // for each GPU L_first_run = FALSE; }
void task_centaur_control( task_t * i_task ) { errlHndl_t l_err = NULL; // Error handler int rc = 0; // Return code uint32_t l_cent; amec_centaur_t *l_cent_ptr = NULL; static uint8_t L_scom_timeout[MAX_NUM_CENTAURS] = {0}; //track # of consecutive failures static bool L_gpe_scheduled = FALSE; static uint8_t L_gpe_fail_logged = 0; static bool L_gpe_idle_traced = FALSE; static bool L_gpe_had_1_tick = FALSE; // Pointer to the task data structure centaur_control_task_t * l_centControlTask = (centaur_control_task_t *) i_task->data_ptr; // Pointer to parameter field for GPE request GpeScomParms * l_parms = (GpeScomParms *)(l_centControlTask->gpe_req.parameter); do { l_cent = l_centControlTask->curCentaur; l_cent_ptr = &g_amec->proc[0].memctl[l_cent].centaur; //First, check to see if the previous GPE request still running //A request is considered idle if it is not attached to any of the //asynchronous request queues if( !(async_request_is_idle(&l_centControlTask->gpe_req.request)) ) { L_scom_timeout[l_cent]++; //This can happen due to variability in when the task runs if(!L_gpe_idle_traced && L_gpe_had_1_tick) { TRAC_INFO("task_centaur_control: GPE is still running. cent[%d]", l_cent); l_centControlTask->traceThresholdFlags |= CENTAUR_CONTROL_GPE_STILL_RUNNING; L_gpe_idle_traced = TRUE; } L_gpe_had_1_tick = TRUE; break; } else { //Request is idle L_gpe_had_1_tick = FALSE; if(L_gpe_idle_traced) { TRAC_INFO("task_centaur_control: GPE completed. cent[%d]", l_cent); L_gpe_idle_traced = FALSE; } } //check scom status if(L_gpe_scheduled) { if(!async_request_completed(&l_centControlTask->gpe_req.request) || l_parms->rc) { if(!(L_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> l_cent))) { // Check if the centaur has a channel checkstop. If it does, // then do not log any errors. We also don't want to throttle // a centaur that is in this condition. if(!(cent_chan_checkstop(l_cent))) { L_gpe_fail_logged |= CENTAUR0_PRESENT_MASK >> l_cent; TRAC_ERR("task_centaur_control: gpe_scom_centaur failed. l_cent=%d rc=%x, index=0x%08x", l_cent, l_parms->rc, l_parms->errorIndex); /* @ * @errortype * @moduleid CENT_TASK_CONTROL_MOD * @reasoncode CENT_SCOM_ERROR * @userdata1 rc - Return code of scom operation * @userdata2 index of scom operation that failed * @userdata4 OCC_NO_EXTENDED_RC * @devdesc OCC access to centaur failed */ l_err = createErrl( CENT_TASK_CONTROL_MOD, // modId CENT_SCOM_ERROR, // reasoncode OCC_NO_EXTENDED_RC, // Extended reason code ERRL_SEV_PREDICTIVE, // Severity NULL, // Trace Buf DEFAULT_TRACE_SIZE, // Trace Size l_parms->rc, // userdata1 l_parms->errorIndex // userdata2 ); addUsrDtlsToErrl(l_err, //io_err (uint8_t *) &(l_centControlTask->gpe_req.ffdc), //i_dataPtr, sizeof(PoreFfdc), //i_size ERRL_USR_DTL_STRUCT_VERSION_1, //version ERRL_USR_DTL_BINARY_DATA); //type //callout the centaur addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.centaur_huids[l_cent], ERRL_CALLOUT_PRIORITY_MED); //callout the processor addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_MED); commitErrl(&l_err); } }//if(l_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> l_cent)) //Request failed. Keep count of failures and request a reset if we reach a //max retry count L_scom_timeout[l_cent]++; if(L_scom_timeout[l_cent] == CENTAUR_CONTROL_SCOM_TIMEOUT) { break; } }//if(!async_request_completed(&l_centControlTask->gpe_req.request) || l_parms->rc) else { //request completed successfully. reset the timeout. L_scom_timeout[l_cent] = 0; } }//if(L_gpe_scheduled)
// Function Specification // // Name: task_check_for_checkstop // // Description: Check for checkstop // // End Function Specification void task_check_for_checkstop(task_t *i_self) { pore_status_t l_gpe0_status; ocb_oisr0_t l_oisr0_status; static bool L_checkstop_traced = FALSE; uint8_t l_reason_code = 0; do { // This check is disabled once a checkstop or frozen GPE is detected if(L_checkstop_traced) { break; } // Looked for a frozen GPE, a sign that the chip has stopped working or // check-stopped. This check also looks for an interrupt status flag that // indicates if the system has check-stopped. l_gpe0_status.value = in64(PORE_GPE0_STATUS); l_oisr0_status.value = in32(OCB_OISR0); if (l_gpe0_status.fields.freeze_action || l_oisr0_status.fields.check_stop) { errlHndl_t l_err = NULL; if (l_gpe0_status.fields.freeze_action) { TRAC_IMP("Frozen GPE0 detected by RTL"); l_reason_code = OCC_GPE_HALTED; } if (l_oisr0_status.fields.check_stop) { TRAC_IMP("System checkstop detected by RTL"); l_reason_code = OCC_SYSTEM_HALTED; } L_checkstop_traced = TRUE; /* * @errortype * @moduleid MAIN_SYSTEM_HALTED_MID * @reasoncode OCC_GPE_HALTED * @userdata1 High order word of PORE_GPE0_STATUS * @userdata2 OCB_OISR0 * @devdesc OCC detected frozen GPE0 */ /* * @errortype * @moduleid MAIN_SYSTEM_HALTED_MID * @reasoncode OCC_SYSTEM_HALTED * @userdata1 High order word of PORE_GPE0_STATUS * @userdata2 OCB_OISR0 * @devdesc OCC detected system checkstop */ l_err = createErrl(MAIN_SYSTEM_HALTED_MID, l_reason_code, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL, NULL, DEFAULT_TRACE_SIZE, l_gpe0_status.words.high_order, l_oisr0_status.value); // The commit code will check for the frozen GPE0 and system // checkstop conditions and take appropriate actions. commitErrl(&l_err); } } while(0); }
TRAC_ERR("PMC Failure detected through OISR0[9]!!!"); /* @ * @moduleid PMC_HW_ERROR_ISR * @reasonCode PMC_FAILURE * @severity ERRL_SEV_PREDICTIVE * @userdata1 0 * @userdata2 0 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failure detected in processor * power management controller (PMC) */ l_err = createErrl( PMC_HW_ERROR_ISR, // i_modId, PMC_FAILURE, // i_reasonCode, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, // tracDesc_t i_trace, DEFAULT_TRACE_SIZE, // i_traceSz, 0, // i_userData1, 0); // i_userData2 //Add our register dump to the error log addUsrDtlsToErrl(l_err, (uint8_t*) &l_pmc_ffdc, sizeof(l_pmc_ffdc), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_BINARY_DATA); //Add firmware callout addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE,
// Function Specification // // Name: querySensorList // // Description: Query sensor list // // End Function Specification errlHndl_t querySensorList(const querySensorListArg_t * i_argPtr) { errlHndl_t l_err = NULL; /* TEMP -- NOT SUPPORTED ( NEED AMEC/DCOM ) */ #if 0 if (i_argPtr != NULL) { uint16_t i_startGsid = i_argPtr->i_startGsid; uint8_t i_present = i_argPtr->i_present; uint16_t i_type = i_argPtr->i_type; uint16_t i_loc = i_argPtr->i_loc; uint16_t * io_numOfSensors = i_argPtr->io_numOfSensors; sensorQueryList_t * o_sensors = i_argPtr->o_sensors; sensor_info_t * o_sensorInfoPtrs= i_argPtr->o_sensorInfoPtrs; // Validate input parameters if( (i_startGsid >= NUMBER_OF_SENSORS_IN_LIST) || ((o_sensors == NULL) && (o_sensorInfoPtrs ==NULL)) || (io_numOfSensors == NULL)) { TRAC_ERR("querySensorList: Invalid input pointers OR start GSID is out of range: " "i_startGsid: 0x%x, G_amec_sensor_count: 0x%x", i_startGsid,G_amec_sensor_count); /* @ * @errortype * @moduleid SENSOR_QUERY_LIST * @reasoncode INTERNAL_INVALID_INPUT_DATA * @userdata1 i_startGsid -- passed in Global Sensor ID * @userdata2 G_amec_sensor_count -- number of OCC sensors * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Firmware failure caused due to invalid GSID passed */ /* @ * @errortype * @moduleid SENSOR_QUERY_LIST * @reasoncode INTERNAL_FAILURE * @userdata1 i_startGsid -- passed in Global Sensor ID * @userdata2 G_amec_sensor_count -- number of OCC sensors * @userdata4 OCC_NO_EXTENDED_RC * @devdesc NULL pointer passed for querySensorList output args */ l_err = createErrl(SENSOR_QUERY_LIST, //modId ((i_startGsid >= NUMBER_OF_SENSORS_IN_LIST) ? INTERNAL_INVALID_INPUT_DATA : INTERNAL_FAILURE), //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf 0, //Trace Size i_startGsid, //userdata1 G_amec_sensor_count //userdata2 ); } else { uint32_t l_cnt = i_startGsid; uint32_t l_num = *io_numOfSensors; *io_numOfSensors = 0; // Traverse through sensor list starting at i_startGsid to find // matching sensor. Return it in the output variable for (; (l_cnt < NUMBER_OF_SENSORS_IN_LIST && ((*io_numOfSensors) < l_num)); l_cnt++) { // If sample value is not zero then it means sensor is present. // This is currently only used by debug/mfg purpose // If user is looking for present sensors and sample is zero, // then don't include current sensor in the query list if ((i_present) && (G_amec_sensor_list[l_cnt]->sample == 0)) { continue; } // If user is NOT looking for any sensor type and input type, // does not match the current sensor type, then don't include // current sensor in the query list if ((i_type & G_sensor_info[l_cnt].sensor.type) == 0) { continue; } // If user is NOT looking for any sensor location and input loc, // does not match the current sensor location, then don't include // current sensor in the query list if ((i_loc & G_sensor_info[l_cnt].sensor.location) == 0) { continue; } if (o_sensors != NULL) { // All conditions match. Include current sensor in the query list // Copy gsid, name and sample o_sensors->gsid = l_cnt; strncpy(o_sensors->name, G_sensor_info[l_cnt].name, MAX_SENSOR_NAME_SZ); o_sensors->sample = G_amec_sensor_list[l_cnt]->sample; o_sensors++; } if (o_sensorInfoPtrs != NULL) { memcpy(o_sensorInfoPtrs, &G_sensor_info[l_cnt], sizeof(sensor_info_t)); o_sensorInfoPtrs++; } (*io_numOfSensors)++; } } } else { TRAC_ERR("querySensorList: Invalid argument pointer = NULL"); /* @ * @errortype * @moduleid SENSOR_QUERY_LIST * @reasoncode INTERNAL_INVALID_INPUT_DATA * @userdata1 NULL * @userdata2 NULL * @userdata4 ERC_ARG_POINTER_FAILURE * @devdesc NULL pointer passed to querySensorList applet */ l_err = createErrl( SENSOR_QUERY_LIST, // Module ID INTERNAL_INVALID_INPUT_DATA, // Reason Code ERC_ARG_POINTER_FAILURE, // Extended reason code ERRL_SEV_PREDICTIVE, // Severity NULL, // Trace 0, // Trace Size 0, // UserData 1 0 // UserData 2 ); } #endif return l_err; }