// Create/Build an Error log and add HTMGT component trace void bldErrLog(errlHndl_t & io_err, const uint8_t i_modid, const uint16_t i_rc, const uint32_t i_data1, const uint32_t i_data2, const uint32_t i_data3, const uint32_t i_data4, const ERRORLOG::errlSeverity_t i_sev, const bool i_addFwCallout) { TMGT_INF("bldErrLog(mod: 0x%02X, rc: 0x%02X, data: 0x%08X %08X %08X" " %08X, sev: 0x%02X, fw:%c", i_modid, i_rc, i_data1, i_data2, i_data3, i_data4, i_sev, i_addFwCallout?'y':'n'); // TODO RTC 124739 - RAS review what logs need fw callout if (NULL == io_err) { io_err = new ERRORLOG::ErrlEntry(i_sev, i_modid, i_rc, ((uint64_t)i_data1 << 32) | i_data2, ((uint64_t)i_data3 << 32) | i_data4, i_addFwCallout); io_err->collectTrace("HTMGT"); } else { // TODO RTC 124739 // - collectTrace will not filter dup traces and no way to clear // - no way to add secondary SRC to elog io_err->collectTrace("HTMGT"); uint32_t additionalSrc[] = { uint32_t(HTMGT_COMP_ID | i_rc), uint32_t(i_modid), uint32_t(i_sev), uint32_t(i_addFwCallout?1:0), i_data1, i_data2, i_data3, i_data4 }; io_err->addFFDC(HTMGT_COMP_ID, additionalSrc, sizeof(additionalSrc), 1, // version SUBSEC_ADDITIONAL_SRC); } }
// Move the OCCs to active state or log unrecoverable error and // stay in safe mode void processOccStartStatus(const bool i_startCompleted, TARGETING::Target * i_failedOccTarget) { TMGT_INF(">>processOccStartStatus(%d,0x%p)", i_startCompleted, i_failedOccTarget); errlHndl_t l_err = NULL; uint32_t l_huid = 0; if (i_failedOccTarget) { l_huid = TARGETING::get_huid(i_failedOccTarget); } TMGT_INF("processOccStartStatus(Start Success=%c, failedOcc=0x%08X)", i_startCompleted?'y':'n', l_huid); if (i_startCompleted) { // Query functional OCCs l_err = OccManager::buildOccs(); if (NULL == l_err) { if (NULL != OccManager::getMasterOcc()) { do { #ifndef __HOSTBOOT_RUNTIME // Build pstate tables (once per IPL) l_err = genPstateTables(); if(l_err) { break; } // Calc memory throttles (once per IPL) calcMemThrottles(); #endif // Make sure OCCs are ready for communication OccManager::waitForOccCheckpoint(); #ifdef __HOSTBOOT_RUNTIME // TODO RTC 124738 Final solution TBD // Perhapse POLL scom 0x6a214 until bit 31 is set? nanosleep(1,0); #endif // Send poll to establish comm TMGT_INF("Send initial poll to all OCCs to" " establish comm"); l_err = OccManager::sendOccPoll(); if (l_err) { // Continue even if failed (poll will be retried) ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } // Send ALL config data sendOccConfigData(); // Set the User PCAP l_err = sendOccUserPowerCap(); if (l_err) { break; } // Wait for all OCCs to go to the target state l_err = waitForOccState(); if ( l_err ) { break; } // Set active sensors for all OCCs, // so BMC can start communication with OCCs l_err = setOccActiveSensors(true); if (l_err) { // Continue even if failed to update sensor ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } } while(0); } else { TMGT_ERR("Unable to find any Master capable OCCs"); /*@ * @errortype * @reasoncode HTMGT_RC_OCC_MASTER_NOT_FOUND * @moduleid HTMGT_MOD_LOAD_START_STATUS * @userdata1[0:7] number of OCCs * @devdesc No OCC master was found */ bldErrLog(l_err, HTMGT_MOD_LOAD_START_STATUS, HTMGT_RC_OCC_MASTER_NOT_FOUND, OccManager::getNumOccs(), 0, 0, 0, ERRORLOG::ERRL_SEV_INFORMATIONAL); } } else { // Failed to find functional OCCs, no need to try again // Set original error log as unrecoverable and commit l_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } } else { TMGT_ERR("All OCCs were not loaded/started successfully"); /*@ * @errortype * @reasoncode HTMGT_RC_OCC_START_FAIL * @moduleid HTMGT_MOD_LOAD_START_STATUS * @userdata1 Failing OCC HUID * @devdesc OCCs were not loaded/started successfully */ bldErrLog(l_err, HTMGT_MOD_LOAD_START_STATUS, HTMGT_RC_OCC_START_FAIL, l_huid, 0, 0, 0, ERRORLOG::ERRL_SEV_INFORMATIONAL); } if (NULL != l_err) { TMGT_ERR("OCCs not all active. Attempting OCC Reset"); TMGT_CONSOLE("OCCs are not active (rc=0x%04X). " "Attempting OCC Reset", l_err->reasonCode()); TMGT_INF("Calling resetOccs"); errlHndl_t err2 = OccManager::resetOccs(NULL); if(err2) { TMGT_ERR("OccManager::resetOccs failed with 0x%04X", err2->reasonCode()); // Set original error log as unrecoverable and commit l_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); // Commit occReset error ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); } else { // retry worked - commit original error as informational l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } } TMGT_INF("<<processOccStartStatus()"); } // end processOccStartStatus()
// Set the OCC state errlHndl_t enableOccActuation(bool i_occActivation) { TMGT_INF(">>enableOccActuation(%c)", i_occActivation?'Y':'N'); errlHndl_t l_err = NULL; TARGETING::Target* sys = NULL; // If the system is already in safemode then can't talk to OCCs TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; if(sys) { sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode); } if (0 == safeMode) { occStateId targetState = OCC_STATE_ACTIVE; if (false == i_occActivation) { targetState = OCC_STATE_OBSERVATION; } // Set state for all OCCs l_err = OccManager::setOccState(targetState); if (NULL == l_err) { TMGT_INF("enableOccActuation: OCC states updated to 0x%02X", targetState); } if (OccManager::occNeedsReset()) { if (l_err) { // Commit setOccState elog since OCCs will be reset // and recovery attempted. ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } TMGT_ERR("enableOccActuation(): OCCs need to be reset"); // Don't pass failed target as OCC should have already // been marked as failed during the poll. l_err = OccManager::resetOccs(NULL); // NOTE: If the system exceeded its reset count and ended up // in safe mode an error may not be returned here (if a // failure happened after the first reset attempt). // This is because the resets are recursive: // HTMGT calls back into HBRT to initiate the reset, then // HBRT calls into HTMGT when reset completed // To detected this condition we need to check for safe mode // after the recovery attempts and return error if in safe. if(sys) { sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode); } } } if ((NULL == l_err) && safeMode) { // Create an elog so the user knows the cmd failed. TMGT_ERR("enableOccActuation(): System is in safe mode"); /*@ * @errortype * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION * @userdata1 OCC activate [1==true][0==false] * @devdesc Operation not allowed, system is in safe mode */ bldErrLog(l_err, HTMGT_MOD_ENABLE_OCC_ACTUATION, HTMGT_RC_OCC_CRIT_FAILURE, 0, i_occActivation, 0, safeMode, ERRORLOG::ERRL_SEV_UNRECOVERABLE); } TMGT_INF("<<enableOccActuation() returning 0x%04X", (l_err==NULL) ? 0 : l_err->reasonCode()); return l_err; } // end enableOccActuation()
// Notify HTMGT that an OCC has failed and needs to be reset void processOccReset(TARGETING::Target * i_proc) { TMGT_INF(">>processOccReset(0x%p)", i_proc); errlHndl_t errl = NULL; TARGETING::Target * failedOccTarget = NULL; TARGETING::Target* sys = NULL; TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; // If the system is in safemode then ignore request to reset OCCs if(sys && sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && safeMode) { return; } // Get functional OCC (one per proc) TARGETING::TargetHandleList pOccs; getChildChiplets(pOccs, i_proc, TARGETING::TYPE_OCC); if (pOccs.size() > 0) { failedOccTarget = pOccs[0]; } if(NULL != failedOccTarget) { uint32_t huid = failedOccTarget->getAttr<TARGETING::ATTR_HUID>(); TMGT_INF("processOccReset(HUID=0x%08X) called", huid); } else { uint32_t huid = i_proc->getAttr<TARGETING::ATTR_HUID>(); TMGT_INF("processOccReset: Invalid OCC target (proc huid=0x08X)" "resetting OCCs anyway", huid); /*@ * @errortype * @reasoncode HTMGT_RC_INVALID_PARAMETER * @moduleid HTMGT_MOD_PROCESS_OCC_RESET * @userdata1[0:7] Processor HUID * @devdesc No OCC target found for proc Target, */ bldErrLog(errl, HTMGT_MOD_PROCESS_OCC_RESET, HTMGT_RC_INVALID_PARAMETER, huid, 0, 0, 1, ERRORLOG::ERRL_SEV_INFORMATIONAL); // Add HB firmware callout errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, HWAS::SRCI_PRIORITY_MED); ERRORLOG::errlCommit(errl, HTMGT_COMP_ID); // sets errl to NULL } errl = OccManager::resetOccs(failedOccTarget); if(errl) { ERRORLOG::errlCommit(errl, HTMGT_COMP_ID); // sets errl to NULL } TMGT_INF("<<processOccReset()"); } // end processOccReset()
// Notify HTMGT that an OCC has an error to report void processOccError(TARGETING::Target * i_procTarget) { TMGT_INF(">>processOccError(0x%p)", i_procTarget); TARGETING::Target* sys = NULL; TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; // If the system is in safemode then can't talk to OCCs - // ignore call to processOccError if(sys && sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && safeMode) { return; } bool polledOneOcc = false; errlHndl_t err = OccManager::buildOccs(); if (NULL == err) { if (i_procTarget != NULL) { const uint32_t l_huid = i_procTarget->getAttr<TARGETING::ATTR_HUID>(); TMGT_INF("processOccError(HUID=0x%08X) called", l_huid); TARGETING::TargetHandleList pOccs; getChildChiplets(pOccs, i_procTarget, TARGETING::TYPE_OCC); if (pOccs.size() > 0) { // Poll specified OCC flushing any errors errlHndl_t err = OccManager::sendOccPoll(true, pOccs[0]); if (err) { ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } polledOneOcc = true; } } if ((OccManager::getNumOccs() > 1) || (false == polledOneOcc)) { // Send POLL command to all OCCs to flush any other errors errlHndl_t err = OccManager::sendOccPoll(true); if (err) { ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } } if (OccManager::occNeedsReset()) { TMGT_ERR("processOccError(): OCCs need to be reset"); // Don't pass failed target as OCC should have already // been marked as failed during the poll. errlHndl_t err = OccManager::resetOccs(NULL); if(err) { ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } } } else { // OCC build failed... TMGT_ERR("processOccError() called, but unable to find OCCs"); ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } TMGT_INF("<<processOccError()"); } // end processOccError()
// Send pass-thru command to HTMGT errlHndl_t passThruCommand(uint16_t i_cmdLength, uint8_t * i_cmdData, uint16_t & o_rspLength, uint8_t * o_rspData) { errlHndl_t err = NULL; htmgtReasonCode failingSrc = HTMGT_RC_NO_ERROR; o_rspLength = 0; if ((i_cmdLength > 0) && (NULL != i_cmdData)) { switch (i_cmdData[0]) { case PASSTHRU_OCC_STATUS: TMGT_INF("passThruCommand: OCC Status"); OccManager::getOccData(o_rspLength, o_rspData); break; case PASSTHRU_GENERATE_MFG_PSTATE: if (i_cmdLength == 1) { TMGT_INF("passThruCommand: Generate MFG pstate tables", i_cmdData[1]); err = genPstateTables(false); } else { TMGT_ERR("passThruCommand: invalid generate pstate " "command length %d", i_cmdLength); /*@ * @errortype * @reasoncode HTMGT_RC_INVALID_LENGTH * @moduleid HTMGT_MOD_PASS_THRU * @userdata1 command data[0-7] * @userdata2 command data length * @devdesc Invalid pass thru command data length */ failingSrc = HTMGT_RC_INVALID_LENGTH; } break; case PASSTHRU_LOAD_PSTATE: if (i_cmdLength == 2) { const uint8_t pstateType = i_cmdData[1]; if ((0 == pstateType) || (1 == pstateType)) { TMGT_INF("passThruCommand: Load pstate tables " "(type: %d)", pstateType); // 0 = Normal Pstate Tables err = OccManager::loadPstates(0 == pstateType); } else { TMGT_ERR("passThruCommand: invalid pstate type " "specified: %d", pstateType); /*@ * @errortype * @reasoncode HTMGT_RC_INVALID_PARAMETER * @moduleid HTMGT_MOD_PASS_THRU * @userdata1 command data[0-7] * @userdata2 command data length * @devdesc Invalid load pstate table type */ failingSrc = HTMGT_RC_INVALID_PARAMETER; } } else { TMGT_ERR("passThruCommand: invalid load pstate " "command length %d", i_cmdLength); failingSrc = HTMGT_RC_INVALID_LENGTH; } break; default: TMGT_ERR("passThruCommand: Invalid command 0x%08X " "(%d bytes)", UINT32_GET(i_cmdData), i_cmdLength); /*@ * @errortype * @reasoncode HTMGT_RC_INVALID_DATA * @moduleid HTMGT_MOD_PASS_THRU * @userdata1 command data[0-7] * @userdata2 command data length * @devdesc Invalid pass thru command */ failingSrc = HTMGT_RC_INVALID_DATA; break; } if ((HTMGT_RC_NO_ERROR != failingSrc) && (NULL == err)) { bldErrLog(err, HTMGT_MOD_PASS_THRU, failingSrc, UINT32_GET(i_cmdData), UINT32_GET(&i_cmdData[4]), 0, i_cmdLength, ERRORLOG::ERRL_SEV_INFORMATIONAL); } } return err; } // end passThruCommand()
// Handle OCC poll response void Occ::pollRspHandler(const uint8_t * i_pollResponse, const uint16_t i_pollResponseSize) { static uint32_t L_elog_retry_count = 0; TMGT_DBG("OCC Poll Response", i_pollResponse, i_pollResponseSize); const occPollRspStruct_t *pollRsp = (occPollRspStruct_t *) i_pollResponse; const occPollRspStruct_t *lastPollRsp = (occPollRspStruct_t *) iv_lastPollResponse; // Trace if any data changed if ((false == iv_lastPollValid) || (memcmp(pollRsp, lastPollRsp, OCC_POLL_DATA_MIN_SIZE) != 0)) { TMGT_INF("OCC%d Poll change: Status:%04X Occs:%02X Cfg:%02X " "State:%02X Error:%06X/%08X", iv_instance, (pollRsp->status << 8) | pollRsp->extStatus, pollRsp->occsPresent, pollRsp->requestedCfg, pollRsp->state, (pollRsp->errorId<<16) | pollRsp->errorLength, pollRsp->errorAddress); } do { if (false == iv_commEstablished) { // 1st poll response, so comm has been established for this OCC iv_commEstablished = true; TMGT_INF("pollRspHandler: FW Level for OCC%d: %.16s", iv_instance, pollRsp->codeLevel); } // Check for Error Logs if (pollRsp->errorId != 0) { if ((pollRsp->errorId != lastPollRsp->errorId) || (L_elog_retry_count < 3)) { if (pollRsp->errorId == lastPollRsp->errorId) { // Only retry same errorId a few times... L_elog_retry_count++; TMGT_ERR("pollRspHandler: Requesting elog 0x%02X" " (retry %d)", pollRsp->errorId, L_elog_retry_count); } else { L_elog_retry_count = 0; } // Handle a new error log from the OCC occProcessElog(this, pollRsp->errorId, pollRsp->errorAddress, pollRsp->errorLength); if (iv_needsReset) { // Update state if changed... // (since dropping out of poll rsp handler) if (iv_state != pollRsp->state) { iv_state = (occStateId)pollRsp->state; TMGT_INF("pollRspHandler: updating OCC%d state" " to %s", iv_instance, state_string(iv_state)); } break; } } } if ((OCC_STATE_ACTIVE == pollRsp->state) || (OCC_STATE_OBSERVATION == pollRsp->state)) { // Check role status if (((OCC_ROLE_SLAVE == iv_role) && ((pollRsp->status & OCC_STATUS_MASTER) != 0)) || ((OCC_ROLE_MASTER == iv_role) && ((pollRsp->status & OCC_STATUS_MASTER) == 0))) { TMGT_ERR("pollRspHandler: OCC%d Status role mismatch" " (role:0x%02X, status:0x%02X 0x%02X)", iv_instance, iv_role, pollRsp->status, pollRsp->extStatus); iv_needsReset = true; // TODO RTC 109224 //iv_resetReason = OCC_RESET_REASON_ERROR; break; } } //iv_requestedFormat = (occCfgDataFormat)pollRsp->requestedCfg; if (pollRsp->requestedCfg != 0x00) { TMGT_INF("pollRspHandler: OCC%d is requesting cfg format" " 0x%02X", iv_instance, pollRsp->requestedCfg); } // Check for state change if (iv_state != pollRsp->state) { iv_state = (occStateId)pollRsp->state; TMGT_INF("pollRspHandler: updating OCC%d state to %s", iv_instance, state_string(iv_state)); } // Copy rspData to lastPollResponse memcpy(iv_lastPollResponse, pollRsp, OCC_POLL_DATA_MIN_SIZE); iv_lastPollValid = true; } while(0); // NOTE: When breaking out of the above while loop, the new poll // response is NOT copied to lastPollResponse (should only // break when reset required) if (true == iv_needsReset) { // Save full poll response memcpy(iv_lastPollResponse, pollRsp, OCC_POLL_DATA_MIN_SIZE); iv_lastPollValid = true; iv_state = (occStateId)pollRsp->state; } } // end Occ::pollRspHandler()