//Analyze a subset of chips in a Domain... //This is a mini analysis of some of the chips in the Fabric Domain. int32_t FabricDomain::AnalyzeTheseChips(STEP_CODE_DATA_STRUCT & serviceData, ATTENTION_TYPE attentionType, TARGETING::TargetHandleList & i_chips) { using namespace TARGETING ; PRDF_DENTER( "FabricDomain::AnalyzeTheseChips" ); int32_t l_rc = ~SUCCESS; PRDF_DTRAC( "FabricDomain::AnalyzeTheseChips:: Domain ID = 0x%X", GetId() ); if(i_chips.size() != 0) { for (TargetHandleList::iterator i = i_chips.begin(); i != i_chips.end(); ++i) { PRDF_DTRAC( "FabricDomain::AnalyzeTheseChips::Before--chip=0x%X", PlatServices::getHuid(*i)); } OrderTheseChips(attentionType, i_chips); for (TargetHandleList::iterator i = i_chips.begin(); i != i_chips.end(); ++i) { PRDF_DTRAC( "FabricDomain::AnalyzeTheseChips::After--chip=0x%X", PlatServices::getHuid(*i) ); } //After the Order function is called the first chip should contain the chip to look at. //Look here for the correct LookUp function. I don't think this is working. RuleChip * l_fabChip = FindChipInTheseChips(i_chips[0], i_chips); PRDF_DTRAC( "FabricDomain::AnalyzeTheseChips::Analyzing this one: 0x%X", l_fabChip->GetId() ); if(NULL != l_fabChip) { l_rc = l_fabChip->Analyze(serviceData, attentionType); } else { PRDF_DTRAC( "FabricDomain::AnalyzeTheseChips::l_fabChip is NULL" ); l_rc = ~SUCCESS; } } else { PRDF_DTRAC( "FabricDomain::AnalyzeTheseChips::i_chips = %d", i_chips.size() ); } //Get P7 chip Global FIR data for FFDC for (TargetHandleList::iterator i = i_chips.begin(); i != i_chips.end(); ++i) { RuleChip * l_fabChip = FindChipInTheseChips(*i, i_chips); l_fabChip->CaptureErrorData( serviceData.service_data->GetCaptureData(), Util::hashString("GlobalFIRs")); } PRDF_DEXIT( "FabricDomain::AnalyzeTheseChips" ); return l_rc; }
//****************************************************************************** // host_gard function //****************************************************************************** void* host_gard( void *io_pArgs ) { TRACDCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "host_gard entry" ); errlHndl_t errl; do { // Check whether we're in MPIPL mode TARGETING::Target* l_pTopLevel = NULL; targetService().getTopLevelTarget( l_pTopLevel ); HWAS_ASSERT(l_pTopLevel, "HWAS host_gard: no TopLevelTarget"); if (l_pTopLevel->getAttr<ATTR_IS_MPIPL_HB>()) { TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "MPIPL mode"); // we only want EX units to be processed TARGETING::PredicateCTM l_exFilter(TARGETING::CLASS_UNIT, TARGETING::TYPE_EX); errl = collectGard(&l_exFilter); if (errl) { TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "collectGard returned error; breaking out"); break; } } else { TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "Normal IPL mode"); errl = collectGard(); if(errl) { TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "collectGard returned error; breaking out"); break; } if (errl == NULL) { // check and see if we still have enough hardware to continue errl = checkMinimumHardware(); if(errl) { TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "check minimum hardware returned error; breaking out"); break; } } // If targets are deconfigured as a result of host_gard, they are // done so using the PLID as the reason for deconfiguration. This // triggers the reconfigure loop attribute to be set, which causes // undesirable behavior, so we need to reset it here: // Read current value TARGETING::ATTR_RECONFIGURE_LOOP_type l_reconfigAttr = l_pTopLevel->getAttr<TARGETING::ATTR_RECONFIGURE_LOOP>(); // Turn off deconfigure bit l_reconfigAttr &= ~TARGETING::RECONFIGURE_LOOP_DECONFIGURE; // Write back to attribute l_pTopLevel->setAttr<TARGETING::ATTR_RECONFIGURE_LOOP> (l_reconfigAttr); } // Send message to FSP sending HUID of EX chip associated with // master core msg_t * core_msg = msg_allocate(); core_msg->type = SBE::MSG_IPL_MASTER_CORE; const TARGETING::Target* l_masterCore = TARGETING::getMasterCore( ); /*@ errorlog tag * @errortype ERRL_SEV_CRITICAL_SYS_TERM * @moduleid MOD_HOST_GARD * @reasoncode RC_MASTER_CORE_NULL * @userdata1 0 * @userdata2 0 * @devdesc HWAS host_gard: no masterCore found */ if (l_masterCore == NULL) { TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "No masterCore Found" ); const bool hbSwError = true; errl = new ERRORLOG::ErrlEntry (ERRORLOG::ERRL_SEV_CRITICAL_SYS_TERM, HWAS::MOD_HOST_GARD, HWAS::RC_MASTER_CORE_NULL, 0, 0, hbSwError); break; } // Get the EX chip associated with the master core as that is the // chip that // has the IS_MASTER_EX attribute associated with it TARGETING::TargetHandleList targetList; getParentAffinityTargets(targetList, l_masterCore, TARGETING::CLASS_UNIT, TARGETING::TYPE_EX); HWAS_ASSERT(targetList.size() == 1, "HWAS host_gard: Incorrect EX chip(s) associated with masterCore"); core_msg->data[0] = 0; core_msg->data[1] = TARGETING::get_huid( targetList[0] ); core_msg->extra_data = NULL; TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "Sending MSG_MASTER_CORE message with HUID %08x", core_msg->data[1]); errl = MBOX::send(MBOX::IPL_SERVICE_QUEUE,core_msg); if (errl) { TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, ERR_MRK"MBOX::send failed sending Master Core message"); msg_free(core_msg); break; } } while (0); TRACDCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "host_gard exit" ); return errl; }
//***************************************************************************** // resetBackupTopology //***************************************************************************** errlHndl_t resetBackupTopology( uint32_t i_oscPos, const TARGETING::TargetHandle_t& i_procOscTgt, const TARGETING::TargetHandleList& i_badChipList, bool i_informPhyp) { TOD_ENTER("resetBackupTopology"); errlHndl_t l_err = nullptr; // Put the handle to the firmware_request request struct // out here so it is easier to free later hostInterfaces::hbrt_fw_msg *l_req_fw_msg = nullptr; hostInterfaces::hbrt_fw_msg *l_resp_fw_msg = nullptr; do { if ((nullptr == g_hostInterfaces) || (nullptr == g_hostInterfaces->firmware_request)) { TOD_ERR("resetBackupTopology: " "Hypervisor firmware_request interface not linked"); /*@ * @errortype * @severity ERRL_SEV_UNRECOVERABLE * @moduleid TOD_RT_TOPOLOGY_RESET_BACKUP * @reasoncode TOD_RT_NULL_FIRMWARE_REQUEST_PTR * @userdata1 None * @userdata2 None * @devdesc Host interfaces are not initialized * @custdesc An internal error occurred. This will * force the Time of Day function to run * with complete redundancy. */ l_err = new ErrlEntry( ERRL_SEV_UNRECOVERABLE, TOD_RT_TOPOLOGY_RESET_BACKUP, TOD_RT_NULL_FIRMWARE_REQUEST_PTR, 0, 0, true); break; } // The format of the data to be sent, according to the document // "Handle PRD Request for resetting backup TOD topology" is as follows // All data members below are 4 bytes long (32 bits) // Ordinal ID - 0xFFFFFFFF means no OSC to be avoided // HUID of the node - This field should be considered only if Ordinal // ID is NOT set to 0xFFFFFFFF otherwise it is set // to 0 // HUID of the first processor // HUID of the second processor, etc // Check if we get conflicting data, if so send a Trace out if ((0xFFFFFFFF == i_oscPos) && (nullptr != i_procOscTgt)) { TOD_ERR("Conflicting input data, input oscillator position " "(i_oscPos) has value 0xFFFFFFFF, meaning no oscillator " "to be avoided but input oscillator target (i_procOscTgt) " "has a valid value" ); } else if ((0xFFFFFFFF != i_oscPos) && (nullptr == i_procOscTgt)) { TOD_ERR("Conflicting input data, input oscillator position " "(i_oscPos) has value 0x%X, meaning avoid oscillator " "but input oscillator target (i_procOscTgt) " "has a NULL value", i_oscPos); } // Flag to determine if the OSC data will be added to the data bool l_addOscData = (0xFFFFFFFF != i_oscPos) && (nullptr != i_procOscTgt); // Default the request data size to the size of the // GenericFspMboxMessage_t minus the size of the // GenericFspMboxMessage_t's data. The size of the // GenericFspMboxMessage_t's data will be added later uint32_t l_req_data_size = sizeof(GenericFspMboxMessage_t) - sizeof(GenericFspMboxMessage_t::data); // Add to the request data size iff there is data needing to be passed if (i_badChipList.size() > 0) { // if the bad chip list has any items then increase size to // accommodate for an ordinal ID and a HUID, regardless if // they have relevant data or not, because they are expected // before the HUID list. l_req_data_size += (MSG_OSC_SIZE_OF_DETAILS * sizeof(uint32_t)) + (i_badChipList.size() * sizeof(uint32_t)); } else if (l_addOscData) { // if there is a valid OSC then accommodate for an ordinal ID // and HUID of node, but don't need space for HUID list because, // if we are here, the list is empty l_req_data_size += (MSG_OSC_SIZE_OF_DETAILS * sizeof(uint32_t)); } // The request data size must be at a minimum the size of the // FSP generic message (sizeof(GenericFspMboxMessage_t)) if (l_req_data_size < sizeof(GenericFspMboxMessage_t)) { l_req_data_size = sizeof(GenericFspMboxMessage_t); } // Calculate the TOTAL size of hostInterfaces::hbrt_fw_msg which // means only adding hostInterfaces::HBRT_FW_MSG_BASE_SIZE to // the previous calculated request data size uint64_t l_req_fw_msg_size = hostInterfaces::HBRT_FW_MSG_BASE_SIZE + l_req_data_size; // Create the firmware_request request struct to send data l_req_fw_msg = (hostInterfaces::hbrt_fw_msg *)malloc(l_req_fw_msg_size); // Initialize the firmware_request request struct l_req_fw_msg->generic_msg.initialize(); // Populate the firmware_request request struct with given data l_req_fw_msg->io_type = hostInterfaces::HBRT_FW_MSG_HBRT_FSP_REQ; l_req_fw_msg->generic_msg.dataSize = l_req_data_size; l_req_fw_msg->generic_msg.msgq = MBOX::FSP_TOD_MSGQ; l_req_fw_msg->generic_msg.msgType = (false == i_informPhyp ? GenericFspMboxMessage_t::MSG_TOD_BACKUP_RESET: GenericFspMboxMessage_t::MSG_TOD_BACKUP_RESET_INFORM_PHYP); l_req_fw_msg->generic_msg.__req = GenericFspMboxMessage_t::REQUEST; // A convenient way to populate the data uint32_t* l_dataPtr = reinterpret_cast<uint32_t*>(&(l_req_fw_msg->generic_msg.data)); if (i_badChipList.size() > 0) { // set the ordinal ID l_dataPtr[MSG_OSC_ORDINAL_ID_LOC] = i_oscPos; // attach the HUIDs from bad chip list to end of struct size_t i = MSG_OSC_HUIDS_LOC; for (auto l_target : i_badChipList) { l_dataPtr[i++] = GETHUID(l_target); } } // Set the HUID of the ordinal node if need be if (l_addOscData) { // set the ordinal ID l_dataPtr[MSG_OSC_ORDINAL_ID_LOC] = i_oscPos; // Get the parent node target TARGETING::TargetHandleList l_list; TARGETING::targetService().getAssociated(l_list, i_procOscTgt, TARGETING::TargetService::PARENT, TARGETING::TargetService::IMMEDIATE); if (l_list.size() == 1) { l_dataPtr[MSG_OSC_ORDINAL_NODE_HUID_LOC] = GETHUID(l_list[0]); } else { /*@ * @errortype * @severity ERRL_SEV_UNRECOVERABLE * @moduleid TOD_RT_TOPOLOGY_RESET_BACKUP * @reasoncode TOD_INVALID_TARGET * @userdata1 The number of parents found osc target * @userdata2 None * @devdesc No/Multiple parent(s) found for * processor osc target * @custdesc An internal error occurred. This will * force the Time of Day function to run * with complete redundancy. */ l_err = new ErrlEntry(ERRL_SEV_UNRECOVERABLE, TOD_RT_TOPOLOGY_RESET_BACKUP, TOD_INVALID_TARGET, l_list.size(), 0, true); break; } } // Create the firmware_request response struct to receive data // NOTE: For messages to the FSP the response size must match // the request size uint64_t l_resp_fw_msg_size = l_req_fw_msg_size; l_resp_fw_msg = (hostInterfaces::hbrt_fw_msg *)malloc(l_resp_fw_msg_size); memset(l_resp_fw_msg, 0, l_resp_fw_msg_size); // Trace out the request structure TRACFBIN(ISTEPS_TRACE::g_trac_isteps_trace, INFO_MRK"TOD::Sending firmware_request", l_req_fw_msg, l_req_fw_msg_size); // Make the firmware_request call l_err = firmware_request_helper(l_req_fw_msg_size, l_req_fw_msg, &l_resp_fw_msg_size, l_resp_fw_msg); if (l_err) { break; } } while (0); // Release the firmware messages free(l_req_fw_msg); free(l_resp_fw_msg); l_req_fw_msg = l_resp_fw_msg = nullptr; TOD_EXIT("resetBackupTopology"); return l_err; } // end resetBackupTopology
int32_t FabricDomain::OrderTheseChips(ATTENTION_TYPE attentionType, TARGETING::TargetHandleList & i_chips) { using namespace PluginDef; using namespace TARGETING; PRDF_DENTER( "FabricDomain::OrderTheseChips" ); uint32_t l_internalOnlyCount = 0; uint64_t l_externalDrivers[i_chips.size()]; uint64_t l_wofValues[i_chips.size()]; bool l_internalCS[i_chips.size()]; union { uint64_t * u; CPU_WORD * c; } ptr; uint32_t l_chip = 0; uint32_t l_chipToFront = 0; // Get internal setting and external driver list for each chip. for (TargetHandleList::iterator i = i_chips.begin(); i != i_chips.end(); ++i) { RuleChip * l_fabChip = FindChipInTheseChips(*i, i_chips); ptr.u = &l_externalDrivers[l_chip]; BitString l_externalChips(i_chips.size(), ptr.c); TargetHandleList l_tmpList; if(l_fabChip != NULL) { // Call "GetCheckstopInfo" plugin. ExtensibleChipFunction * l_extFunc = l_fabChip->getExtensibleFunction("GetCheckstopInfo"); (*l_extFunc)(l_fabChip, bindParm<bool &, TargetHandleList &, uint64_t &> (l_internalCS[l_chip], l_tmpList, l_wofValues[l_chip] ) ); } else { l_internalCS[l_chip] = false; PRDF_DTRAC( "FabricDomain::OrderTheseChips: l_fabChip is NULL" ); } //If we are just checking for internal errors then there is no need for //a list of what chips sent checkstops where. // Update bit buffer. for (TargetHandleList::iterator j = l_tmpList.begin(); j != l_tmpList.end(); ++j) { for (uint32_t k = 0; k < i_chips.size(); k++) if ((*j) == LookUp(k)->GetChipHandle()) l_externalChips.Set(k); }; // Check if is internal. if (l_internalCS[l_chip]) { l_internalOnlyCount++; l_chipToFront = l_chip; } l_chip++; //Move to next chip in the list. } // Check if we are done... only one with an internal error. if (1 == l_internalOnlyCount) { MoveToFrontInTheseChips(l_chipToFront, i_chips); return(SUCCESS); } PRDF_DEXIT( "FabricDomain::OrderTheseChips" ); return(SUCCESS); }
// Notify HTMGT that an OCC has failed and needs to be reset void processOccReset(TARGETING::Target * i_proc) { TMGT_INF(">>processOccReset(0x%p)", i_proc); errlHndl_t errl = NULL; TARGETING::Target * failedOccTarget = NULL; TARGETING::Target* sys = NULL; TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; // If the system is in safemode then ignore request to reset OCCs if(sys && sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && safeMode) { return; } // Get functional OCC (one per proc) TARGETING::TargetHandleList pOccs; getChildChiplets(pOccs, i_proc, TARGETING::TYPE_OCC); if (pOccs.size() > 0) { failedOccTarget = pOccs[0]; } if(NULL != failedOccTarget) { uint32_t huid = failedOccTarget->getAttr<TARGETING::ATTR_HUID>(); TMGT_INF("processOccReset(HUID=0x%08X) called", huid); } else { uint32_t huid = i_proc->getAttr<TARGETING::ATTR_HUID>(); TMGT_INF("processOccReset: Invalid OCC target (proc huid=0x08X)" "resetting OCCs anyway", huid); /*@ * @errortype * @reasoncode HTMGT_RC_INVALID_PARAMETER * @moduleid HTMGT_MOD_PROCESS_OCC_RESET * @userdata1[0:7] Processor HUID * @devdesc No OCC target found for proc Target, */ bldErrLog(errl, HTMGT_MOD_PROCESS_OCC_RESET, HTMGT_RC_INVALID_PARAMETER, huid, 0, 0, 1, ERRORLOG::ERRL_SEV_INFORMATIONAL); // Add HB firmware callout errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, HWAS::SRCI_PRIORITY_MED); ERRORLOG::errlCommit(errl, HTMGT_COMP_ID); // sets errl to NULL } errl = OccManager::resetOccs(failedOccTarget); if(errl) { ERRORLOG::errlCommit(errl, HTMGT_COMP_ID); // sets errl to NULL } TMGT_INF("<<processOccReset()"); } // end processOccReset()
// Notify HTMGT that an OCC has an error to report void processOccError(TARGETING::Target * i_procTarget) { TMGT_INF(">>processOccError(0x%p)", i_procTarget); TARGETING::Target* sys = NULL; TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; // If the system is in safemode then can't talk to OCCs - // ignore call to processOccError if(sys && sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && safeMode) { return; } bool polledOneOcc = false; errlHndl_t err = OccManager::buildOccs(); if (NULL == err) { if (i_procTarget != NULL) { const uint32_t l_huid = i_procTarget->getAttr<TARGETING::ATTR_HUID>(); TMGT_INF("processOccError(HUID=0x%08X) called", l_huid); TARGETING::TargetHandleList pOccs; getChildChiplets(pOccs, i_procTarget, TARGETING::TYPE_OCC); if (pOccs.size() > 0) { // Poll specified OCC flushing any errors errlHndl_t err = OccManager::sendOccPoll(true, pOccs[0]); if (err) { ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } polledOneOcc = true; } } if ((OccManager::getNumOccs() > 1) || (false == polledOneOcc)) { // Send POLL command to all OCCs to flush any other errors errlHndl_t err = OccManager::sendOccPoll(true); if (err) { ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } } if (OccManager::occNeedsReset()) { TMGT_ERR("processOccError(): OCCs need to be reset"); // Don't pass failed target as OCC should have already // been marked as failed during the poll. errlHndl_t err = OccManager::resetOccs(NULL); if(err) { ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } } } else { // OCC build failed... TMGT_ERR("processOccError() called, but unable to find OCCs"); ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } TMGT_INF("<<processOccError()"); } // end processOccError()
void HBVddrMsg::createVddrData( VDDR_MSG_TYPE i_requestType, RequestContainer& io_request) const { TRACFCOMP( g_trac_volt, ENTER_MRK "HBVddrMsg::createVddrData" ); // Go through all the memory buffers and gather their domains, domain // specific IDs, and domain specific voltages io_request.clear(); do{ TARGETING::TargetHandleList membufTargetList; //When request is a disable command, disable all present Centaurs // in case we go through a reconfigure loop if(i_requestType == HB_VDDR_DISABLE) { getChipResources( membufTargetList, TYPE_MEMBUF, UTIL_FILTER_PRESENT ); } //When the request is an enable command, enable only functional // centaurs. else { getAllChips(membufTargetList, TYPE_MEMBUF); } TARGETING::Target* pMembuf =NULL; for (TARGETING::TargetHandleList::const_iterator ppMembuf = membufTargetList.begin(); ppMembuf != membufTargetList.end(); ++ppMembuf) { pMembuf = *ppMembuf; if(i_requestType == HB_VDDR_ENABLE) { (void)addMemoryVoltageDomains< TARGETING::ATTR_MSS_CENT_VDD_OFFSET_DISABLE, TARGETING::ATTR_MEM_VDD_OFFSET_MILLIVOLTS, TARGETING::ATTR_MEM_VDD_OFFSET_MILLIVOLTS, TARGETING::ATTR_VDD_ID>( pMembuf, io_request); (void)addMemoryVoltageDomains< TARGETING::ATTR_MSS_CENT_AVDD_OFFSET_DISABLE, TARGETING::ATTR_MEM_AVDD_OFFSET_MILLIVOLTS, TARGETING::ATTR_MEM_AVDD_OFFSET_MILLIVOLTS, TARGETING::ATTR_AVDD_ID>( pMembuf, io_request); (void)addMemoryVoltageDomains< TARGETING::ATTR_MSS_CENT_VCS_OFFSET_DISABLE, TARGETING::ATTR_MEM_VCS_OFFSET_MILLIVOLTS, TARGETING::ATTR_MEM_VCS_OFFSET_MILLIVOLTS, TARGETING::ATTR_VCS_ID>( pMembuf, io_request); (void)addMemoryVoltageDomains< TARGETING::ATTR_MSS_VOLT_VPP_OFFSET_DISABLE, TARGETING::ATTR_MEM_VPP_OFFSET_MILLIVOLTS, TARGETING::ATTR_VPP_BASE, TARGETING::ATTR_VPP_ID>( pMembuf, io_request); } (void)addMemoryVoltageDomains< TARGETING::ATTR_MSS_VOLT_VDDR_OFFSET_DISABLE, TARGETING::ATTR_MEM_VDDR_OFFSET_MILLIVOLTS, TARGETING::ATTR_MSS_VOLT, TARGETING::ATTR_VMEM_ID>( pMembuf, io_request); } if (membufTargetList.size() > 1) { // Take out the duplicate records in io_request by first // sorting and then removing the duplicates std::sort(io_request.begin(), io_request.end(), compareVids); std::vector<hwsvPowrMemVoltDomainRequest_t>::iterator pInvalidEntries = std::unique( io_request.begin(), io_request.end(), areVidsEqual); io_request.erase(pInvalidEntries,io_request.end()); } if( ( (i_requestType == HB_VDDR_ENABLE) || (i_requestType == HB_VDDR_POST_DRAM_INIT_ENABLE) ) && (!membufTargetList.empty()) ) { // Inhibit sending any request to turn on a domain with no voltage. // When disabling we don't need to do this because the voltage is // ignored. io_request.erase( std::remove_if(io_request.begin(), io_request.end(), isUnusedVoltageDomain),io_request.end()); } } while(0); TRACFCOMP( g_trac_volt, EXIT_MRK "HBVddrMsg::createVddrData" ); return; }