void ServiceDataCollector::AddSignatureList( ErrorSignature & i_sig ) { #define PRDF_FUNC "[ServiceDataCollector::AddSignatureList] " TARGETING::TargetHandle_t tgt = PlatServices::getTarget( i_sig.getChipId()); if ( NULL != tgt ) { AddSignatureList( tgt, i_sig.getSigId() ); } else { PRDF_ERR( PRDF_FUNC "Failed to get target Handle for " "chip:0x%08X", i_sig.getChipId() ); } #undef PRDF_FUNC }
errlHndl_t SimErrDataService::GenerateSrcPfa(ATTENTION_TYPE attn_type, ServiceDataCollector & i_sdc) { using namespace TARGETING; using namespace PlatServices; PRDF_ENTER("SimErrDataService::GenerateSrcPfa()"); errlHndl_t errLog = NULL; // call the actual ras services function errLog = ErrDataService::GenerateSrcPfa(attn_type, i_sdc); ErrorSignature * esig = i_sdc.GetErrorSignature(); // report the actual signature getSimServices().reportSig(esig->getChipId(), esig->getSigId()); PRDF_EXIT("SimErrDataService::GenerateSrcPfa()"); return errLog; }
int32_t ErrorRegister::Analyze( STEP_CODE_DATA_STRUCT & io_sdc ) { int32_t rc = SUCCESS; uint32_t l_savedErrSig = 0; ErrorSignature * esig = io_sdc.service_data->GetErrorSignature(); if(xScrId == 0x0fff) { esig->setRegId(scr.GetAddress()); } else { esig->setRegId( xScrId ); } // Get Data from hardware const BIT_STRING_CLASS &bs = Read( io_sdc.service_data->getSecondaryAttnType() ); BitKey bl; // null bit list has length 0 if ( scr_rc == SUCCESS ) { bl = Filter( bs ); rc = SetErrorSignature( io_sdc, bl ); // Save signature to determine if it changes during resolution // execution. l_savedErrSig = esig->getSigId(); } // This loop will iterate through all bits in the bit list until an active // attention is found. This is useful in the cases where a global or chiplet // level FIR has multiple bits set, but the associated local FIRs may not // have an active attention because of a filter or hardware bug. uint32_t res_rc = SUCCESS; BitKey analyzed_bl; // Keep track of bits that have been analyzed. BitKey remaining_bl = bl; // Keep track of bits that need to be analyzed. do { BitKey res_bl = remaining_bl; BitKey tmp_bl = remaining_bl; // lookup and execute the resolutions res_rc = Lookup( io_sdc, res_bl ); // Add the resolved bits to the analyzed list. // TODO: RTC 126267 Should modify BitKey to have a more efficient way of // adding bits to a list. for ( uint32_t i = 0; i < res_bl.size(); i++ ) analyzed_bl.setBit(res_bl.getListValue(i)); // Remove the resolved bits from the remaining list. remaining_bl.removeBits(res_bl); // Make sure forward progress is made. if ( tmp_bl == remaining_bl ) break; } while ( (PRD_SCAN_COMM_REGISTER_ZERO == res_rc) && (0 != remaining_bl.size()) ); if ( SUCCESS == rc ) rc = res_rc; // previous rc has prioity over res_rc // If we had a DD02 and the signature changes, ignore DD02. if ( rc == PRD_SCAN_COMM_REGISTER_ZERO ) { uint32_t l_currentSig = esig->getSigId(); if( l_currentSig != l_savedErrSig ) { // Found a better answer during the DD02 analysis. rc = res_rc; } } if( scr_rc == SUCCESS ) { FilterUndo( analyzed_bl ); // NOTE: This is an unusual work-a-round for NOT clearing // particular FIR bits in a register because they are cleared // in another part of the plugin code. if( rc == PRD_NO_CLEAR_FIR_BITS ) { //Return success to indicate that we understand the DDFF rc = SUCCESS; } else { int32_t reset_rc; reset_rc = Reset( analyzed_bl, io_sdc ); if( rc == SUCCESS ) rc = reset_rc; } } else // scr read failed { esig->setErrCode( PRD_SCANCOM_FAILURE ); rc = scr_rc; } return(rc); }
errlHndl_t ErrDataService::GenerateSrcPfa( ATTENTION_TYPE i_attnType, ServiceDataCollector & io_sdc, bool & o_initiateHwudump, TargetHandle_t & o_dumpTrgt, errlHndl_t & o_dumpErrl, uint32_t & o_dumpErrlActions) { #define PRDF_FUNC "[ErrDataService::GenerateSrcPfa] " o_initiateHwudump = false; o_dumpTrgt = NULL; o_dumpErrl = NULL; o_dumpErrlActions = 0; // First, check if an error log should be committed. Note that there should // always be an error log if there was a system or unit checkstop. if ( io_sdc.queryDontCommitErrl() && MACHINE_CHECK != i_attnType && !io_sdc.IsUnitCS() ) { // User did not want this error log committed. No need to continue. So // delete it and exit. delete iv_errl; iv_errl = NULL; return NULL; } #ifdef __HOSTBOOT_MODULE using namespace ERRORLOG; using namespace HWAS; #else uint8_t sdcSaveFlags = SDC_NO_SAVE_FLAGS; size_t sz_uint8 = sizeof(uint8_t); #endif epubProcedureID thisProcedureID; bool ForceTerminate = false; bool iplDiagMode = false; ++iv_serviceActionCounter; uint16_t PRD_Reason_Code = 0; //************************************************************** // Callout loop to set up Reason code and SRC word 9 //************************************************************** // Must go thru callout list to look for RIOPORT procedure callouts, // since they require the port info to be in SRC Word 9 bool HW = false; bool SW = false; bool SW_High = false; bool SecondLevel = false; uint32_t SrcWord7 = 0; uint32_t SrcWord9 = 0; // Should not gard hardware if there is a hardware callout at LOW priority // and a symbolic FRU indicating a possibility of a software error at MED or // HIGH priority. bool sappSwNoGardReq = false, sappHwNoGardReq = false; const SDC_MRU_LIST & mruList = io_sdc.getMruList(); int32_t calloutsPlusDimms = mruList.size(); for ( SDC_MRU_LIST::const_iterator it = mruList.begin(); it < mruList.end(); ++it ) { PRDcallout thiscallout = it->callout; if ( PRDcalloutData::TYPE_SYMFRU == thiscallout.getType() ) { if ( (SP_CODE == thiscallout.flatten()) || (SYS_SW_CODE == thiscallout.flatten()) ) { SW = true; if ( MRU_LOW != it->priority ) { sappSwNoGardReq = true; } if ( MRU_MED == it->priority ) { SW_High = true; } } else if ( LEVEL2_SUPPORT == thiscallout.flatten()) { SecondLevel = true; if ( MRU_LOW != it->priority ) { sappSwNoGardReq = true; } } } else if ( PRDcalloutData::TYPE_MEMMRU == thiscallout.getType() ) { MemoryMru memMru (thiscallout.flatten()); SrcWord9 = memMru.toUint32(); // Get MemMru value TargetHandleList partList = memMru.getCalloutList(); uint32_t partCount = partList.size(); calloutsPlusDimms = calloutsPlusDimms + partCount -1; HW = true; //hardware callout if ( MRU_LOW == it->priority ) { sappHwNoGardReq = true; } } else // PRDcalloutData::TYPE_TARGET { HW = true; // Hardware callout // Determines if all the hardware callouts have low priority. if ( MRU_LOW == it->priority ) { sappHwNoGardReq = true; } } } //////////////////////////////////////////////////////////////// //Set the PRD Reason Code based on the flags set in the above callout loop. //////////////////////////////////////////////////////////////// if (HW == true && SW == true) { if (SW_High == true) PRD_Reason_Code = PRDF_DETECTED_FAIL_SOFTWARE_PROBABLE; else PRD_Reason_Code = PRDF_DETECTED_FAIL_HARDWARE_PROBABLE; } else if (HW == true && SW == false && SecondLevel == true) PRD_Reason_Code = PRDF_DETECTED_FAIL_HARDWARE_PROBABLE; else if (HW == true && SW == false && SecondLevel == false) PRD_Reason_Code = PRDF_DETECTED_FAIL_HARDWARE; else if (HW == false && SW == true) PRD_Reason_Code = PRDF_DETECTED_FAIL_SOFTWARE; else { // If we get here both HW and SW flags were false. Callout may be // Second Level Support only, or a procedure not checked in the SW // flag code. PRD_Reason_Code = PRDF_DETECTED_FAIL_HARDWARE_PROBABLE; } SrcWord7 = io_sdc.getPrimaryAttnType() << 8; SrcWord7 |= io_sdc.getSecondaryAttnType(); //-------------------------------------------------------------------------- // Check for IPL Diag Mode //-------------------------------------------------------------------------- #if defined(__HOSTBOOT_MODULE) && !defined(__HOSTBOOT_RUNTIME) iplDiagMode = PlatServices::isInMdiaMode(); #endif //************************************************************** // Update Error Log with SRC //************************************************************** ErrorSignature * esig = io_sdc.GetErrorSignature(); updateSrc( esig->getChipId(), SrcWord7, esig->getSigId(), SrcWord9, PRD_Reason_Code); //************************************************************** // Add SDC Capture data to Error Log User Data here only if // there are 4 or more callouts, // (including Dimm callouts in the MemoryMru). //************************************************************** bool capDataAdded = false; if (calloutsPlusDimms > 3) { AddCapData( io_sdc.GetCaptureData(), iv_errl ); AddCapData( io_sdc.getTraceArrayData(), iv_errl ); capDataAdded = true; } //-------------------------------------------------------------------------- // Set the error log severity and get the error log action flags. //-------------------------------------------------------------------------- // Let's assume the default is the action for a system checkstop. #ifdef __HOSTBOOT_MODULE errlSeverity_t errlSev = ERRL_SEV_UNRECOVERABLE; #else errlSeverity errlSev = ERRL_SEV_UNRECOVERABLE; #endif uint32_t errlAct = ERRL_ACTION_SA | // Service action required. ERRL_ACTION_REPORT | // Report to HMC and hypervisor. ERRL_ACTION_CALL_HOME; // Call home. if ( MACHINE_CHECK != i_attnType ) // Anything other that a system checkstop { if ( io_sdc.queryServiceCall() ) // still a serviceable event { errlSev = ERRL_SEV_PREDICTIVE; } else // not a serviceable event { errlSev = io_sdc.queryLogging() ? ERRL_SEV_RECOVERED // should still be logged : ERRL_SEV_INFORMATIONAL; // can be ignored errlAct = ERRL_ACTION_HIDDEN; } } // This needs to be done after setting the SRCs otherwise it will be // overridden. iv_errl->setSev( errlSev ); // Add procedure callout for SUE attentions. The intent is to make sure the // customer looks for other service actions before replacing parts for this // attention. if ( io_sdc.IsSUE() ) { PRDF_HW_ADD_PROC_CALLOUT( SUE_PREV_ERR, MRU_HIGH, iv_errl, errlSev ); } //-------------------------------------------------------------------------- // Get the global gard policy. //-------------------------------------------------------------------------- HWAS::GARD_ErrorType gardPolicy = HWAS::GARD_NULL; // Gard only if the error is a serviceable event. if ( io_sdc.queryServiceCall() ) { // We will not Resource Recover on a checkstop attention. gardPolicy = ( MACHINE_CHECK == i_attnType ) ? HWAS::GARD_Fatal : HWAS::GARD_Predictive; } if ( io_sdc.IsSUE() && ( MACHINE_CHECK == i_attnType ) ) { // If we are logging an error for an SUE consumed, we should not // perform any GARD here. Appropriate resources should have already // been GARDed for the original UE. gardPolicy = HWAS::GARD_NULL; } // Apply special policies for OPAL. if ( isHyprConfigOpal() && // OPAL is used !isMfgAvpEnabled() && !isMfgHdatAvpEnabled() ) // No AVPs running { // OPAL has requested that we disable garding for predictive errors // found at runtime. if ( HWAS::GARD_Predictive == gardPolicy ) { #if !defined(__HOSTBOOT_MODULE) // FSP only if ( isHyprRunning() ) gardPolicy = HWAS::GARD_NULL; #elif defined(__HOSTBOOT_RUNTIME) // HBRT only gardPolicy = HWAS::GARD_NULL; #endif } // OPAL has requested that we diable garding for fatal errors (system // checkstops) that could have been caused by a software generated // attention at runtime. This will be determined if there is a software // callout with higher priority than a hardware callout. else if ( HWAS::GARD_Fatal == gardPolicy && sappSwNoGardReq && sappHwNoGardReq ) // Gard requirements met { #if !defined(__HOSTBOOT_MODULE) // FSP only if ( isHyprRunning() ) gardPolicy = HWAS::GARD_NULL; #elif !defined(__HOSTBOOT_RUNTIME) // Hostboot only #ifdef CONFIG_ENABLE_CHECKSTOP_ANALYSIS // Checkstop analysis is only done at the beginning of the IPL, // regardless if the checkstop actually occurred during the IPL // or at runtime. We will need to check the IPL state in FIR // data to determine when the checkstop occurred. // Get access to IPL state info from the FIR data in the PNOR. if ( !(PnorFirDataReader::getPnorFirDataReader().isIplState()) ) gardPolicy = HWAS::GARD_NULL; #endif #endif } } //-------------------------------------------------------------------------- // Get the global deconfig policy (must be done after setting gard policy). //-------------------------------------------------------------------------- HWAS::DeconfigEnum deconfigPolicy = HWAS::NO_DECONFIG; bool deferDeconfig = false; if ( HWAS::GARD_NULL != gardPolicy ) { #if !defined(__HOSTBOOT_MODULE) // FSP only // Change the deconfig state based the gard type. This is only required // to control what the FSP does during the reconfig loop. deconfigPolicy = HWSV::SvrError::isInHwReconfLoop() ? HWAS::DECONFIG : HWAS::NO_DECONFIG; #elif !defined(__HOSTBOOT_RUNTIME) // Hostboot only // Deferred Deconfig should be used throughout all of Hostboot (both // checkForIplAttns() and MDIA). deconfigPolicy = HWAS::DECONFIG; deferDeconfig = true; #endif } //-------------------------------------------------------------------------- // Get the HCDB diagnostics policy. //-------------------------------------------------------------------------- HWSV::hwsvDiagUpdate l_diagUpdate = HWSV::HWSV_DIAG_NEEDED; if ( ERRL_ACTION_HIDDEN == errlAct ) { // Diagnostics is not needed in the next IPL cycle for non-visible logs. l_diagUpdate = HWSV::HWSV_DIAG_NOT_NEEDED; } //-------------------------------------------------------------------------- // Initialize the PFA data //-------------------------------------------------------------------------- PfaData pfaData; initPfaData( io_sdc, i_attnType, deferDeconfig, errlAct, errlSev, gardPolicy, pfaData, o_dumpTrgt ); //-------------------------------------------------------------------------- // Add each mru/callout to the error log. //-------------------------------------------------------------------------- for ( SDC_MRU_LIST::const_iterator it = mruList.begin(); it < mruList.end(); ++it ) { PRDcallout thiscallout = it->callout; PRDpriority thispriority = it->priority; // Use the global gard/deconfig policies as default. HWAS::GARD_ErrorType thisGard = gardPolicy; HWAS::DeconfigEnum thisDeconfig = deconfigPolicy; // Change the gard/deconfig actions if this MRU should not be garded. if ( NO_GARD == it->gardState ) { thisGard = HWAS::GARD_NULL; thisDeconfig = HWAS::NO_DECONFIG; } // Add the callout to the PFA data addCalloutToPfaData( pfaData, thiscallout, thispriority, thisGard ); // Add callout based on callout type. if( PRDcalloutData::TYPE_TARGET == thiscallout.getType() ) { PRDF_HW_ADD_CALLOUT(thiscallout.getTarget(), thispriority, thisDeconfig, iv_errl, thisGard, errlSev, l_diagUpdate); } else if(PRDcalloutData::TYPE_PROCCLK == thiscallout.getType() || PRDcalloutData::TYPE_PCICLK == thiscallout.getType()) { PRDF_ADD_CLOCK_CALLOUT(iv_errl, thiscallout.getTarget(), thiscallout.getType(), thispriority, thisDeconfig, thisGard); } else if ( PRDcalloutData::TYPE_MEMMRU == thiscallout.getType() ) { MemoryMru memMru (thiscallout.flatten()); TargetHandleList partList = memMru.getCalloutList(); for ( TargetHandleList::iterator it = partList.begin(); it != partList.end(); it++ ) { PRDF_HW_ADD_CALLOUT( *it, thispriority, thisDeconfig, iv_errl, thisGard, errlSev, l_diagUpdate ); } } else if ( PRDcalloutData::TYPE_SYMFRU == thiscallout.getType() ) { thisProcedureID = epubProcedureID(thiscallout.flatten()); PRDF_DTRAC( PRDF_FUNC "thisProcedureID: %x, thispriority: %x, " "errlSev: %x", thisProcedureID, thispriority,errlSev ); PRDF_HW_ADD_PROC_CALLOUT(thisProcedureID, thispriority, iv_errl, errlSev); // Use the flags set earlier to determine if the callout is just // Software (SP code or Phyp Code). Add a Second Level Support // procedure callout Low, for this case. if (HW == false && SW == true && SecondLevel == false) { PRDF_DTRAC( PRDF_FUNC "thisProcedureID= %x, thispriority=%x, " "errlSev=%x", LEVEL2_SUPPORT, MRU_LOW, errlSev ); PRDF_HW_ADD_PROC_CALLOUT( LEVEL2_SUPPORT, MRU_LOW, iv_errl, errlSev ); SecondLevel = true; } } } // Send the dynamic memory Dealloc message for DIMMS for Predictive // callouts. // We can not check for ERRL severity here as there are some cases // e.g. DD02 where we create a Predictive error log but callouts // are not predictive. if ( HWAS::GARD_Predictive == gardPolicy ) { deallocateDimms( mruList ); } //************************************************************** // Check for Terminating the system for non mnfg conditions. //************************************************************** ForceTerminate = checkForceTerm( io_sdc, o_dumpTrgt, pfaData ); //************************************************************* // Check for Manufacturing Mode terminate here and then do // the needed overrides on ForceTerminate flag. //************************************************************* if ( PlatServices::mnfgTerminate() && !ForceTerminate ) { ForceTerminate = true; if ( !((errlSev == ERRL_SEV_RECOVERED) || (errlSev == ERRL_SEV_INFORMATIONAL)) && iplDiagMode && !HW ) { //Terminate in Manufacturing Mode, in IPL mode, for visible log, with no HW callouts. PRDF_SRC_WRITE_TERM_STATE_ON(iv_errl, SRCI_TERM_STATE_MNFG); } // Do not terminate if recoverable or informational. // Do not terminate if deferred deconfig. else if ( deferDeconfig || (errlSev == ERRL_SEV_RECOVERED ) || (errlSev == ERRL_SEV_INFORMATIONAL) ) { ForceTerminate = false; errlAct |= ERRL_ACTION_DONT_TERMINATE; } else { PRDF_SRC_WRITE_TERM_STATE_ON(iv_errl, SRCI_TERM_STATE_MNFG); } pfaData.errlActions = errlAct; } // Needed to move the errl add user data sections here because of some updates // of the data required in the Aysnc section for the SMA dual reporting fix. //************************************************************** // Add the PFA data to Error Log User Data //************************************************************** UtilMem l_membuf; l_membuf << pfaData; PRDF_ADD_FFDC( iv_errl, (const char*)l_membuf.base(), l_membuf.size(), ErrlVer1, ErrlSectPFA5_1 ); //************************************************************** // Add SDC Capture data to Error Log User Data //************************************************************** // Pulled some code out to incorporate into AddCapData // Check to make sure Capture Data wasn't added earlier. if (!capDataAdded) { AddCapData( io_sdc.GetCaptureData(), iv_errl ); AddCapData( io_sdc.getTraceArrayData(), iv_errl ); } //************************************************************************** // Add extended MemoryMru error log sections (if needed). //************************************************************************** for ( SDC_MRU_LIST::const_iterator it = mruList.begin(); it < mruList.end(); ++it ) { // Operate only on MemoryMru callouts. if ( PRDcalloutData::TYPE_MEMMRU != it->callout.getType() ) continue; /* TODO RTC 136125 // Only add single DIMM callouts. Otherwise, the parsed data is // redundant. MemoryMru memMru ( it->callout.flatten() ); if ( !memMru.getSymbol().isValid() ) continue; // Add the MemoryMru to the capture data. CenMbaCaptureData::addExtMemMruData( memMru, iv_errl ); */ } //************************************************************************** // Additional FFDC //************************************************************************** // For OP checkstop analysis, add a string indicating a system checkstop // occurred and when. This will be printed out in the console traces along // with the error log. #if defined(__HOSTBOOT_MODULE) && !defined(__HOSTBOOT_RUNTIME) // IPL only #ifdef CONFIG_ENABLE_CHECKSTOP_ANALYSIS if ( MACHINE_CHECK == i_attnType ) { const char * const str = PnorFirDataReader::getPnorFirDataReader().isIplState() ? "System checkstop occurred during IPL on previous boot" : "System checkstop occurred during runtime on previous boot"; ErrlUserDetailsString(str).addToLog(iv_errl); } #endif #endif // Collect PRD traces. // NOTE: Each line of a trace is on average 36 bytes so 768 bytes should get // us around 21 lines of trace output. PRDF_COLLECT_TRACE(iv_errl, 768); //************************************************************** // Commit the error log. // This will also perform Gard and Deconfig actions. // Do the Unit Dumps if needed. //************************************************************** // Add the MNFG trace information. MnfgTrace( io_sdc.GetErrorSignature(), pfaData ); // If this is not a terminating condition, commit the error log. If the // error log is not committed, the error log will be passed back to // PRDF::main() and eventually ATTN. if ( MACHINE_CHECK != pfaData.priAttnType && !ForceTerminate && !pfaData.TERMINATE ) { // Handle any unit checkstop conditions, if needed (i.e. runtime // deconfiguration, dump/FFDC collection, etc. if ( io_sdc.IsUnitCS() && !io_sdc.IsUsingSavedSdc() ) { handleUnitCS( io_sdc, o_dumpTrgt, o_initiateHwudump ); } if ( true == o_initiateHwudump ) { // the dump log will be deleted later in PRDF::main // after the hwudump is initiated there. o_dumpErrl = iv_errl; iv_errl = NULL; o_dumpErrlActions = errlAct; PRDF_TRAC( PRDF_FUNC "for target: 0x%08x, i_errl: 0x%08x, " "i_errlActions: 0x%08x", getHuid(o_dumpTrgt), ERRL_GETRC_SAFE(o_dumpErrl), o_dumpErrlActions ); } else { // Commit the error log. commitErrLog( iv_errl, pfaData ); } } #ifndef __HOSTBOOT_MODULE errlHndl_t reg_errl = UtilReg::read ("prdf/RasServices", &sdcSaveFlags, sz_uint8); if (reg_errl) { PRDF_ERR( PRDF_FUNC "Failure in SDC Sync flag Registry read" ); PRDF_COMMIT_ERRL(reg_errl, ERRL_ACTION_REPORT); } else { //Turn off indicator that there is saved Sdc Analysis info sdcSaveFlags &= ( ~SDC_ANALYSIS_SAVE_FLAG ); reg_errl = UtilReg::write ("prdf/RasServices", &sdcSaveFlags, sz_uint8); if (reg_errl) { PRDF_ERR( PRDF_FUNC "Failure in SDC Sync flag Registry write" ); PRDF_COMMIT_ERRL(reg_errl, ERRL_ACTION_REPORT); } } #endif PRDF_INF( PRDF_FUNC "PRD called to analyze an error: 0x%08x 0x%08x", esig->getChipId(), esig->getSigId() ); // Reset iv_errl to NULL. This is done to catch logical bug in our code. // It enables us to assert in createInitialErrl function if iv_errl is // not NULL which should catch any logical bug in initial stages of testing. errlHndl_t o_errl = iv_errl; iv_errl = NULL; return o_errl; #undef PRDF_FUNC }