int32_t analyzeIplCEStats( TargetHandle_t i_mba, bool &o_calloutMade ) { #define PRDF_FUNC "PRDF::analyzeIplCEStats" PRDF_ENTER( PRDF_FUNC"(0x%08x)", getHuid(i_mba) ); // will unlock when going out of scope PRDF_SYSTEM_SCOPELOCK; int32_t o_rc = SUCCESS; o_calloutMade = false; ExtensibleChip * mbaChip = (ExtensibleChip *)systemPtr->GetChip( i_mba ); CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); o_rc = mbadb->getIplCeStats()->analyzeStats( o_calloutMade ); if ( SUCCESS != o_rc ) { PRDF_ERR( "["PRDF_FUNC"] analyzeStats() failed"); // Get user data uint64_t ud12 = PRDF_GET_UINT64_FROM_UINT32( getHuid(i_mba), 0 ); uint64_t ud34 = PRDF_GET_UINT64_FROM_UINT32( PRDFSIG_MnfgIplFail, 0 ); // Create error log errlHndl_t errl = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, // severity PRDF_MNFG_IPL_CE_ANALYSIS, // module ID PRDF_DETECTED_FAIL_SOFTWARE, // reason code ud12, ud34 ); // user data 1-4 // Add 2nd level support errl->addProcedureCallout( EPUB_PRC_LVL_SUPP, SRCI_PRIORITY_HIGH ); // Add traces errl->collectTrace( PRDF_COMP_NAME, 512 ); // Commit the error log ERRORLOG::errlCommit( errl, PRDF_COMP_ID ); } PRDF_EXIT( PRDF_FUNC"(0x%08x), o_calloutMade:%u", getHuid(i_mba), o_calloutMade ); return o_rc; #undef PRDF_FUNC }
int32_t CenMbaTdCtlr::analyzeTpsPhase2( STEP_CODE_DATA_STRUCT & io_sc, const CenAddr & i_stopAddr, const CenAddr & i_endAddr ) { #define PRDF_FUNC "[CenMbaTdCtlr::analyzeTpsPhase2] " int32_t o_rc = SUCCESS; do { if ( TPS_PHASE_2 != iv_tdState ) { PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" ); o_rc = FAIL; break; } CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); o_rc = mbadb->getIplCeStats()->calloutHardCes( iv_rank ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"calloutHardCes() failed"); break; } // Get error condition which caused command to stop uint16_t eccErrorMask = NO_ERROR; o_rc = checkEccErrors( eccErrorMask, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"checkEccErrors() failed" ); break; } if ( ( eccErrorMask & UE ) || ( eccErrorMask & RETRY_CTE )) { // Handle UE. Highest priority o_rc = handleUE( io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"handleUE() failed" ); break; } } else if ( eccErrorMask & MPE ) { o_rc = handleMPE( io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"handleMPE() failed"); break; } } else { // No error found so add rank to callout list, just in case. MemoryMru memmru (iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK); io_sc.service_data->SetCallout( memmru ); io_sc.service_data->AddSignatureList( iv_mbaTrgt, PRDFSIG_EndTpsPhase2 ); iv_tdState = NO_OP; } } while (0); return o_rc; #undef PRDF_FUNC }
int32_t CenMbaTdCtlr::handleUE( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[CenMbaTdCtlr::handleUE] " using namespace CalloutUtil; int32_t o_rc = SUCCESS; iv_tdState = NO_OP; // Abort the TD procedure. setTdSignature( io_sc, PRDFSIG_MaintUE ); io_sc.service_data->SetServiceCall(); CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); do { // Clean up the maintenance command. This is needed just in case the UE // isolation procedure is modified to use maintenance commands. o_rc = cleanupPrevCmd(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); break; } // Look for all failing bits on this rank. CenDqBitmap bitmap; o_rc = mssIplUeIsolation( iv_mbaTrgt, iv_rank, bitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"mssIplUeIsolation() failed" ); break; } // Add UE data to capture data. bitmap.getCaptureData( io_sc.service_data->GetCaptureData() ); // Callout the failing DIMMs. TargetHandleList callouts; for ( int32_t ps = 0; ps < PORT_SLCT_PER_MBA; ps++ ) { bool badDqs = false; o_rc = bitmap.badDqs( ps, badDqs ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"badDqs(%d) failed", ps ); break; } if ( !badDqs ) continue; // nothing to do. TargetHandleList dimms = getConnectedDimms(iv_mbaTrgt, iv_rank, ps); if ( 0 == dimms.size() ) { PRDF_ERR( PRDF_FUNC"getConnectedDimms(%d) failed", ps ); o_rc = FAIL; break; } callouts.insert( callouts.end(), dimms.begin(), dimms.end() ); if ( isMfgCeCheckingEnabled() ) { // As we are doing callout for UE, we dont need to do callout // during CE for this rank on given port mbadb->getIplCeStats()->banAnalysis( iv_rank, ps ); } } if ( SUCCESS != o_rc ) break; if ( 0 == callouts.size() ) { // It is possible the scrub counters have rolled over to zero due to // a known DD1.0 hardware bug. In this case, the best we can do is // callout both DIMMs, because at minimum we know there was a UE, we // just don't know where. // NOTE: If this condition happens because of a DD2.0+ bug, the // mssIplUeIsolation procedure will callout the Centaur. callouts = getConnectedDimms( iv_mbaTrgt, iv_rank ); if ( 0 == callouts.size() ) { PRDF_ERR( PRDF_FUNC"getConnectedDimms() failed" ); o_rc = FAIL; break; } if ( isMfgCeCheckingEnabled() ) { // As we are doing callout for UE, we dont need to do callout // during CE for this rank on both port mbadb->getIplCeStats()->banAnalysis( iv_rank); } } // Callout all DIMMs in the list. for ( TargetHandleList::iterator i = callouts.begin(); i != callouts.end(); i++ ) { io_sc.service_data->SetCallout( *i, MRU_HIGH ); } } while(0); return o_rc; #undef PRDF_FUNC }