int32_t CenMbaTdCtlrCommon::handleMCE_DSD2( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_DSD2] " int32_t o_rc = SUCCESS; do { if ( DSD_PHASE_2 != iv_tdState ) { PRDF_ERR( PRDF_FUNC "Invalid state machine configuration" ); o_rc = FAIL; break; } setTdSignature( io_sc, PRDFSIG_DsdBadSpare ); io_sc.service_data->SetServiceCall(); // Callout spare DRAM. MemoryMru memmru ( iv_mbaTrgt, iv_rank, iv_mark.getCM() ); io_sc.service_data->SetCallout( memmru ); // The spare DRAM is bad, so set it in VPD. At this point, the chip mark // should have already been set in the VPD because it was recently // verified. CenDqBitmap bitmap; o_rc = getBadDqBitmap( iv_mbaTrgt, iv_rank, bitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap() failed" ); break; } if ( iv_isEccSteer ) { bitmap.setEccSpare(); } else { o_rc = bitmap.setDramSpare( iv_mark.getCM().getPortSlct() ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "setDramSpare() failed" ); break; } } o_rc = setBadDqBitmap( iv_mbaTrgt, iv_rank, bitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "setBadDqBitmap() failed" ); break; } } while(0); return o_rc; #undef PRDF_FUNC }
int32_t CenMbaTdCtlr::handleUE( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[CenMbaTdCtlr::handleUE] " using namespace CalloutUtil; int32_t o_rc = SUCCESS; iv_tdState = NO_OP; // Abort the TD procedure. setTdSignature( io_sc, PRDFSIG_MaintUE ); io_sc.service_data->SetServiceCall(); CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); do { // Clean up the maintenance command. This is needed just in case the UE // isolation procedure is modified to use maintenance commands. o_rc = cleanupPrevCmd(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); break; } // Look for all failing bits on this rank. CenDqBitmap bitmap; o_rc = mssIplUeIsolation( iv_mbaTrgt, iv_rank, bitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"mssIplUeIsolation() failed" ); break; } // Add UE data to capture data. bitmap.getCaptureData( io_sc.service_data->GetCaptureData() ); // Callout the failing DIMMs. TargetHandleList callouts; for ( int32_t ps = 0; ps < PORT_SLCT_PER_MBA; ps++ ) { bool badDqs = false; o_rc = bitmap.badDqs( ps, badDqs ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"badDqs(%d) failed", ps ); break; } if ( !badDqs ) continue; // nothing to do. TargetHandleList dimms = getConnectedDimms(iv_mbaTrgt, iv_rank, ps); if ( 0 == dimms.size() ) { PRDF_ERR( PRDF_FUNC"getConnectedDimms(%d) failed", ps ); o_rc = FAIL; break; } callouts.insert( callouts.end(), dimms.begin(), dimms.end() ); if ( isMfgCeCheckingEnabled() ) { // As we are doing callout for UE, we dont need to do callout // during CE for this rank on given port mbadb->getIplCeStats()->banAnalysis( iv_rank, ps ); } } if ( SUCCESS != o_rc ) break; if ( 0 == callouts.size() ) { // It is possible the scrub counters have rolled over to zero due to // a known DD1.0 hardware bug. In this case, the best we can do is // callout both DIMMs, because at minimum we know there was a UE, we // just don't know where. // NOTE: If this condition happens because of a DD2.0+ bug, the // mssIplUeIsolation procedure will callout the Centaur. callouts = getConnectedDimms( iv_mbaTrgt, iv_rank ); if ( 0 == callouts.size() ) { PRDF_ERR( PRDF_FUNC"getConnectedDimms() failed" ); o_rc = FAIL; break; } if ( isMfgCeCheckingEnabled() ) { // As we are doing callout for UE, we dont need to do callout // during CE for this rank on both port mbadb->getIplCeStats()->banAnalysis( iv_rank); } } // Callout all DIMMs in the list. for ( TargetHandleList::iterator i = callouts.begin(); i != callouts.end(); i++ ) { io_sc.service_data->SetCallout( *i, MRU_HIGH ); } } while(0); return o_rc; #undef PRDF_FUNC }
int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_VCM2] " using namespace fapi; // For spare config macros. int32_t o_rc = SUCCESS; iv_isEccSteer = false; do { if ( VCM_PHASE_2 != iv_tdState ) { PRDF_ERR( PRDF_FUNC "Invalid state machine configuration" ); o_rc = FAIL; break; } setTdSignature( io_sc, PRDFSIG_VcmVerified ); if ( areDramRepairsDisabled() ) { iv_tdState = NO_OP; // The TD procedure is complete. io_sc.service_data->SetServiceCall(); break; // nothing else to do. } // If there is a symbol mark on the same DRAM as the newly verified chip // mark, remove the symbol mark. const uint8_t cmDram = iv_mark.getCM().getDram(); if ( cmDram == iv_mark.getSM().getDram() ) { iv_mark.clearSM(); bool blocked; // Won't be blocked because chip mark is in place. o_rc = mssSetMarkStore( iv_mbaTrgt, iv_rank, iv_mark, blocked ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "mssSetMarkStore() failed" ); break; } } bool startDsdProcedure = false; // Read VPD. CenDqBitmap bitmap; o_rc = getBadDqBitmap( iv_mbaTrgt, iv_rank, bitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap() failed" ); break; } // The chip mark is considered verified, so set it in VPD. o_rc = bitmap.setDram( iv_mark.getCM() ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "setDram() failed" ); break; } uint8_t ps = iv_mark.getCM().getPortSlct(); uint8_t spareConfig = ENUM_ATTR_VPD_DIMM_SPARE_NO_SPARE; o_rc = getDimmSpareConfig( iv_mbaTrgt, iv_rank, ps, spareConfig ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getDimmSpareConfig() failed" ); break; } // Check if DRAM spare is present. Also, ECC spares are available on all // x4 DIMMS. if ( ( ENUM_ATTR_VPD_DIMM_SPARE_NO_SPARE != spareConfig ) || iv_x4Dimm ) { // Get the current spares in hardware. CenSymbol sp0, sp1, ecc; o_rc = mssGetSteerMux( iv_mbaTrgt, iv_rank, sp0, sp1, ecc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "mssGetSteerMux() failed" ); break; } // If the verified chip mark is on a spare then the spare is bad and // hardware can not steer it to another DRAM even if one is // available (e.g. ECC spare). In this this case, make error log // predictive (remember that the chip mark has already been added to // the callout list. if ( ( cmDram == (0 == ps ? sp0.getDram() : sp1.getDram()) ) || ( cmDram == ecc.getDram() ) ) { setTdSignature( io_sc, PRDFSIG_VcmBadSpare ); io_sc.service_data->SetServiceCall(); } else { // Certain DIMMs may have had spares intentially made // unavailable by the manufacturer. Check the VPD for available // spares. Note that a x4 DIMM has DRAM spares and ECC spares, // so check for availability on both. bool dramSparePossible = false; bool eccSparePossible = false; o_rc = bitmap.isSpareAvailable( ps, dramSparePossible, eccSparePossible ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "isDramSpareAvailable() failed" ); break; } if ( dramSparePossible && (0 == ps ? !sp0.isValid() : !sp1.isValid()) ) { // A spare DRAM is available. startDsdProcedure = true; } else if ( eccSparePossible && !ecc.isValid() ) { startDsdProcedure = true; iv_isEccSteer = true; } else { // Chip mark is in place and sparing is not possible. setTdSignature( io_sc, PRDFSIG_VcmCmAndSpare ); io_sc.service_data->SetServiceCall(); } } } else // DRAM spare not supported. { // Not able to do dram sparing. If there is a symbol mark, there are // no repairs available so call it out and set the error log to // predictive. if ( iv_mark.getSM().isValid() ) { setTdSignature( io_sc, PRDFSIG_VcmCmAndSm ); io_sc.service_data->SetServiceCall(); } } // Write VPD. o_rc = setBadDqBitmap( iv_mbaTrgt, iv_rank, bitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "setBadDqBitmap() failed" ); break; } // Start DSD Phase 1, if possible. if ( startDsdProcedure ) { o_rc = startDsdPhase1( io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "startDsdPhase1() failed" ); break; } } else { iv_tdState = NO_OP; // The TD procedure is complete. } } while(0); return o_rc; #undef PRDF_FUNC }
void captureDramRepairsVpd( TargetHandle_t i_mbaTrgt, CaptureData & io_cd ) { #define PRDF_FUNC "[captureDramRepairsVpd] " // Get the maximum capture data size. static const size_t sz_rank = sizeof(uint8_t); static const size_t sz_entry = PORT_SLCT_PER_MBA * DIMM_DQ_RANK_BITMAP_SIZE; static const size_t sz_word = sizeof(CPU_WORD); int32_t rc = SUCCESS; do { std::vector<CenRank> masterRanks; rc = getMasterRanks( i_mbaTrgt, masterRanks ); if ( SUCCESS != rc ) { PRDF_ERR( PRDF_FUNC "getMasterRanks() failed" ); break; } if( masterRanks.empty() ) { PRDF_ERR( PRDF_FUNC "Master Rank list size is 0"); break; } // Get the maximum capture data size. size_t sz_maxData = masterRanks.size() * (sz_rank + sz_entry); // Adjust the size for endianness. sz_maxData = ((sz_maxData + sz_word-1) / sz_word) * sz_word; // Initialize to 0. uint8_t capData[sz_maxData]; memset( capData, 0x00, sz_maxData ); // Iterate all ranks to get VPD data uint32_t idx = 0; for ( std::vector<CenRank>::iterator it = masterRanks.begin(); it != masterRanks.end(); it++ ) { CenDqBitmap bitmap; uint8_t rank = it->getMaster(); if ( SUCCESS != getBadDqBitmap(i_mbaTrgt, *it, bitmap, true) ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap() failed: MBA=0x%08x" " rank=%d", getHuid(i_mbaTrgt), rank ); continue; // skip this rank } if ( bitmap.badDqs() ) // make sure the data is non-zero { // Add the rank, then the entry data. capData[idx] = rank; idx += sz_rank; memcpy(&capData[idx], bitmap.getData(), sz_entry); idx += sz_entry; } } if( 0 == idx ) break; // Nothing to capture // Fix endianness issues with non PPC machines. size_t sz_capData = idx; sz_capData = ((sz_capData + sz_word-1) / sz_word) * sz_word; for ( uint32_t i = 0; i < (sz_capData/sz_word); i++ ) ((CPU_WORD*)capData)[i] = htonl(((CPU_WORD*)capData)[i]); // Add data to capture data. BIT_STRING_ADDRESS_CLASS bs ( 0, sz_capData*8, (CPU_WORD *) &capData ); io_cd.Add( i_mbaTrgt, Util::hashString("DRAM_REPAIRS_VPD"), bs ); }while(0); if( FAIL == rc ) PRDF_ERR( PRDF_FUNC "Failed for MBA 0x%08X", getHuid( i_mbaTrgt ) ); #undef PRDF_FUNC }