/** * @brief Handles MCS Channel fail bits, if they exist. * * @param i_membChip The Centaur chip. * @param i_sc ServiceDataColector. * * @return SUCCESS if MCS channel fail is present and properly * handled, FAIL otherwise. */ int32_t handleMcsChnlCs( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc ) { #define PRDF_FUNC "[handleMcsChnlCs] " // We will return FAIL from this function if MCS channel fail bits // are not set. If MCS channel fail bits are set, we will try to analyze // Mcs. If MCS is not analyzed properly, we will return FAIL. // This will trigger rule code to execute alternate resolution. int32_t l_rc = SUCCESS; do { CenMembufDataBundle * mbdb = getMembufDataBundle( i_membChip ); ExtensibleChip * mcsChip = mbdb->getMcsChip(); if( NULL == mcsChip ) { l_rc = FAIL; break; } SCAN_COMM_REGISTER_CLASS * mciFir = mcsChip->getRegister("MCIFIR"); SCAN_COMM_REGISTER_CLASS * mciFirMask = mcsChip->getRegister("MCIFIR_MASK"); l_rc = mciFir->Read(); l_rc |= mciFirMask->Read(); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"MCIFIR/MCIFIR_MASK read failed for 0x%08x", mcsChip->GetId()); break; } // If any of MCS channel fail bit is set, we will analyze // MCS. It is safe to do hard coded check as channel fail // bits are hard wired and and they can not change without HW // change. // bits 0,1, 6, 8, 9, 22, 23, 40 are channel fail bits. uint64_t chnlCsBitsMask = 0xC2C0030000800000ull; uint64_t mciFirBits = mciFir->GetBitFieldJustified(0, 64); uint64_t mciFirMaskBits = mciFirMask->GetBitFieldJustified(0, 64); if ( mciFirBits & ~mciFirMaskBits & chnlCsBitsMask ) { l_rc = mcsChip->Analyze( i_sc, i_sc.service_data->GetCauseAttentionType() ); if( SUCCESS == l_rc ) break; } l_rc = FAIL; }while( 0 ); return l_rc; #undef PRDF_FUNC } PRDF_PLUGIN_DEFINE( Membuf, handleMcsChnlCs );
int32_t getDramSize( ExtensibleChip *i_mbaChip, uint8_t & o_size ) { #define PRDF_FUNC "[MemUtils::getDramSize] " int32_t o_rc = SUCCESS; o_size = SIZE_2GB; do { TargetHandle_t mbaTrgt = i_mbaChip->GetChipHandle(); CenMbaDataBundle * mbadb = getMbaDataBundle( i_mbaChip ); ExtensibleChip * membufChip = mbadb->getMembChip(); if ( NULL == membufChip ) { PRDF_ERR( PRDF_FUNC "getMembChip() failed: MBA=0x%08x", getHuid(mbaTrgt) ); o_rc = FAIL; break; } uint32_t pos = getTargetPosition(mbaTrgt); const char * reg_str = (0 == pos) ? "MBA0_MBAXCR" : "MBA1_MBAXCR"; SCAN_COMM_REGISTER_CLASS * reg = membufChip->getRegister( reg_str ); o_rc = reg->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on %s. Target=0x%08x", reg_str, getHuid(mbaTrgt) ); break; } o_size = reg->GetBitFieldJustified( 6, 2 ); } while(0); return o_rc; #undef PRDF_FUNC }
/** * @brief Calls out the EX chiplet (MRU_LOW), if possible. Otherwise, calls * out the PROC (MRU_LOW) * @param i_chip P8 chip * @param io_sc service data collector * @returns SUCCESS */ int32_t combinedResponseCallout( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[Proc::combinedResponseCallout] " int32_t l_rc = SUCCESS; TargetHandle_t procTrgt = i_chip->GetChipHandle(); SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister("PB_CENT_CR_ERROR"); do { l_rc = reg->Read(); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Read() failed on PB_CENT_CR_ERROR" ); break; } uint32_t tmp = reg->GetBitFieldJustified(0,3); if ( 0x02 != tmp ) // Must be 0b010 to continue { PRDF_ERR( PRDF_FUNC"Unsupported reason code: 0x%02x", tmp ); l_rc = FAIL; break; } tmp = reg->GetBitFieldJustified(38,5); if ( 0x00 != tmp ) // Must be 0b00000 to continue { PRDF_ERR( PRDF_FUNC"Unsupported combined response encoding: 0x%02x", tmp ); l_rc = FAIL; break; } if ( reg->IsBitSet(22) ) // Must be 0b0 to continue { PRDF_ERR( PRDF_FUNC"Operation not sourced by an EX chiplet" ); l_rc = FAIL; break; } // Get the EX target tmp = reg->GetBitFieldJustified(23,4); TargetHandle_t exTrgt = getConnectedChild( procTrgt, TYPE_EX, tmp ); if ( NULL == exTrgt ) { PRDF_ERR( PRDF_FUNC"No connected EX chiplet at position %d", tmp ); l_rc = FAIL; break; } // Callout the EX target io_sc.service_data->SetCallout( exTrgt, MRU_LOW ); } while (0); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Unable to isolate to an EX chiplet. Calling out " "PROC 0x%08x instead.", i_chip->GetId() ); io_sc.service_data->SetCallout( procTrgt, MRU_LOW ); } return SUCCESS; #undef PRDF_FUNC }
int32_t collectCeStats( ExtensibleChip * i_mbaChip, const CenRank & i_rank, MaintSymbols & o_maintStats, CenSymbol & o_chipMark, uint8_t i_thr ) { #define PRDF_FUNC "[MemUtils::collectCeStats] " int32_t o_rc = SUCCESS; o_chipMark = CenSymbol(); // Initially invalid. do { if ( 0 == i_thr ) // Must be non-zero { PRDF_ERR( PRDF_FUNC "i_thr %d is invalid", i_thr ); o_rc = FAIL; break; } TargetHandle_t mbaTrgt = i_mbaChip->GetChipHandle(); CenMbaDataBundle * mbadb = getMbaDataBundle( i_mbaChip ); ExtensibleChip * membufChip = mbadb->getMembChip(); if ( NULL == membufChip ) { PRDF_ERR( PRDF_FUNC "getMembChip() failed" ); o_rc = FAIL; break; } uint8_t mbaPos = getTargetPosition( mbaTrgt ); if ( MAX_MBA_PER_MEMBUF <= mbaPos ) { PRDF_ERR( PRDF_FUNC "mbaPos %d is invalid", mbaPos ); o_rc = FAIL; break; } const bool isX4 = isDramWidthX4(mbaTrgt); // Get the current spares on this rank. CenSymbol sp0, sp1, ecc; o_rc = mssGetSteerMux( mbaTrgt, i_rank, sp0, sp1, ecc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "mssGetSteerMux() failed." ); break; } // Use this map to keep track of the total counts per DRAM. DramCountMap dramCounts; const char * reg_str = NULL; SCAN_COMM_REGISTER_CLASS * reg = NULL; for ( uint8_t regIdx = 0; regIdx < CE_REGS_PER_MBA; regIdx++ ) { reg_str = mbsCeStatReg[mbaPos][regIdx]; reg = membufChip->getRegister( reg_str ); o_rc = reg->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on %s", reg_str ); break; } uint8_t baseSymbol = SYMBOLS_PER_CE_REG * regIdx; for ( uint8_t i = 0; i < SYMBOLS_PER_CE_REG; i++ ) { uint8_t count = reg->GetBitFieldJustified( (i*8), 8 ); if ( 0 == count ) continue; // nothing to do uint8_t sym = baseSymbol + i; uint8_t dram = symbol2Dram( sym, isX4 ); // Keep track of the total DRAM counts. dramCounts[dram].totalCount += count; // Add any symbols that have exceeded threshold to the list. if ( i_thr <= count ) { // Keep track of the total number of symbols per DRAM that // have exceeded threshold. dramCounts[dram].symbolCount++; SymbolData symData; symData.symbol = CenSymbol::fromSymbol( mbaTrgt, i_rank, sym, CEN_SYMBOL::BOTH_SYMBOL_DQS ); if ( !symData.symbol.isValid() ) { PRDF_ERR( PRDF_FUNC "CenSymbol() failed: symbol=%d", sym ); o_rc = FAIL; break; } else { // Check if this symbol is on any of the spares. if ( ( sp0.isValid() && (sp0.getDram() == symData.symbol.getDram()) ) || ( sp1.isValid() && (sp1.getDram() == symData.symbol.getDram()) ) ) { symData.symbol.setDramSpared(); } if ( ecc.isValid() && (ecc.getDram() == symData.symbol.getDram()) ) { symData.symbol.setEccSpared(); } // Add the symbol to the list. symData.count = count; o_maintStats.push_back( symData ); } } } if ( SUCCESS != o_rc ) break; } if ( SUCCESS != o_rc ) break; if ( o_maintStats.empty() ) break; // no need to continue // Sort the list of symbols. std::sort( o_maintStats.begin(), o_maintStats.end(), sortSymDataCount ); // Get the DRAM with the highest count. uint32_t highestDram = 0; uint32_t highestCount = 0; const uint32_t symbolTH = isX4 ? 1 : 2; for ( DramCountMap::iterator it = dramCounts.begin(); it != dramCounts.end(); ++it ) { if ( (symbolTH <= it->second.symbolCount) && (highestCount < it->second.totalCount ) ) { highestDram = it->first; highestCount = it->second.totalCount; } } if ( 0 != highestCount ) { uint8_t sym = dram2Symbol( highestDram, isX4 ); o_chipMark = CenSymbol::fromSymbol( mbaTrgt, i_rank, sym ); // Check if this symbol is on any of the spares. if ( ( sp0.isValid() && (sp0.getDram() == o_chipMark.getDram()) ) || ( sp1.isValid() && (sp1.getDram() == o_chipMark.getDram()) ) ) { o_chipMark.setDramSpared(); } if ( ecc.isValid() && (ecc.getDram() == o_chipMark.getDram()) ) { o_chipMark.setEccSpared(); } } } while(0); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Failed: i_mbaChip=0x%08x i_rank=m%ds%d i_thr=%d", i_mbaChip->GetId(), i_rank.getMaster(), i_rank.getSlave(), i_thr ); } return o_rc; #undef PRDF_FUNC }
/** * @brief MBSECCFIR[16] - Fetch New CE (NCE). * @param i_membChip A Centaur chip. * @param i_sc The step code data struct. * @param i_mbaPos The MBA position. * @return SUCCESS */ int32_t AnalyzeFetchNce( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc, uint32_t i_mbaPos ) { #define PRDF_FUNC "[AnalyzeFetchNce] " int32_t l_rc = SUCCESS; ExtensibleChip * mbaChip = NULL; do { CenMembufDataBundle * membdb = getMembufDataBundle( i_membChip ); mbaChip = membdb->getMbaChip( i_mbaPos ); if ( NULL == mbaChip ) { PRDF_ERR( PRDF_FUNC"getMbaChip() returned NULL" ); l_rc = FAIL; break; } TargetHandle_t mbaTrgt = mbaChip->GetChipHandle(); CenAddr addr; l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_NCE_ADDR, addr ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"getCenReadAddr() failed" ); break; } CenRank rank = addr.getRank(); if ( 0x20 > getChipLevel(i_membChip->GetChipHandle()) ) { // There is a bug in DD1.x where the value of MBSEVR cannot be // trusted. The workaround is too complicated for its value so // callout the rank instead. MemoryMru memmru ( mbaTrgt, rank, MemoryMruData::CALLOUT_RANK ); i_sc.service_data->SetCallout( memmru ); } else // DD2.0+ { // Get the failing symbol const char * reg_str = (0 == i_mbaPos) ? "MBA0_MBSEVR" : "MBA1_MBSEVR"; SCAN_COMM_REGISTER_CLASS * reg = i_membChip->getRegister(reg_str); l_rc = reg->Read(); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); break; } uint8_t galois = reg->GetBitFieldJustified( 40, 8 ); uint8_t mask = reg->GetBitFieldJustified( 32, 8 ); CenSymbol symbol = CenSymbol::fromGalois( mbaTrgt, rank, galois, mask ); if ( !symbol.isValid() ) { PRDF_ERR( PRDF_FUNC"Failed to create symbol: galois=0x%02x " "mask=0x%02x", galois, mask ); break; } // Check if this symbol is on any of the spares. CenSymbol sp0, sp1, ecc; l_rc = mssGetSteerMux( mbaTrgt, rank, sp0, sp1, ecc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed. HUID: 0x%08x " "rank: %d", getHuid(mbaTrgt), rank.getMaster() ); break; } if ( (sp0.isValid() && (sp0.getDram() == symbol.getDram())) || (sp1.isValid() && (sp1.getDram() == symbol.getDram())) ) { symbol.setDramSpared(); } if ( ecc.isValid() && (ecc.getDram() == symbol.getDram()) ) { symbol.setEccSpared(); } // Add the DIMM to the callout list MemoryMru memmru ( mbaTrgt, rank, symbol ); i_sc.service_data->SetCallout( memmru, MRU_MEDA ); // Add to CE table CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); uint32_t ceTableRc = mbadb->iv_ceTable.addEntry( addr, symbol ); bool doTps = ( CenMbaCeTable::NO_TH_REACHED != ceTableRc ); // Check MNFG thresholds, if needed. if ( mfgMode() ) { // Get the MNFG CE thresholds. uint16_t dramTh, hrTh, dimmTh; l_rc = getMnfgMemCeTh( mbaChip, rank, dramTh, hrTh, dimmTh ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"getMnfgMemCeTh() failed: rank=m%ds%d", rank.getMaster(), rank.getSlave() ); break; } // Get counts from CE table. uint32_t dramCount, hrCount, dimmCount; mbadb->iv_ceTable.getMnfgCounts( addr.getRank(), symbol, dramCount, hrCount, dimmCount ); if ( dramTh < dramCount ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgDramCte ); i_sc.service_data->SetServiceCall(); doTps = true; } else if ( hrTh < hrCount ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgHrCte ); i_sc.service_data->SetServiceCall(); doTps = true; } else if ( dimmTh < dimmCount ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgDimmCte ); i_sc.service_data->SetServiceCall(); doTps = true; } else if ( 0 != (CenMbaCeTable::TABLE_FULL & ceTableRc) ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgTableFull); // The table is full and no other threshold has been met. // We are in a state where we may never hit a MNFG // threshold. Callout all memory behind the MBA. Also, since // the counts are all over the place, there may be a problem // with the MBA. So call it out as well. MemoryMru all_mm ( mbaTrgt, rank, MemoryMruData::CALLOUT_ALL_MEM ); i_sc.service_data->SetCallout( all_mm, MRU_MEDA ); i_sc.service_data->SetCallout( mbaTrgt, MRU_MEDA ); i_sc.service_data->SetServiceCall(); } else if ( 0 != (CenMbaCeTable::ENTRY_TH_REACHED & ceTableRc) ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgEntryCte ); // There is a single entry threshold and no other threshold // has been met. This is a potential flooding issue, so make // the DIMM callout predictive. i_sc.service_data->SetServiceCall(); } } // Initiate a TPS procedure, if needed. if ( doTps ) { // If a MNFG threshold has been reached (predictive callout), we // will still try to start TPS just in case MNFG disables the // termination policy. // Will not be able to do TPS during hostboot. Note that we will // still call handleTdEvent() so we can get the trace statement // indicating TPS was requested during Hostboot. l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, CenMbaTdCtlrCommon::TPS_EVENT ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"handleTdEvent() failed: rank=m%ds%d", rank.getMaster(), rank.getSlave() ); break; } } } } while (0); // Add ECC capture data for FFDC. if ( NULL != mbaChip ) CenMbaCaptureData::addMemEccData( mbaChip, i_sc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Failed: i_membChip=0x%08x i_mbaPos=%d", i_membChip->GetId(), i_mbaPos ); CalloutUtil::defaultError( i_sc ); } return SUCCESS; // Intentionally return SUCCESS for this plugin #undef PRDF_FUNC }