int32_t chnlCsCleanup( ExtensibleChip *i_mbChip, STEP_CODE_DATA_STRUCT & i_sc ) { #define PRDF_FUNC "[MemUtils::chnlCsCleanup] " int32_t o_rc = SUCCESS; do { if( ( NULL == i_mbChip ) || ( TYPE_MEMBUF != getTargetType( i_mbChip->GetChipHandle() ))) { PRDF_ERR( PRDF_FUNC "Invalid parameters" ); o_rc = FAIL; break; } if (( ! i_sc.service_data->IsUnitCS() ) || (CHECK_STOP == i_sc.service_data->getPrimaryAttnType()) ) break; CenMembufDataBundle * mbdb = getMembufDataBundle( i_mbChip ); if ( !mbdb->iv_doChnlFailCleanup ) break; // Cleanup has already been done. // Set it as SUE generation point. i_sc.service_data->SetFlag( ServiceDataCollector::UERE ); ExtensibleChip * mcsChip = mbdb->getMcsChip(); if ( NULL == mcsChip ) { PRDF_ERR( PRDF_FUNC "MCS chip is NULL for Membuf:0x%08X", i_mbChip->GetId() ); o_rc = FAIL; break; } TargetHandle_t mcs = mcsChip->GetChipHandle(); ExtensibleChip * procChip = NULL; uint8_t pos = getTargetPosition( mcs ); TargetHandle_t proc = getParentChip ( mcs ); if ( NULL == proc ) { PRDF_ERR( PRDF_FUNC "Proc is NULL for Mcs:0x%08X", getHuid( mcs ) ); o_rc = FAIL; break; } procChip = (ExtensibleChip *)systemPtr->GetChip( proc ); if( NULL == procChip ) { PRDF_ERR( PRDF_FUNC "Can not find Proc chip for HUID:0x%08X", getHuid( proc) ); o_rc = FAIL; break; } // This is a cleanup function. If we get any error from scom // operations, we will still continue with cleanup. SCAN_COMM_REGISTER_CLASS * l_tpMask = procChip->getRegister("TP_CHIPLET_FIR_MASK"); o_rc |= l_tpMask->Read(); if ( SUCCESS == o_rc ) { // Bits 5-12 maps to attentions from MCS0-MCS7. l_tpMask->SetBit( 5 + pos ); o_rc |= l_tpMask->Write(); } // Mask attentions from the Centaur const char *iomcFirMask = ( pos < 4 )? "IOMCFIR_0_MASK_OR":"IOMCFIR_1_MASK_OR"; SCAN_COMM_REGISTER_CLASS * iomcMask = procChip->getRegister( iomcFirMask); if ( pos >= 4 ) pos -= 4; // 8 bits are reserved for each Centaur in IOMCFIR. // There are total 4 ( for P system ) centaur supported // in MCS. Bits for first centaur starts from bit 8. iomcMask->SetBitFieldJustified( 8+ ( pos*8 ), 8, 0xff); o_rc |= iomcMask->Write(); SCAN_COMM_REGISTER_CLASS * l_tpfirmask = NULL; SCAN_COMM_REGISTER_CLASS * l_nestfirmask = NULL; SCAN_COMM_REGISTER_CLASS * l_memfirmask = NULL; SCAN_COMM_REGISTER_CLASS * l_memspamask = NULL; l_tpfirmask = i_mbChip->getRegister("TP_CHIPLET_FIR_MASK"); l_nestfirmask = i_mbChip->getRegister("NEST_CHIPLET_FIR_MASK"); l_memfirmask = i_mbChip->getRegister("MEM_CHIPLET_FIR_MASK"); l_memspamask = i_mbChip->getRegister("MEM_CHIPLET_SPA_MASK"); l_tpfirmask->setAllBits(); o_rc |= l_tpfirmask->Write(); l_nestfirmask->setAllBits(); o_rc |= l_nestfirmask->Write(); l_memfirmask->setAllBits(); o_rc |= l_memfirmask->Write(); l_memspamask->setAllBits(); o_rc |= l_memspamask->Write(); for ( uint32_t i = 0; i < MAX_MBA_PER_MEMBUF; i++ ) { ExtensibleChip * mbaChip = mbdb->getMbaChip( i ); if( NULL != mbaChip ) { TargetHandle_t mba = mbaChip->GetChipHandle(); if ( NULL != mba ) { #if defined(__HOSTBOOT_MODULE) && \ !defined(__HOSTBOOT_RUNTIME) // This is very small platform specific code. So not // creating a separate file for this. int32_t l_rc = mdiaSendEventMsg( mba, MDIA::SKIP_MBA ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC "mdiaSendEventMsg(0x%08x, SKIP_MBA) " "failed", getHuid( mba ) ); o_rc |= l_rc; } #else int32_t l_rc = DEALLOC::mbaGard( mbaChip ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC "mbaGard failed. HUID: 0x%08x", getHuid( mba ) ); o_rc |= l_rc; } #endif // __HOSTBOOT_MODULE } } } // Clean up complete an is no longer required. mbdb->iv_doChnlFailCleanup = false; } while(0); return o_rc; #undef PRDF_FUNC }
/** * @brief MBSECCFIR[19] - Fetch UE. * @param i_membChip A Centaur chip. * @param i_sc The step code data struct. * @param i_mbaPos The MBA position. * @return SUCCESS */ int32_t AnalyzeFetchUe( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc, uint32_t i_mbaPos ) { #define PRDF_FUNC "[AnalyzeFetchUe] " int32_t l_rc = SUCCESS; ExtensibleChip * mbaChip = NULL; do { // All memory UEs should be customer viewable. Normally, this would be // done by setting the threshold to 1, but we do not want to mask UEs // on the first occurrence. i_sc.service_data->SetServiceCall(); CenMembufDataBundle * membdb = getMembufDataBundle( i_membChip ); mbaChip = membdb->getMbaChip( i_mbaPos ); if ( NULL == mbaChip ) { PRDF_ERR( PRDF_FUNC"getMbaChip() returned NULL" ); l_rc = FAIL; break; } CenAddr addr; l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_UE_ADDR, addr ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"getCenReadAddr() failed" ); break; } CenRank rank = addr.getRank(); // Add address to UE table. CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); mbadb->iv_ueTable.addEntry( UE_TABLE::FETCH_UE, addr ); // Callout the rank. MemoryMru memmru ( mbaChip->GetChipHandle(), rank, MemoryMruData::CALLOUT_RANK ); i_sc.service_data->SetCallout( memmru ); // Add a TPS request to the TD queue and ban any further TPS requests // for this rank. l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, CenMbaTdCtlrCommon::TPS_EVENT, true ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"handleTdEvent() failed: rank=m%ds%d", rank.getMaster(), rank.getSlave() ); // We are not adding break here as we still want to do lmbGard // If you want to add any code after this which depends on result // of handleTdEvent result, add the code judicially. } #ifndef __HOSTBOOT_MODULE // Send lmb gard message to PHYP. int32_t lmbRc = DEALLOC::lmbGard( mbaChip, addr ); if ( SUCCESS != lmbRc ) { PRDF_ERR( PRDF_FUNC"lmbGard() failed" ); l_rc = lmbRc; break; } #endif } while (0); // Add ECC capture data for FFDC. if ( NULL != mbaChip ) CenMbaCaptureData::addMemEccData( mbaChip, i_sc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Failed: i_membChip=0x%08x i_mbaPos=%d", i_membChip->GetId(), i_mbaPos ); CalloutUtil::defaultError( i_sc ); } return SUCCESS; // Intentionally return SUCCESS for this plugin #undef PRDF_FUNC }
/** * @brief Fetch Retry CE / Prefetch UE Errors. * @param i_membChip A Centaur chip. * @param i_sc The step code data struct. * @param i_mbaPos The MBA position. * @param i_isRceError True for RCE error false otherwise. * @return SUCCESS */ int32_t AnalyzeFetchRcePue( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc, uint32_t i_mbaPos, bool i_isRceError ) { #define PRDF_FUNC "[AnalyzeFetchRcePue] " int32_t l_rc = SUCCESS; ExtensibleChip * mbaChip = NULL; do { CenMembufDataBundle * membdb = getMembufDataBundle( i_membChip ); mbaChip = membdb->getMbaChip( i_mbaPos ); if ( NULL == mbaChip ) { PRDF_ERR( PRDF_FUNC"getMbaChip() returned NULL" ); l_rc = FAIL; break; } CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); CenAddr addr; if ( i_isRceError ) l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_RCE_ADDR, addr ); else l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_UE_ADDR, addr ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"getCenReadAddr() failed" ); break; } CenRank rank = addr.getRank(); // Callout the rank. MemoryMru memmru ( mbaChip->GetChipHandle(), rank, MemoryMruData::CALLOUT_RANK ); i_sc.service_data->SetCallout( memmru ); // Add an entry to the RCE table. if ( mbadb->iv_rceTable.addEntry(rank, i_sc) ) { // Add a TPS request to the queue TD queue. l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, CenMbaTdCtlrCommon::TPS_EVENT ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"handleTdEvent() failed." ); break; } } } while (0); // Add ECC capture data for FFDC. if ( NULL != mbaChip ) CenMbaCaptureData::addMemEccData( mbaChip, i_sc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Failed: i_membChip=0x%08x i_mbaPos=%d " "i_isRceError=%c", i_membChip->GetId(), i_mbaPos, i_isRceError ? 'T' : 'F' ); CalloutUtil::defaultError( i_sc ); } return SUCCESS; // Intentionally return SUCCESS for this plugin #undef PRDF_FUNC }
/** * @brief MBSECCFIR[16] - Fetch New CE (NCE). * @param i_membChip A Centaur chip. * @param i_sc The step code data struct. * @param i_mbaPos The MBA position. * @return SUCCESS */ int32_t AnalyzeFetchNce( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc, uint32_t i_mbaPos ) { #define PRDF_FUNC "[AnalyzeFetchNce] " int32_t l_rc = SUCCESS; ExtensibleChip * mbaChip = NULL; do { CenMembufDataBundle * membdb = getMembufDataBundle( i_membChip ); mbaChip = membdb->getMbaChip( i_mbaPos ); if ( NULL == mbaChip ) { PRDF_ERR( PRDF_FUNC"getMbaChip() returned NULL" ); l_rc = FAIL; break; } TargetHandle_t mbaTrgt = mbaChip->GetChipHandle(); CenAddr addr; l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_NCE_ADDR, addr ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"getCenReadAddr() failed" ); break; } CenRank rank = addr.getRank(); if ( 0x20 > getChipLevel(i_membChip->GetChipHandle()) ) { // There is a bug in DD1.x where the value of MBSEVR cannot be // trusted. The workaround is too complicated for its value so // callout the rank instead. MemoryMru memmru ( mbaTrgt, rank, MemoryMruData::CALLOUT_RANK ); i_sc.service_data->SetCallout( memmru ); } else // DD2.0+ { // Get the failing symbol const char * reg_str = (0 == i_mbaPos) ? "MBA0_MBSEVR" : "MBA1_MBSEVR"; SCAN_COMM_REGISTER_CLASS * reg = i_membChip->getRegister(reg_str); l_rc = reg->Read(); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); break; } uint8_t galois = reg->GetBitFieldJustified( 40, 8 ); uint8_t mask = reg->GetBitFieldJustified( 32, 8 ); CenSymbol symbol = CenSymbol::fromGalois( mbaTrgt, rank, galois, mask ); if ( !symbol.isValid() ) { PRDF_ERR( PRDF_FUNC"Failed to create symbol: galois=0x%02x " "mask=0x%02x", galois, mask ); break; } // Check if this symbol is on any of the spares. CenSymbol sp0, sp1, ecc; l_rc = mssGetSteerMux( mbaTrgt, rank, sp0, sp1, ecc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed. HUID: 0x%08x " "rank: %d", getHuid(mbaTrgt), rank.getMaster() ); break; } if ( (sp0.isValid() && (sp0.getDram() == symbol.getDram())) || (sp1.isValid() && (sp1.getDram() == symbol.getDram())) ) { symbol.setDramSpared(); } if ( ecc.isValid() && (ecc.getDram() == symbol.getDram()) ) { symbol.setEccSpared(); } // Add the DIMM to the callout list MemoryMru memmru ( mbaTrgt, rank, symbol ); i_sc.service_data->SetCallout( memmru, MRU_MEDA ); // Add to CE table CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); uint32_t ceTableRc = mbadb->iv_ceTable.addEntry( addr, symbol ); bool doTps = ( CenMbaCeTable::NO_TH_REACHED != ceTableRc ); // Check MNFG thresholds, if needed. if ( mfgMode() ) { // Get the MNFG CE thresholds. uint16_t dramTh, hrTh, dimmTh; l_rc = getMnfgMemCeTh( mbaChip, rank, dramTh, hrTh, dimmTh ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"getMnfgMemCeTh() failed: rank=m%ds%d", rank.getMaster(), rank.getSlave() ); break; } // Get counts from CE table. uint32_t dramCount, hrCount, dimmCount; mbadb->iv_ceTable.getMnfgCounts( addr.getRank(), symbol, dramCount, hrCount, dimmCount ); if ( dramTh < dramCount ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgDramCte ); i_sc.service_data->SetServiceCall(); doTps = true; } else if ( hrTh < hrCount ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgHrCte ); i_sc.service_data->SetServiceCall(); doTps = true; } else if ( dimmTh < dimmCount ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgDimmCte ); i_sc.service_data->SetServiceCall(); doTps = true; } else if ( 0 != (CenMbaCeTable::TABLE_FULL & ceTableRc) ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgTableFull); // The table is full and no other threshold has been met. // We are in a state where we may never hit a MNFG // threshold. Callout all memory behind the MBA. Also, since // the counts are all over the place, there may be a problem // with the MBA. So call it out as well. MemoryMru all_mm ( mbaTrgt, rank, MemoryMruData::CALLOUT_ALL_MEM ); i_sc.service_data->SetCallout( all_mm, MRU_MEDA ); i_sc.service_data->SetCallout( mbaTrgt, MRU_MEDA ); i_sc.service_data->SetServiceCall(); } else if ( 0 != (CenMbaCeTable::ENTRY_TH_REACHED & ceTableRc) ) { i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgEntryCte ); // There is a single entry threshold and no other threshold // has been met. This is a potential flooding issue, so make // the DIMM callout predictive. i_sc.service_data->SetServiceCall(); } } // Initiate a TPS procedure, if needed. if ( doTps ) { // If a MNFG threshold has been reached (predictive callout), we // will still try to start TPS just in case MNFG disables the // termination policy. // Will not be able to do TPS during hostboot. Note that we will // still call handleTdEvent() so we can get the trace statement // indicating TPS was requested during Hostboot. l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, CenMbaTdCtlrCommon::TPS_EVENT ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"handleTdEvent() failed: rank=m%ds%d", rank.getMaster(), rank.getSlave() ); break; } } } } while (0); // Add ECC capture data for FFDC. if ( NULL != mbaChip ) CenMbaCaptureData::addMemEccData( mbaChip, i_sc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Failed: i_membChip=0x%08x i_mbaPos=%d", i_membChip->GetId(), i_mbaPos ); CalloutUtil::defaultError( i_sc ); } return SUCCESS; // Intentionally return SUCCESS for this plugin #undef PRDF_FUNC }
/** * @brief MBSECCFIR[0:7] - Fetch Mark Placed Event (MPE). * @param i_membChip A Centaur chip. * @param i_sc The step code data struct. * @param i_mbaPos The MBA position. * @param i_rank The target rank. * @return SUCCESS */ int32_t AnalyzeFetchMpe( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc, uint32_t i_mbaPos, uint8_t i_rank ) { #define PRDF_FUNC "[AnalyzeFetchMpe] " int32_t l_rc = SUCCESS; ExtensibleChip * mbaChip = NULL; do { CenMembufDataBundle * membdb = getMembufDataBundle( i_membChip ); mbaChip = membdb->getMbaChip( i_mbaPos ); if ( NULL == mbaChip ) { PRDF_ERR( PRDF_FUNC"getMbaChip() returned NULL" ); l_rc = FAIL; break; } CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); TargetHandle_t mbaTrgt = mbaChip->GetChipHandle(); CenAddr addr; l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_MPE_ADDR, addr ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"getCenReadAddr() failed" ); break; } // If the address does not match the rank that reported the attention, // there are multiple MPE attentions and the address was overwritten. // In this case, add an invalid dummy address to the UE table. if ( addr.getRank().getMaster() != i_rank ) { addr = CenAddr( i_rank, 0, 0xffffffff, 0xffffffff, 0xffffffff ); } mbadb->iv_ueTable.addEntry( UE_TABLE::FETCH_MPE, addr ); // Get the current mark in hardware. CenRank rank ( addr.getRank() ); CenMark mark; l_rc = mssGetMarkStore( mbaTrgt, rank, mark ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"mssGetMarkStore() failed"); break; } if ( !mark.getCM().isValid() ) { PRDF_ERR( PRDF_FUNC"FIR bit set but no valid chip mark" ); l_rc = FAIL; break; } // Callout the mark. CalloutUtil::calloutMark( mbaTrgt, rank, mark, i_sc ); // Tell TD controller to handle VCM event. l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, CenMbaTdCtlrCommon::VCM_EVENT ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"handleTdEvent() failed." ); break; } } while (0); // Add ECC capture data for FFDC. if ( NULL != mbaChip ) CenMbaCaptureData::addMemEccData( mbaChip, i_sc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"Failed: i_membChip=0x%08x i_mbaPos=%d i_rank=%d", i_membChip->GetId(), i_mbaPos, i_rank ); CalloutUtil::defaultError( i_sc ); } return SUCCESS; // Intentionally return SUCCESS for this plugin #undef PRDF_FUNC }
int32_t cleanupSecondaryFirBits( ExtensibleChip * i_chip, TYPE i_busType, uint32_t i_busPos ) { int32_t l_rc = SUCCESS; TargetHandle_t mcsTgt = NULL; TargetHandle_t mbTgt = NULL; ExtensibleChip * mcsChip = NULL; ExtensibleChip * mbChip = NULL; //In case of spare deployed attention for DMI bus, we need to clear // secondary MBIFIR[10] and MCIFIR[10] bits. do { if ( i_busType == TYPE_MCS ) { mcsTgt = getConnectedChild( i_chip->GetChipHandle(), TYPE_MCS, i_busPos); if (!mcsTgt) break; mcsChip = ( ExtensibleChip * )systemPtr->GetChip( mcsTgt ); if (!mcsChip) break; mbChip = getMcsDataBundle( mcsChip )->getMembChip(); if (!mbChip) break; mbTgt = mbChip->GetChipHandle(); if (!mbTgt) break; } else if ( i_busType == TYPE_MEMBUF ) { mbTgt = i_chip->GetChipHandle(); if (!mbTgt) break; mcsChip = getMembufDataBundle( i_chip )->getMcsChip(); if (!mcsChip) break; mcsTgt = mcsChip->GetChipHandle(); if (!mcsTgt) break; mbChip = i_chip; } else { // We only need to clean secondary FIR bits for DMI bus l_rc = SUCCESS; break; } SCAN_COMM_REGISTER_CLASS * mciAnd = mcsChip->getRegister("MCIFIR_AND"); SCAN_COMM_REGISTER_CLASS * mbiAnd = mbChip->getRegister( "MBIFIR_AND"); mciAnd->setAllBits(); mciAnd->ClearBit(10); mbiAnd->setAllBits(); mbiAnd->ClearBit(10); l_rc = mciAnd->Write(); l_rc |= mbiAnd->Write(); if ( SUCCESS != l_rc ) { PRDF_ERR( "[cleanupSecondaryFirBits] Write() failed on " "MCIFIR/MBIFIR: MCS=0x%08x MEMB=0x%08x", mcsChip->GetId(), mbChip->GetId() ); break; } } while (0); return l_rc; }