Example #1
0
int32_t analyzeIplCEStats( TargetHandle_t i_mba, bool &o_calloutMade )
{
    #define PRDF_FUNC "PRDF::analyzeIplCEStats"

    PRDF_ENTER( PRDF_FUNC"(0x%08x)", getHuid(i_mba) );

    // will unlock when going out of scope
    PRDF_SYSTEM_SCOPELOCK;

    int32_t o_rc = SUCCESS;
    o_calloutMade = false;

    ExtensibleChip * mbaChip = (ExtensibleChip *)systemPtr->GetChip( i_mba );
    CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip );

    o_rc = mbadb->getIplCeStats()->analyzeStats( o_calloutMade );

    if ( SUCCESS != o_rc )
    {
        PRDF_ERR( "["PRDF_FUNC"] analyzeStats() failed");

        // Get user data
        uint64_t ud12 = PRDF_GET_UINT64_FROM_UINT32( getHuid(i_mba),      0 );
        uint64_t ud34 = PRDF_GET_UINT64_FROM_UINT32( PRDFSIG_MnfgIplFail, 0 );

        // Create error log
        errlHndl_t errl = new ERRORLOG::ErrlEntry(
                                ERRORLOG::ERRL_SEV_PREDICTIVE, // severity
                                PRDF_MNFG_IPL_CE_ANALYSIS,     // module ID
                                PRDF_DETECTED_FAIL_SOFTWARE,   // reason code
                                ud12, ud34 );                  // user data 1-4

        // Add 2nd level support
        errl->addProcedureCallout( EPUB_PRC_LVL_SUPP, SRCI_PRIORITY_HIGH );

        // Add traces
        errl->collectTrace( PRDF_COMP_NAME, 512 );

        // Commit the error log
        ERRORLOG::errlCommit( errl, PRDF_COMP_ID );
    }

    PRDF_EXIT( PRDF_FUNC"(0x%08x), o_calloutMade:%u",
               getHuid(i_mba), o_calloutMade );

    return o_rc;

    #undef PRDF_FUNC
}
Example #2
0
/**
 * @brief  Plugin to mask the side effects of an RCD parity error
 * @param  i_mbaChip A Centaur MBA chip.
 * @param  i_sc      The step code data struct.
 * @return SUCCESS
 */
int32_t maskRcdParitySideEffects( ExtensibleChip * i_mbaChip,
                                    STEP_CODE_DATA_STRUCT & i_sc )
{
    #define PRDF_FUNC "[maskRcdParitySideEffects] "

    int32_t l_rc = SUCCESS;

    do
    {
        //use a data bundle to get the membuf chip
        CenMbaDataBundle * mbadb = getMbaDataBundle( i_mbaChip );
        ExtensibleChip * membChip = mbadb->getMembChip();
        if (NULL == membChip)
        {
            PRDF_ERR(PRDF_FUNC "getMembChip() failed");
            break;
        }

        //get the masks for each FIR
        SCAN_COMM_REGISTER_CLASS * mbsFirMaskOr =
            membChip->getRegister("MBSFIR_MASK_OR");
        SCAN_COMM_REGISTER_CLASS * mbaCalMaskOr =
            i_mbaChip->getRegister("MBACALFIR_MASK_OR");
        SCAN_COMM_REGISTER_CLASS * mbaFirMaskOr =
            i_mbaChip->getRegister("MBAFIR_MASK_OR");

        mbaFirMaskOr->SetBit(2);
        mbaCalMaskOr->SetBit(2);
        mbaCalMaskOr->SetBit(17);
        mbsFirMaskOr->SetBit(4);

        l_rc =  mbaFirMaskOr->Write();
        l_rc |= mbaCalMaskOr->Write();
        l_rc |= mbsFirMaskOr->Write();

        if (SUCCESS != l_rc)
        {
            PRDF_ERR(PRDF_FUNC "MBAFIR_MASK_OR/MBACALFIR_MASK_OR/MBSFIR_MASK_OR"
                    " write failed for 0x%08x", i_mbaChip->GetId());
            break;
        }
    }while(0);

    return SUCCESS;
    #undef PRDF_FUNC
}
int32_t CenMbaTdCtlrCommon::initialize()
{
    #define PRDF_FUNC "[CenMbaTdCtlrCommon::initialize] "

    int32_t o_rc = SUCCESS;

    do
    {
        // Set iv_mbaTrgt
        iv_mbaTrgt = iv_mbaChip->GetChipHandle();

        // Validate iv_mbaChip.
        if ( TYPE_MBA != getTargetType(iv_mbaTrgt) )
        {
            PRDF_ERR( PRDF_FUNC "iv_mbaChip is not TYPE_MBA" );
            o_rc = FAIL; break;
        }

        // Set iv_membChip.
        CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );
        iv_membChip = mbadb->getMembChip();
        if ( NULL == iv_membChip )
        {
            PRDF_ERR( PRDF_FUNC "getMembChip() failed" );
            o_rc = FAIL; break;
        }

        // Set iv_mbaPos.
        iv_mbaPos = getTargetPosition( iv_mbaTrgt );
        if ( MAX_MBA_PER_MEMBUF <= iv_mbaPos )
        {
            PRDF_ERR( PRDF_FUNC "iv_mbaPos=%d is invalid", iv_mbaPos );
            o_rc = FAIL; break;
        }

        // Set iv_x4Dimm.
        iv_x4Dimm = isDramWidthX4(iv_mbaTrgt);

    } while (0);

    return o_rc;

    #undef PRDF_FUNC
}
Example #4
0
int32_t getDramSize( ExtensibleChip *i_mbaChip, uint8_t & o_size )
{
    #define PRDF_FUNC "[MemUtils::getDramSize] "

    int32_t o_rc = SUCCESS;
    o_size = SIZE_2GB;

    do
    {
        TargetHandle_t mbaTrgt = i_mbaChip->GetChipHandle();
        CenMbaDataBundle * mbadb = getMbaDataBundle( i_mbaChip );
        ExtensibleChip * membufChip = mbadb->getMembChip();
        if ( NULL == membufChip )
        {
            PRDF_ERR( PRDF_FUNC "getMembChip() failed: MBA=0x%08x",
                      getHuid(mbaTrgt) );
            o_rc = FAIL; break;
        }

        uint32_t pos = getTargetPosition(mbaTrgt);
        const char * reg_str = (0 == pos) ? "MBA0_MBAXCR" : "MBA1_MBAXCR";

        SCAN_COMM_REGISTER_CLASS * reg = membufChip->getRegister( reg_str );
        o_rc = reg->Read();
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC "Read() failed on %s. Target=0x%08x",
                      reg_str, getHuid(mbaTrgt) );
            break;
        }
        o_size = reg->GetBitFieldJustified( 6, 2 );

    } while(0);

    return o_rc;

    #undef PRDF_FUNC
}
Example #5
0
int32_t collectCeStats( ExtensibleChip * i_mbaChip, const CenRank & i_rank,
                        MaintSymbols & o_maintStats, CenSymbol & o_chipMark,
                        uint8_t i_thr )
{
    #define PRDF_FUNC "[MemUtils::collectCeStats] "

    int32_t o_rc = SUCCESS;

    o_chipMark = CenSymbol(); // Initially invalid.

    do
    {
        if ( 0 == i_thr ) // Must be non-zero
        {
            PRDF_ERR( PRDF_FUNC "i_thr %d is invalid", i_thr );
            o_rc = FAIL; break;
        }

        TargetHandle_t mbaTrgt = i_mbaChip->GetChipHandle();
        CenMbaDataBundle * mbadb = getMbaDataBundle( i_mbaChip );
        ExtensibleChip * membufChip = mbadb->getMembChip();
        if ( NULL == membufChip )
        {
            PRDF_ERR( PRDF_FUNC "getMembChip() failed" );
            o_rc = FAIL; break;
        }

        uint8_t mbaPos = getTargetPosition( mbaTrgt );
        if ( MAX_MBA_PER_MEMBUF <= mbaPos )
        {
            PRDF_ERR( PRDF_FUNC "mbaPos %d is invalid", mbaPos );
            o_rc = FAIL; break;
        }

        const bool isX4 = isDramWidthX4(mbaTrgt);

        // Get the current spares on this rank.
        CenSymbol sp0, sp1, ecc;
        o_rc = mssGetSteerMux( mbaTrgt, i_rank, sp0, sp1, ecc );
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC "mssGetSteerMux() failed." );
            break;
        }

        // Use this map to keep track of the total counts per DRAM.
        DramCountMap dramCounts;

        const char * reg_str = NULL;
        SCAN_COMM_REGISTER_CLASS * reg = NULL;

        for ( uint8_t regIdx = 0; regIdx < CE_REGS_PER_MBA; regIdx++ )
        {
            reg_str = mbsCeStatReg[mbaPos][regIdx];
            reg     = membufChip->getRegister( reg_str );

            o_rc = reg->Read();
            if ( SUCCESS != o_rc )
            {
                PRDF_ERR( PRDF_FUNC "Read() failed on %s", reg_str );
                break;
            }

            uint8_t baseSymbol = SYMBOLS_PER_CE_REG * regIdx;

            for ( uint8_t i = 0; i < SYMBOLS_PER_CE_REG; i++ )
            {
                uint8_t count = reg->GetBitFieldJustified( (i*8), 8 );

                if ( 0 == count ) continue; // nothing to do

                uint8_t sym  = baseSymbol + i;
                uint8_t dram = symbol2Dram( sym, isX4 );

                // Keep track of the total DRAM counts.
                dramCounts[dram].totalCount += count;

                // Add any symbols that have exceeded threshold to the list.
                if ( i_thr <= count )
                {
                    // Keep track of the total number of symbols per DRAM that
                    // have exceeded threshold.
                    dramCounts[dram].symbolCount++;

                    SymbolData symData;
                    symData.symbol = CenSymbol::fromSymbol( mbaTrgt, i_rank,
                                            sym, CEN_SYMBOL::BOTH_SYMBOL_DQS );
                    if ( !symData.symbol.isValid() )
                    {
                        PRDF_ERR( PRDF_FUNC "CenSymbol() failed: symbol=%d",
                                  sym );
                        o_rc = FAIL;
                        break;
                    }
                    else
                    {
                        // Check if this symbol is on any of the spares.
                        if ( ( sp0.isValid() &&
                               (sp0.getDram() == symData.symbol.getDram()) ) ||
                             ( sp1.isValid() &&
                               (sp1.getDram() == symData.symbol.getDram()) ) )
                        {
                            symData.symbol.setDramSpared();
                        }
                        if ( ecc.isValid() &&
                             (ecc.getDram() == symData.symbol.getDram()) )
                        {
                            symData.symbol.setEccSpared();
                        }

                        // Add the symbol to the list.
                        symData.count = count;
                        o_maintStats.push_back( symData );
                    }
                }
            }
            if ( SUCCESS != o_rc ) break;
        }
        if ( SUCCESS != o_rc ) break;

        if ( o_maintStats.empty() ) break; // no need to continue

        // Sort the list of symbols.
        std::sort( o_maintStats.begin(), o_maintStats.end(), sortSymDataCount );

        // Get the DRAM with the highest count.
        uint32_t highestDram  = 0;
        uint32_t highestCount = 0;
        const uint32_t symbolTH = isX4 ? 1 : 2;
        for ( DramCountMap::iterator it = dramCounts.begin();
              it != dramCounts.end(); ++it )
        {
            if ( (symbolTH     <= it->second.symbolCount) &&
                 (highestCount <  it->second.totalCount ) )
            {
                highestDram  = it->first;
                highestCount = it->second.totalCount;
            }
        }

        if ( 0 != highestCount )
        {
            uint8_t sym = dram2Symbol( highestDram, isX4 );
            o_chipMark  = CenSymbol::fromSymbol( mbaTrgt, i_rank, sym );

            // Check if this symbol is on any of the spares.
            if ( ( sp0.isValid() && (sp0.getDram() == o_chipMark.getDram()) ) ||
                 ( sp1.isValid() && (sp1.getDram() == o_chipMark.getDram()) ) )
            {
                o_chipMark.setDramSpared();
            }
            if ( ecc.isValid() && (ecc.getDram() == o_chipMark.getDram()) )
            {
                o_chipMark.setEccSpared();
            }
        }

    } while(0);

    if ( SUCCESS != o_rc )
    {
        PRDF_ERR( PRDF_FUNC "Failed: i_mbaChip=0x%08x i_rank=m%ds%d i_thr=%d",
                  i_mbaChip->GetId(), i_rank.getMaster(), i_rank.getSlave(),
                  i_thr );
    }

    return o_rc;

    #undef PRDF_FUNC
}
Example #6
0
int32_t CenMbaTdCtlr::analyzeTpsPhase2( STEP_CODE_DATA_STRUCT & io_sc,
                                        const CenAddr & i_stopAddr,
                                        const CenAddr & i_endAddr )
{
    #define PRDF_FUNC "[CenMbaTdCtlr::analyzeTpsPhase2] "

    int32_t o_rc = SUCCESS;

    do
    {
        if ( TPS_PHASE_2 != iv_tdState )
        {
            PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" );
            o_rc = FAIL; break;
        }

        CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );

        o_rc = mbadb->getIplCeStats()->calloutHardCes( iv_rank );
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"calloutHardCes() failed");
            break;
        }

        // Get error condition which caused command to stop
        uint16_t eccErrorMask = NO_ERROR;
        o_rc = checkEccErrors( eccErrorMask, io_sc );
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"checkEccErrors() failed" );
            break;
        }

        if ( ( eccErrorMask & UE ) || ( eccErrorMask & RETRY_CTE ))
        {
            // Handle UE. Highest priority
            o_rc = handleUE( io_sc );
            if ( SUCCESS != o_rc )
            {
                PRDF_ERR( PRDF_FUNC"handleUE() failed" );
                break;
            }
        }
        else if ( eccErrorMask & MPE )
        {
            o_rc = handleMPE( io_sc );
            if ( SUCCESS != o_rc )
            {
                PRDF_ERR( PRDF_FUNC"handleMPE() failed");
                break;
            }
        }
        else
        {
            // No error found so add rank to callout list, just in case.
            MemoryMru memmru (iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK);
            io_sc.service_data->SetCallout( memmru );

            io_sc.service_data->AddSignatureList( iv_mbaTrgt,
                                                  PRDFSIG_EndTpsPhase2 );
            iv_tdState = NO_OP;
        }

    } while (0);

    return o_rc;

    #undef PRDF_FUNC
}
Example #7
0
int32_t CenMbaTdCtlr::handleUE( STEP_CODE_DATA_STRUCT & io_sc )
{
    #define PRDF_FUNC "[CenMbaTdCtlr::handleUE] "

    using namespace CalloutUtil;

    int32_t o_rc = SUCCESS;

    iv_tdState = NO_OP; // Abort the TD procedure.

    setTdSignature( io_sc, PRDFSIG_MaintUE );
    io_sc.service_data->SetServiceCall();

    CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );

    do
    {
        // Clean up the maintenance command. This is needed just in case the UE
        // isolation procedure is modified to use maintenance commands.
        o_rc = cleanupPrevCmd();
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" );
            break;
        }

        // Look for all failing bits on this rank.
        CenDqBitmap bitmap;
        o_rc = mssIplUeIsolation( iv_mbaTrgt, iv_rank, bitmap );
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"mssIplUeIsolation() failed" );
            break;
        }

        // Add UE data to capture data.
        bitmap.getCaptureData( io_sc.service_data->GetCaptureData() );

        // Callout the failing DIMMs.
        TargetHandleList callouts;
        for ( int32_t ps = 0; ps < PORT_SLCT_PER_MBA; ps++ )
        {
            bool badDqs = false;
            o_rc = bitmap.badDqs( ps, badDqs );
            if ( SUCCESS != o_rc )
            {
                PRDF_ERR( PRDF_FUNC"badDqs(%d) failed", ps );
                break;
            }

            if ( !badDqs ) continue; // nothing to do.

            TargetHandleList dimms = getConnectedDimms(iv_mbaTrgt, iv_rank, ps);
            if ( 0 == dimms.size() )
            {
                PRDF_ERR( PRDF_FUNC"getConnectedDimms(%d) failed", ps );
                o_rc = FAIL; break;
            }

            callouts.insert( callouts.end(), dimms.begin(), dimms.end() );

            if ( isMfgCeCheckingEnabled() )
            {
                // As we are doing callout for UE, we dont need to do callout
                // during CE for this rank on given port
                mbadb->getIplCeStats()->banAnalysis( iv_rank, ps );
            }
        }
        if ( SUCCESS != o_rc ) break;

        if ( 0 == callouts.size() )
        {
            // It is possible the scrub counters have rolled over to zero due to
            // a known DD1.0 hardware bug. In this case, the best we can do is
            // callout both DIMMs, because at minimum we know there was a UE, we
            // just don't know where.
            // NOTE: If this condition happens because of a DD2.0+ bug, the
            //       mssIplUeIsolation procedure will callout the Centaur.
            callouts = getConnectedDimms( iv_mbaTrgt, iv_rank );
            if ( 0 == callouts.size() )
            {
                PRDF_ERR( PRDF_FUNC"getConnectedDimms() failed" );
                o_rc = FAIL; break;
            }

            if ( isMfgCeCheckingEnabled() )
            {
                // As we are doing callout for UE, we dont need to do callout
                // during CE for this rank on both port
                mbadb->getIplCeStats()->banAnalysis( iv_rank);
            }
        }

        // Callout all DIMMs in the list.
        for ( TargetHandleList::iterator i = callouts.begin();
              i != callouts.end(); i++ )
        {
            io_sc.service_data->SetCallout( *i, MRU_HIGH );
        }

    } while(0);

    return o_rc;

    #undef PRDF_FUNC
}