Example #1
0
int32_t analyzeIplCEStats( TargetHandle_t i_mba, bool &o_calloutMade )
{
    #define PRDF_FUNC "PRDF::analyzeIplCEStats"

    PRDF_ENTER( PRDF_FUNC"(0x%08x)", getHuid(i_mba) );

    // will unlock when going out of scope
    PRDF_SYSTEM_SCOPELOCK;

    int32_t o_rc = SUCCESS;
    o_calloutMade = false;

    ExtensibleChip * mbaChip = (ExtensibleChip *)systemPtr->GetChip( i_mba );
    CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip );

    o_rc = mbadb->getIplCeStats()->analyzeStats( o_calloutMade );

    if ( SUCCESS != o_rc )
    {
        PRDF_ERR( "["PRDF_FUNC"] analyzeStats() failed");

        // Get user data
        uint64_t ud12 = PRDF_GET_UINT64_FROM_UINT32( getHuid(i_mba),      0 );
        uint64_t ud34 = PRDF_GET_UINT64_FROM_UINT32( PRDFSIG_MnfgIplFail, 0 );

        // Create error log
        errlHndl_t errl = new ERRORLOG::ErrlEntry(
                                ERRORLOG::ERRL_SEV_PREDICTIVE, // severity
                                PRDF_MNFG_IPL_CE_ANALYSIS,     // module ID
                                PRDF_DETECTED_FAIL_SOFTWARE,   // reason code
                                ud12, ud34 );                  // user data 1-4

        // Add 2nd level support
        errl->addProcedureCallout( EPUB_PRC_LVL_SUPP, SRCI_PRIORITY_HIGH );

        // Add traces
        errl->collectTrace( PRDF_COMP_NAME, 512 );

        // Commit the error log
        ERRORLOG::errlCommit( errl, PRDF_COMP_ID );
    }

    PRDF_EXIT( PRDF_FUNC"(0x%08x), o_calloutMade:%u",
               getHuid(i_mba), o_calloutMade );

    return o_rc;

    #undef PRDF_FUNC
}
Example #2
0
int32_t CenMbaTdCtlr::analyzeTpsPhase2( STEP_CODE_DATA_STRUCT & io_sc,
                                        const CenAddr & i_stopAddr,
                                        const CenAddr & i_endAddr )
{
    #define PRDF_FUNC "[CenMbaTdCtlr::analyzeTpsPhase2] "

    int32_t o_rc = SUCCESS;

    do
    {
        if ( TPS_PHASE_2 != iv_tdState )
        {
            PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" );
            o_rc = FAIL; break;
        }

        CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );

        o_rc = mbadb->getIplCeStats()->calloutHardCes( iv_rank );
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"calloutHardCes() failed");
            break;
        }

        // Get error condition which caused command to stop
        uint16_t eccErrorMask = NO_ERROR;
        o_rc = checkEccErrors( eccErrorMask, io_sc );
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"checkEccErrors() failed" );
            break;
        }

        if ( ( eccErrorMask & UE ) || ( eccErrorMask & RETRY_CTE ))
        {
            // Handle UE. Highest priority
            o_rc = handleUE( io_sc );
            if ( SUCCESS != o_rc )
            {
                PRDF_ERR( PRDF_FUNC"handleUE() failed" );
                break;
            }
        }
        else if ( eccErrorMask & MPE )
        {
            o_rc = handleMPE( io_sc );
            if ( SUCCESS != o_rc )
            {
                PRDF_ERR( PRDF_FUNC"handleMPE() failed");
                break;
            }
        }
        else
        {
            // No error found so add rank to callout list, just in case.
            MemoryMru memmru (iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK);
            io_sc.service_data->SetCallout( memmru );

            io_sc.service_data->AddSignatureList( iv_mbaTrgt,
                                                  PRDFSIG_EndTpsPhase2 );
            iv_tdState = NO_OP;
        }

    } while (0);

    return o_rc;

    #undef PRDF_FUNC
}
Example #3
0
int32_t CenMbaTdCtlr::handleUE( STEP_CODE_DATA_STRUCT & io_sc )
{
    #define PRDF_FUNC "[CenMbaTdCtlr::handleUE] "

    using namespace CalloutUtil;

    int32_t o_rc = SUCCESS;

    iv_tdState = NO_OP; // Abort the TD procedure.

    setTdSignature( io_sc, PRDFSIG_MaintUE );
    io_sc.service_data->SetServiceCall();

    CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );

    do
    {
        // Clean up the maintenance command. This is needed just in case the UE
        // isolation procedure is modified to use maintenance commands.
        o_rc = cleanupPrevCmd();
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" );
            break;
        }

        // Look for all failing bits on this rank.
        CenDqBitmap bitmap;
        o_rc = mssIplUeIsolation( iv_mbaTrgt, iv_rank, bitmap );
        if ( SUCCESS != o_rc )
        {
            PRDF_ERR( PRDF_FUNC"mssIplUeIsolation() failed" );
            break;
        }

        // Add UE data to capture data.
        bitmap.getCaptureData( io_sc.service_data->GetCaptureData() );

        // Callout the failing DIMMs.
        TargetHandleList callouts;
        for ( int32_t ps = 0; ps < PORT_SLCT_PER_MBA; ps++ )
        {
            bool badDqs = false;
            o_rc = bitmap.badDqs( ps, badDqs );
            if ( SUCCESS != o_rc )
            {
                PRDF_ERR( PRDF_FUNC"badDqs(%d) failed", ps );
                break;
            }

            if ( !badDqs ) continue; // nothing to do.

            TargetHandleList dimms = getConnectedDimms(iv_mbaTrgt, iv_rank, ps);
            if ( 0 == dimms.size() )
            {
                PRDF_ERR( PRDF_FUNC"getConnectedDimms(%d) failed", ps );
                o_rc = FAIL; break;
            }

            callouts.insert( callouts.end(), dimms.begin(), dimms.end() );

            if ( isMfgCeCheckingEnabled() )
            {
                // As we are doing callout for UE, we dont need to do callout
                // during CE for this rank on given port
                mbadb->getIplCeStats()->banAnalysis( iv_rank, ps );
            }
        }
        if ( SUCCESS != o_rc ) break;

        if ( 0 == callouts.size() )
        {
            // It is possible the scrub counters have rolled over to zero due to
            // a known DD1.0 hardware bug. In this case, the best we can do is
            // callout both DIMMs, because at minimum we know there was a UE, we
            // just don't know where.
            // NOTE: If this condition happens because of a DD2.0+ bug, the
            //       mssIplUeIsolation procedure will callout the Centaur.
            callouts = getConnectedDimms( iv_mbaTrgt, iv_rank );
            if ( 0 == callouts.size() )
            {
                PRDF_ERR( PRDF_FUNC"getConnectedDimms() failed" );
                o_rc = FAIL; break;
            }

            if ( isMfgCeCheckingEnabled() )
            {
                // As we are doing callout for UE, we dont need to do callout
                // during CE for this rank on both port
                mbadb->getIplCeStats()->banAnalysis( iv_rank);
            }
        }

        // Callout all DIMMs in the list.
        for ( TargetHandleList::iterator i = callouts.begin();
              i != callouts.end(); i++ )
        {
            io_sc.service_data->SetCallout( *i, MRU_HIGH );
        }

    } while(0);

    return o_rc;

    #undef PRDF_FUNC
}