// Function Specification // // Name: errlTestSetErrlSevToInfo // // Description: errlTestSetErrlSevToInfo // // End Function Specification uint32_t errlTestSetErrlSevToInfo() { uint32_t l_rc = 0; ERRL_DBG("START"); do { errlHndl_t l_handle = NULL; /****************************************************/ // Check setErrlSevToInfo // Create ERRL_SEV_PREDICTIVE log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Add callout addCalloutToErrl(l_handle,ERRL_CALLOUT_TYPE_HUID,0x00,ERRL_CALLOUT_PRIORITY_LOW); CHECK_CONDITION( l_handle->iv_numCallouts == 1, l_rc); // Call setErrlSevToInfo. Callouts within log should be cleared and // iv_severity should be set to ERRL_SEV_INFORMATIONAL setErrlSevToInfo(l_handle); CHECK_CONDITION( (l_handle->iv_numCallouts == 0) && (l_handle->iv_severity == ERRL_SEV_INFORMATIONAL), l_rc); deleteErrl( &l_handle ); ppdumpslot(); /****************************************************/ // Check setErrlSevToInfo after errl is committed // Create log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); errlHndl_t l_log = l_handle; // Add callout addCalloutToErrl(l_handle,ERRL_CALLOUT_TYPE_HUID,0x00,ERRL_CALLOUT_PRIORITY_LOW); CHECK_CONDITION( l_handle->iv_numCallouts == 1, l_rc); // Commit log and call setErrlSevToInfo. But setErrlSevToInfo will do nothing commitErrl( &l_handle ); setErrlSevToInfo(l_handle); CHECK_CONDITION( (l_log->iv_numCallouts == ERRL_MAX_CALLOUTS) && (l_log->iv_severity == ERRL_SEV_PREDICTIVE), l_rc); deleteErrl(&l_log); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Called after a failure to read a DIMM temperature. The error will // be counted and if threshold is reached, and error will be created with // the DIMM as a callout and then set flag to trigger I2C reset void mark_dimm_failed() { const uint8_t port = G_dimm_sm_args.i2cPort; const uint8_t dimm = G_dimm_sm_args.dimm; INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X " "(ffdc 0x%08X%08X, completion_state 0x%02X)", DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount, WORD_HIGH(G_dimm_sm_args.error.ffdc), WORD_LOW(G_dimm_sm_args.error.ffdc), G_dimm_sm_request.request.completion_state); if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS) { // Disable collection on this DIMM, collect FFDC and log error G_dimm[port][dimm].disabled = true; INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to %d consecutive errors (state=%d)", DIMM_AND_PORT, G_dimm[port][dimm].errorCount, G_dimm_sm_args.state); errlHndl_t l_err = NULL; /* * @errortype * @moduleid DIMM_MID_MARK_DIMM_FAILED * @reasoncode DIMM_GPE_FAILURE * @userdata1 GPE returned rc code * @userdata4 ERC_DIMM_COMPLETE_FAILURE * @devdesc Failure writing dimm i2c mode register */ l_err = createErrl(DIMM_MID_MARK_DIMM_FAILED, DIMM_GPE_FAILURE, ERC_DIMM_COMPLETE_FAILURE, ERRL_SEV_INFORMATIONAL, NULL, DEFAULT_TRACE_SIZE, G_dimm_sm_args.error.rc, 0); addUsrDtlsToErrl(l_err, (uint8_t*)&G_dimm_sm_request.ffdc, sizeof(G_dimm_sm_request.ffdc), ERRL_STRUCT_VERSION_1, ERRL_USR_DTL_BINARY_DATA); addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.dimm_huids[port][dimm], ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } // Reset DIMM I2C engine G_dimm_i2c_reset_required = true; G_dimm_i2c_reset_cause = port<<24 | dimm<<16 | (G_dimm_sm_args.error.rc & 0xFFFF); G_dimm_state = DIMM_STATE_RESET_MASTER; } // end mark_dimm_failed()
// Function Specification // // Name: amec_update_vrm_sensors // // Description: Updates sensors that use data from the VRMs // (e.g., VR_FAN, FANS_FULL_SPEED, VR_HOT). // // Thread: RealTime Loop // // End Function Specification void amec_update_vrm_sensors(void) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ int l_rc = 0; int l_vrfan = 0; int l_softoc = 0; int l_minus_np1_regmode = 0; int l_minus_n_regmode = 0; static uint8_t L_error_count = 0; uint8_t l_pin = 0; uint8_t l_pin_value = 1; // active low, so set default to high uint8_t l_vrhot_count = 0; errlHndl_t l_err = NULL; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // Check if we have access to SPIVID. In DCMs only Master OCC has access to // the SPIVID. if (G_dcm_occ_role == OCC_DCM_MASTER) { // VR_FAN and SOFT_OC come from SPIVID l_rc = vrm_read_state(SPIVRM_PORT(0), &l_minus_np1_regmode, &l_minus_n_regmode, &l_vrfan, &l_softoc); if (l_rc == 0) { // Update the VR_FAN sensor sensor_update( AMECSENSOR_PTR(VRFAN250USPROC), (uint16_t)l_vrfan ); // Clear our error count and the 'read failure' flag (since we can // read VR_FAN signal) L_error_count = 0; G_thrm_fru_data[DATA_FRU_VRM].read_failure = 0; // Obtain the 'fan_full_speed' GPIO from APSS l_pin = G_sysConfigData.apss_gpio_map.fans_full_speed; // No longer reading gpio from APSS in GA1 due to instability in // APSS composite mode //apss_gpio_get(l_pin, &l_pin_value); // VR_HOT sensor is a counter of number of times the VRHOT signal // has been asserted l_vrhot_count = AMECSENSOR_PTR(VRHOT250USPROC)->sample; // Check if VR_FAN is asserted AND if 'fans_full_speed' GPIO is ON. // Note that this GPIO is active low. if (AMECSENSOR_PTR(VRFAN250USPROC)->sample && !(l_pin_value)) { // VR_FAN is asserted and 'fans_full_speed' GPIO is ON, // then increment our VR_HOT counter if (l_vrhot_count < g_amec->vrhotproc.setpoint) { l_vrhot_count++; } } else { // Reset our VR_HOT counter l_vrhot_count = 0; } sensor_update(AMECSENSOR_PTR(VRHOT250USPROC), l_vrhot_count); } else { // Increment our error count L_error_count++; // Don't allow the error count to wrap if (L_error_count == 0) { L_error_count = 0xFF; } // Log an error if we exceeded our number of fail-to-read sensor if ((L_error_count == g_amec->proc[0].vrfan_error_count) && (g_amec->proc[0].vrfan_error_count != 0xFF)) { TRAC_ERR("amec_update_vrm_sensors: Failed to read VR_FAN for %u consecutive times!", L_error_count); // Also, inform the thermal thread to send a cooling request G_thrm_fru_data[DATA_FRU_VRM].read_failure = 1; /* @ * @errortype * @moduleid AMEC_HEALTH_CHECK_VRFAN_TIMEOUT * @reasoncode VRM_VRFAN_TIMEOUT * @userdata1 timeout value * @userdata2 0 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failed to read VR_FAN signal from regulator. * */ l_err = createErrl(AMEC_HEALTH_CHECK_VRFAN_TIMEOUT, //modId VRM_VRFAN_TIMEOUT, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size g_amec->thermaldimm.temp_timeout, //userdata1 0); //userdata2 // Callout backplane for this VRM error addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.backplane_huid, ERRL_CALLOUT_PRIORITY_MED); // Commit the error commitErrl(&l_err); } } } if( 1 ) { sensor_update( AMECSENSOR_PTR(VRFAN250USMEM), 0 ); sensor_update( AMECSENSOR_PTR(VRHOT250USMEM), 0 ); } }
// Check and update lock ownership for the specified i2c engine. // Returns true if OCC owns the lock, or false if host owns lock // // If host has requesed the i2c lock, it will be released and an external interrupt // will be generated/queued and function will return false. // If the host has not released the lock, function will return false. // If the host cleared its lock bit, OCC will take back ownership and return true. // bool check_and_update_i2c_lock(const uint8_t i_engine) { bool occ_owns_lock = true; if ((PIB_I2C_ENGINE_E == i_engine) || (PIB_I2C_ENGINE_D == i_engine) || (PIB_I2C_ENGINE_C == i_engine)) { bool needRetry = false; do { ocb_occflg_t original_occflags; original_occflags.value = in32(OCB_OCCFLG); LOCK_DBG("check_and_update_i2c_lock: I2C engine %d - host=%d, occ=%d (dimmTick=%d)", i_engine, original_occflags.fields.i2c_engine3_lock_host, original_occflags.fields.i2c_engine3_lock_occ, DIMM_TICK); if (occ_owns_i2c_lock(original_occflags, i_engine)) { if (host_wants_i2c_lock(original_occflags, i_engine)) { // Host requested lock, clear the OCC lock and notify host update_i2c_lock(LOCK_RELEASE, i_engine); occ_owns_lock = false; } // else OCC already owns the lock } else { // OCC does not own the lock occ_owns_lock = false; if (false == host_wants_i2c_lock(original_occflags, i_engine)) { // Host is not requesting the lock, acquire lock for OCC update_i2c_lock(LOCK_ACQUIRE, i_engine); occ_owns_lock = true; } // else Host still holds the lock } if ((occ_owns_lock) && (original_occflags.fields.i2c_engine1_lock_host == 0) && (original_occflags.fields.i2c_engine1_lock_occ == 0)) { // If neither lock bit is set, we must read back the register to make // sure the host did not set at same time (lock conflict) ocb_occflg_t verify_occflags; verify_occflags.value = in32(OCB_OCCFLG); if (host_wants_i2c_lock(verify_occflags, i_engine)) { // Host wrote their lock bit at same time, clear OCC lock and notify host update_i2c_lock(LOCK_RELEASE, i_engine); occ_owns_lock = false; } else { if (false == occ_owns_i2c_lock(verify_occflags, i_engine)) { // ERROR - OCC OWNERSHIP BIT DID NOT GET SET INTR_TRAC_ERR("check_and_update_i2c_lock: I2C lock bit did not get set (OCCFLAGS reg: 0x%08X)", verify_occflags.value); if (needRetry) { // After one retry, log error and goto safe /* * @errortype * @moduleid I2C_LOCK_UPDATE * @reasoncode OCI_WRITE_FAILURE * @userdata1 I2C engine number * @userdata2 OCC Flags register * @devdesc OCI write failure setting I2C ownership bit */ errlHndl_t err = createErrl(I2C_LOCK_UPDATE, OCI_WRITE_FAILURE, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, i_engine, verify_occflags.value); //Callout firmware addCalloutToErrl(err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_MED); //Callout processor addCalloutToErrl(err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_LOW); REQUEST_RESET(err); occ_owns_lock = false; break; } needRetry = true; } // else verify succeeded (OCC owns lock) } } } while (needRetry); } else { // Invalid engine INTR_TRAC_ERR("check_and_update_i2c_lock: Invalid engine specified: 0x%02X", i_engine); } return occ_owns_lock; } // end check_and_update_i2c_lock()
// Verifies that each core is at the correct frequency after they have had // time to stabilize void amec_verify_pstate() { uint8_t l_core = 0; int8_t l_pstate_from_fmax = 0; gpe_bulk_core_data_t * l_core_data_ptr; pmc_pmsr_ffcdc_data_t l_pmc_pmsr_ffdc; errlHndl_t l_err = NULL; if ( (G_time_until_freq_check == 0) && ( CURRENT_MODE() != OCC_MODE_DYN_POWER_SAVE ) && ( CURRENT_MODE() != OCC_MODE_DYN_POWER_SAVE_FP ) && (!G_sysConfigData.system_type.kvm)) { // Reset the counter G_time_until_freq_check = FREQ_CHG_CHECK_TIME; // Convert fmax to the corresponding pstate l_pstate_from_fmax = proc_freq2pstate(g_amec->sys.fmax); for( l_core = 0; l_core < MAX_NUM_CORES; l_core++ ) { // If the core isn't present, skip it if(!CORE_PRESENT(l_core)) { l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].value = 0; continue; } // Get pointer to core data l_core_data_ptr = proc_get_bulk_core_data_ptr(l_core); // Get the core's pmsr data l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core] = l_core_data_ptr->pcb_slave.pmsr; // Verify that the core is running at the correct frequency // If not, log an error if( (l_pstate_from_fmax != l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].fields.local_pstate_actual) && (l_pstate_from_fmax > l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].fields.pv_min) && (l_err == NULL) ) { TRAC_ERR("Frequency mismatch in core %d: actual_ps[%d] req_ps[%d] fmax[%d] mode[%d].", l_core, l_pmc_pmsr_ffdc.pmsr_ffdc_data.data[l_core].fields.local_pstate_actual, l_pstate_from_fmax, g_amec->sys.fmax, CURRENT_MODE()); fill_pmc_ffdc_buffer(&l_pmc_pmsr_ffdc.pmc_ffcdc_data); /* @ * @moduleid AMEC_VERIFY_FREQ_MID * @reasonCode TARGET_FREQ_FAILURE * @severity ERRL_SEV_PREDICTIVE * @userdata1 0 * @userdata2 0 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc A core is not running at the expected frequency */ l_err = createErrl( AMEC_VERIFY_FREQ_MID, // i_modId, TARGET_FREQ_FAILURE, // i_reasonCode, OCC_NO_EXTENDED_RC, ERRL_SEV_UNRECOVERABLE, NULL, // i_trace, DEFAULT_TRACE_SIZE, // i_traceSz, 0, // i_userData1, 0); // i_userData2 //Add firmware callout addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); //Add processor callout addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_MED); } } if( l_err != NULL) { //Add our register dump to the error log addUsrDtlsToErrl(l_err, (uint8_t*) &l_pmc_pmsr_ffdc, sizeof(l_pmc_pmsr_ffdc), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_BINARY_DATA); REQUEST_RESET(l_err); } } }
// Function Specification // // Name: amec_slv_check_perf // // Description: Slave OCC's Detect and log degraded performance errors // This function will run every tick. // // Thread: RealTime Loop // // Task Flags: // // End Function Specification void amec_slv_check_perf(void) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ static BOOLEAN l_prev_failsafe_state = FALSE; static BOOLEAN l_prev_ovs_state = FALSE; static BOOLEAN l_prev_pcap_state = FALSE; static ERRL_SEVERITY l_pcap_sev = ERRL_SEV_PREDICTIVE; static BOOLEAN l_throttle_traced = FALSE; static uint64_t l_time = 0; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // Verify that cores are at proper frequency amec_verify_pstate(); do { // was frequency limited by power ? if ( G_non_dps_power_limited != TRUE ) { if(l_throttle_traced) { TRAC_INFO("Frequency not limited by power algorithms anymore"); l_throttle_traced = FALSE; } // we are done break and return break; } // frequency limited due to failsafe condition ? if ( AMEC_INTF_GET_FAILSAFE() == TRUE ) { if ( l_prev_failsafe_state == TRUE) { // we are done break and return break; } else { // log this error ONLY ONCE per IPL l_prev_failsafe_state = TRUE; TRAC_ERR("Frequency limited due to failsafe condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out OVS procedure // set error severity to RRL_SEV_PREDICTIVE /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode INTERNAL_FAILURE * @userdata1 Previous FailSafe State * @userdata4 ERC_AMEC_SLAVE_FAILSAFE_STATE * @devdesc Frequency limited due to failsafe condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId INTERNAL_FAILURE, //reasoncode ERC_AMEC_SLAVE_FAILSAFE_STATE,//Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_prev_failsafe_state, //userdata1 0); //userdata2 addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_OVERSUBSCRIPTION, ERRL_CALLOUT_PRIORITY_HIGH ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // Commit Error commitErrl(&l_errl); // we are done lets break break; } } // frequency limited due to oversubscription condition ? if ( AMEC_INTF_GET_OVERSUBSCRIPTION() == TRUE ) { if ( l_prev_ovs_state == TRUE) { // we are done break and return break; } else { // log this error ONLY ONCE per IPL l_prev_ovs_state = TRUE; TRAC_ERR("Frequency limited due to oversubscription condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out OVS procedure // set error severity to RRL_SEV_PREDICTIVE // Updated the RC to match the actual RC passed to createErrl() /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode OVERSUB_LIMIT_ALERT * @userdata1 Previous OVS State * @userdata4 ERC_AMEC_SLAVE_OVS_STATE * @devdesc Frequency limited due to oversubscription condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId OVERSUB_LIMIT_ALERT, //reasoncode ERC_AMEC_SLAVE_OVS_STATE, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_prev_ovs_state, //userdata1 0); //userdata2 // Callout to Oversubscription addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_OVERSUBSCRIPTION, ERRL_CALLOUT_PRIORITY_HIGH ); // Callout to APSS addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.apss_huid, ERRL_CALLOUT_PRIORITY_MED ); // Callout to Firmware addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_LOW ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // Commit Error commitErrl(&l_errl); // we are done lets break break; } } uint16_t l_snrBulkPwr = AMECSENSOR_PTR(PWR250US)->sample; // frequency limited due to system power cap condition ? if (( l_snrBulkPwr > (G_sysConfigData.pcap.system_pcap - PDROP_THRESH) ) && ( G_sysConfigData.pcap.current_pcap == 0 )) { if ( l_prev_pcap_state == TRUE) { // we are done break and return break; } else { //log this error ONLY ONCE per IPL l_prev_pcap_state = TRUE; TRAC_ERR("Frequency limited due to power cap condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); TRAC_ERR("SnrBulkPwr %d > Sys Pcap %d ",l_snrBulkPwr, G_sysConfigData.pcap.system_pcap ); TRAC_ERR("SnrFanPwr %d, SnrIOPwr %d, SnrStoragePwr %d, SnrGpuPrw %d ", AMECSENSOR_PTR(PWR250USFAN)->sample, AMECSENSOR_PTR(PWR250USIO)->sample, AMECSENSOR_PTR(PWR250USSTORE)->sample, AMECSENSOR_PTR(PWR250USGPU)->sample ); TRAC_ERR("SnrProcPwr 0 %d, SnrProcPwr 1 %d, SnrProcPwr 2 %d, SnrProcPwr 3 %d", g_amec->proc_snr_pwr[0], g_amec->proc_snr_pwr[1], g_amec->proc_snr_pwr[2], g_amec->proc_snr_pwr[3] ); TRAC_ERR("SnrMemPwr 0 %d, SnrMemPwr 1 %d, SnrMemPwr 2 %d, SnrMemPwr 3 %d", g_amec->mem_snr_pwr[0], g_amec->mem_snr_pwr[1], g_amec->mem_snr_pwr[2], g_amec->mem_snr_pwr[3] ); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out firmware and APSS procedure // set error severity to l_pcap_sev /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode PCAP_THROTTLE_POWER_LIMIT * @userdata1 Current Sensor Bulk Power * @userdata2 System PCAP * @userdata4 ERC_AMEC_SLAVE_POWERCAP * @devdesc Frequency limited due to PowerCap condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId PCAP_THROTTLE_POWER_LIMIT, //reasoncode ERC_AMEC_SLAVE_POWERCAP, //Extended reason code l_pcap_sev, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_snrBulkPwr, //userdata1 G_sysConfigData.pcap.system_pcap);//userdata2 addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH ); addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.apss_huid, ERRL_CALLOUT_PRIORITY_HIGH ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // then l_pcap_sev to informational l_pcap_sev = ERRL_SEV_INFORMATIONAL; // Commit Error commitErrl(&l_errl); // we are done lets break break; } } // trottle trace to every 3600 seconds (1hr = 3600000) if(!l_throttle_traced && ( DURATION_IN_MS_UNTIL_NOW_FROM(l_time) > 3600000 ) ) { TRAC_INFO("Frequency power limited due to transient condition: PowerLimited=%x, FailSafe=%x, OverSubScription=%x CurrentBulkPwr=%x", G_non_dps_power_limited, AMEC_INTF_GET_FAILSAFE(), AMEC_INTF_GET_OVERSUBSCRIPTION(), l_snrBulkPwr ); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); } } while( 0 ); return; }
// Function Specification // // Name: dbug_err_inject // // Description: Injects an error // // End Function Specification void dbug_err_inject(const cmdh_fsp_cmd_t * i_cmd_ptr, cmdh_fsp_rsp_t * i_rsp_ptr) { errlHndl_t l_err; cmdh_dbug_inject_errl_query_t *l_cmd_ptr = (cmdh_dbug_inject_errl_query_t*) i_cmd_ptr; i_rsp_ptr->data_length[0] = 0; i_rsp_ptr->data_length[1] = 0; G_rsp_status = ERRL_RC_SUCCESS; if(!strncmp(l_cmd_ptr->comp, "RST", OCC_TRACE_NAME_SIZE)) { l_err = createErrl(CMDH_DBUG_MID, //modId INTERNAL_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0xff, //userdata1 0); //userdata2 if (INVALID_ERR_HNDL == l_err) { G_rsp_status = ERRL_RC_INTERNAL_FAIL; } addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, //callout type (HUID/CompID) G_sysConfigData.proc_huid, //callout data ERRL_CALLOUT_PRIORITY_HIGH); //priority REQUEST_RESET(l_err); } else { l_err = createErrl(CMDH_DBUG_MID, //modId INTERNAL_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity TRAC_get_td(l_cmd_ptr->comp), //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0xff, //userdata1 0); //userdata2 if (INVALID_ERR_HNDL == l_err) { G_rsp_status = ERRL_RC_INTERNAL_FAIL; } // Commit Error log commitErrl(&l_err); } if (G_rsp_status == ERRL_RC_INTERNAL_FAIL) { TRAC_ERR("cmdh_dbug_inject_errl: Fail creating ERR Log\n"); } else { TRAC_INFO("cmdh_dbug_inject_errl: inject errl for COMP : %s\n", l_cmd_ptr->comp); } return; }
// Function Specification // // Name: dcom_initialize_roles // // Description: Initialize roles so we know if we are master or slave // // End Function Specification void dcom_initialize_roles(void) { G_occ_role = OCC_SLAVE; // Locals pba_xcfg_t pbax_cfg_reg; // Used as a debug tool to correlate time between OCCs & System Time // getscom_ffdc(OCB_OTBR, &G_dcomTime.tod, NULL); // Commits errors internally G_dcomTime.tod = in64(OCB_OTBR) >> 4; G_dcomTime.base = ssx_timebase_get(); pbax_cfg_reg.value = in64(PBA_XCFG); if(pbax_cfg_reg.fields.rcv_groupid < MAX_NUM_NODES && pbax_cfg_reg.fields.rcv_chipid < MAX_NUM_OCC) { TRAC_IMP("Proc ChipId (%d) NodeId (%d)", pbax_cfg_reg.fields.rcv_chipid, pbax_cfg_reg.fields.rcv_groupid); G_pbax_id.valid = 1; G_pbax_id.node_id = pbax_cfg_reg.fields.rcv_groupid; G_pbax_id.chip_id = pbax_cfg_reg.fields.rcv_chipid; G_pbax_id.module_id = G_pbax_id.chip_id; // Always start as OCC Slave G_occ_role = OCC_SLAVE; rtl_set_run_mask(RTL_FLAG_NOTMSTR); // Set the initial presence mask, and count the number of occ's present G_sysConfigData.is_occ_present |= (0x01 << G_pbax_id.chip_id); G_occ_num_present = __builtin_popcount(G_sysConfigData.is_occ_present); } else // Invalid chip/node ID(s) { TRAC_ERR("Proc ChipId (%d) and/or NodeId (%d) too high: request reset", pbax_cfg_reg.fields.rcv_chipid, pbax_cfg_reg.fields.rcv_groupid); /* @ * @errortype * @moduleid DCOM_MID_INIT_ROLES * @reasoncode INVALID_CONFIG_DATA * @userdata1 PBAXCFG (upper) * @userdata2 PBAXCFG (lower) * @userdata4 ERC_CHIP_IDS_INVALID * @devdesc Failure determining OCC role */ errlHndl_t l_errl = createErrl( DCOM_MID_INIT_ROLES, //ModId INVALID_CONFIG_DATA, //Reasoncode ERC_CHIP_IDS_INVALID, //Extended reasoncode ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size pbax_cfg_reg.words.high_order, //Userdata1 pbax_cfg_reg.words.low_order //Userdata2 ); // Callout firmware addCalloutToErrl(l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); //Add processor callout addCalloutToErrl(l_errl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_LOW); G_pbax_id.valid = 0; // Invalid Chip/Node ID } // Initialize DCOM Thread Sem ssx_semaphore_create( &G_dcomThreadWakeupSem, // Semaphore 1, // Initial Count 0); // No Max Count }
// Function Specification // // Name: errlTestCallouts // // Description: errlTestCallouts // // End Function Specification uint32_t errlTestCallouts() { uint32_t l_rc = 0; ERRL_DBG("START"); do { errlHndl_t l_handle = NULL; ERRL_DBG("--------------------------------\n"); /****************************************************/ // Check max callouts l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); ERRL_CALLOUT_PRIORITY l_array[8] = { ERRL_CALLOUT_PRIORITY_HIGH, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_LOW, ERRL_CALLOUT_PRIORITY_HIGH, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_LOW, ERRL_CALLOUT_PRIORITY_LOW, }; ERRL_CALLOUT_TYPE l_type[8] = { ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, }; // add 6 (ERRL_MAX_CALLOUTS) callouts uint8_t l_index = 0; for(l_index = 0; l_index < ERRL_MAX_CALLOUTS; l_index++) { ERRL_DBG("current callouts %d attempting to add callout # %d with type %d ,priority %d", l_handle->iv_numCallouts, l_index, l_type[l_index], l_array[l_index] ); addCalloutToErrl(l_handle,l_type[l_index],l_index,l_array[l_index]); } CHECK_CONDITION( l_handle->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); // add one more callout and it should fail addCalloutToErrl(l_handle,l_type[0],l_index,l_array[0]); CHECK_CONDITION( l_handle->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); deleteErrl( &l_handle ); ppdumpslot(); /****************************************************/ // Check callouts after errl is committed // Create log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 32, 0x1, 0x2); errlHndl_t l_log = l_handle; CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Commit log and add callout. But adding callout should fail commitErrl( &l_handle ); addCalloutToErrl(l_handle,l_type[0],0,l_array[0]); CHECK_CONDITION( l_log->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); deleteErrl(&l_log); /****************************************************/ // Check addCalloutToErrl for ERRL_SEV_INFORMATIONAL log // Create ERRL_SEV_INFORMATIONAL log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); if(l_handle == INVALID_ERR_HNDL) // add one callout and it should fail addCalloutToErrl(l_handle,l_type[0],l_index,l_array[0]); CHECK_CONDITION( l_handle->iv_numCallouts == 0, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); deleteErrl( &l_handle ); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestErrorHandling // // Description: errlTestErrorHandling // // End Function Specification uint32_t errlTestErrorHandling() { uint32_t l_rc = 0; errlHndl_t l_errlHnd = NULL; uint8_t l_dataPtr[10]; uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; ERRL_DBG(" START"); do { /****************************************************/ // Test createErrl with incorrect parameter // Set ERRL_SEVERITY to 0x04, out of range so log won't be created l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, 0x04, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd == INVALID_ERR_HNDL, l_rc); /****************************************************/ // Test addTraceToErrl with incorrect parameter // Create a log l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd != INVALID_ERR_HNDL, l_rc); // i_trace = NULL, so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(NULL, 5, l_errlHnd); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION(l_entrySizeBefore == l_entrySizeAfter, l_rc); // i_traceSz = 0, entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 0, l_errlHnd); // @at012c l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // io_err = NULL, entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 32, NULL); // @at012c l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // test addTraceToErrl after log is comitted so entry size doesn't change errlHndl_t l_errlHndx = l_errlHnd; commitErrl(&l_errlHnd); l_entrySizeBefore = l_errlHndx->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 32, l_errlHndx); // @at012c l_entrySizeAfter = l_errlHndx->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); deleteErrl(&l_errlHndx); CHECK_CONDITION( l_errlHndx == NULL, l_rc); // io_err = INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) l_errlHnd = INVALID_ERR_HNDL; addTraceToErrl(g_trac_inf, 32, l_errlHnd); /****************************************************/ // Test commitErrl with incorrect parameter // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) commitErrl( NULL); // l_errlHnd should be set to NULL l_errlHnd = INVALID_ERR_HNDL; commitErrl(&l_errlHnd); CHECK_CONDITION( l_errlHnd == NULL, l_rc); /****************************************************/ // Test deleteErrl with incorrect parameter // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) deleteErrl( NULL); // l_errlHnd should be set to NULL l_errlHnd = INVALID_ERR_HNDL; deleteErrl(&l_errlHnd); CHECK_CONDITION( l_errlHnd == NULL, l_rc); /****************************************************/ // Test addCalloutToErrl with incorrect parameter // Set io_err to NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addCalloutToErrl(NULL, ERRL_CALLOUT_TYPE_HUID, 0, ERRL_CALLOUT_PRIORITY_LOW); // Set io_err to INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addCalloutToErrl(INVALID_ERR_HNDL, ERRL_CALLOUT_TYPE_HUID, 0, ERRL_CALLOUT_PRIORITY_LOW); /****************************************************/ // Test addUsrDtlsToErrl with incorrect parameter // Create a log l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd != INVALID_ERR_HNDL, l_rc); // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addUsrDtlsToErrl(NULL, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // io_err = INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addUsrDtlsToErrl(INVALID_ERR_HNDL, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // i_dataPtr = NULL so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHnd, NULL, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // i_size = 0 so so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHnd, l_dataPtr, 0, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // test addUsrDtlsToErrl after log is committed so entry size doesn't change l_errlHndx = l_errlHnd; commitErrl(&l_errlHnd); l_entrySizeBefore = l_errlHndx->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHndx, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHndx->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); deleteErrl(&l_errlHndx); CHECK_CONDITION( l_errlHndx == NULL, l_rc); /****************************************************/ // Test setErrlSevToInfo with incorrect parameter // Set io_err to NULL. // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) setErrlSevToInfo(NULL); // Set io_err to INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) setErrlSevToInfo(INVALID_ERR_HNDL); }while(0); return l_rc; }
////////////////////////// // Function Specification // // Name: amec_gpu_pcap // // Description: Determine power cap for GPUs // // Thread: Real Time Loop // // End Function Specification void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t i_avail_power) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ uint8_t i = 0; uint32_t l_gpu_cap_mw = 0; uint16_t l_system_gpu_total_pcap = 0; // total GPU pcap required by system based on if currently in oversub or not static uint16_t L_total_gpu_pcap = 0; // Current total GPU pcap in effect static uint16_t L_n_plus_1_mode_gpu_total_pcap = 0; // Total GPU pcap required for N+1 (not in oversubscription) static uint16_t L_n_mode_gpu_total_pcap = 0; // Total GPU pcap required for oversubscription static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR static uint16_t L_per_gpu_pcap = 0; // Amount of L_total_gpu_pcap for each GPU static uint8_t L_psr = 100; // PSR value used in L_active_psr_gpu_total_pcap calculation static bool L_first_run = TRUE; // for calculations done only 1 time static uint32_t L_last_pcap_traced[MAX_NUM_GPU_PER_DOMAIN] = {0}; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // If this is the first time running calculate the total GPU power cap for system power caps (N and N+1) if(L_first_run) { // calculate total GPU power cap for oversubscription if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the oversubscription power cap L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs L_n_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; } else { // This should not happen, the total non GPU power should never be higher than the N mode cap // Log error and set GPUs to minimum power cap L_n_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N mode pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.ovs_node_pcap); /* @ * @errortype * @moduleid AMEC_GPU_PCAP_MID * @reasoncode GPU_FAILURE * @userdata1 N mode Power Cap watts * @userdata2 Total non-GPU power watts * @userdata4 ERC_GPU_N_MODE_PCAP_CALC_FAILURE * @devdesc Total non-GPU power more than N mode power cap * */ errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, GPU_FAILURE, ERC_GPU_N_MODE_PCAP_CALC_FAILURE, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, g_amec->pcap.ovs_node_pcap, G_sysConfigData.total_non_gpu_max_pwr_watts); //Callout firmware addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } // calculate total GPU power cap for N+1 (not in oversubscription) if(G_sysConfigData.pcap.system_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the N+1 power cap L_n_plus_1_mode_gpu_total_pcap = G_sysConfigData.pcap.system_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs L_n_plus_1_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; } else { // This should not happen, the total non GPU power should never be higher than the N+1 mode cap // Log error and set GPUs to minimum power cap L_n_plus_1_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N+1 mode pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, G_sysConfigData.pcap.system_pcap); /* @ * @errortype * @moduleid AMEC_GPU_PCAP_MID * @reasoncode GPU_FAILURE * @userdata1 N+1 mode Power Cap watts * @userdata2 Total non-GPU power watts * @userdata4 ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE * @devdesc Total non-GPU power more than N+1 mode power cap * */ errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, GPU_FAILURE, ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, G_sysConfigData.pcap.system_pcap, G_sysConfigData.total_non_gpu_max_pwr_watts); //Callout firmware addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } } // if first run // Calculate the total GPU power cap for the current active limit and PSR // this only needs to be calculated if either the active limit or PSR changed if( (L_first_run) || (i_active_pcap_changed) || (L_psr != G_sysConfigData.psr) ) { L_psr = G_sysConfigData.psr; if(g_amec->pcap.active_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the active power cap L_active_psr_gpu_total_pcap = g_amec->pcap.active_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling based on the PSR // to give to GPUs L_active_psr_gpu_total_pcap += ( (L_psr / 100) * G_sysConfigData.total_proc_mem_pwr_drop_watts ); } else { // Set GPUs to minimum power cap L_active_psr_gpu_total_pcap = 0; TRAC_IMP("amec_gpu_pcap: non GPU max power %dW is more than active pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap); } // Total GPU power cap is the lower of system (N+1 or oversubscription depending on if in oversub) // and the active power limit. We do not need to always account for oversubscription since // the automatic hw power brake will assert to the GPUs if there is a problem when oversub is // entered from the time OCC can set and GPUs react to a new power limit if(i_oversubscription) { // system in oversubscription use N mode cap l_system_gpu_total_pcap = L_n_mode_gpu_total_pcap; } else { // system is not in oversubscription use N+1 mode cap l_system_gpu_total_pcap = L_n_plus_1_mode_gpu_total_pcap; } L_total_gpu_pcap = (l_system_gpu_total_pcap < L_active_psr_gpu_total_pcap) ? l_system_gpu_total_pcap : L_active_psr_gpu_total_pcap; // Divide the total equally across all GPUs in the system if(G_first_num_gpus_sys) { L_per_gpu_pcap = L_total_gpu_pcap / G_first_num_gpus_sys; } else { L_per_gpu_pcap = 0; TRAC_ERR("amec_gpu_pcap: Called with no GPUs present!"); } } // Setup to send new power limit to GPUs. The actual sending of GPU power limit will be handled by task_gpu_sm() for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++) { // Before sending a GPU a power limit the power limits must be read from the GPU to know min/max GPU allows if( GPU_PRESENT(i) && g_amec->gpu[i].pcap.pwr_limits_read ) { l_gpu_cap_mw = L_per_gpu_pcap * 1000; // convert W to mW // GPU is present and have min/max power limits from GPU // clip the GPU power limit to min/max GPU limit if needed if(l_gpu_cap_mw < g_amec->gpu[i].pcap.gpu_min_pcap_mw) // clip to min? { l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw; } else if(l_gpu_cap_mw > g_amec->gpu[i].pcap.gpu_max_pcap_mw) // clip to max? { l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_max_pcap_mw; } // check if this is a new power limit if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw) { if( (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) || (L_last_pcap_traced[i] != l_gpu_cap_mw) ) { L_last_pcap_traced[i] = l_gpu_cap_mw; TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i, g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw); } g_amec->gpu[i].pcap.gpu_desired_pcap_mw = l_gpu_cap_mw; } } } // for each GPU L_first_run = FALSE; }
// Function Specification // // Name: proc_gpsm_dcm_sync_enable_pstates_smh // // Description: Step through all the states & synch needed to enable // Pstates on both master & slave on a DCM. This also // works for a SCM, which will act as DCM master (as far // as this function is concerned.) // // End Function Specification void proc_gpsm_dcm_sync_enable_pstates_smh(void) { // Static Locals static GpsmEnablePstatesMasterInfo l_master_info; static Pstate l_voltage_pstate, l_freq_pstate; // Local Variables int l_rc = 0; errlHndl_t l_errlHndl = NULL; if(!gpsm_dcm_slave_p()) { // --------------------------------------- // SCM or DCM Master // --------------------------------------- switch( G_proc_dcm_sync_state.sync_state_master ) { case PROC_GPSM_SYNC_NO_PSTATE_TABLE: // Waiting for Pstate Table from TMGT break; case PROC_GPSM_SYNC_PSTATE_TABLE_INSTALLED: PROC_DBG("GPST DCM Master State %d\n",G_proc_dcm_sync_state.sync_state_master); // DCM SYNC (MasterWaitForSlave): Wait for slave to install Pstate table if(gpsm_dcm_mode_p()){ if( G_proc_dcm_sync_state.sync_state_slave == PROC_GPSM_SYNC_PSTATE_TABLE_INSTALLED) { // Move to next state in state machine G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_READY_TO_ENABLE_MASTER; } } else { // Move to next state in state machine G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_READY_TO_ENABLE_MASTER; } break; case PROC_GPSM_SYNC_READY_TO_ENABLE_MASTER: PROC_DBG("GPST DCM Master State %d\n",G_proc_dcm_sync_state.sync_state_master); // Pstate tables has been installed, so now Master can start to enable Pstates l_rc = gpsm_enable_pstates_master(&l_master_info, &l_voltage_pstate, &l_freq_pstate); if(l_rc) { // Error TRAC_ERR("MSTR: gpsm_enable_pstates_master failed with rc=0x%08x", l_rc); G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_ERROR; break; } TRAC_IMP("MSTR: Initial Pstates: V: %d, F: %d\n",l_voltage_pstate, l_freq_pstate); // DCM SYNC (Master2Slave): Send V & F Pstate to slave G_proc_dcm_sync_state.dcm_pair_id = G_pob_id.chip_id; G_proc_dcm_sync_state.pstate_v = l_voltage_pstate; G_proc_dcm_sync_state.pstate_f = l_freq_pstate; // Move to next state in state machine G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_MASTER_ENABLED; break; case PROC_GPSM_SYNC_PSTATE_MASTER_ENABLED: PROC_DBG("GPST DCM Master State %d\n",G_proc_dcm_sync_state.sync_state_master); // DCM SYNC (MasterWaitForSlave): Wait for slave to complete gpsm_enable_pstates_slave() if(gpsm_dcm_mode_p()){ if( G_proc_dcm_sync_state.sync_state_slave == PROC_GPSM_SYNC_PSTATE_SLAVE_ENABLED) { // Move to next state in state machine G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_READY_TO_ENABLE_SLAVE; } } else { G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_READY_TO_ENABLE_SLAVE; } break; case PROC_GPSM_SYNC_READY_TO_ENABLE_SLAVE: PROC_DBG("GPST DCM Master State %d\n",G_proc_dcm_sync_state.sync_state_master); // Master does next step of enabling Pstates, now that slave has done it's enable l_rc = gpsm_enable_pstates_slave(&l_master_info, l_voltage_pstate, l_freq_pstate); if(l_rc) { // Error TRAC_ERR("MSTR: gpsm_enable_pstates_slave failed with rc=0x%08x", l_rc); G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_ERROR; break; } TRAC_INFO("MSTR: Completed DCM Pstate Slave Init\n"); G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_SLAVE_ENABLED; break; case PROC_GPSM_SYNC_PSTATE_SLAVE_ENABLED: PROC_DBG("GPST DCM Master State %d\n",G_proc_dcm_sync_state.sync_state_master); // Master puts this chip in Pstate HW mode l_rc = gpsm_hw_mode(); if(l_rc) { // Error TRAC_ERR("MSTR: gpsm_hw_mode failed with rc=0x%08x", l_rc); G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_ERROR; break; } // DCM SYNC (Master2Slave): Tell Slave Master has entered HW mmode G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_HW_MODE; break; case PROC_GPSM_SYNC_PSTATE_HW_MODE: PROC_DBG("GPST DCM Master State %d\n",G_proc_dcm_sync_state.sync_state_master); // DCM SYNC (Master2Slave): Wait for Slave to Enter HW Mode if(gpsm_dcm_mode_p()){ if( G_proc_dcm_sync_state.sync_state_slave == PROC_GPSM_SYNC_PSTATE_HW_MODE) { TRAC_INFO("MSTR: Completed DCM Pstate Enable"); G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_HW_MODE_ENABLED; //do additional setup if in kvm mode proc_pstate_kvm_setup(); } } else { G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_PSTATE_HW_MODE_ENABLED; TRAC_INFO("MSTR: Completed SCM Pstate Enable"); //do additional setup if in kvm mode proc_pstate_kvm_setup(); } break; case PROC_GPSM_SYNC_PSTATE_HW_MODE_ENABLED: // Final State // Pstates Enabled on both modules in DCM break; case PROC_GPSM_SYNC_PSTATE_ERROR: // Do nothing, something will have to come and kick us out of this state break; default: G_proc_dcm_sync_state.sync_state_master = PROC_GPSM_SYNC_NO_PSTATE_TABLE; break; } } else if (gpsm_dcm_slave_p()) { // --------------------------------------- // DCM Slave // - Don't need to check if DCM, since we can't come in here unless DCM // --------------------------------------- switch( G_proc_dcm_sync_state.sync_state_slave) { case PROC_GPSM_SYNC_NO_PSTATE_TABLE: // Waiting for Pstate Table from TMGT break; case PROC_GPSM_SYNC_PSTATE_TABLE_INSTALLED: // Pstate table has been installed, but slave needs to wait // for master before it can do anything else. // DCM SYNC (SlaveWaitForMaster): Send V & F Pstate to slave // Wait for Master to complete gpsm_enable_pstates_master() // before running gpsm_enable_pstates_slave() if( G_proc_dcm_sync_state.sync_state_master == PROC_GPSM_SYNC_PSTATE_MASTER_ENABLED) { // Go to next state G_proc_dcm_sync_state.sync_state_slave = PROC_GPSM_SYNC_PSTATE_MASTER_ENABLED; } break; case PROC_GPSM_SYNC_PSTATE_MASTER_ENABLED: PROC_DBG("GPST DCM Slave State %d\n",G_proc_dcm_sync_state.sync_state_slave); // Read the initial Pstates from the data DCM master sent l_voltage_pstate = G_proc_dcm_sync_state.pstate_v; l_freq_pstate = G_proc_dcm_sync_state.pstate_f; // NULL is passed to this function when run on dcm slave l_rc = gpsm_enable_pstates_slave(NULL, l_voltage_pstate, l_freq_pstate); if(l_rc) { // Error TRAC_ERR("SLV: gpsm_enable_pstates_slave failed with rc=0x%08x", l_rc); G_proc_dcm_sync_state.sync_state_slave = PROC_GPSM_SYNC_PSTATE_ERROR; break; } TRAC_INFO("SLV: Completed DCM Pstate Slave Init\n"); // DCM SYNC (Slave2Master): // Tell Master that slave has run gpsm_enable_pstates_slave() // Go to next state G_proc_dcm_sync_state.sync_state_slave = PROC_GPSM_SYNC_PSTATE_SLAVE_ENABLED; break; case PROC_GPSM_SYNC_PSTATE_SLAVE_ENABLED: // DCM SYNC (SlaveWaitForMaster): Wait for Master to run gpsm_hw_mode if( G_proc_dcm_sync_state.sync_state_master == PROC_GPSM_SYNC_PSTATE_HW_MODE) { // Enter Pstate HW mode l_rc = gpsm_hw_mode(); if(l_rc) { // Error TRAC_ERR("SLV: gpsm_hw_mode failed with rc=0x%08x", l_rc); G_proc_dcm_sync_state.sync_state_slave = PROC_GPSM_SYNC_PSTATE_ERROR; break; } // DCM SYNC (Slave2Master): Tell master that DCM slave made it to HW mode // Go to next state G_proc_dcm_sync_state.sync_state_slave = PROC_GPSM_SYNC_PSTATE_HW_MODE; } break; case PROC_GPSM_SYNC_PSTATE_HW_MODE: // Slave & Master now both know each other has HW mode enabled if( G_proc_dcm_sync_state.sync_state_master == PROC_GPSM_SYNC_PSTATE_HW_MODE_ENABLED) { G_proc_dcm_sync_state.sync_state_slave = PROC_GPSM_SYNC_PSTATE_HW_MODE_ENABLED; TRAC_INFO("SLV: Completed DCM Pstate Enable"); //do additional setup if in kvm mode proc_pstate_kvm_setup(); } break; case PROC_GPSM_SYNC_PSTATE_HW_MODE_ENABLED: // Final State // Pstates Enabled on both modules in DCM break; case PROC_GPSM_SYNC_PSTATE_ERROR: // Do nothing, something will have to come and kick us out of this state break; default: G_proc_dcm_sync_state.sync_state_slave = PROC_GPSM_SYNC_NO_PSTATE_TABLE; break; } } // If we are in the process of running through the state machine, // we will do a sem_post to speed up the DCOM Thread and step us // through faster. if( PROC_GPSM_SYNC_NO_PSTATE_TABLE != proc_gpsm_dcm_sync_get_my_state() && !proc_is_hwpstate_enabled() ) { ssx_semaphore_post(&G_dcomThreadWakeupSem); } // If we broke out of loops above because of an error, create an // error log and return it to caller. if(l_rc) { /* @ * @errortype * @moduleid PROC_ENABLE_PSTATES_SMH_MOD * @reasoncode SSX_GENERIC_FAILURE * @userdata1 SRAM Address of the Pstate Table * @userdata2 Return Code of call that failed * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failed to install Pstate Table */ l_errlHndl = createErrl( PROC_ENABLE_PSTATES_SMH_MOD, //modId SSX_GENERIC_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //TODO: create trace //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size (uint32_t) &G_global_pstate_table, //userdata1 l_rc); //userdata2 addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_LOW); REQUEST_RESET(l_errlHndl); } return; }
// Function Specification // // Name: proc_pstate_kvm_setup // // Description: Get everything set up for KVM mode // // End Function Specification void proc_pstate_kvm_setup() { int l_core; int l_rc = 0; uint32_t l_configured_cores; pcbs_pcbspm_mode_reg_t l_ppmr; pcbs_pmgp1_reg_t l_pmgp1; pcbs_power_management_bounds_reg_t l_pmbr; errlHndl_t l_errlHndl; do { //only run this in KVM mode if(!G_sysConfigData.system_type.kvm) { break; } l_configured_cores = ~in32(PMC_CORE_DECONFIGURATION_REG); // Do per-core configuration for(l_core = 0; l_core < PGP_NCORES; l_core++, l_configured_cores <<= 1) { if(!(l_configured_cores & 0x80000000)) continue; //do read-modify-write to allow pmax clip to also clip voltage (not just frequency) l_rc = getscom_ffdc(CORE_CHIPLET_ADDRESS(PCBS_PCBSPM_MODE_REG, l_core), &(l_ppmr.value), NULL); //commit errors internally if(l_rc) { TRAC_ERR("proc_pstate_kvm_setup: getscom(PCBS_PCBSPM_MODE_REG) failed. rc=%d, hw_core=%d", l_rc, l_core); break; } l_ppmr.fields.enable_clipping_of_global_pstate_req = 1; l_rc = putscom_ffdc(CORE_CHIPLET_ADDRESS(PCBS_PCBSPM_MODE_REG, l_core), l_ppmr.value, NULL); //commit errors internally if(l_rc) { TRAC_ERR("proc_pstate_kvm_setup: putscom(PCBS_PCBSPM_MODE_REG) failed. rc=%d, hw_core=%d", l_rc, l_core); break; } //per Vaidy Srinivasan, clear bit 11 in the Power Management GP1 register l_pmgp1.value = 0; l_pmgp1.fields.pm_spr_override_en = 1; l_rc = putscom_ffdc(CORE_CHIPLET_ADDRESS(PCBS_PMGP1_REG_AND, l_core), ~l_pmgp1.value, NULL); //commit errors internally if(l_rc) { TRAC_ERR("proc_pstate_kvm_setup: putscom(PCBS_PMGB1_REG_OR) failed. rc=0x%08x, hw_core=%d", l_rc, l_core); break; } //set pmax/pmin clip initial settings l_pmbr.value = 0; l_pmbr.fields.pmin_clip = gpst_pmin(&G_global_pstate_table)+1; //Per David Du, we must use pmin+1 to avoid gpsa hang l_pmbr.fields.pmax_clip = gpst_pmax(&G_global_pstate_table); l_rc = putscom_ffdc(CORE_CHIPLET_ADDRESS(PCBS_POWER_MANAGEMENT_BOUNDS_REG, l_core), l_pmbr.value, NULL); //commit errors internally if(l_rc) { TRAC_ERR("proc_pstate_kvm_setup: putscom(PCBS_POWER_MANAGEMENT_BOUNDS_REG) failed. rc=0x%08x, hw_core=%d", l_rc, l_core); break; } }// end of per-core config if(l_rc) { break; } // Set the voltage clipping register to match the pmax/pmin clip values set above. pmc_rail_bounds_register_t prbr; prbr.value = in32(PMC_RAIL_BOUNDS_REGISTER); prbr.fields.pmin_rail = gpst_pmin(&G_global_pstate_table); prbr.fields.pmax_rail = gpst_pmax(&G_global_pstate_table); TRAC_IMP("pmin clip pstate = %d, pmax clip pstate = %d", prbr.fields.pmin_rail, prbr.fields.pmax_rail); out32(PMC_RAIL_BOUNDS_REGISTER, prbr.value); // Initialize the sapphire table in SRAM (sets valid bit) populate_pstate_to_sapphire_tbl(); // copy sram image into mainstore HOMER populate_sapphire_tbl_to_mem(); TRAC_IMP("proc_pstate_kvm_setup: RUNNING IN KVM MODE"); }while(0); if(l_rc) { // Create Error Log and request reset /* @ * @errortype * @moduleid PROC_PSTATE_KVM_SETUP_MOD * @reasoncode PROC_SCOM_ERROR * @userdata1 l_configured_cores * @userdata2 Return Code of call that failed * @userdata4 OCC_NO_EXTENDED_RC * @devdesc OCC failed to scom a core register */ l_errlHndl = createErrl( PROC_PSTATE_KVM_SETUP_MOD, //modId PROC_SCOM_ERROR, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_configured_cores, //userdata1 l_rc //userdata2 ); addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_HIGH); addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_MED); REQUEST_RESET(l_errlHndl); } }
// Function Specification // // Name: proc_gpsm_pstate_initialize // // Description: Initialize Pstate Table (and the rest of the Pstate // SuperStructure). Also, initialize Global variables // that will speed up the proc_freq2pstate function. // // End Function Specification errlHndl_t proc_gpsm_pstate_initialize(const PstateSuperStructure* i_pss) { errlHndl_t l_errlHndl = NULL; GlobalPstateTable * l_gpst_ptr = NULL; int l_rc = 0; do { /// Because early EC's of the Murano chip did not have valid #V data, /// we need to exclude them from loading a pstate table created by a /// hardware procedure. If we run a table created from a #V on these /// chips, we could crash the box (or worse, burn something up!) if ( (cfam_id() == CFAM_CHIP_ID_MURANO_10) || (cfam_id() == CFAM_CHIP_ID_MURANO_11) ) { TRAC_ERR("OCC not supported on murano dd10 or dd11 due to bad #V data. chip id = 0x%08x"); // Create Error Log and return to caller /* @ * @errortype * @moduleid PROC_GPST_INIT_FAILURE_MOD * @reasoncode INTERNAL_FAILURE * @userdata1 chip id * @userdata2 0 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc OCC not supported on Murano DD10 or DD11 */ l_errlHndl = createErrl( PROC_GPST_INIT_FAILURE_MOD, //modId INTERNAL_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size cfam_id(), //userdata1 0 //userdata2 ); //callout the processor addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_HIGH); break; } l_rc = gpsm_initialize(i_pss, &G_global_pstate_table); // Print key elements of table for debug proc_trace_pstate_table_quick(); // Get Pstate Table Ptr l_gpst_ptr = gpsm_gpst(); if(l_rc || (l_gpst_ptr != &G_global_pstate_table)) { TRAC_ERR("gpsm_initialize failed with rc=0x%08x or l_gpstr_ptr=0x%08x", l_rc, l_gpst_ptr); // Create Error Log and return to caller /* @ * @errortype * @moduleid PROC_GPST_INIT_FAILURE_MOD * @reasoncode INTERNAL_FAILURE * @userdata1 SRAM Address of the Pstate Table * @userdata2 Return Code of call that failed * @userdata4 ERC_PROC_PSTATE_INSTALL_FAILURE * @devdesc Failed to install Pstate Table */ l_errlHndl = createErrl( PROC_GPST_INIT_FAILURE_MOD, //modId INTERNAL_FAILURE, //reasoncode ERC_PROC_PSTATE_INSTALL_FAILURE, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size (uint32_t) &G_global_pstate_table, //userdata1 l_rc //userdata2 ); addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_HIGH); addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_MED); break; } // set up key globals based on the pstate table. // Set the pstate state (state machine will start enabling pstates // when it sees this) proc_gpsm_dcm_sync_set_state(PROC_GPSM_SYNC_PSTATE_TABLE_INSTALLED); // Set up Key Globals for use by proc_freq2pstate functions G_proc_gpst_fmax = l_gpst_ptr->pstate0_frequency_khz + (((int8_t) l_gpst_ptr->pmin + l_gpst_ptr->entries - 1) * l_gpst_ptr->frequency_step_khz); G_proc_gpst_fmin = l_gpst_ptr->pstate0_frequency_khz + (((int8_t) l_gpst_ptr->pmin) * l_gpst_ptr->frequency_step_khz); G_proc_gpst_pmax = l_gpst_ptr->pmin + l_gpst_ptr->entries - 1; // Dcom uses this to know whether to pass DCM msgs or not. G_isDcm = gpsm_dcm_mode_p(); // Set globals used by amec for pcap calculation G_mhz_per_pstate = (l_gpst_ptr->frequency_step_khz)/1000; }while(0); return l_errlHndl; }
// Function Specification // // Name: populate_sapphire_tbl_to_mem // // Description: // // End Function Specification void populate_sapphire_tbl_to_mem() { int l_ssxrc = SSX_OK; uint32_t l_reasonCode = 0; uint32_t l_extReasonCode = 0; do { #define SAPPHIRE_OFFSET_IN_HOMER 0x001F8000 BceRequest pba_copy; // Set up copy request l_ssxrc = bce_request_create(&pba_copy, // block copy object &G_pba_bcue_queue, // sram to mainstore copy engine SAPPHIRE_OFFSET_IN_HOMER, // mainstore address (uint32_t) &G_sapphire_table, // sram starting address (size_t) sizeof(G_sapphire_table), // size of copy SSX_WAIT_FOREVER, // no timeout NULL, // call back NULL, // call back arguments ASYNC_REQUEST_BLOCKING // callback mask ); if(l_ssxrc != SSX_OK) { TRAC_ERR("populate_sapphire_tbl_to_mem: PBA request create failure rc=[%08X]", -l_ssxrc); /* * @errortype * @moduleid MAIN_STATE_TRANSITION_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 RC for PBA block-copy engine * @userdata4 ERC_BCE_REQUEST_CREATE_FAILURE * @devdesc SSX BCE related failure */ l_reasonCode = SSX_GENERIC_FAILURE; l_extReasonCode = ERC_BCE_REQUEST_CREATE_FAILURE; break; } // Do actual copying l_ssxrc = bce_request_schedule(&pba_copy); if(l_ssxrc != SSX_OK) { TRAC_ERR("populate_sapphire_tbl_to_mem: PBA request schedule failure rc=[%08X]", -l_ssxrc); /* * @errortype * @moduleid MAIN_STATE_TRANSITION_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 RC for PBA block-copy engine * @userdata4 ERC_BCE_REQUEST_SCHEDULE_FAILURE * @devdesc Failed to copy data by using DMA */ l_reasonCode = SSX_GENERIC_FAILURE; l_extReasonCode = ERC_BCE_REQUEST_SCHEDULE_FAILURE; break; } } while(0); if ( l_ssxrc != SSX_OK ) { errlHndl_t l_errl = createErrl(MAIN_STATE_TRANSITION_MID, //modId l_reasonCode, //reasoncode l_extReasonCode, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf 0, //Trace Size -l_ssxrc, //userdata1 0); //userdata2 // Callout firmware addCalloutToErrl(l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_errl); } }
// Function Specification // // Name: SMGR_set_mode // // Description: // // End Function Specification errlHndl_t SMGR_set_mode(const OCC_MODE i_mode, const uint8_t i_sms_type) { errlHndl_t l_errlHndl = NULL; int jj=0; OCC_MODE l_mode = i_mode; do { // Get lock for critical section if(ssx_semaphore_pend(&G_smgrModeChangeSem,SSX_WAIT_FOREVER)) { /* @ * @errortype * @moduleid MAIN_MODE_TRANSITION_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 none * @userdata4 ERC_RUNNING_SEM_PENDING_FAILURE * @devdesc SSX semaphore related failure */ l_errlHndl = createErrl(MAIN_MODE_TRANSITION_MID, //modId SSX_GENERIC_FAILURE, //reasoncode ERC_RUNNING_SEM_PENDING_FAILURE,//Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0, //userdata1 0); //userdata2 // Callout firmware addCalloutToErrl(l_errlHndl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); break; } //Check to see if we need to make a change if(l_mode == OCC_MODE_NOCHANGE) { break; } // SAPPHIRE only accepts DPS-FE mode. In case OCC gets other modes, it should accept the request // and keep reporting back that it is in that mode. However, internally we should not // initiate any mode transition, i.e., OCC should remain internally in DPS-FE mode. if(G_sysConfigData.system_type.kvm) { G_occ_external_req_mode_kvm = l_mode; if (l_mode != OCC_MODE_DYN_POWER_SAVE) { TRAC_ERR("SAPPHIRE only accepts DPS-FE mode(6) but requested mode is : %d", l_mode); l_mode = OCC_MODE_DYN_POWER_SAVE; } } switch (l_mode) { case OCC_MODE_NOMINAL: // FALL THROUGH case OCC_MODE_PWRSAVE: // FALL THROUGH case OCC_MODE_DYN_POWER_SAVE: // FALL THROUGH case OCC_MODE_DYN_POWER_SAVE_FP: // FALL THROUGH case OCC_MODE_TURBO: // FALL THROUGH case OCC_MODE_STURBO: // FALL THROUGH case OCC_MODE_FFO: // FALL THROUGH // Notify AMEC of mode change // Change Mode via Transition Function do { // Loop through mode transition table, and find the state // transition function that matches the transition we need to do. for(jj=0; jj<G_smgr_mode_trans_count; jj++) { if( ((G_smgr_mode_trans[jj].old_state == G_occ_internal_mode) || (G_smgr_mode_trans[jj].old_state == OCC_MODE_ALL) ) && (G_smgr_mode_trans[jj].new_state == l_mode) ) { // We found the transtion that matches, now run the function // that is associated with that state transition. if(NULL != G_smgr_mode_trans[jj].trans_func_ptr) { // Signal that we are now in a mode transition G_mode_transition_occuring = TRUE; // Run transition function l_errlHndl = (G_smgr_mode_trans[jj].trans_func_ptr)(); // Signal that we are done with the transition G_mode_transition_occuring = FALSE; break; } } } // Check if we hit the end of the table without finding a valid // mode transition. If we did, log an internal error. if(G_smgr_mode_trans_count == jj) { TRAC_ERR("No transition (or NULL) found for the mode change\n"); l_errlHndl = NULL; //TODO: Create Error break; } // Update the power mode for all core groups that are following system mode AMEC_part_update_sysmode_policy(CURRENT_MODE()); } while(0); break; default: //unsupported mode break; } if(l_errlHndl) { // Punt !!! :-) break; } // Load correct thermal thresholds based on the current mode l_errlHndl = AMEC_data_write_thrm_thresholds(CURRENT_MODE()); // Update the CPU speed in AME? // Register the New Mode? // Update Power Policy Requirements? // Update CPM Calibration }while(0); // If we have a mode change failure, Mode change flag needs to be set, // otherwise, it needs be be cleared/unset. if(l_errlHndl) { } // Unlock critical section ssx_semaphore_post(&G_smgrModeChangeSem); return l_errlHndl; }
void task_centaur_control( task_t * i_task ) { errlHndl_t l_err = NULL; // Error handler int rc = 0; // Return code uint32_t l_cent; amec_centaur_t *l_cent_ptr = NULL; static uint8_t L_scom_timeout[MAX_NUM_CENTAURS] = {0}; //track # of consecutive failures static bool L_gpe_scheduled = FALSE; static uint8_t L_gpe_fail_logged = 0; static bool L_gpe_idle_traced = FALSE; static bool L_gpe_had_1_tick = FALSE; // Pointer to the task data structure centaur_control_task_t * l_centControlTask = (centaur_control_task_t *) i_task->data_ptr; // Pointer to parameter field for GPE request GpeScomParms * l_parms = (GpeScomParms *)(l_centControlTask->gpe_req.parameter); do { l_cent = l_centControlTask->curCentaur; l_cent_ptr = &g_amec->proc[0].memctl[l_cent].centaur; //First, check to see if the previous GPE request still running //A request is considered idle if it is not attached to any of the //asynchronous request queues if( !(async_request_is_idle(&l_centControlTask->gpe_req.request)) ) { L_scom_timeout[l_cent]++; //This can happen due to variability in when the task runs if(!L_gpe_idle_traced && L_gpe_had_1_tick) { TRAC_INFO("task_centaur_control: GPE is still running. cent[%d]", l_cent); l_centControlTask->traceThresholdFlags |= CENTAUR_CONTROL_GPE_STILL_RUNNING; L_gpe_idle_traced = TRUE; } L_gpe_had_1_tick = TRUE; break; } else { //Request is idle L_gpe_had_1_tick = FALSE; if(L_gpe_idle_traced) { TRAC_INFO("task_centaur_control: GPE completed. cent[%d]", l_cent); L_gpe_idle_traced = FALSE; } } //check scom status if(L_gpe_scheduled) { if(!async_request_completed(&l_centControlTask->gpe_req.request) || l_parms->rc) { if(!(L_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> l_cent))) { // Check if the centaur has a channel checkstop. If it does, // then do not log any errors. We also don't want to throttle // a centaur that is in this condition. if(!(cent_chan_checkstop(l_cent))) { L_gpe_fail_logged |= CENTAUR0_PRESENT_MASK >> l_cent; TRAC_ERR("task_centaur_control: gpe_scom_centaur failed. l_cent=%d rc=%x, index=0x%08x", l_cent, l_parms->rc, l_parms->errorIndex); /* @ * @errortype * @moduleid CENT_TASK_CONTROL_MOD * @reasoncode CENT_SCOM_ERROR * @userdata1 rc - Return code of scom operation * @userdata2 index of scom operation that failed * @userdata4 OCC_NO_EXTENDED_RC * @devdesc OCC access to centaur failed */ l_err = createErrl( CENT_TASK_CONTROL_MOD, // modId CENT_SCOM_ERROR, // reasoncode OCC_NO_EXTENDED_RC, // Extended reason code ERRL_SEV_PREDICTIVE, // Severity NULL, // Trace Buf DEFAULT_TRACE_SIZE, // Trace Size l_parms->rc, // userdata1 l_parms->errorIndex // userdata2 ); addUsrDtlsToErrl(l_err, //io_err (uint8_t *) &(l_centControlTask->gpe_req.ffdc), //i_dataPtr, sizeof(PoreFfdc), //i_size ERRL_USR_DTL_STRUCT_VERSION_1, //version ERRL_USR_DTL_BINARY_DATA); //type //callout the centaur addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.centaur_huids[l_cent], ERRL_CALLOUT_PRIORITY_MED); //callout the processor addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_MED); commitErrl(&l_err); } }//if(l_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> l_cent)) //Request failed. Keep count of failures and request a reset if we reach a //max retry count L_scom_timeout[l_cent]++; if(L_scom_timeout[l_cent] == CENTAUR_CONTROL_SCOM_TIMEOUT) { break; } }//if(!async_request_completed(&l_centControlTask->gpe_req.request) || l_parms->rc) else { //request completed successfully. reset the timeout. L_scom_timeout[l_cent] = 0; } }//if(L_gpe_scheduled)
// Function Specification // // Name: amec_set_freq_range // // Description: Set the frequency range for AMEC based on mode only // NOTE: Any other clipping of frequency should be done in // amec_slv_proc_voting_box() (called every tick) // so the CLIP history in poll response will accurately // show all clipping for the given mode the system is in // This function will run on mode changes and cnfg_data changes // // Thread: RealTime Loop // // Task Flags: // // End Function Specification errlHndl_t amec_set_freq_range(const OCC_MODE i_mode) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ errlHndl_t l_err = NULL; uint16_t l_freq_min = 0; uint16_t l_freq_max = 0; uint32_t l_temp = 0; amec_mode_freq_t l_ppm_freq[OCC_INTERNAL_MODE_MAX_NUM] = {{0}}; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // First set to Max Freq Range for this mode // if no mode set yet default to the full range if(i_mode == OCC_MODE_NOCHANGE) { l_freq_min = G_sysConfigData.sys_mode_freq.table[OCC_MODE_MIN_FREQUENCY]; // Set Max frequency (turbo if wof off, otherwise max possible (ultra turbo) if( g_amec->wof.wof_disabled || (g_amec->wof.wof_init_state != WOF_ENABLED)) { l_freq_max = G_sysConfigData.sys_mode_freq.table[OCC_MODE_TURBO]; } else { l_freq_max = G_proc_fmax_mhz; } } else if( VALID_MODE(i_mode) ) // Set to Max Freq Range for this mode { l_freq_min = G_sysConfigData.sys_mode_freq.table[OCC_MODE_MIN_FREQUENCY]; // Use max frequency for performance modes and FMF if( (i_mode == OCC_MODE_NOM_PERFORMANCE) || (i_mode == OCC_MODE_MAX_PERFORMANCE) || (i_mode == OCC_MODE_FMF) || (i_mode == OCC_MODE_DYN_POWER_SAVE) || (i_mode == OCC_MODE_DYN_POWER_SAVE_FP) ) { // clip to turbo if WOF is disabled if( g_amec->wof.wof_disabled || (g_amec->wof.wof_init_state != WOF_ENABLED)) { l_freq_max = G_sysConfigData.sys_mode_freq.table[OCC_MODE_TURBO]; } else { l_freq_max = G_proc_fmax_mhz; } } else { l_freq_max = G_sysConfigData.sys_mode_freq.table[i_mode]; } } if( (l_freq_min == 0) || (l_freq_max == 0) ) { // Do not update amec vars with a 0 frequency. // The frequency limit for each mode should have been set prior // to calling or the mode passed was invalid TRAC_ERR("amec_set_freq_range: Freq of 0 found! mode[0x%02x] Fmin[%u] Fmax[%u]", i_mode, l_freq_min, l_freq_max); // Log an error if this is PowerVM as this should never happen when OCC // supports modes if(!G_sysConfigData.system_type.kvm) { /* @ * @errortype * @moduleid AMEC_SET_FREQ_RANGE * @reasoncode INTERNAL_FW_FAILURE * @userdata1 Mode * @userdata2 0 * @userdata4 ERC_FW_ZERO_FREQ_LIMIT * @devdesc Fmin or Fmax of 0 found for mode */ errlHndl_t l_err = createErrl(AMEC_SET_FREQ_RANGE, //modId INTERNAL_FW_FAILURE, //reasoncode ERC_FW_ZERO_FREQ_LIMIT, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size i_mode, //userdata1 0); //userdata2 // Callout Firmware addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_LOW ); } } else { g_amec->sys.fmin = l_freq_min; g_amec->sys.fmax = l_freq_max; TRAC_INFO("amec_set_freq_range: Mode[0x%02x] Fmin[%u] (Pmin 0x%02x) Fmax[%u] (Pmax 0x%02x)", i_mode, l_freq_min, proc_freq2pstate(g_amec->sys.fmin), l_freq_max, proc_freq2pstate(g_amec->sys.fmax)); // Now determine the max frequency for the PPM structure l_ppm_freq[OCC_INTERNAL_MODE_NOM].fmax = G_sysConfigData.sys_mode_freq.table[OCC_MODE_NOMINAL]; l_ppm_freq[OCC_INTERNAL_MODE_DPS].fmax = G_sysConfigData.sys_mode_freq.table[OCC_MODE_DYN_POWER_SAVE]; l_ppm_freq[OCC_INTERNAL_MODE_DPS_MP].fmax = G_sysConfigData.sys_mode_freq.table[OCC_MODE_DYN_POWER_SAVE_FP]; // Determine the min frequency for the PPM structure. This Fmin should // always be set to the system Fmin l_ppm_freq[OCC_INTERNAL_MODE_NOM].fmin = G_sysConfigData.sys_mode_freq.table[OCC_MODE_MIN_FREQUENCY]; l_ppm_freq[OCC_INTERNAL_MODE_DPS].fmin = G_sysConfigData.sys_mode_freq.table[OCC_MODE_MIN_FREQUENCY]; l_ppm_freq[OCC_INTERNAL_MODE_DPS_MP].fmin = G_sysConfigData.sys_mode_freq.table[OCC_MODE_MIN_FREQUENCY]; // Determine the min speed allowed for DPS power policies (this is needed // by the DPS algorithms) l_temp = (l_ppm_freq[OCC_INTERNAL_MODE_DPS].fmin * 1000)/l_ppm_freq[OCC_INTERNAL_MODE_DPS].fmax; l_ppm_freq[OCC_INTERNAL_MODE_DPS].min_speed = l_temp; l_temp = (l_ppm_freq[OCC_INTERNAL_MODE_DPS_MP].fmin * 1000)/l_ppm_freq[OCC_INTERNAL_MODE_DPS_MP].fmax; l_ppm_freq[OCC_INTERNAL_MODE_DPS_MP].min_speed = l_temp; // Copy the PPM frequency information into g_amec memcpy(g_amec->part_mode_freq, l_ppm_freq, sizeof(l_ppm_freq)); } return l_err; }