/* * Function Specification * * Name: homer_log_access_error * * Description: Utility function to log an error that occurred while accessing * the HOMER. * * End Function Specification */ void homer_log_access_error(const homer_rc_t i_homer_rc, const int i_ssx_rc, const uint32_t i_usr_data2) { // Catch and log the homer error if (HOMER_SUCCESS != i_homer_rc) { // We could potentially have both an internal error dealing with the // homer and an SSX error, for example we could find an unsupported // version number in the homer and then have an ssx error trying to // unmap the homer address space. This check catches all those cases. if (SSX_OK != i_ssx_rc) { /* @ * @errortype * @moduleid MAIN_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 HOMER and SSX return codes * @userdata2 Host interrupt type used * @userdata4 ERC_HOMER_MAIN_SSX_ERROR * @devdesc An SSX error occurred mapping the HOMER host data * into the OCC address space. User word 1 contains * both the internal and SSX return codes returned * by the method used to access the HOMER data. */ errlHndl_t l_err = createErrl(MAIN_MID, //modId SSX_GENERIC_FAILURE, //reasoncode ERC_HOMER_MAIN_SSX_ERROR, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size (i_homer_rc << 16) | (0xFFFF & (uint32_t)i_ssx_rc), //userdata1 i_usr_data2); //userdata2 commitErrl(&l_err); } else { /* @ * @errortype * @moduleid MAIN_MID * @reasoncode INTERNAL_FAILURE * @userdata1 HOMER return code * @userdata2 Default host interrupt type used. * @userdata4 ERC_HOMER_MAIN_ACCESS_ERROR * @devdesc Error accessing initialization data */ errlHndl_t l_err = createErrl(MAIN_MID, //modId INTERNAL_FAILURE, //reasoncode ERC_HOMER_MAIN_ACCESS_ERROR,//Extended reason code ERRL_SEV_INFORMATIONAL, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size i_homer_rc, //userdata1 i_usr_data2); //userdata2 commitErrl(&l_err); } } }
// Function Specification // // Name: apssInitApplet // // Description: Entry point function // // End Function Specification errlHndl_t apssInitApplet(void * i_arg) { errlHndl_t l_err = NULL; // Initialize APSS l_err = apss_initialize(); if(NULL != l_err) { TRAC_ERR("APSS Init failed! (retrying) ErrLog[%p]", l_err); setErrlSevToInfo(l_err); // commit & delete commitErrl(&l_err); // Retry one more time l_err = apss_initialize(); if(NULL != l_err) { TRAC_ERR("APSS Init failed again! ErrLog[%p]",l_err); } } return l_err; }
// Function Specification // // Name: errlTestTime // // Description: errlTestTime // // End Function Specification uint32_t errlTestTime() { uint32_t l_rc = 0; do { ERRL_DBG("START"); errlHndl_t l_handle = NULL; uint64_t l_start = 0; uint64_t l_end = 0; /****************************************************/ // Check timeStamp // Create one log l_start = ssx_timebase_get(); l_handle = createErrl( 0x1716, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA, g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // check time stamp errlHndl_t l_handle2 = l_handle; commitErrl( &l_handle ); l_end = ssx_timebase_get(); CHECK_CONDITION( (l_handle2->iv_userDetails.iv_timeStamp >= l_start) && (l_handle2->iv_userDetails.iv_timeStamp <= l_end ), l_rc); deleteErrl(&l_handle2); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestSetErrlSevToInfo // // Description: errlTestSetErrlSevToInfo // // End Function Specification uint32_t errlTestSetErrlSevToInfo() { uint32_t l_rc = 0; ERRL_DBG("START"); do { errlHndl_t l_handle = NULL; /****************************************************/ // Check setErrlSevToInfo // Create ERRL_SEV_PREDICTIVE log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Add callout addCalloutToErrl(l_handle,ERRL_CALLOUT_TYPE_HUID,0x00,ERRL_CALLOUT_PRIORITY_LOW); CHECK_CONDITION( l_handle->iv_numCallouts == 1, l_rc); // Call setErrlSevToInfo. Callouts within log should be cleared and // iv_severity should be set to ERRL_SEV_INFORMATIONAL setErrlSevToInfo(l_handle); CHECK_CONDITION( (l_handle->iv_numCallouts == 0) && (l_handle->iv_severity == ERRL_SEV_INFORMATIONAL), l_rc); deleteErrl( &l_handle ); ppdumpslot(); /****************************************************/ // Check setErrlSevToInfo after errl is committed // Create log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); errlHndl_t l_log = l_handle; // Add callout addCalloutToErrl(l_handle,ERRL_CALLOUT_TYPE_HUID,0x00,ERRL_CALLOUT_PRIORITY_LOW); CHECK_CONDITION( l_handle->iv_numCallouts == 1, l_rc); // Commit log and call setErrlSevToInfo. But setErrlSevToInfo will do nothing commitErrl( &l_handle ); setErrlSevToInfo(l_handle); CHECK_CONDITION( (l_log->iv_numCallouts == ERRL_MAX_CALLOUTS) && (l_log->iv_severity == ERRL_SEV_PREDICTIVE), l_rc); deleteErrl(&l_log); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Called after a failure to read a DIMM temperature. The error will // be counted and if threshold is reached, and error will be created with // the DIMM as a callout and then set flag to trigger I2C reset void mark_dimm_failed() { const uint8_t port = G_dimm_sm_args.i2cPort; const uint8_t dimm = G_dimm_sm_args.dimm; INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X " "(ffdc 0x%08X%08X, completion_state 0x%02X)", DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount, WORD_HIGH(G_dimm_sm_args.error.ffdc), WORD_LOW(G_dimm_sm_args.error.ffdc), G_dimm_sm_request.request.completion_state); if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS) { // Disable collection on this DIMM, collect FFDC and log error G_dimm[port][dimm].disabled = true; INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to %d consecutive errors (state=%d)", DIMM_AND_PORT, G_dimm[port][dimm].errorCount, G_dimm_sm_args.state); errlHndl_t l_err = NULL; /* * @errortype * @moduleid DIMM_MID_MARK_DIMM_FAILED * @reasoncode DIMM_GPE_FAILURE * @userdata1 GPE returned rc code * @userdata4 ERC_DIMM_COMPLETE_FAILURE * @devdesc Failure writing dimm i2c mode register */ l_err = createErrl(DIMM_MID_MARK_DIMM_FAILED, DIMM_GPE_FAILURE, ERC_DIMM_COMPLETE_FAILURE, ERRL_SEV_INFORMATIONAL, NULL, DEFAULT_TRACE_SIZE, G_dimm_sm_args.error.rc, 0); addUsrDtlsToErrl(l_err, (uint8_t*)&G_dimm_sm_request.ffdc, sizeof(G_dimm_sm_request.ffdc), ERRL_STRUCT_VERSION_1, ERRL_USR_DTL_BINARY_DATA); addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.dimm_huids[port][dimm], ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } // Reset DIMM I2C engine G_dimm_i2c_reset_required = true; G_dimm_i2c_reset_cause = port<<24 | dimm<<16 | (G_dimm_sm_args.error.rc & 0xFFFF); G_dimm_state = DIMM_STATE_RESET_MASTER; } // end mark_dimm_failed()
// Function Specification // // Name: errlTestWordAlign // // Description: errlTestWordAlign // // End Function Specification uint32_t errlTestWordAlign() { uint32_t l_rc = 0; uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; ERRL_DBG("START"); do { /****************************************************/ // Test word align for addUsrDtlsToErrl // Create log errlHndl_t l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it errlHndl_t l_handleX = l_handle; ppdumpslot(); // add 13 bytes of "user details" l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xAA, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 13, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 13 bytes" ); ppdumpslot(); // (header + WORDALIGN(13)) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+16), l_rc); /****************************************************/ // Test word align for addTraceToErrl // add 21 bytes of trace l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 21, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 21 bytes" ); ppdumpslot(); // (header + WORDALIGN(21)) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter <= (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+24), l_rc); commitErrl( &l_handle ); deleteErrl(&l_handleX); ERRL_DBG("Slots should now be empty"); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestCreateCommitDeleteLog // // Description: errlTestCreateCommitDeleteLog // // End Function Specification uint32_t errlTestCreateCommitDeleteLog() { ERRL_DBG("START"); uint32_t l_rc = 0; do { /****************************************************/ // Test create log errlHndl_t l_handle = NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA, g_trac_inf, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); ERRL_DBG("Slots after Creating call home log" ); ppdumpslot(); /****************************************************/ // Test commit log errlHndl_t l_handle2 = l_handle; commitErrl( &l_handle ); CHECK_CONDITION( (l_handle == NULL) && (l_handle2->iv_userDetails.iv_committed == 1), l_rc); ERRL_DBG("Slots after Commiting call home log" ); dumpLog( l_handle2, l_handle2->iv_userDetails.iv_entrySize ); ppdumpslot(); /****************************************************/ // Test delete log deleteErrl(&l_handle2); CHECK_CONDITION( l_handle2 == NULL, l_rc); ERRL_DBG("Slots after delete Log" ); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: cmdh_mnfg_get_sensor // // Description: Returns a list of selected sensors // // End Function Specification uint8_t cmdh_mnfg_get_sensor(const cmdh_fsp_cmd_t * i_cmd_ptr, cmdh_fsp_rsp_t * o_rsp_ptr) { uint8_t l_rc = ERRL_RC_SUCCESS; uint16_t l_gsid; uint16_t l_resp_data_length = 0; uint16_t l_datalength; uint16_t l_num_of_sensors = 1; cmdh_mfg_get_sensor_query_t *l_cmd_ptr = (cmdh_mfg_get_sensor_query_t*) i_cmd_ptr; cmdh_mfg_get_sensor_resp_t *l_resp_ptr = (cmdh_mfg_get_sensor_resp_t*) o_rsp_ptr; sensor_info_t l_sensor_info; errlHndl_t l_err = NULL; sensor_t* l_sensor_ptr; do { // Do sanity check on the function inputs if ((NULL == i_cmd_ptr) || (NULL == o_rsp_ptr)) { TRAC_ERR("cmdh_mnfg_get_sensor: invalid pointers. cmd[0x%08x] rsp[0x%08x]", (uint32_t) i_cmd_ptr, (uint32_t) o_rsp_ptr); l_rc = ERRL_RC_INTERNAL_FAIL; break; } // Check packet data length l_datalength = CMDH_DATALEN_FIELD_UINT16(i_cmd_ptr); if(l_datalength < (sizeof(cmdh_mfg_get_sensor_query_t) - sizeof(cmdh_fsp_cmd_header_t))) { TRAC_ERR("cmdh_mnfg_get_sensor: incorrect data length. exp[%d] act[%d]", (sizeof(cmdh_mfg_get_sensor_query_t) - sizeof(cmdh_fsp_cmd_header_t)), l_datalength); l_rc = ERRL_RC_INVALID_CMD_LEN; break; } // Check version if(l_cmd_ptr->version != MFG_LIST_SENSOR_VERSION) { TRAC_ERR("cmdh_mnfg_get_sensor: incorrect version. exp[%d] act[%d]", MFG_GET_SENSOR_VERSION, l_cmd_ptr->version); l_rc = ERRL_RC_INVALID_DATA; break; } // Capture user inputs l_gsid = l_cmd_ptr->gsid; TRAC_INFO("cmdh_mnfg_get_sensor: gsid[0x%04x]", l_gsid); // Initialize the sensor query arguments querySensorListArg_t l_qsl_arg = { l_gsid, // i_startGsid - passed by the caller 0, // i_present - passed by the caller AMEC_SENSOR_TYPE_ALL, // i_type AMEC_SENSOR_LOC_ALL, // i_loc &l_num_of_sensors, // io_numOfSensors NULL, // o_sensors - not needed &l_sensor_info // o_sensorInfoPtr }; // Get the sensor list l_err = querySensorList(&l_qsl_arg); if (NULL != l_err) { // Query failure TRAC_ERR("cmdh_mnfg_get_sensor: Failed to get sensor list. Error status is: 0x%x", l_err->iv_reasonCode); // Commit error log commitErrl(&l_err); l_rc = ERRL_RC_INTERNAL_FAIL; break; } else { l_resp_ptr->gsid = l_gsid; // Some of the response comes from the sensor l_sensor_ptr = getSensorByGsid(l_gsid); if (l_sensor_ptr == NULL) { TRAC_INFO("cmdh_mnfg_get_sensor: Didn't find sensor with gsid[0x%.4X]. Min/Max values won't be accurate.", l_gsid); l_resp_ptr->sample = 0; l_resp_ptr->min = 0xFFFF; l_resp_ptr->max = 0; l_resp_ptr->accumulator = 0; l_resp_ptr->status = 0; } else { l_resp_ptr->sample = l_sensor_ptr->sample; l_resp_ptr->min = l_sensor_ptr->sample_min; l_resp_ptr->max = l_sensor_ptr->sample_max; // Truncate accumulator to 4 bytes (should not be used) l_resp_ptr->accumulator = (uint32_t)l_sensor_ptr->accumulator; l_resp_ptr->status = *(uint8_t*)(&l_sensor_ptr->status); } // The rest of the response comes from the sensor info memcpy(l_resp_ptr->name, l_sensor_info.name, sizeof(l_resp_ptr->name)); memcpy(l_resp_ptr->units, l_sensor_info.sensor.units, sizeof(l_resp_ptr->units)); l_resp_ptr->freq = l_sensor_info.sensor.freq; l_resp_ptr->scalefactor = l_sensor_info.sensor.scalefactor; l_resp_ptr->location = l_sensor_info.sensor.location; l_resp_ptr->type = l_sensor_info.sensor.type; } }while(0); // Populate the response data header l_resp_data_length = sizeof(cmdh_mfg_get_sensor_resp_t) - sizeof(cmdh_fsp_rsp_header_t); G_rsp_status = l_rc; o_rsp_ptr->data_length[0] = ((uint8_t *)&l_resp_data_length)[0]; o_rsp_ptr->data_length[1] = ((uint8_t *)&l_resp_data_length)[1]; return l_rc; }
// Function Specification // // Name: cmdh_mnfg_list_sensors // // Description: Returns a list of selected sensors // // End Function Specification uint8_t cmdh_mnfg_list_sensors(const cmdh_fsp_cmd_t * i_cmd_ptr, cmdh_fsp_rsp_t * o_rsp_ptr) { uint8_t l_rc = ERRL_RC_SUCCESS; uint16_t l_type = 0; uint16_t l_location = 0; uint16_t l_start_gsid; uint16_t i = 0; uint16_t l_resp_data_length = 0; uint16_t l_datalength; uint16_t l_num_of_sensors = MFG_MAX_NUM_SENSORS + 1; cmdh_mfg_list_sensors_query_t *l_cmd_ptr = (cmdh_mfg_list_sensors_query_t*) i_cmd_ptr; cmdh_mfg_list_sensors_resp_t *l_resp_ptr = (cmdh_mfg_list_sensors_resp_t*) o_rsp_ptr; sensorQueryList_t l_sensor_list[MFG_MAX_NUM_SENSORS + 1]; errlHndl_t l_err = NULL; do { // Do sanity check on the function inputs if ((NULL == i_cmd_ptr) || (NULL == o_rsp_ptr)) { TRAC_ERR("cmdh_mnfg_list_sensors: invalid pointers. cmd[0x%08x] rsp[0x%08x]", (uint32_t) i_cmd_ptr, (uint32_t) o_rsp_ptr); l_rc = ERRL_RC_INTERNAL_FAIL; break; } // Check packet data length l_datalength = CMDH_DATALEN_FIELD_UINT16(i_cmd_ptr); if(l_datalength < (sizeof(cmdh_mfg_list_sensors_query_t) - sizeof(cmdh_fsp_cmd_header_t))) { TRAC_ERR("cmdh_mnfg_list_sensors: incorrect data length. exp[%d] act[%d]", (sizeof(cmdh_mfg_list_sensors_query_t) - sizeof(cmdh_fsp_cmd_header_t)), l_datalength); l_rc = ERRL_RC_INVALID_CMD_LEN; break; } // Check version if(l_cmd_ptr->version != MFG_LIST_SENSOR_VERSION) { TRAC_ERR("cmdh_mnfg_list_sensors: incorrect version. exp[%d] act[%d]", MFG_LIST_SENSOR_VERSION, l_cmd_ptr->version); l_rc = ERRL_RC_INVALID_DATA; break; } // Capture user inputs l_type = l_cmd_ptr->type; l_location = l_cmd_ptr->location; l_start_gsid = l_cmd_ptr->start_gsid; TRAC_INFO("cmdh_mnfg_list_sensors: Type[0x%04x] Location[0x%04x]", l_type, l_location); // Initialize the sensor query arguments const querySensorListArg_t l_qsl_arg = { l_start_gsid, // i_startGsid - passed by the caller l_cmd_ptr->present, // i_present - passed by the caller l_type, // i_type - passed by the caller l_location, // i_loc - passed by the caller &l_num_of_sensors, // io_numOfSensors l_sensor_list, // o_sensors NULL // o_sensorInfoPtr - not needed }; // Get the list of sensors l_err = querySensorList(&l_qsl_arg); if (NULL != l_err) { // Query failure TRAC_ERR("cmdh_mnfg_list_sensors: Failed to query sensor list. Error status is: 0x%x", l_err->iv_reasonCode); // Commit error log commitErrl(&l_err); l_rc = ERRL_RC_INTERNAL_FAIL; break; } else { TRAC_INFO("cmdh_mnfg_list_sensors: Numbers of sensors found[%u]", l_num_of_sensors); if (l_num_of_sensors > MFG_MAX_NUM_SENSORS) { // Got too many sensors back, need to truncate the list TRAC_INFO("cmdh_mnfg_list_sensors: Got too many sensors back[%u]. Truncating number of sensors to %u", l_num_of_sensors, MFG_MAX_NUM_SENSORS); l_num_of_sensors = MFG_MAX_NUM_SENSORS; l_resp_ptr->truncated = 1; } else { l_resp_ptr->truncated = 0; } // Clear out the sensor fields memset((void*) &(l_resp_ptr->sensor[0]), 0, (sizeof(cmdh_dbug_sensor_list_t)*l_num_of_sensors) ); // Populate the response data packet l_resp_ptr->num_sensors = l_num_of_sensors; for (i=0; i<l_num_of_sensors; i++) { l_resp_ptr->sensor[i].gsid = l_sensor_list[i].gsid; l_resp_ptr->sensor[i].sample = l_sensor_list[i].sample; strcpy(l_resp_ptr->sensor[i].name, l_sensor_list[i].name); } } }while(0); // Populate the response data header l_resp_data_length = 2 + l_num_of_sensors * sizeof(cmdh_mfg_sensor_rec_t); G_rsp_status = l_rc; o_rsp_ptr->data_length[0] = ((uint8_t *)&l_resp_data_length)[0]; o_rsp_ptr->data_length[1] = ((uint8_t *)&l_resp_data_length)[1]; return l_rc; }
// Function Specification // // Name: populate_sapphire_tbl_to_mem // // Description: // // End Function Specification void populate_sapphire_tbl_to_mem() { int l_ssxrc = SSX_OK; uint32_t l_reasonCode = 0; uint32_t l_extReasonCode = 0; do { #define SAPPHIRE_OFFSET_IN_HOMER 0x001F8000 BceRequest pba_copy; // Set up copy request l_ssxrc = bce_request_create(&pba_copy, // block copy object &G_pba_bcue_queue, // sram to mainstore copy engine SAPPHIRE_OFFSET_IN_HOMER, // mainstore address (uint32_t) &G_sapphire_table, // sram starting address (size_t) sizeof(G_sapphire_table), // size of copy SSX_WAIT_FOREVER, // no timeout NULL, // call back NULL, // call back arguments ASYNC_REQUEST_BLOCKING // callback mask ); if(l_ssxrc != SSX_OK) { TRAC_ERR("populate_sapphire_tbl_to_mem: PBA request create failure rc=[%08X]", -l_ssxrc); /* * @errortype * @moduleid MAIN_STATE_TRANSITION_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 RC for PBA block-copy engine * @userdata4 ERC_BCE_REQUEST_CREATE_FAILURE * @devdesc SSX BCE related failure */ l_reasonCode = SSX_GENERIC_FAILURE; l_extReasonCode = ERC_BCE_REQUEST_CREATE_FAILURE; break; } // Do actual copying l_ssxrc = bce_request_schedule(&pba_copy); if(l_ssxrc != SSX_OK) { TRAC_ERR("populate_sapphire_tbl_to_mem: PBA request schedule failure rc=[%08X]", -l_ssxrc); /* * @errortype * @moduleid MAIN_STATE_TRANSITION_MID * @reasoncode SSX_GENERIC_FAILURE * @userdata1 RC for PBA block-copy engine * @userdata4 ERC_BCE_REQUEST_SCHEDULE_FAILURE * @devdesc Failed to copy data by using DMA */ l_reasonCode = SSX_GENERIC_FAILURE; l_extReasonCode = ERC_BCE_REQUEST_SCHEDULE_FAILURE; break; } } while(0); if ( l_ssxrc != SSX_OK ) { errlHndl_t l_errl = createErrl(MAIN_STATE_TRANSITION_MID, //modId l_reasonCode, //reasoncode l_extReasonCode, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf 0, //Trace Size -l_ssxrc, //userdata1 0); //userdata2 // Callout firmware addCalloutToErrl(l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_errl); } }
// Function Specification // // Name: amec_slv_check_perf // // Description: Slave OCC's Detect and log degraded performance errors // This function will run every tick. // // Thread: RealTime Loop // // Task Flags: // // End Function Specification void amec_slv_check_perf(void) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ static BOOLEAN l_prev_failsafe_state = FALSE; static BOOLEAN l_prev_ovs_state = FALSE; static BOOLEAN l_prev_pcap_state = FALSE; static ERRL_SEVERITY l_pcap_sev = ERRL_SEV_PREDICTIVE; static BOOLEAN l_throttle_traced = FALSE; static uint64_t l_time = 0; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // Verify that cores are at proper frequency amec_verify_pstate(); do { // was frequency limited by power ? if ( G_non_dps_power_limited != TRUE ) { if(l_throttle_traced) { TRAC_INFO("Frequency not limited by power algorithms anymore"); l_throttle_traced = FALSE; } // we are done break and return break; } // frequency limited due to failsafe condition ? if ( AMEC_INTF_GET_FAILSAFE() == TRUE ) { if ( l_prev_failsafe_state == TRUE) { // we are done break and return break; } else { // log this error ONLY ONCE per IPL l_prev_failsafe_state = TRUE; TRAC_ERR("Frequency limited due to failsafe condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out OVS procedure // set error severity to RRL_SEV_PREDICTIVE /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode INTERNAL_FAILURE * @userdata1 Previous FailSafe State * @userdata4 ERC_AMEC_SLAVE_FAILSAFE_STATE * @devdesc Frequency limited due to failsafe condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId INTERNAL_FAILURE, //reasoncode ERC_AMEC_SLAVE_FAILSAFE_STATE,//Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_prev_failsafe_state, //userdata1 0); //userdata2 addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_OVERSUBSCRIPTION, ERRL_CALLOUT_PRIORITY_HIGH ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // Commit Error commitErrl(&l_errl); // we are done lets break break; } } // frequency limited due to oversubscription condition ? if ( AMEC_INTF_GET_OVERSUBSCRIPTION() == TRUE ) { if ( l_prev_ovs_state == TRUE) { // we are done break and return break; } else { // log this error ONLY ONCE per IPL l_prev_ovs_state = TRUE; TRAC_ERR("Frequency limited due to oversubscription condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out OVS procedure // set error severity to RRL_SEV_PREDICTIVE // Updated the RC to match the actual RC passed to createErrl() /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode OVERSUB_LIMIT_ALERT * @userdata1 Previous OVS State * @userdata4 ERC_AMEC_SLAVE_OVS_STATE * @devdesc Frequency limited due to oversubscription condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId OVERSUB_LIMIT_ALERT, //reasoncode ERC_AMEC_SLAVE_OVS_STATE, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_prev_ovs_state, //userdata1 0); //userdata2 // Callout to Oversubscription addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_OVERSUBSCRIPTION, ERRL_CALLOUT_PRIORITY_HIGH ); // Callout to APSS addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.apss_huid, ERRL_CALLOUT_PRIORITY_MED ); // Callout to Firmware addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_LOW ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // Commit Error commitErrl(&l_errl); // we are done lets break break; } } uint16_t l_snrBulkPwr = AMECSENSOR_PTR(PWR250US)->sample; // frequency limited due to system power cap condition ? if (( l_snrBulkPwr > (G_sysConfigData.pcap.system_pcap - PDROP_THRESH) ) && ( G_sysConfigData.pcap.current_pcap == 0 )) { if ( l_prev_pcap_state == TRUE) { // we are done break and return break; } else { //log this error ONLY ONCE per IPL l_prev_pcap_state = TRUE; TRAC_ERR("Frequency limited due to power cap condition(mode:%d, state:%d)", CURRENT_MODE(), CURRENT_STATE()); TRAC_ERR("SnrBulkPwr %d > Sys Pcap %d ",l_snrBulkPwr, G_sysConfigData.pcap.system_pcap ); TRAC_ERR("SnrFanPwr %d, SnrIOPwr %d, SnrStoragePwr %d, SnrGpuPrw %d ", AMECSENSOR_PTR(PWR250USFAN)->sample, AMECSENSOR_PTR(PWR250USIO)->sample, AMECSENSOR_PTR(PWR250USSTORE)->sample, AMECSENSOR_PTR(PWR250USGPU)->sample ); TRAC_ERR("SnrProcPwr 0 %d, SnrProcPwr 1 %d, SnrProcPwr 2 %d, SnrProcPwr 3 %d", g_amec->proc_snr_pwr[0], g_amec->proc_snr_pwr[1], g_amec->proc_snr_pwr[2], g_amec->proc_snr_pwr[3] ); TRAC_ERR("SnrMemPwr 0 %d, SnrMemPwr 1 %d, SnrMemPwr 2 %d, SnrMemPwr 3 %d", g_amec->mem_snr_pwr[0], g_amec->mem_snr_pwr[1], g_amec->mem_snr_pwr[2], g_amec->mem_snr_pwr[3] ); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); // log error that calls out firmware and APSS procedure // set error severity to l_pcap_sev /* @ * @errortype * @moduleid AMEC_SLAVE_CHECK_PERFORMANCE * @reasoncode PCAP_THROTTLE_POWER_LIMIT * @userdata1 Current Sensor Bulk Power * @userdata2 System PCAP * @userdata4 ERC_AMEC_SLAVE_POWERCAP * @devdesc Frequency limited due to PowerCap condition */ errlHndl_t l_errl = createErrl(AMEC_SLAVE_CHECK_PERFORMANCE, //modId PCAP_THROTTLE_POWER_LIMIT, //reasoncode ERC_AMEC_SLAVE_POWERCAP, //Extended reason code l_pcap_sev, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_snrBulkPwr, //userdata1 G_sysConfigData.pcap.system_pcap);//userdata2 addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH ); addCalloutToErrl( l_errl, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.apss_huid, ERRL_CALLOUT_PRIORITY_HIGH ); // and sets the consolidate action flag setErrlActions( l_errl, ERRL_ACTIONS_CONSOLIDATE_ERRORS ); // then l_pcap_sev to informational l_pcap_sev = ERRL_SEV_INFORMATIONAL; // Commit Error commitErrl(&l_errl); // we are done lets break break; } } // trottle trace to every 3600 seconds (1hr = 3600000) if(!l_throttle_traced && ( DURATION_IN_MS_UNTIL_NOW_FROM(l_time) > 3600000 ) ) { TRAC_INFO("Frequency power limited due to transient condition: PowerLimited=%x, FailSafe=%x, OverSubScription=%x CurrentBulkPwr=%x", G_non_dps_power_limited, AMEC_INTF_GET_FAILSAFE(), AMEC_INTF_GET_OVERSUBSCRIPTION(), l_snrBulkPwr ); l_throttle_traced = TRUE; l_time = ssx_timebase_get(); } } while( 0 ); return; }
// Function Specification // // Name: Dcom_thread_routine // // Description: Purpose of this task is to handle messages passed from // Master to Slave and vice versa. // // Nothing in this thread should be time-critical, but should // happen more often than the 1-second that other threads run // at. // // This thread currently runs ~1ms, based on the RTL loop of // 250us. // // FWIW -- It is pointless to set this thread to run any more // often than the length of the RTL loop, since it is acting // on data passed back and forth via that loop. // // End Function Specification void Dcom_thread_routine(void *arg) { OCC_STATE l_newOccState = 0; OCC_MODE l_newOccMode = 0; SsxTimer l_timeout_timer; errlHndl_t l_errlHndl = NULL; // -------------------------------------------------- // Create a timer that pops every 10 seconds to wake up // this thread, in case a semaphore never gets posted. // TODO: Is this really needed? // -------------------------------------------------- ssx_timer_create(&l_timeout_timer, (SsxTimerCallback) ssx_semaphore_post, (void *) &G_dcomThreadWakeupSem); ssx_timer_schedule(&l_timeout_timer, SSX_SECONDS(10), SSX_SECONDS(10)); for(;;) { // -------------------------------------------------- // Wait on Semaphore until we get new data over DCOM // (signalled by sem_post() or timeout occurs. // Sem timeout is designed to be the slowest // interval we will attempt to run this thread at. // -------------------------------------------------- // Wait for sem_post before we run through this thread. ssx_semaphore_pend(&G_dcomThreadWakeupSem, SSX_WAIT_FOREVER); // -------------------------------------------------- // Counter to ensure thread is running (can wrap) // -------------------------------------------------- G_dcom_thread_counter++; // -------------------------------------------------- // Check if we need to update the sapphire table // -------------------------------------------------- if(G_sysConfigData.system_type.kvm) { proc_check_for_sapphire_updates(); } // -------------------------------------------------- // Set Mode and State Based on Master // -------------------------------------------------- l_newOccState = (G_occ_master_state == CURRENT_STATE()) ? OCC_STATE_NOCHANGE : G_occ_master_state; if(G_sysConfigData.system_type.kvm) { l_newOccMode = (G_occ_master_mode == G_occ_external_req_mode_kvm ) ? OCC_MODE_NOCHANGE : G_occ_master_mode; } else { l_newOccMode = (G_occ_master_mode == CURRENT_MODE() ) ? OCC_MODE_NOCHANGE : G_occ_master_mode; } // Override State if SAFE state is requested l_newOccState = ( isSafeStateRequested() ) ? OCC_STATE_SAFE : l_newOccState; // Override State if we are in SAFE state already l_newOccState = ( OCC_STATE_SAFE == CURRENT_STATE() ) ? OCC_STATE_NOCHANGE : l_newOccState; if( (OCC_STATE_NOCHANGE != l_newOccState) || (OCC_MODE_NOCHANGE != l_newOccMode) ) { // If we're active, then we should always process the mode change first // If we're not active, then we should always process the state change first if(OCC_STATE_ACTIVE == CURRENT_STATE()) { // Set the new mode l_errlHndl = SMGR_set_mode(l_newOccMode, 0 /* TODO V/F */ ); if(l_errlHndl) { commitErrl(&l_errlHndl); } // Set the new state l_errlHndl = SMGR_set_state(l_newOccState); if(l_errlHndl) { commitErrl(&l_errlHndl); } } else { // Set the new state l_errlHndl = SMGR_set_state(l_newOccState); if(l_errlHndl) { commitErrl(&l_errlHndl); } // Set the new mode l_errlHndl = SMGR_set_mode(l_newOccMode, 0 /* TODO V/F */ ); if(l_errlHndl) { commitErrl(&l_errlHndl); } } } // -------------------------------------------------- // DCM PStates // \_ can do sem_post to increment through state machine // -------------------------------------------------- if(OCC_STATE_SAFE != CURRENT_STATE()) { proc_gpsm_dcm_sync_enable_pstates_smh(); } // -------------------------------------------------- // SSX Sleep // -------------------------------------------------- // Even if semaphores are continually posted, there is no reason // for us to run this thread any more often than once every 250us // so we don't starve any other thread ssx_sleep(SSX_MICROSECONDS(250)); } }
// Function Specification // // Name: dbug_err_inject // // Description: Injects an error // // End Function Specification void dbug_err_inject(const cmdh_fsp_cmd_t * i_cmd_ptr, cmdh_fsp_rsp_t * i_rsp_ptr) { errlHndl_t l_err; cmdh_dbug_inject_errl_query_t *l_cmd_ptr = (cmdh_dbug_inject_errl_query_t*) i_cmd_ptr; i_rsp_ptr->data_length[0] = 0; i_rsp_ptr->data_length[1] = 0; G_rsp_status = ERRL_RC_SUCCESS; if(!strncmp(l_cmd_ptr->comp, "RST", OCC_TRACE_NAME_SIZE)) { l_err = createErrl(CMDH_DBUG_MID, //modId INTERNAL_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0xff, //userdata1 0); //userdata2 if (INVALID_ERR_HNDL == l_err) { G_rsp_status = ERRL_RC_INTERNAL_FAIL; } addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, //callout type (HUID/CompID) G_sysConfigData.proc_huid, //callout data ERRL_CALLOUT_PRIORITY_HIGH); //priority REQUEST_RESET(l_err); } else { l_err = createErrl(CMDH_DBUG_MID, //modId INTERNAL_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_UNRECOVERABLE, //Severity TRAC_get_td(l_cmd_ptr->comp), //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size 0xff, //userdata1 0); //userdata2 if (INVALID_ERR_HNDL == l_err) { G_rsp_status = ERRL_RC_INTERNAL_FAIL; } // Commit Error log commitErrl(&l_err); } if (G_rsp_status == ERRL_RC_INTERNAL_FAIL) { TRAC_ERR("cmdh_dbug_inject_errl: Fail creating ERR Log\n"); } else { TRAC_INFO("cmdh_dbug_inject_errl: inject errl for COMP : %s\n", l_cmd_ptr->comp); } return; }
// Function Specification // // Name: errlTestErrorHandling // // Description: errlTestErrorHandling // // End Function Specification uint32_t errlTestErrorHandling() { uint32_t l_rc = 0; errlHndl_t l_errlHnd = NULL; uint8_t l_dataPtr[10]; uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; ERRL_DBG(" START"); do { /****************************************************/ // Test createErrl with incorrect parameter // Set ERRL_SEVERITY to 0x04, out of range so log won't be created l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, 0x04, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd == INVALID_ERR_HNDL, l_rc); /****************************************************/ // Test addTraceToErrl with incorrect parameter // Create a log l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd != INVALID_ERR_HNDL, l_rc); // i_trace = NULL, so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(NULL, 5, l_errlHnd); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION(l_entrySizeBefore == l_entrySizeAfter, l_rc); // i_traceSz = 0, entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 0, l_errlHnd); // @at012c l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // io_err = NULL, entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 32, NULL); // @at012c l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // test addTraceToErrl after log is comitted so entry size doesn't change errlHndl_t l_errlHndx = l_errlHnd; commitErrl(&l_errlHnd); l_entrySizeBefore = l_errlHndx->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 32, l_errlHndx); // @at012c l_entrySizeAfter = l_errlHndx->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); deleteErrl(&l_errlHndx); CHECK_CONDITION( l_errlHndx == NULL, l_rc); // io_err = INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) l_errlHnd = INVALID_ERR_HNDL; addTraceToErrl(g_trac_inf, 32, l_errlHnd); /****************************************************/ // Test commitErrl with incorrect parameter // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) commitErrl( NULL); // l_errlHnd should be set to NULL l_errlHnd = INVALID_ERR_HNDL; commitErrl(&l_errlHnd); CHECK_CONDITION( l_errlHnd == NULL, l_rc); /****************************************************/ // Test deleteErrl with incorrect parameter // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) deleteErrl( NULL); // l_errlHnd should be set to NULL l_errlHnd = INVALID_ERR_HNDL; deleteErrl(&l_errlHnd); CHECK_CONDITION( l_errlHnd == NULL, l_rc); /****************************************************/ // Test addCalloutToErrl with incorrect parameter // Set io_err to NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addCalloutToErrl(NULL, ERRL_CALLOUT_TYPE_HUID, 0, ERRL_CALLOUT_PRIORITY_LOW); // Set io_err to INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addCalloutToErrl(INVALID_ERR_HNDL, ERRL_CALLOUT_TYPE_HUID, 0, ERRL_CALLOUT_PRIORITY_LOW); /****************************************************/ // Test addUsrDtlsToErrl with incorrect parameter // Create a log l_errlHnd = createErrl(TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 0, 0x01, 0x02); CHECK_CONDITION( l_errlHnd != INVALID_ERR_HNDL, l_rc); // io_err = NULL // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addUsrDtlsToErrl(NULL, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // io_err = INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) addUsrDtlsToErrl(INVALID_ERR_HNDL, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); // i_dataPtr = NULL so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHnd, NULL, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // i_size = 0 so so entry size doesn't change l_entrySizeBefore = l_errlHnd->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHnd, l_dataPtr, 0, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHnd->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); // test addUsrDtlsToErrl after log is committed so entry size doesn't change l_errlHndx = l_errlHnd; commitErrl(&l_errlHnd); l_entrySizeBefore = l_errlHndx->iv_userDetails.iv_entrySize; addUsrDtlsToErrl(l_errlHndx, l_dataPtr, 10, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA); l_entrySizeAfter = l_errlHndx->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeBefore == l_entrySizeAfter, l_rc); deleteErrl(&l_errlHndx); CHECK_CONDITION( l_errlHndx == NULL, l_rc); /****************************************************/ // Test setErrlSevToInfo with incorrect parameter // Set io_err to NULL. // We are making sure that this function // handles a NULL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) setErrlSevToInfo(NULL); // Set io_err to INVALID_ERR_HNDL // We are making sure that this function // handles a INVALID_ERR_HNDL being passed, and that we can't verify if // an error occurred by checking anything. (It will just cause // a TLB exception) setErrlSevToInfo(INVALID_ERR_HNDL); }while(0); return l_rc; }
// Function Specification // // Name: dcom_error_check // // Description: keep track of failure counts // // End Function Specification void dcom_error_check( const dcom_error_type_t i_error_type, const bool i_clear_error, const uint32_t i_orc, const uint32_t i_orc_ext) { static uint16_t L_rx_slv_outbox_fail_count = 0; uint16_t l_modId = 0; uint16_t *l_count_ptr = NULL; if ( i_error_type == SLAVE_INBOX ) { l_count_ptr = &G_dcomSlvInboxCounter.currentFailCount; l_modId = DCOM_MID_TASK_RX_SLV_INBOX; } // if the i_error_type == SLAVE_OUTBOX then set the outbox count else { l_count_ptr = &L_rx_slv_outbox_fail_count; l_modId = DCOM_MID_TASK_RX_SLV_OUTBOX; } if ( i_clear_error ) { *l_count_ptr = 0; } else { (*l_count_ptr)++; if ( *l_count_ptr == DCOM_250us_GAP ) { // Trace an imp trace log TRAC_IMP("l_count_ptr[%d], L_outbox[%d], L_inbox[%d]", *l_count_ptr, L_rx_slv_outbox_fail_count, G_dcomSlvInboxCounter.currentFailCount ); } else if ( *l_count_ptr == DCOM_4MS_GAP ) { // Create and commit error log // NOTE: SRC tags are NOT needed here, they are // taken care of by the caller errlHndl_t l_errl = createErrl( l_modId, //ModId i_orc, //Reasoncode i_orc_ext, //Extended reasoncode ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size *l_count_ptr, //Userdata1 0 //Userdata2 ); // Commit log commitErrl( &l_errl ); // Call request nominal macro to change state REQUEST_NOMINAL(); } else if ( *l_count_ptr == DCOM_1S_GAP ) { // Create and commit error log // NOTE: SRC tags are NOT needed here, they are // taken care of by the caller errlHndl_t l_errl = createErrl( l_modId, //ModId i_orc, //Reasoncode i_orc_ext, //Extended reasoncode ERRL_SEV_UNRECOVERABLE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size *l_count_ptr, //Userdata1 0 //Userdata2 ); // Commit log // Call request reset macro REQUEST_RESET(l_errl); } } }
void task_centaur_control( task_t * i_task ) { errlHndl_t l_err = NULL; // Error handler int rc = 0; // Return code uint32_t l_cent; amec_centaur_t *l_cent_ptr = NULL; static uint8_t L_scom_timeout[MAX_NUM_CENTAURS] = {0}; //track # of consecutive failures static bool L_gpe_scheduled = FALSE; static uint8_t L_gpe_fail_logged = 0; static bool L_gpe_idle_traced = FALSE; static bool L_gpe_had_1_tick = FALSE; // Pointer to the task data structure centaur_control_task_t * l_centControlTask = (centaur_control_task_t *) i_task->data_ptr; // Pointer to parameter field for GPE request GpeScomParms * l_parms = (GpeScomParms *)(l_centControlTask->gpe_req.parameter); do { l_cent = l_centControlTask->curCentaur; l_cent_ptr = &g_amec->proc[0].memctl[l_cent].centaur; //First, check to see if the previous GPE request still running //A request is considered idle if it is not attached to any of the //asynchronous request queues if( !(async_request_is_idle(&l_centControlTask->gpe_req.request)) ) { L_scom_timeout[l_cent]++; //This can happen due to variability in when the task runs if(!L_gpe_idle_traced && L_gpe_had_1_tick) { TRAC_INFO("task_centaur_control: GPE is still running. cent[%d]", l_cent); l_centControlTask->traceThresholdFlags |= CENTAUR_CONTROL_GPE_STILL_RUNNING; L_gpe_idle_traced = TRUE; } L_gpe_had_1_tick = TRUE; break; } else { //Request is idle L_gpe_had_1_tick = FALSE; if(L_gpe_idle_traced) { TRAC_INFO("task_centaur_control: GPE completed. cent[%d]", l_cent); L_gpe_idle_traced = FALSE; } } //check scom status if(L_gpe_scheduled) { if(!async_request_completed(&l_centControlTask->gpe_req.request) || l_parms->rc) { if(!(L_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> l_cent))) { // Check if the centaur has a channel checkstop. If it does, // then do not log any errors. We also don't want to throttle // a centaur that is in this condition. if(!(cent_chan_checkstop(l_cent))) { L_gpe_fail_logged |= CENTAUR0_PRESENT_MASK >> l_cent; TRAC_ERR("task_centaur_control: gpe_scom_centaur failed. l_cent=%d rc=%x, index=0x%08x", l_cent, l_parms->rc, l_parms->errorIndex); /* @ * @errortype * @moduleid CENT_TASK_CONTROL_MOD * @reasoncode CENT_SCOM_ERROR * @userdata1 rc - Return code of scom operation * @userdata2 index of scom operation that failed * @userdata4 OCC_NO_EXTENDED_RC * @devdesc OCC access to centaur failed */ l_err = createErrl( CENT_TASK_CONTROL_MOD, // modId CENT_SCOM_ERROR, // reasoncode OCC_NO_EXTENDED_RC, // Extended reason code ERRL_SEV_PREDICTIVE, // Severity NULL, // Trace Buf DEFAULT_TRACE_SIZE, // Trace Size l_parms->rc, // userdata1 l_parms->errorIndex // userdata2 ); addUsrDtlsToErrl(l_err, //io_err (uint8_t *) &(l_centControlTask->gpe_req.ffdc), //i_dataPtr, sizeof(PoreFfdc), //i_size ERRL_USR_DTL_STRUCT_VERSION_1, //version ERRL_USR_DTL_BINARY_DATA); //type //callout the centaur addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.centaur_huids[l_cent], ERRL_CALLOUT_PRIORITY_MED); //callout the processor addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_MED); commitErrl(&l_err); } }//if(l_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> l_cent)) //Request failed. Keep count of failures and request a reset if we reach a //max retry count L_scom_timeout[l_cent]++; if(L_scom_timeout[l_cent] == CENTAUR_CONTROL_SCOM_TIMEOUT) { break; } }//if(!async_request_completed(&l_centControlTask->gpe_req.request) || l_parms->rc) else { //request completed successfully. reset the timeout. L_scom_timeout[l_cent] = 0; } }//if(L_gpe_scheduled)
////////////////////////// // Function Specification // // Name: amec_gpu_pcap // // Description: Determine power cap for GPUs // // Thread: Real Time Loop // // End Function Specification void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t i_avail_power) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ uint8_t i = 0; uint32_t l_gpu_cap_mw = 0; uint16_t l_system_gpu_total_pcap = 0; // total GPU pcap required by system based on if currently in oversub or not static uint16_t L_total_gpu_pcap = 0; // Current total GPU pcap in effect static uint16_t L_n_plus_1_mode_gpu_total_pcap = 0; // Total GPU pcap required for N+1 (not in oversubscription) static uint16_t L_n_mode_gpu_total_pcap = 0; // Total GPU pcap required for oversubscription static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR static uint16_t L_per_gpu_pcap = 0; // Amount of L_total_gpu_pcap for each GPU static uint8_t L_psr = 100; // PSR value used in L_active_psr_gpu_total_pcap calculation static bool L_first_run = TRUE; // for calculations done only 1 time static uint32_t L_last_pcap_traced[MAX_NUM_GPU_PER_DOMAIN] = {0}; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // If this is the first time running calculate the total GPU power cap for system power caps (N and N+1) if(L_first_run) { // calculate total GPU power cap for oversubscription if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the oversubscription power cap L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs L_n_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; } else { // This should not happen, the total non GPU power should never be higher than the N mode cap // Log error and set GPUs to minimum power cap L_n_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N mode pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.ovs_node_pcap); /* @ * @errortype * @moduleid AMEC_GPU_PCAP_MID * @reasoncode GPU_FAILURE * @userdata1 N mode Power Cap watts * @userdata2 Total non-GPU power watts * @userdata4 ERC_GPU_N_MODE_PCAP_CALC_FAILURE * @devdesc Total non-GPU power more than N mode power cap * */ errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, GPU_FAILURE, ERC_GPU_N_MODE_PCAP_CALC_FAILURE, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, g_amec->pcap.ovs_node_pcap, G_sysConfigData.total_non_gpu_max_pwr_watts); //Callout firmware addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } // calculate total GPU power cap for N+1 (not in oversubscription) if(G_sysConfigData.pcap.system_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the N+1 power cap L_n_plus_1_mode_gpu_total_pcap = G_sysConfigData.pcap.system_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs L_n_plus_1_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; } else { // This should not happen, the total non GPU power should never be higher than the N+1 mode cap // Log error and set GPUs to minimum power cap L_n_plus_1_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N+1 mode pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, G_sysConfigData.pcap.system_pcap); /* @ * @errortype * @moduleid AMEC_GPU_PCAP_MID * @reasoncode GPU_FAILURE * @userdata1 N+1 mode Power Cap watts * @userdata2 Total non-GPU power watts * @userdata4 ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE * @devdesc Total non-GPU power more than N+1 mode power cap * */ errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, GPU_FAILURE, ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE, ERRL_SEV_PREDICTIVE, NULL, DEFAULT_TRACE_SIZE, G_sysConfigData.pcap.system_pcap, G_sysConfigData.total_non_gpu_max_pwr_watts); //Callout firmware addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } } // if first run // Calculate the total GPU power cap for the current active limit and PSR // this only needs to be calculated if either the active limit or PSR changed if( (L_first_run) || (i_active_pcap_changed) || (L_psr != G_sysConfigData.psr) ) { L_psr = G_sysConfigData.psr; if(g_amec->pcap.active_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) { // Take all non-GPU power away from the active power cap L_active_psr_gpu_total_pcap = g_amec->pcap.active_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; // Add back in the power that will be dropped by processor DVFS and memory throttling based on the PSR // to give to GPUs L_active_psr_gpu_total_pcap += ( (L_psr / 100) * G_sysConfigData.total_proc_mem_pwr_drop_watts ); } else { // Set GPUs to minimum power cap L_active_psr_gpu_total_pcap = 0; TRAC_IMP("amec_gpu_pcap: non GPU max power %dW is more than active pwr limit %dW", G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap); } // Total GPU power cap is the lower of system (N+1 or oversubscription depending on if in oversub) // and the active power limit. We do not need to always account for oversubscription since // the automatic hw power brake will assert to the GPUs if there is a problem when oversub is // entered from the time OCC can set and GPUs react to a new power limit if(i_oversubscription) { // system in oversubscription use N mode cap l_system_gpu_total_pcap = L_n_mode_gpu_total_pcap; } else { // system is not in oversubscription use N+1 mode cap l_system_gpu_total_pcap = L_n_plus_1_mode_gpu_total_pcap; } L_total_gpu_pcap = (l_system_gpu_total_pcap < L_active_psr_gpu_total_pcap) ? l_system_gpu_total_pcap : L_active_psr_gpu_total_pcap; // Divide the total equally across all GPUs in the system if(G_first_num_gpus_sys) { L_per_gpu_pcap = L_total_gpu_pcap / G_first_num_gpus_sys; } else { L_per_gpu_pcap = 0; TRAC_ERR("amec_gpu_pcap: Called with no GPUs present!"); } } // Setup to send new power limit to GPUs. The actual sending of GPU power limit will be handled by task_gpu_sm() for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++) { // Before sending a GPU a power limit the power limits must be read from the GPU to know min/max GPU allows if( GPU_PRESENT(i) && g_amec->gpu[i].pcap.pwr_limits_read ) { l_gpu_cap_mw = L_per_gpu_pcap * 1000; // convert W to mW // GPU is present and have min/max power limits from GPU // clip the GPU power limit to min/max GPU limit if needed if(l_gpu_cap_mw < g_amec->gpu[i].pcap.gpu_min_pcap_mw) // clip to min? { l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw; } else if(l_gpu_cap_mw > g_amec->gpu[i].pcap.gpu_max_pcap_mw) // clip to max? { l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_max_pcap_mw; } // check if this is a new power limit if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw) { if( (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) || (L_last_pcap_traced[i] != l_gpu_cap_mw) ) { L_last_pcap_traced[i] = l_gpu_cap_mw; TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i, g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw); } g_amec->gpu[i].pcap.gpu_desired_pcap_mw = l_gpu_cap_mw; } } } // for each GPU L_first_run = FALSE; }
// Function Specification // // Name: errlTestAddTraceToErrl // // Description: errlTestAddTraceToErrl // // End Function Specification uint32_t errlTestAddTraceToErrl() { uint32_t l_rc = 0; uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; ERRL_DBG("START"); do { // Create one err log errlHndl_t l_handle = NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, NULL, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it errlHndl_t l_handleX = l_handle; ERRL_DBG("Slots after Create - 1 slots should be used (one of each"); ppdumpslot(); /****************************************************/ // Test size limit for addTraceToErrl // Add "trace" data that exceeds the max size l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, MAX_BUFFER_SIZE, l_handle); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeAfter <= MAX_ERRL_ENTRY_SZ, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); commitErrl( &l_handle ); ERRL_DBG("Slots after Commit - 1 slots should be used/committed"); ppdumpslot(); deleteErrl(&l_handleX); ERRL_DBG("Slots after delete Log - All slots should be empty"); ppdumpslot(); /****************************************************/ // Test size limit for addTraceToErrl with continuous calls // Create log with 512 bytes trace l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, g_trac_inf, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it l_handleX = l_handle; ppdumpslot(); // Add 256 bytes of trace (512+256) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 256, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 bytes" ); ppdumpslot(); // (header + 256) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter <= (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+256), l_rc); // Add 512 bytes of trace (512+256+512) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 512, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 bytes"); ppdumpslot(); CHECK_CONDITION( l_entrySizeAfter <= (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+512), l_rc); // Add 1024 bytes of trace (512+256+512+1024), the entry size is more than 2048 now l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; addTraceToErrl(g_trac_inf, 1024, l_handle); // @at012c l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 bytes"); ppdumpslot(); CHECK_CONDITION( l_entrySizeAfter <= MAX_ERRL_ENTRY_SZ, l_rc); commitErrl( &l_handle ); deleteErrl(&l_handleX); ERRL_DBG("Slots should now be empty"); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: errlTestAddUsrDtlsToErrl // // Description: errlTestAddUsrDtlsToErrl // // End Function Specification uint32_t errlTestAddUsrDtlsToErrl() { uint32_t l_rc = 0; ERRL_DBG("START"); uint16_t l_entrySizeBefore = 0; uint16_t l_entrySizeAfter = 0; do { // Create three err logs errlHndl_t l_handle = NULL; errlHndl_t l_handle2 = NULL; errlHndl_t l_handle3 = NULL; l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_UNRECOVERABLE, NULL, 512, 0x1, 0x2); l_handle2 = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_CALLHOME_DATA, NULL, 512, 0x1, 0x2); l_handle3 = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL, NULL, 512, 0x1, 0x2); // l_handle will set to NULL after calling the commitErrl, so we need to store it errlHndl_t l_handleX = l_handle; errlHndl_t l_handle2X = l_handle2; errlHndl_t l_handle3X = l_handle3; ERRL_DBG("Slots after Create - 3 slots should be used (one of each"); ppdumpslot(); CHECK_CONDITION( (l_handle != INVALID_ERR_HNDL) && (l_handle2 != INVALID_ERR_HNDL) && (l_handle3 != INVALID_ERR_HNDL), l_rc); /****************************************************/ // Test size limit for addUsrDtlsToErrl // Add "user details" data that exceeds the max size for l_handle l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xCC, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, sizeof( G_data ), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeAfter == MAX_ERRL_ENTRY_SZ, l_rc); // Add "user details" data that exceeds the max size for l_handle2 l_entrySizeBefore = l_handle2->iv_userDetails.iv_entrySize; memset( G_data, 0xDD, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle2, G_data, sizeof( G_data ), ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_CALLHOME_DATA ); l_entrySizeAfter = l_handle2->iv_userDetails.iv_entrySize; CHECK_CONDITION( l_entrySizeAfter == MAX_ERRL_CALL_HOME_SZ, l_rc); // Add "user details" with size 76 for l_handle3 l_entrySizeBefore = l_handle3->iv_userDetails.iv_entrySize; memset( G_data, 0xEE, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle3, G_data, 76, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle3->iv_userDetails.iv_entrySize; // (header + 76) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+76), l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); dumpLog( l_handle2, l_handle2->iv_userDetails.iv_entrySize ); dumpLog( l_handle3, l_handle3->iv_userDetails.iv_entrySize ); commitErrl( &l_handle ); commitErrl( &l_handle2 ); commitErrl( &l_handle3 ); ERRL_DBG("Slots after Commit - 3 slots should be used/committed"); ppdumpslot(); deleteErrl(&l_handleX); deleteErrl(&l_handle2X); deleteErrl(&l_handle3X); CHECK_CONDITION( (l_handleX == NULL) && (l_handle2X == NULL) && (l_handle3X == NULL), l_rc); ERRL_DBG("Slots after delete Log - All slots should be empty"); ppdumpslot(); /****************************************************/ // Test size limit for addUsrDtlsToErrl with continuous calls // Create log with 512 bytes trace l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE, g_trac_inf, 512, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // l_handle will set to NULL after calling the commitErrl, so we need to store it l_handleX = l_handle; ppdumpslot(); // add 256 bytes of "user details" (512+256) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xAA, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 256, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 bytes" ); ppdumpslot(); // (header + 256) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+256), l_rc); // add 512 bytes of "user details" (512+256+512) l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xBB, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 512, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 bytes"); ppdumpslot(); // (header + 512) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter == (l_entrySizeBefore+sizeof(ErrlUserDetailsEntry_t)+512), l_rc); // add 1024 bytes of "user details" (512+256+512+1024), the entry size is more than 2048 now l_entrySizeBefore = l_handle->iv_userDetails.iv_entrySize; memset( G_data, 0xCC, sizeof( G_data ) ); addUsrDtlsToErrl( l_handle, G_data, 1024, ERRL_USR_DTL_STRUCT_VERSION_1, ERRL_USR_DTL_TRACE_DATA ); l_entrySizeAfter = l_handle->iv_userDetails.iv_entrySize; ERRL_DBG("Slots after create + 256 + 512 +1024 bytes"); ppdumpslot(); // (header + 1024) is the size that add to entry CHECK_CONDITION( l_entrySizeAfter <= MAX_ERRL_ENTRY_SZ, l_rc); // @at012c commitErrl( &l_handle ); deleteErrl(&l_handleX); ERRL_DBG("Slots should now be empty"); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: amec_update_vrm_sensors // // Description: Updates sensors that use data from the VRMs // (e.g., VR_FAN, FANS_FULL_SPEED, VR_HOT). // // Thread: RealTime Loop // // End Function Specification void amec_update_vrm_sensors(void) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ int l_rc = 0; int l_vrfan = 0; int l_softoc = 0; int l_minus_np1_regmode = 0; int l_minus_n_regmode = 0; static uint8_t L_error_count = 0; uint8_t l_pin = 0; uint8_t l_pin_value = 1; // active low, so set default to high uint8_t l_vrhot_count = 0; errlHndl_t l_err = NULL; /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ // Check if we have access to SPIVID. In DCMs only Master OCC has access to // the SPIVID. if (G_dcm_occ_role == OCC_DCM_MASTER) { // VR_FAN and SOFT_OC come from SPIVID l_rc = vrm_read_state(SPIVRM_PORT(0), &l_minus_np1_regmode, &l_minus_n_regmode, &l_vrfan, &l_softoc); if (l_rc == 0) { // Update the VR_FAN sensor sensor_update( AMECSENSOR_PTR(VRFAN250USPROC), (uint16_t)l_vrfan ); // Clear our error count and the 'read failure' flag (since we can // read VR_FAN signal) L_error_count = 0; G_thrm_fru_data[DATA_FRU_VRM].read_failure = 0; // Obtain the 'fan_full_speed' GPIO from APSS l_pin = G_sysConfigData.apss_gpio_map.fans_full_speed; // No longer reading gpio from APSS in GA1 due to instability in // APSS composite mode //apss_gpio_get(l_pin, &l_pin_value); // VR_HOT sensor is a counter of number of times the VRHOT signal // has been asserted l_vrhot_count = AMECSENSOR_PTR(VRHOT250USPROC)->sample; // Check if VR_FAN is asserted AND if 'fans_full_speed' GPIO is ON. // Note that this GPIO is active low. if (AMECSENSOR_PTR(VRFAN250USPROC)->sample && !(l_pin_value)) { // VR_FAN is asserted and 'fans_full_speed' GPIO is ON, // then increment our VR_HOT counter if (l_vrhot_count < g_amec->vrhotproc.setpoint) { l_vrhot_count++; } } else { // Reset our VR_HOT counter l_vrhot_count = 0; } sensor_update(AMECSENSOR_PTR(VRHOT250USPROC), l_vrhot_count); } else { // Increment our error count L_error_count++; // Don't allow the error count to wrap if (L_error_count == 0) { L_error_count = 0xFF; } // Log an error if we exceeded our number of fail-to-read sensor if ((L_error_count == g_amec->proc[0].vrfan_error_count) && (g_amec->proc[0].vrfan_error_count != 0xFF)) { TRAC_ERR("amec_update_vrm_sensors: Failed to read VR_FAN for %u consecutive times!", L_error_count); // Also, inform the thermal thread to send a cooling request G_thrm_fru_data[DATA_FRU_VRM].read_failure = 1; /* @ * @errortype * @moduleid AMEC_HEALTH_CHECK_VRFAN_TIMEOUT * @reasoncode VRM_VRFAN_TIMEOUT * @userdata1 timeout value * @userdata2 0 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failed to read VR_FAN signal from regulator. * */ l_err = createErrl(AMEC_HEALTH_CHECK_VRFAN_TIMEOUT, //modId VRM_VRFAN_TIMEOUT, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size g_amec->thermaldimm.temp_timeout, //userdata1 0); //userdata2 // Callout backplane for this VRM error addCalloutToErrl(l_err, ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.backplane_huid, ERRL_CALLOUT_PRIORITY_MED); // Commit the error commitErrl(&l_err); } } } if( 1 ) { sensor_update( AMECSENSOR_PTR(VRFAN250USMEM), 0 ); sensor_update( AMECSENSOR_PTR(VRHOT250USMEM), 0 ); } }
// Function Specification // // Name: errlTestCallouts // // Description: errlTestCallouts // // End Function Specification uint32_t errlTestCallouts() { uint32_t l_rc = 0; ERRL_DBG("START"); do { errlHndl_t l_handle = NULL; ERRL_DBG("--------------------------------\n"); /****************************************************/ // Check max callouts l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); ERRL_CALLOUT_PRIORITY l_array[8] = { ERRL_CALLOUT_PRIORITY_HIGH, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_LOW, ERRL_CALLOUT_PRIORITY_HIGH, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_MED, ERRL_CALLOUT_PRIORITY_LOW, ERRL_CALLOUT_PRIORITY_LOW, }; ERRL_CALLOUT_TYPE l_type[8] = { ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, ERRL_CALLOUT_TYPE_HUID, ERRL_CALLOUT_TYPE_COMPONENT_ID, }; // add 6 (ERRL_MAX_CALLOUTS) callouts uint8_t l_index = 0; for(l_index = 0; l_index < ERRL_MAX_CALLOUTS; l_index++) { ERRL_DBG("current callouts %d attempting to add callout # %d with type %d ,priority %d", l_handle->iv_numCallouts, l_index, l_type[l_index], l_array[l_index] ); addCalloutToErrl(l_handle,l_type[l_index],l_index,l_array[l_index]); } CHECK_CONDITION( l_handle->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); // add one more callout and it should fail addCalloutToErrl(l_handle,l_type[0],l_index,l_array[0]); CHECK_CONDITION( l_handle->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); deleteErrl( &l_handle ); ppdumpslot(); /****************************************************/ // Check callouts after errl is committed // Create log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_PREDICTIVE,g_trac_inf, 32, 0x1, 0x2); errlHndl_t l_log = l_handle; CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); // Commit log and add callout. But adding callout should fail commitErrl( &l_handle ); addCalloutToErrl(l_handle,l_type[0],0,l_array[0]); CHECK_CONDITION( l_log->iv_numCallouts == ERRL_MAX_CALLOUTS, l_rc); deleteErrl(&l_log); /****************************************************/ // Check addCalloutToErrl for ERRL_SEV_INFORMATIONAL log // Create ERRL_SEV_INFORMATIONAL log l_handle = createErrl( TEST_MODULE_ID, 0x08, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL,g_trac_inf, 128, 0x1, 0x2); CHECK_CONDITION( l_handle != INVALID_ERR_HNDL, l_rc); if(l_handle == INVALID_ERR_HNDL) // add one callout and it should fail addCalloutToErrl(l_handle,l_type[0],l_index,l_array[0]); CHECK_CONDITION( l_handle->iv_numCallouts == 0, l_rc); dumpLog( l_handle, l_handle->iv_userDetails.iv_entrySize ); deleteErrl( &l_handle ); ppdumpslot(); ERRL_DBG("END \n"); }while(0); return l_rc; }
// Function Specification // // Name: task_check_for_checkstop // // Description: Check for checkstop // // End Function Specification void task_check_for_checkstop(task_t *i_self) { pore_status_t l_gpe0_status; ocb_oisr0_t l_oisr0_status; static bool L_checkstop_traced = FALSE; uint8_t l_reason_code = 0; do { // This check is disabled once a checkstop or frozen GPE is detected if(L_checkstop_traced) { break; } // Looked for a frozen GPE, a sign that the chip has stopped working or // check-stopped. This check also looks for an interrupt status flag that // indicates if the system has check-stopped. l_gpe0_status.value = in64(PORE_GPE0_STATUS); l_oisr0_status.value = in32(OCB_OISR0); if (l_gpe0_status.fields.freeze_action || l_oisr0_status.fields.check_stop) { errlHndl_t l_err = NULL; if (l_gpe0_status.fields.freeze_action) { TRAC_IMP("Frozen GPE0 detected by RTL"); l_reason_code = OCC_GPE_HALTED; } if (l_oisr0_status.fields.check_stop) { TRAC_IMP("System checkstop detected by RTL"); l_reason_code = OCC_SYSTEM_HALTED; } L_checkstop_traced = TRUE; /* * @errortype * @moduleid MAIN_SYSTEM_HALTED_MID * @reasoncode OCC_GPE_HALTED * @userdata1 High order word of PORE_GPE0_STATUS * @userdata2 OCB_OISR0 * @devdesc OCC detected frozen GPE0 */ /* * @errortype * @moduleid MAIN_SYSTEM_HALTED_MID * @reasoncode OCC_SYSTEM_HALTED * @userdata1 High order word of PORE_GPE0_STATUS * @userdata2 OCB_OISR0 * @devdesc OCC detected system checkstop */ l_err = createErrl(MAIN_SYSTEM_HALTED_MID, l_reason_code, OCC_NO_EXTENDED_RC, ERRL_SEV_INFORMATIONAL, NULL, DEFAULT_TRACE_SIZE, l_gpe0_status.words.high_order, l_oisr0_status.value); // The commit code will check for the frozen GPE0 and system // checkstop conditions and take appropriate actions. commitErrl(&l_err); } } while(0); }
void task_core_data( task_t * i_task ) { errlHndl_t l_err = NULL; //Error handler tracDesc_t l_trace = NULL; //Temporary trace descriptor int rc = 0; //return code bulk_core_data_task_t * l_bulk_core_data_ptr = (bulk_core_data_task_t *)i_task->data_ptr; GpeGetCoreDataParms * l_parms = (GpeGetCoreDataParms *)(l_bulk_core_data_ptr->gpe_req.parameter); gpe_bulk_core_data_t * l_temp = NULL; do { //First, check to see if the previous GPE request still running //A request is considered idle if it is not attached to any of the //asynchronous request queues if( !(async_request_is_idle(&l_bulk_core_data_ptr->gpe_req.request)) ) { //This should not happen unless there's a problem //Trace 1 time if( !G_queue_not_idle_traced ) { TRAC_ERR("Core data GPE is still running \n"); G_queue_not_idle_traced = TRUE; } break; } //Need to complete collecting data for all assigned cores from previous interval //and tick 0 is the current tick before collect data again. if( (l_bulk_core_data_ptr->current_core == l_bulk_core_data_ptr->end_core) && ((CURRENT_TICK & (MAX_NUM_TICKS - 1)) != 0) ) { PROC_DBG("Not collect data. Need to wait for tick.\n"); break; } //Check to see if the previously GPE request has successfully completed //A request is not considered complete until both the engine job //has finished without error and any callback has run to completion. if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request) && CORE_PRESENT(l_bulk_core_data_ptr->current_core) ) { //If the previous GPE request succeeded then swap core_data_ptr //with the global one. The gpe routine will write new data into //a buffer that is not being accessed by the RTLoop code. PROC_DBG( "Swap core_data_ptr [%x] with the global one\n", l_bulk_core_data_ptr->current_core ); //debug only #ifdef PROC_DEBUG print_core_status(l_bulk_core_data_ptr->current_core); print_core_data_sensors(l_bulk_core_data_ptr->current_core); #endif l_temp = l_bulk_core_data_ptr->core_data_ptr; l_bulk_core_data_ptr->core_data_ptr = G_core_data_ptrs[l_bulk_core_data_ptr->current_core]; G_core_data_ptrs[l_bulk_core_data_ptr->current_core] = l_temp; //Core data has been collected so set the bit in global mask. //AMEC code will know which cores to update sensors for. AMEC is //responsible for clearing the bit later on. G_updated_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core); // Presumptively clear the empath error mask G_empath_error_core_mask &= ~(CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core)); // The gpe_data collection code has to handle the workaround for // HW280375. Two new flags have been added to the OHA_RO_STATUS_REG // image to indicate whether the EMPATH collection failed, and // whether it was due to an "expected" error that we can ignore // (we can ignore the data as well), or an "unexpected" error that // we will create an informational log one time. // // The "expected" errors are very rare in practice, in fact we may // never even see them unless running a specific type of workload. // If you want to test the handling of expected errors compile the // GPE code with -DINJECT_HW280375_ERRORS which will inject an error // approximately every 1024 samples // // To determine if the expected error has occurred inspect the // CoreDataOha element of the CoreData structure written by the GPE // core data job. The OHA element contains the oha_ro_status_reg. // Inside the OHA status register is a 16 bit reserved field. // gpe_data.h defines two masks that can be applied against the // reserved field to check for these errors: // CORE_DATA_EXPECTED_EMPATH_ERROR // CORE_DATA_UNEXPECTED_EMPATH_ERROR // Also, a 4-bit PCB parity + error code is saved at bit position: // CORE_DATA_EMPATH_ERROR_LOCATION, formally the length is // specified by: CORE_DATA_EMPATH_ERROR_BITS gpe_bulk_core_data_t *l_core_data = G_core_data_ptrs[l_bulk_core_data_ptr->current_core]; // We will trace the errors, but only a certain number of // times, we will only log the unexpected error once. #define OCC_EMPATH_ERROR_THRESH 10 static uint32_t L_expected_emp_err_cnt = 0; static uint32_t L_unexpected_emp_err_cnt = 0; // Check the reserved field for the expected or the unexpected error flag if ((l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_EXPECTED_EMPATH_ERROR) || (l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_UNEXPECTED_EMPATH_ERROR)) { // Indicate empath error on current core G_empath_error_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core); // Save the high and low order words of the OHA status reg uint32_t l_oha_reg_high = l_core_data->oha.oha_ro_status_reg.words.high_order; uint32_t l_oha_reg_low = l_core_data->oha.oha_ro_status_reg.words.low_order; // Handle each error case if ((l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_EXPECTED_EMPATH_ERROR) && (L_expected_emp_err_cnt < OCC_EMPATH_ERROR_THRESH)) { L_expected_emp_err_cnt++; TRAC_IMP("Expected empath collection error occurred %d time(s)! Core = %d", L_expected_emp_err_cnt, l_bulk_core_data_ptr->current_core); TRAC_IMP("OHA status register: 0x%4.4x%4.4x", l_oha_reg_high, l_oha_reg_low); } if ((l_core_data->oha.oha_ro_status_reg.fields._reserved0 & CORE_DATA_UNEXPECTED_EMPATH_ERROR) && (L_unexpected_emp_err_cnt < OCC_EMPATH_ERROR_THRESH)) { L_unexpected_emp_err_cnt++; TRAC_ERR("Unexpected empath collection error occurred %d time(s)! Core = %d", L_unexpected_emp_err_cnt, l_bulk_core_data_ptr->current_core); TRAC_ERR("OHA status register: 0x%4.4x%4.4x", l_oha_reg_high, l_oha_reg_low); // Create and commit an informational error the first // time this occurs. if (L_unexpected_emp_err_cnt == 1) { TRAC_IMP("Logging unexpected empath collection error 1 time only."); /* * @errortype * @moduleid PROC_TASK_CORE_DATA_MOD * @reasoncode INTERNAL_HW_FAILURE * @userdata1 OHA status reg high * @userdata2 OHA status reg low * @userdata4 ERC_PROC_CORE_DATA_EMPATH_ERROR * @devdesc An unexpected error occurred while * collecting core empath data. */ l_err = createErrl( PROC_TASK_CORE_DATA_MOD, //modId INTERNAL_HW_FAILURE, //reason code ERC_PROC_CORE_DATA_EMPATH_ERROR, //Extended reason code ERRL_SEV_INFORMATIONAL, //Severity NULL, //Trace DEFAULT_TRACE_SIZE, //Trace Size l_oha_reg_high, //userdata1 l_oha_reg_low); //userdata2 commitErrl(&l_err); } } } }
void amec_update_fw_sensors(void) { errlHndl_t l_err = NULL; int rc = 0; int rc2 = 0; static bool l_first_call = TRUE; bool l_gpe0_idle, l_gpe1_idle; static int L_consec_trace_count = 0; // ------------------------------------------------------ // Update OCC Firmware Sensors from last tick // ------------------------------------------------------ int l_last_state = G_fw_timing.amess_state; // RTLtickdur = duration of last tick's RTL ISR (max = 250us) sensor_update( AMECSENSOR_PTR(RTLtickdur), G_fw_timing.rtl_dur); // AMEintdur = duration of last tick's AMEC portion of RTL ISR sensor_update( AMECSENSOR_PTR(AMEintdur), G_fw_timing.ameint_dur); // AMESSdurX = duration of last tick's AMEC state if(l_last_state >= NUM_AMEC_SMH_STATES) { // Sanity check. Trace this out, even though it should never happen. TRAC_INFO("AMEC State Invalid, Sensor Not Updated"); } else { // AMESSdurX = duration of last tick's AMEC state sensor_update( AMECSENSOR_ARRAY_PTR(AMESSdur0, l_last_state), G_fw_timing.amess_dur); } // ------------------------------------------------------ // Kick off GPE programs to track WorstCase time in GPE // and update the sensors. // ------------------------------------------------------ if( (NULL != G_fw_timing.gpe0_timing_request) && (NULL != G_fw_timing.gpe1_timing_request) ) { //Check if both GPE engines were able to complete the last GPE job on //the queue within 1 tick. l_gpe0_idle = async_request_is_idle(&G_fw_timing.gpe0_timing_request->request); l_gpe1_idle = async_request_is_idle(&G_fw_timing.gpe1_timing_request->request); if(l_gpe0_idle && l_gpe1_idle) { //reset the consecutive trace count L_consec_trace_count = 0; //Both GPE engines finished on time. Now check if they were //successful too. if( async_request_completed(&(G_fw_timing.gpe0_timing_request->request)) && async_request_completed(&(G_fw_timing.gpe1_timing_request->request)) ) { // GPEtickdur0 = duration of last tick's PORE-GPE0 duration sensor_update( AMECSENSOR_PTR(GPEtickdur0), G_fw_timing.gpe_dur[0]); // GPEtickdur1 = duration of last tick's PORE-GPE1 duration sensor_update( AMECSENSOR_PTR(GPEtickdur1), G_fw_timing.gpe_dur[1]); } else { //This case is expected on the first call of the function. //After that, this should not happen. if(!l_first_call) { //Note: FFDC for this case is gathered by each task //responsible for a GPE job. TRAC_INFO("GPE task idle but GPE task did not complete"); } l_first_call = FALSE; } // Update Time used to measure GPE duration. G_fw_timing.rtl_start_gpe = G_fw_timing.rtl_start; // Schedule the GPE Routines that will run and update the worst // case timings (via callback) after they complete. These GPE // routines are the last GPE routines added to the queue // during the RTL tick. rc = pore_flex_schedule(G_fw_timing.gpe0_timing_request); rc2 = pore_flex_schedule(G_fw_timing.gpe1_timing_request); if(rc || rc2) { /* @ * @errortype * @moduleid AMEC_UPDATE_FW_SENSORS * @reasoncode SSX_GENERIC_FAILURE * @userdata1 return code - gpe0 * @userdata2 return code - gpe1 * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Failure to schedule PORE-GPE poreFlex object for FW timing * analysis. */ l_err = createErrl( AMEC_UPDATE_FW_SENSORS, //modId SSX_GENERIC_FAILURE, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_INFORMATIONAL, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size rc, //userdata1 rc2); //userdata2 // commit error log commitErrl( &l_err ); } } else if(L_consec_trace_count < MAX_CONSEC_TRACE) { uint64_t l_dbg1; // Reset will eventually be requested due to not having power measurement // data after X ticks, but add some additional FFDC to the trace that // will tell us what GPE job is currently executing. if(!l_gpe0_idle) { l_dbg1 = in64(PORE_GPE0_DBG1); TRAC_ERR("GPE0 programs did not complete within one tick. DBG1[0x%08x%08x]", l_dbg1 >> 32, l_dbg1 & 0x00000000ffffffffull); }