static int mca_bml_base_open(mca_base_open_flag_t flags) { int ret; if(OMPI_SUCCESS != (ret = mca_base_framework_components_open(&ompi_bml_base_framework, flags))) { return ret; } #if OPAL_ENABLE_DEBUG_RELIABILITY /* seed random number generator */ struct timeval tv; gettimeofday(&tv, NULL); opal_srand(&mca_bml_base_rand_buff,(uint32_t)(getpid() * tv.tv_usec)); /* initialize count */ if(mca_bml_base_error_rate_ceiling > 0 && mca_bml_base_error_rate_floor <= mca_bml_base_error_rate_ceiling) { mca_bml_base_error_count = (int) (((double) mca_bml_base_error_rate_ceiling * opal_rand(&mca_bml_base_rand_buff))/(UINT32_MAX+1.0)); } #endif return mca_base_framework_open(&ompi_btl_base_framework, 0); }
/* the file name is only guaranteed to be unique on the local host. if there * was a failure that left backing files behind, then no such guarantees can be * made. we use the pid + file_name hash + random number to help avoid issues. * * caller is responsible for freeing returned resources. the returned string * will be OPAL_PATH_MAX long. */ static char * get_uniq_file_name(const char *base_path, const char *hash_key) { char *uniq_name_buf = NULL; unsigned long str_hash = 0; pid_t my_pid; opal_rng_buff_t rand_buff; uint32_t rand_num; /* invalid argument */ if (NULL == hash_key) { return NULL; } if (NULL == (uniq_name_buf = calloc(OPAL_PATH_MAX, sizeof(char)))) { /* out of resources */ return NULL; } my_pid = getpid(); opal_srand(&rand_buff,((uint32_t)(time(NULL) + my_pid))); rand_num = opal_rand(&rand_buff) % 1024; str_hash = sdbm_hash((unsigned char *)hash_key); /* build the name */ snprintf(uniq_name_buf, OPAL_PATH_MAX, "%s/open_mpi_shmem_mmap.%d_%lu_%d", base_path, (int)my_pid, str_hash, rand_num); return uniq_name_buf; }
int mca_bml_base_send( mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des, mca_btl_base_tag_t tag ) { des->des_context = (void*)bml_btl; if(mca_bml_base_error_count <= 0 && mca_bml_base_error_rate_ceiling > 0) { mca_bml_base_error_count = (int) (((double) mca_bml_base_error_rate_ceiling * opal_rand(&mca_bml_base_rand_buff))/(UINT32_MAX+1.0)); if(mca_bml_base_error_count < (double) mca_bml_base_error_rate_floor) { mca_bml_base_error_count = (double) mca_bml_base_error_rate_floor; } if(mca_bml_base_error_count % 2) { /* local completion - network "drops" packet */ opal_output(0, "%s:%d: dropping data, with local completion\n", __FILE__, __LINE__); des->des_cbfunc(bml_btl->btl, bml_btl->btl_endpoint, des, OMPI_SUCCESS); return OMPI_SUCCESS; } else { /* corrupt data */ mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) malloc(sizeof(mca_bml_base_context_t)); if(NULL != ctx) { opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__); ctx->index = (size_t) ((des->des_src[0].seg_len * opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0)); ctx->cbfunc = des->des_cbfunc; ctx->cbdata = des->des_cbdata; ((unsigned char*)des->des_src[0].seg_addr.pval)[ctx->index] ^= ~0; des->des_cbdata = ctx; des->des_cbfunc = mca_bml_base_completion; } } } mca_bml_base_error_count--; return bml_btl->btl_send( bml_btl->btl, bml_btl->btl_endpoint, des, tag ); }
static void sample(void) { float prob; orte_proc_t *child; int i; OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, "%s sample:ft_tester considering killing something", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* are we including ourselves? */ if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_CMSLAVE) && 0 < mca_sensor_ft_tester_component.daemon_fail_prob) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, "%s sample:ft_tester considering killing me!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* roll the dice */ prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX; if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) { /* commit suicide */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, "%s sample:ft_tester committing suicide", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_errmgr.abort(1, NULL); return; } } if (0 < mca_sensor_ft_tester_component.fail_prob) { /* see if we should kill a child */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (!child->alive || 0 == child->pid || ORTE_PROC_STATE_UNTERMINATED < child->state) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, "%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), child->alive ? "TRUE" : "FALSE", (unsigned long)child->pid, orte_proc_state_to_str(child->state))); continue; } /* roll the dice */ prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX; OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, "%s sample:ft_tester child: %s dice: %f prob %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), prob, mca_sensor_ft_tester_component.fail_prob)); if (prob < mca_sensor_ft_tester_component.fail_prob) { /* you shall die... */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output, "%s sample:ft_tester killing %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name))); kill(child->pid, SIGTERM); /* are we allowing multiple deaths */ if (!mca_sensor_ft_tester_component.multi_fail) { break; } } } } }
static inline void orte_pre_condition_transports_use_rand(uint64_t* unique_key) { opal_rng_buff_t rng; opal_srand(&rng,(unsigned int)time(NULL)); unique_key[0] = opal_rand(&rng); unique_key[1] = opal_rand(&rng); }
static void sample(orcm_sensor_sampler_t *sampler) { float prob, division, check; char *vector, **elements, **parts, **pieces; orcm_ras_event_t *rev; int i, j; OPAL_OUTPUT_VERBOSE((1, orcm_sensor_base_framework.framework_output, "%s sample:evinj considering injecting something", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* roll the dice */ prob = (double)opal_rand(&mca_sensor_evinj_component.rng_buff) / (double)UINT32_MAX; if (prob < mca_sensor_evinj_component.prob) { rev = OBJ_NEW(orcm_ras_event_t); /* if we were given a vector file, read * the next vector from the file */ if (NULL != fp) { vector = orcm_getline(); if (NULL == vector) { /* reopen the file to start over */ fclose(fp); fp = fopen(mca_sensor_evinj_component.vector_file, "r"); if (NULL == fp) { /* nothing we can do */ return; } vector = orcm_getline(); if (NULL == vector) { /* give up */ return; } } elements = opal_argv_split(vector, ';'); free(vector); i=0; /* first field must contain a comma-delimited set of descriptors * of the location reporting this event, each descriptor given * as a colon-separated key:value pair (only string values are * supported when read from a file) */ parts = opal_argv_split(elements[i], ','); for (j=0; NULL != parts[j]; j++) { pieces = opal_argv_split(parts[j], ':'); if (2 != opal_argv_count(pieces)) { ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM); opal_argv_free(elements); opal_argv_free(parts); opal_argv_free(pieces); OBJ_RELEASE(rev); return; } ORCM_RAS_REPORTER(rev, pieces[0], pieces[1], OPAL_STRING); opal_argv_free(pieces); } opal_argv_free(parts); /* next field must be the event type */ ++i; if (0 == strcmp("EXCEPTION", elements[i])) { rev->type = ORCM_RAS_EVENT_EXCEPTION; } else if (0 == strcmp("TRANSITION", elements[i])) { rev->type = ORCM_RAS_EVENT_STATE_TRANSITION; } else if (0 == strcmp("SENSOR", elements[i])) { rev->type = ORCM_RAS_EVENT_SENSOR; } else if (0 == strcmp("COUNTER", elements[i])) { rev->type = ORCM_RAS_EVENT_COUNTER; } else { ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM); opal_argv_free(elements); OBJ_RELEASE(rev); return; } /* next field must be the severity */ ++i; if (0 == strcmp("EMERGENCY", elements[i])) { rev->severity = ORCM_RAS_EMERG; } else if (0 == strcmp("FATAL", elements[i])) { rev->severity = ORCM_RAS_FATAL; } else if (0 == strcmp("ALERT", elements[i])) { rev->severity = ORCM_RAS_ALERT; } else if (0 == strcmp("CRITICAL", elements[i])) { rev->severity = ORCM_RAS_CRIT; } else if (0 == strcmp("ERROR", elements[i])) { rev->severity = ORCM_RAS_ERROR; } else if (0 == strcmp("WARNING", elements[i])) { rev->severity = ORCM_RAS_WARNING; } else if (0 == strcmp("NOTICE", elements[i])) { rev->severity = ORCM_RAS_NOTICE; } else if (0 == strcmp("INFO", elements[i])) { rev->severity = ORCM_RAS_INFO; } else if (0 == strcmp("TRACE", elements[i])) { rev->severity = ORCM_RAS_TRACE; } else if (0 == strcmp("DEBUG", elements[i])) { rev->severity = ORCM_RAS_DEBUG; } else { ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM); opal_argv_free(elements); OBJ_RELEASE(rev); return; } /* next field is optional - if provided, it will consist * of a comma-delimited set of descriptors for this * event, each given as a colon-separated key:value pair * (only string values are supported when read from a file) */ ++i; if (NULL == elements[i]) { /* we are done */ opal_argv_free(elements); goto execute; } parts = opal_argv_split(elements[i], ','); for (j=0; NULL != parts[j]; j++) { pieces = opal_argv_split(parts[j], ':'); if (2 != opal_argv_count(pieces)) { ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM); opal_argv_free(elements); opal_argv_free(parts); opal_argv_free(pieces); OBJ_RELEASE(rev); return; } ORCM_RAS_DESCRIPTION(rev, pieces[0], pieces[1], OPAL_STRING); opal_argv_free(pieces); } opal_argv_free(parts); /* the final field is also optional - if provided it * will consist of a comma-delimited set of data elements for this * event, each given as a colon-separated key:value pair * (only string values are supported when read from a file)*/ ++i; if (NULL == elements[i]) { /* we are done */ opal_argv_free(elements); goto execute; } parts = opal_argv_split(elements[i], ','); for (j=0; NULL != parts[j]; j++) { pieces = opal_argv_split(parts[j], ':'); if (3 != opal_argv_count(pieces)) { ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM); opal_argv_free(elements); opal_argv_free(parts); opal_argv_free(pieces); OBJ_RELEASE(rev); return; } ORCM_RAS_DATA(rev, pieces[0], pieces[1], OPAL_STRING); opal_argv_free(pieces); } opal_argv_free(parts); opal_argv_free(elements); } else { /* just use some bogus location for test purposes */ ORCM_RAS_REPORTER(rev, ORCM_LOC_CLUSTER, "GRAND-SLAM", OPAL_STRING); ORCM_RAS_REPORTER(rev, ORCM_LOC_ROW, "a", OPAL_STRING); i = 3; ORCM_RAS_REPORTER(rev, ORCM_LOC_RACK, &i, OPAL_INT); ORCM_RAS_REPORTER(rev, ORCM_LOC_NODE, "a305", OPAL_STRING); ORCM_RAS_REPORTER(rev, ORCM_COMPONENT_OVLYNET, ORCM_SUBCOMPONENT_PROC, OPAL_STRING); /* randomly generate the event type */ prob = (double)opal_rand(&mca_sensor_evinj_component.rng_buff) / (double)UINT32_MAX; division = 1.0 / (float)(ORCM_RAS_EVENT_UNKNOWN_TYPE+1); rev->type = 0; for (check=division; check < prob; check += division) { ++rev->type; } /* randomly generate the severity */ prob = (double)opal_rand(&mca_sensor_evinj_component.rng_buff) / (double)UINT32_MAX; division = 1.0 / (float)(ORCM_RAS_UNKNOWN+1); rev->severity = 0; for (check=division; check < prob; check += division) { ++rev->severity; } /* provide some description */ check = 198.75; ORCM_RAS_DESCRIPTION(rev, ORCM_DESC_TEMP_HI, &check, OPAL_FLOAT); i = 13789; ORCM_RAS_DESCRIPTION(rev, ORCM_DESC_SESSION_ID, &i, OPAL_INT); /* provide some data */ check = 134.8; ORCM_RAS_DATA(rev, "outlet avg temp", &check, OPAL_FLOAT); } execute: opal_output_verbose(1, orcm_sensor_base_framework.framework_output, "%s sample:evinj injecting RAS event", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* inject it into the event generator thread */ ORCM_RAS_EVENT(rev); } }