Esempio n. 1
0
static int mca_bml_base_open(mca_base_open_flag_t flags) 
{
    int ret;

    if(OMPI_SUCCESS !=
       (ret = mca_base_framework_components_open(&ompi_bml_base_framework, flags))) {
        return ret;
    }

#if OPAL_ENABLE_DEBUG_RELIABILITY
    /* seed random number generator */
        struct timeval tv;
        gettimeofday(&tv, NULL);
        opal_srand(&mca_bml_base_rand_buff,(uint32_t)(getpid() * tv.tv_usec));

    /* initialize count */
    if(mca_bml_base_error_rate_ceiling > 0 
       && mca_bml_base_error_rate_floor <= mca_bml_base_error_rate_ceiling) {
        mca_bml_base_error_count = (int) (((double) mca_bml_base_error_rate_ceiling * 
                    opal_rand(&mca_bml_base_rand_buff))/(UINT32_MAX+1.0));
    }
#endif

    return mca_base_framework_open(&ompi_btl_base_framework, 0);
}
Esempio n. 2
0
/* the file name is only guaranteed to be unique on the local host.  if there
 * was a failure that left backing files behind, then no such guarantees can be
 * made.  we use the pid + file_name hash + random number to help avoid issues.
 *
 * caller is responsible for freeing returned resources. the returned string
 * will be OPAL_PATH_MAX long.
 */
static char *
get_uniq_file_name(const char *base_path, const char *hash_key)
{
    char *uniq_name_buf = NULL;
    unsigned long str_hash = 0;
    pid_t my_pid;
    opal_rng_buff_t rand_buff;
    uint32_t rand_num;

    /* invalid argument */
    if (NULL == hash_key) {
        return NULL;
    }
    if (NULL == (uniq_name_buf = calloc(OPAL_PATH_MAX, sizeof(char)))) {
        /* out of resources */
        return NULL;
    }

    my_pid = getpid();
    opal_srand(&rand_buff,((uint32_t)(time(NULL) + my_pid)));
    rand_num = opal_rand(&rand_buff) % 1024;
    str_hash = sdbm_hash((unsigned char *)hash_key);
    /* build the name */
    snprintf(uniq_name_buf, OPAL_PATH_MAX, "%s/open_mpi_shmem_mmap.%d_%lu_%d",
             base_path, (int)my_pid, str_hash, rand_num);

    return uniq_name_buf;
}
int mca_bml_base_send( mca_bml_base_btl_t* bml_btl, 
                       mca_btl_base_descriptor_t* des, 
                       mca_btl_base_tag_t tag ) 
{ 
    des->des_context = (void*)bml_btl; 
    if(mca_bml_base_error_count <= 0 && mca_bml_base_error_rate_ceiling > 0) {
      mca_bml_base_error_count = (int) (((double) mca_bml_base_error_rate_ceiling * 
                  opal_rand(&mca_bml_base_rand_buff))/(UINT32_MAX+1.0));
        if(mca_bml_base_error_count < (double) mca_bml_base_error_rate_floor) { 
          mca_bml_base_error_count = (double) mca_bml_base_error_rate_floor;
        }
        if(mca_bml_base_error_count % 2) {
            /* local completion - network "drops" packet */
            opal_output(0, "%s:%d: dropping data, with local completion\n", __FILE__, __LINE__);
            des->des_cbfunc(bml_btl->btl, bml_btl->btl_endpoint, des, OMPI_SUCCESS);
            return OMPI_SUCCESS;
        } else {
            /* corrupt data */
            mca_bml_base_context_t* ctx = (mca_bml_base_context_t*) 
                malloc(sizeof(mca_bml_base_context_t));
            if(NULL != ctx) {
                opal_output(0, "%s:%d: corrupting data\n", __FILE__, __LINE__);
                ctx->index = (size_t) ((des->des_src[0].seg_len * 
                            opal_rand(&mca_bml_base_rand_buff) * 1.0) / (UINT32_MAX + 1.0));
                ctx->cbfunc = des->des_cbfunc;
                ctx->cbdata = des->des_cbdata;
                ((unsigned char*)des->des_src[0].seg_addr.pval)[ctx->index] ^= ~0;
                des->des_cbdata = ctx;
                des->des_cbfunc = mca_bml_base_completion;
            }
        }
    }
    mca_bml_base_error_count--;
    return bml_btl->btl_send( bml_btl->btl,
                              bml_btl->btl_endpoint, 
                              des, tag );
}
Esempio n. 4
0
static void sample(void)
{
    float prob;
    orte_proc_t *child;
    int i;

    OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
                         "%s sample:ft_tester considering killing something",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* are we including ourselves? */
    if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_CMSLAVE) &&
        0 < mca_sensor_ft_tester_component.daemon_fail_prob) {
        OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
                             "%s sample:ft_tester considering killing me!",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* roll the dice */
        prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
        if (prob < mca_sensor_ft_tester_component.daemon_fail_prob) {
            /* commit suicide */
            OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
                                 "%s sample:ft_tester committing suicide",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            orte_errmgr.abort(1, NULL);
            return;
        }
    }

    if (0 < mca_sensor_ft_tester_component.fail_prob) {
        /* see if we should kill a child */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (!child->alive || 0 == child->pid ||
                ORTE_PROC_STATE_UNTERMINATED < child->state) {
                OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
                                     "%s sample:ft_tester ignoring child: %s alive %s pid %lu state %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&child->name),
                                     child->alive ? "TRUE" : "FALSE",
                                     (unsigned long)child->pid, orte_proc_state_to_str(child->state)));
                continue;
            }
            /* roll the dice */
            prob = (double)opal_rand(&orte_sensor_ft_rng_buff) / (double)UINT32_MAX;
            OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
                                 "%s sample:ft_tester child: %s dice: %f prob %f",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 prob, mca_sensor_ft_tester_component.fail_prob));
            if (prob < mca_sensor_ft_tester_component.fail_prob) {
                /* you shall die... */
                OPAL_OUTPUT_VERBOSE((1, orte_sensor_base_framework.framework_output,
                                     "%s sample:ft_tester killing %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&child->name)));
                kill(child->pid, SIGTERM);
                /* are we allowing multiple deaths */
                if (!mca_sensor_ft_tester_component.multi_fail) {
                    break;
                }
            }
        }
    }
}
Esempio n. 5
0
static inline void orte_pre_condition_transports_use_rand(uint64_t* unique_key) { 
    opal_rng_buff_t rng;
    opal_srand(&rng,(unsigned int)time(NULL));
    unique_key[0] = opal_rand(&rng);
    unique_key[1] = opal_rand(&rng);
}
Esempio n. 6
0
static void sample(orcm_sensor_sampler_t *sampler)
{
    float prob, division, check;
    char *vector, **elements, **parts, **pieces;
    orcm_ras_event_t *rev;
    int i, j;

    OPAL_OUTPUT_VERBOSE((1, orcm_sensor_base_framework.framework_output,
                         "%s sample:evinj considering injecting something",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

     /* roll the dice */
    prob = (double)opal_rand(&mca_sensor_evinj_component.rng_buff) / (double)UINT32_MAX;
    if (prob < mca_sensor_evinj_component.prob) {
        rev = OBJ_NEW(orcm_ras_event_t);
        /* if we were given a vector file, read
         * the next vector from the file */
        if (NULL != fp) {
            vector = orcm_getline();
            if (NULL == vector) {
                /* reopen the file to start over */
                fclose(fp);
                fp = fopen(mca_sensor_evinj_component.vector_file, "r");
                if (NULL == fp) {
                    /* nothing we can do */
                    return;
                }
                vector = orcm_getline();
                if (NULL == vector) {
                    /* give up */
                    return;
                }
            }
            elements = opal_argv_split(vector, ';');
            free(vector);
            i=0;
            /* first field must contain a comma-delimited set of descriptors
             * of the location reporting this event, each descriptor given
             * as a colon-separated key:value pair (only string values are
             * supported when read from a file) */
            parts = opal_argv_split(elements[i], ',');
            for (j=0; NULL != parts[j]; j++) {
                pieces = opal_argv_split(parts[j], ':');
                if (2 != opal_argv_count(pieces)) {
                    ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM);
                    opal_argv_free(elements);
                    opal_argv_free(parts);
                    opal_argv_free(pieces);
                    OBJ_RELEASE(rev);
                    return;
                }
                ORCM_RAS_REPORTER(rev, pieces[0], pieces[1], OPAL_STRING);
                opal_argv_free(pieces);
            }
            opal_argv_free(parts);
            /* next field must be the event type */
            ++i;
            if (0 == strcmp("EXCEPTION", elements[i])) {
                rev->type = ORCM_RAS_EVENT_EXCEPTION;
            } else if (0 == strcmp("TRANSITION", elements[i])) {
                rev->type = ORCM_RAS_EVENT_STATE_TRANSITION;
            } else if (0 == strcmp("SENSOR", elements[i])) {
                rev->type = ORCM_RAS_EVENT_SENSOR;
            } else if (0 == strcmp("COUNTER", elements[i])) {
                rev->type = ORCM_RAS_EVENT_COUNTER;
            } else {
                ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM);
                opal_argv_free(elements);
                OBJ_RELEASE(rev);
                return;
            }
            /* next field must be the severity */
            ++i;
            if (0 == strcmp("EMERGENCY", elements[i])) {
                rev->severity = ORCM_RAS_EMERG;
            } else if (0 == strcmp("FATAL", elements[i])) {
                rev->severity = ORCM_RAS_FATAL;
            } else if (0 == strcmp("ALERT", elements[i])) {
                rev->severity = ORCM_RAS_ALERT;
            } else if (0 == strcmp("CRITICAL", elements[i])) {
                rev->severity = ORCM_RAS_CRIT;
            } else if (0 == strcmp("ERROR", elements[i])) {
                rev->severity = ORCM_RAS_ERROR;
            } else if (0 == strcmp("WARNING", elements[i])) {
                rev->severity = ORCM_RAS_WARNING;
            } else if (0 == strcmp("NOTICE", elements[i])) {
                rev->severity = ORCM_RAS_NOTICE;
            } else if (0 == strcmp("INFO", elements[i])) {
                rev->severity = ORCM_RAS_INFO;
            } else if (0 == strcmp("TRACE", elements[i])) {
                rev->severity = ORCM_RAS_TRACE;
            } else if (0 == strcmp("DEBUG", elements[i])) {
                rev->severity = ORCM_RAS_DEBUG;
            } else {
                ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM);
                opal_argv_free(elements);
                OBJ_RELEASE(rev);
                return;
            }
            /* next field is optional - if provided, it will consist
             * of a comma-delimited set of descriptors for this
             * event, each given as a colon-separated key:value pair
             * (only string values are supported when read from a file) */
            ++i;
            if (NULL == elements[i]) {
                /* we are done */
                opal_argv_free(elements);
                goto execute;
            }
            parts = opal_argv_split(elements[i], ',');
            for (j=0; NULL != parts[j]; j++) {
                pieces = opal_argv_split(parts[j], ':');
                if (2 != opal_argv_count(pieces)) {
                    ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM);
                    opal_argv_free(elements);
                    opal_argv_free(parts);
                    opal_argv_free(pieces);
                    OBJ_RELEASE(rev);
                    return;
                }
                ORCM_RAS_DESCRIPTION(rev, pieces[0], pieces[1], OPAL_STRING);
                opal_argv_free(pieces);
            }
            opal_argv_free(parts);
             /* the final field is also optional - if provided it
             * will consist of a comma-delimited set of data elements for this
             * event, each given as a colon-separated key:value pair
             * (only string values are supported when read from a file)*/
            ++i;
            if (NULL == elements[i]) {
                /* we are done */
                opal_argv_free(elements);
                goto execute;
            }
            parts = opal_argv_split(elements[i], ',');
            for (j=0; NULL != parts[j]; j++) {
                pieces = opal_argv_split(parts[j], ':');
                if (3 != opal_argv_count(pieces)) {
                    ORTE_ERROR_LOG(ORCM_ERR_BAD_PARAM);
                    opal_argv_free(elements);
                    opal_argv_free(parts);
                    opal_argv_free(pieces);
                    OBJ_RELEASE(rev);
                    return;
                }
                ORCM_RAS_DATA(rev, pieces[0], pieces[1], OPAL_STRING);
                opal_argv_free(pieces);
            }
            opal_argv_free(parts);
            opal_argv_free(elements);
        } else {
            /* just use some bogus location for test purposes */
            ORCM_RAS_REPORTER(rev, ORCM_LOC_CLUSTER, "GRAND-SLAM", OPAL_STRING);
            ORCM_RAS_REPORTER(rev, ORCM_LOC_ROW, "a", OPAL_STRING);
            i = 3;
            ORCM_RAS_REPORTER(rev, ORCM_LOC_RACK, &i, OPAL_INT);
            ORCM_RAS_REPORTER(rev, ORCM_LOC_NODE, "a305", OPAL_STRING);
            ORCM_RAS_REPORTER(rev, ORCM_COMPONENT_OVLYNET, ORCM_SUBCOMPONENT_PROC, OPAL_STRING);
            /* randomly generate the event type */
            prob = (double)opal_rand(&mca_sensor_evinj_component.rng_buff) / (double)UINT32_MAX;
            division = 1.0 / (float)(ORCM_RAS_EVENT_UNKNOWN_TYPE+1);
            rev->type = 0;
            for (check=division; check < prob; check += division) {
                ++rev->type;
            }
            /* randomly generate the severity */
            prob = (double)opal_rand(&mca_sensor_evinj_component.rng_buff) / (double)UINT32_MAX;
            division = 1.0 / (float)(ORCM_RAS_UNKNOWN+1);
            rev->severity = 0;
            for (check=division; check < prob; check += division) {
                ++rev->severity;
            }
            /* provide some description */
            check = 198.75;
            ORCM_RAS_DESCRIPTION(rev, ORCM_DESC_TEMP_HI, &check, OPAL_FLOAT);
            i = 13789;
            ORCM_RAS_DESCRIPTION(rev, ORCM_DESC_SESSION_ID, &i, OPAL_INT);
            /* provide some data */
            check = 134.8;
            ORCM_RAS_DATA(rev, "outlet avg temp", &check, OPAL_FLOAT);
        }

      execute:
        opal_output_verbose(1, orcm_sensor_base_framework.framework_output,
                             "%s sample:evinj injecting RAS event",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

        /* inject it into the event generator thread */
        ORCM_RAS_EVENT(rev);
    }
}