Пример #1
0
static int finalize(void)
{
    opal_list_item_t* item;
    orte_iof_write_output_t *output;
    orte_iof_write_event_t *wev;
    int num_written;
    bool dump;
    int i;
    orte_job_t *jdata;

    /* check if anything is still trying to be written out */
    wev = orte_iof_base.iof_write_stdout->wev;
    if (!opal_list_is_empty(&wev->outputs)) {
        dump = false;
        /* make one last attempt to write this out */
        while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
            output = (orte_iof_write_output_t*)item;
            if (!dump) {
                num_written = write(wev->fd, output->data, output->numbytes);
                if (num_written < output->numbytes) {
                    /* don't retry - just cleanout the list and dump it */
                    dump = true;
                }
            }
            OBJ_RELEASE(output);
        }
    }
    if (!orte_xml_output) {
        /* we only opened stderr channel if we are NOT doing xml output */
        wev = orte_iof_base.iof_write_stderr->wev;
        if (!opal_list_is_empty(&wev->outputs)) {
            dump = false;
            /* make one last attempt to write this out */
            while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
                output = (orte_iof_write_output_t*)item;
                if (!dump) {
                    num_written = write(wev->fd, output->data, output->numbytes);
                    if (num_written < output->numbytes) {
                        /* don't retry - just cleanout the list and dump it */
                        dump = true;
                    }
                }
                OBJ_RELEASE(output);
            }
        }
    }
    
    orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_HNP);

    /* clear our stdin job array */
    for (i=0; i < mca_iof_mr_hnp_component.stdin_jobs.size; i++) {
        if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, i))) {
            continue;
        }
        OBJ_RELEASE(jdata);
    }
    OBJ_DESTRUCT(&mca_iof_mr_hnp_component.stdin_jobs);

    return ORTE_SUCCESS;
}
/*
 * See description in iof_base_endpoint.h
 */
bool orte_iof_base_endpoint_have_pending_frags(
    orte_iof_base_endpoint_t* endpoint)
{
    if (ORTE_IOF_SOURCE == endpoint->ep_mode) {
        return !opal_list_is_empty(&endpoint->ep_source_frags);
    } else {
        return !opal_list_is_empty(&endpoint->ep_sink_frags);
    }
}
Пример #3
0
static int finalize(void)
{
    opal_list_item_t* item;
    orte_iof_write_output_t *output;
    orte_iof_write_event_t *wev;
    int num_written;
    bool dump;

    /* check if anything is still trying to be written out */
    wev = orte_iof_base.iof_write_stdout->wev;
    if (!opal_list_is_empty(&wev->outputs)) {
        dump = false;
        /* make one last attempt to write this out */
        while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
            output = (orte_iof_write_output_t*)item;
            if (!dump) {
                num_written = write(wev->fd, output->data, output->numbytes);
                if (num_written < output->numbytes) {
                    /* don't retry - just cleanout the list and dump it */
                    dump = true;
                }
            }
            OBJ_RELEASE(output);
        }
    }
    OBJ_RELEASE(orte_iof_base.iof_write_stdout);
    if (!orte_xml_output) {
        /* we only opened stderr channel if we are NOT doing xml output */
        wev = orte_iof_base.iof_write_stderr->wev;
        if (!opal_list_is_empty(&wev->outputs)) {
            dump = false;
            /* make one last attempt to write this out */
            while (NULL != (item = opal_list_remove_first(&wev->outputs))) {
                output = (orte_iof_write_output_t*)item;
                if (!dump) {
                    num_written = write(wev->fd, output->data, output->numbytes);
                    if (num_written < output->numbytes) {
                        /* don't retry - just cleanout the list and dump it */
                        dump = true;
                    }
                }
                OBJ_RELEASE(output);
            }
        }
        OBJ_RELEASE(orte_iof_base.iof_write_stderr);
    }

    /* Cancel the RML receive */
    orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_IOF_PROXY);

    return ORTE_SUCCESS;
}
Пример #4
0
int orte_ras_base_node_segment_empty(bool *empty)
{
    int ret;
    opal_list_t nodes;
    opal_list_item_t *item;

    /* See what's already on the node segment */

    OBJ_CONSTRUCT(&nodes, opal_list_t);
    if (ORTE_SUCCESS != (ret = orte_ras_base_node_query(&nodes))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&nodes);
        return ret;
    }

    *empty = opal_list_is_empty(&nodes);

    /* Free the list */

    while (NULL != (item = opal_list_remove_first(&nodes))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&nodes);

    /* All done */

    return ORTE_SUCCESS;
}
Пример #5
0
static void stop(orte_jobid_t jobid)
{
    opal_list_item_t *item;
    file_tracker_t *ft;
    
    /* cannot monitor my own job */
    if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
        return;
    }
    
    for (item = opal_list_get_first(&jobs);
         item != opal_list_get_end(&jobs);
         item = opal_list_get_next(item)) {
        ft = (file_tracker_t*)item;
        if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) {
            opal_list_remove_item(&jobs, item);
            OBJ_RELEASE(item);
        }
    }
    /* if no jobs remain, stop the sampling */
    if (opal_list_is_empty(&jobs) && NULL != sample_ev) {
        opal_event_del(sample_ev);
        free(sample_ev);
        sample_ev = NULL;
    }
    return;
}
Пример #6
0
/**
 * Function to remove previously memory from the tree without freeing it
 *
 * @param base pointer to the memory to free
 *
 * @retval OMPI_SUCCESS
 * @retval OMPI_ERR_BAD_PARAM if the passed base pointer was invalid
 */
int mca_rcache_vma_tree_delete(mca_rcache_vma_module_t* vma_rcache, 
                          mca_mpool_base_registration_t* reg)
{
    mca_rcache_vma_t *vma;

    vma = (mca_rcache_vma_t*)ompi_rb_tree_find_with(&vma_rcache->rb_tree, reg->base, 
            mca_rcache_vma_tree_node_compare_search);

    if(!vma)
        return OMPI_ERROR;

    while(vma != (mca_rcache_vma_t*)opal_list_get_end(&vma_rcache->vma_list)
            && vma->start <= (uintptr_t)reg->bound) {
        mca_rcache_vma_remove_reg(vma, reg);
        
        if(opal_list_is_empty(&vma->reg_list)) {
            mca_rcache_vma_t *next = (mca_rcache_vma_t*)opal_list_get_next(&vma->super);
            ompi_rb_tree_delete(&vma_rcache->rb_tree, vma);
            mca_rcache_vma_update_byte_count(vma_rcache,
                    vma->start - vma->end - 1);
            opal_list_remove_item(&vma_rcache->vma_list, &vma->super);
	    opal_list_append(&vma_rcache->vma_delete_list, &vma->super);
            vma = next;
        } else {
            int merged;

            do {
                mca_rcache_vma_t *prev = NULL, *next = NULL;
                if(opal_list_get_begin(&vma_rcache->vma_list) != 
                        opal_list_get_prev(vma))
                    prev = (mca_rcache_vma_t*)opal_list_get_prev(vma);
                merged = 0;

                if(prev && vma->start == prev->end + 1 &&
                        mca_rcache_vma_compare_reg_lists(vma, prev)) {
                    prev->end = vma->end;
                    opal_list_remove_item(&vma_rcache->vma_list, &vma->super);
                    ompi_rb_tree_delete(&vma_rcache->rb_tree, vma);
		    opal_list_append(&vma_rcache->vma_delete_list, &vma->super);
                    vma = prev;
                    merged = 1;
                }
                if(opal_list_get_end(&vma_rcache->vma_list) != 
                        opal_list_get_next(vma))
                    next = (mca_rcache_vma_t*)opal_list_get_next(vma);

                if(next && vma->end + 1 == next->start &&
                        mca_rcache_vma_compare_reg_lists(vma, next)) {
                    vma->end = next->end;
                    opal_list_remove_item(&vma_rcache->vma_list, &next->super);
                    ompi_rb_tree_delete(&vma_rcache->rb_tree, next);
		    opal_list_append(&vma_rcache->vma_delete_list, &next->super);
                    merged = 1;
                }
            } while(merged);
            vma = (mca_rcache_vma_t*)opal_list_get_next(vma);
        }
    }
    return 0;
}
Пример #7
0
/*
 * called when the connect module has completed setup of an endpoint
 */
void mca_btl_wv_endpoint_connected(mca_btl_wv_endpoint_t *endpoint)
{
    opal_list_item_t *frag_item;
    mca_btl_wv_send_frag_t *frag;
    bool master = false;

    opal_output(-1, "Now we are CONNECTED");
    endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
    endpoint->endpoint_btl->device->non_eager_rdma_endpoints++;

    /* The connection is correctly setup. Now we can decrease the
       event trigger. */
    opal_progress_event_users_decrement();

    /* Process pending packet on the endpoint */
    /* While there are frags in the list, process them */
    while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) {
        frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags));
        frag = to_send_frag(frag_item);
        /* We need to post this one */

        if(OMPI_SUCCESS != mca_btl_wv_endpoint_post_send(endpoint, frag))
            BTL_ERROR(("Error posting send"));
    }
    OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);

    /* if upper layer called put or get before connection moved to connected
     * state then we restart them here */
    mca_btl_wv_frag_progress_pending_put_get(endpoint,
            mca_btl_wv_component.rdma_qp);
}
Пример #8
0
int
ompi_osc_portals4_free(struct ompi_win_t *win)
{
    ompi_osc_portals4_module_t *module =
        (ompi_osc_portals4_module_t*) win->w_osc_module;
    int ret = OMPI_SUCCESS;

    /* synchronize */
    module->comm->c_coll.coll_barrier(module->comm,
                                      module->comm->c_coll.coll_barrier_module);

    /* cleanup */
    PtlMEUnlink(module->data_me_h);
    PtlMDRelease(module->md_h);
    PtlMDRelease(module->req_md_h);
    PtlCTFree(module->ct_h);
    if (NULL != module->disp_units) free(module->disp_units);
    ompi_comm_free(&module->comm);
    if (NULL != module->free_after) free(module->free_after);

    if (!opal_list_is_empty(&module->outstanding_locks)) {
        ret = OMPI_ERR_RMA_SYNC;
    }
    OBJ_DESTRUCT(&module->outstanding_locks);

    free(module);

    return ret;
}
Пример #9
0
int mca_io_ompio_file_sync (ompi_file_t *fh)
{
    int ret = OMPI_SUCCESS;
    mca_common_ompio_data_t *data;

    data = (mca_common_ompio_data_t *) fh->f_io_selected_data;

    OPAL_THREAD_LOCK(&fh->f_lock);
    if ( !opal_list_is_empty (&mca_common_ompio_pending_requests) ) {
        OPAL_THREAD_UNLOCK(&fh->f_lock);
        return MPI_ERR_OTHER;
    }

    if ( data->ompio_fh.f_amode & MPI_MODE_RDONLY ) {
        OPAL_THREAD_UNLOCK(&fh->f_lock);
        return MPI_ERR_ACCESS;
    }        
    // Make sure all processes reach this point before syncing the file.
    ret = data->ompio_fh.f_comm->c_coll->coll_barrier (data->ompio_fh.f_comm,
                                                       data->ompio_fh.f_comm->c_coll->coll_barrier_module);
    if ( MPI_SUCCESS != ret ) {
        OPAL_THREAD_UNLOCK(&fh->f_lock);
        return ret;
    }
    ret = data->ompio_fh.f_fs->fs_file_sync (&data->ompio_fh);
    OPAL_THREAD_UNLOCK(&fh->f_lock);

    return ret;
}
Пример #10
0
/**
 * Discover available (pre-allocated) nodes and report
 * them back to the caller.
 *  
 */
static int allocate(opal_list_t *nodes)
{
    int ret;
    char *pbs_jobid;

    /* get our PBS jobid from the environment */
    if (NULL == (pbs_jobid = getenv("PBS_JOBID"))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    
    /* save that value in the global job ident string for
     * later use in any error reporting
     */
    orte_job_ident = strdup(pbs_jobid);
    
    if (ORTE_SUCCESS != (ret = discover(nodes, pbs_jobid))) {
        ORTE_ERROR_LOG(ret);
        return ret;
    }
    
    /* in the TM world, if we didn't find anything, then this
     * is an unrecoverable error - report it
     */
    if (opal_list_is_empty(nodes)) {
        orte_show_help("help-ras-tm.txt", "no-nodes-found", true, filename);
        return ORTE_ERR_NOT_FOUND;
    }
    
    /* All done */
    return ORTE_SUCCESS;
}
Пример #11
0
static void udsensors_send_log_to_analytics(opal_list_t *key, opal_list_t *non_compute, opal_list_t *compute)
{
    orcm_analytics_value_t *analytics_vals = NULL;

    if (!opal_list_is_empty(compute)){
        /* send data to analytics */
        analytics_vals = orcm_util_load_orcm_analytics_value(key, non_compute, compute);
        orcm_analytics.send_data(analytics_vals);
    }
    SAFE_RELEASE(analytics_vals);
}
/*
 * Close the component
 */
static int basesmuma_close(void)
{
    int ret;
    bcol_basesmuma_registration_data_t *net_ctx;
    bcol_base_network_context_t *net_reg;
	mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;

	/* gvm Leak FIX */
    while(!opal_list_is_empty(&(cs->ctl_structures))) {
        opal_list_item_t *item;
        item = opal_list_remove_first(&(cs->ctl_structures));
		OBJ_DESTRUCT(item);
	}
    OBJ_DESTRUCT(&(cs->ctl_structures));


    /* deregister the progress function */
    ret=opal_progress_unregister(bcol_basesmuma_progress);
    if (MPI_SUCCESS != ret) {
        opal_output(0, "failed to unregister the progress function\n");
    }

    /* remove the control structure backing file */
    ret=mca_bcol_basesmuma_deregister_ctl_sm(&mca_bcol_basesmuma_component);
    if (MPI_SUCCESS != ret) {
        opal_output(0, "failed to remove control structure backing file\n");
    }

    /* remove the network contexts - only one network context defined for
     * this component.
     */
    /* file_name returne by asprintf, so need to free the resource */
    if(mca_bcol_basesmuma_component.super.network_contexts ) {
        net_reg=(bcol_base_network_context_t *)
            mca_bcol_basesmuma_component.super.network_contexts[0];
        if(net_reg) {
            net_ctx=(bcol_basesmuma_registration_data_t *)net_reg->context_data;
            if( net_ctx) {
                if(net_ctx->file_name) {
                    free(net_ctx->file_name);
                }
                free(net_ctx);
            }
            free(net_reg);
        }
        free(mca_bcol_basesmuma_component.super.network_contexts);
        mca_bcol_basesmuma_component.super.network_contexts=NULL;
    }

    /* normal return */
    return OMPI_SUCCESS;
}
/* In case if XRC recv qp was closed and sender still don't know about it
 * we need close the qp, reset the ib_adrr status to CLOSED and start everything
 * from scratch.
 */
static void xoob_restart_connect(mca_btl_base_endpoint_t *endpoint)
{
    BTL_VERBOSE(("Restarting the connection for the endpoint"));
    OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock);
    switch (endpoint->ib_addr->status) {
        case MCA_BTL_IB_ADDR_CONNECTED:
            /* so we have the send qp, we just need the recive site.
             * Send request for SRQ numbers */
            BTL_VERBOSE(("Restart The IB addr: sid %" PRIx64 " lid %d"
                         "in MCA_BTL_IB_ADDR_CONNECTED status,"
                         " Changing to MCA_BTL_IB_ADDR_CLOSED and starting from scratch\n",
                         endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
            /* Switching back to closed and starting from scratch */
            endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
            /* destroy the qp */
            /* the reciver site was alredy closed so all pending list must be clean ! */
            assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[0]));
            assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[1]));
            if(ibv_destroy_qp(endpoint->qps[0].qp->lcl_qp))
                BTL_ERROR(("Failed to destroy QP"));
        case MCA_BTL_IB_ADDR_CLOSED:
        case MCA_BTL_IB_ADDR_CONNECTING:
            BTL_VERBOSE(("Restart The IB addr: sid %" PRIx64 " lid %d"
                         "in MCA_BTL_IB_ADDR_CONNECTING or MCA_BTL_IB_ADDR_CLOSED status,"
                         " starting from scratch\n",
                         endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
            OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
            /* xoob_module_start_connect() should automaticly handle all other cases */
            if (OMPI_SUCCESS != xoob_module_start_connect(NULL, endpoint))
                BTL_ERROR(("Failed to restart connection from MCA_BTL_IB_ADDR_CONNECTING/CLOSED"));
            break;
        default :
            BTL_ERROR(("Invalid endpoint status %d", endpoint->ib_addr->status));
            OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
    }
}
Пример #14
0
int
orte_sds_base_close(void)
{
    /* finalize running component */
    if (NULL != orte_sds_base_module) {
        orte_sds_base_module->finalize();
    }

    /* shutdown any remaining opened components */
    if (! opal_list_is_empty(&orte_sds_base_components_available)) {
        mca_base_components_close(0, 
                                  &orte_sds_base_components_available, NULL);
    }
    OBJ_DESTRUCT(&orte_sds_base_components_available);
    return ORTE_SUCCESS;
}
Пример #15
0
int ompi_osc_ucx_free(struct ompi_win_t *win) {
    ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
    int i, ret = OMPI_SUCCESS;

    if ((module->epoch_type.access != NONE_EPOCH && module->epoch_type.access != FENCE_EPOCH)
        || module->epoch_type.exposure != NONE_EPOCH) {
        ret = OMPI_ERR_RMA_SYNC;
    }

    if (module->start_group != NULL || module->post_group != NULL) {
        ret = OMPI_ERR_RMA_SYNC;
    }

    assert(module->global_ops_num == 0);
    assert(module->lock_count == 0);
    assert(opal_list_is_empty(&module->pending_posts) == true);
    OBJ_DESTRUCT(&module->outstanding_locks);
    OBJ_DESTRUCT(&module->pending_posts);

    while (module->state.lock != TARGET_LOCK_UNLOCKED) {
        /* not sure if this is required */
        ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
    }

    ret = module->comm->c_coll->coll_barrier(module->comm,
                                             module->comm->c_coll->coll_barrier_module);

    for (i = 0; i < ompi_comm_size(module->comm); i++) {
        ucp_rkey_destroy((module->win_info_array[i]).rkey);
        ucp_rkey_destroy((module->state_info_array[i]).rkey);
    }
    free(module->win_info_array);
    free(module->state_info_array);

    free(module->per_target_ops_nums);

    ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->memh);
    ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->state_memh);

    if (module->disp_units) free(module->disp_units);
    ompi_comm_free(&module->comm);

    free(module);

    return ret;
}
Пример #16
0
static int orcm_octl_logical_group_print_list(opal_hash_table_t *groups)
{
    char *key = NULL;
    size_t key_size = 0;
    opal_list_t *value = NULL;
    opal_list_t *new_value = NULL;
    void *in_member = NULL;
    void *o_member = NULL;
    orcm_logical_group_member_t *member_item = NULL;

    while (ORCM_SUCCESS == opal_hash_table_get_next_key_ptr(groups, (void**)&key,
                                         &key_size, (void**)&value, in_member, &o_member)) {
        new_value = orcm_logical_group_convert_members_list(value, MAX_LINE_LENGTH);
        if (NULL != new_value && !opal_list_is_empty(new_value)) {
            ORCM_UTIL_MSG_WITH_ARG("\ngroup name=%s", key);
            OPAL_LIST_FOREACH(member_item, new_value, orcm_logical_group_member_t) {
                ORCM_UTIL_MSG_WITH_ARG("member list=%s", member_item->member);
            }
        }
Пример #17
0
oshmem_group_t* find_group_in_cache(int PE_start, int logPE_stride, int PE_size)
{
    int cache_look_up_id[3] = { PE_start, logPE_stride, PE_size };
    opal_list_item_t *item;
    if (opal_list_is_empty(&oshmem_group_cache_list)) {
        return NULL ;
    }

    for (item = opal_list_get_first(&oshmem_group_cache_list);
            item && (item != opal_list_get_end(&oshmem_group_cache_list));
            item = opal_list_get_next(item)) {
        if (!memcmp(((oshmem_group_cache_t *) item)->cache_id,
                    cache_look_up_id,
                    3 * sizeof(int))) {
            return ((oshmem_group_cache_t *) item)->group;
        }
    }
    return NULL ;
}
Пример #18
0
/*
 * Discover available (pre-allocated) nodes.  Allocate the
 * requested number of nodes/process slots to the job.
 */
static int orte_ras_loadleveler_allocate(orte_job_t *jdata, opal_list_t *nodes)
{
    int ret = ORTE_SUCCESS;

    if (ORTE_SUCCESS != (ret = orte_ras_loadleveler_discover(nodes))) {
        ORTE_ERROR_LOG(ret);
        return ret;
    }

     /* If we didn't find anything, then this
      * is an unrecoverable error - report it
      */
    if (opal_list_is_empty(nodes)) {
        opal_output(orte_ras_base.ras_output,
                "ras:loadleveler:allocate: No nodes were found in the LOADL_HOSTFILE - %s",
		getenv("LOADL_HOSTFILE"));
        return ORTE_ERR_NOT_FOUND;
    }
    
    return ret;
}
/*
 * Forcibly drain all pending output on an endpoint, without waiting for
 * actual completion.
 */
void
ompi_btl_usnic_flush_endpoint(
    ompi_btl_usnic_endpoint_t *endpoint)
{
    ompi_btl_usnic_send_frag_t *frag;

    /* First, free all pending fragments */
    while (!opal_list_is_empty(&endpoint->endpoint_frag_send_queue)) {
        frag = (ompi_btl_usnic_send_frag_t *)opal_list_remove_first(
                &endpoint->endpoint_frag_send_queue);

        /* _cond still needs to check ownership, but make sure the 
         * fragment is marked as done.
         */
        frag->sf_ack_bytes_left = 0;
        frag->sf_seg_post_cnt = 0;
        ompi_btl_usnic_send_frag_return_cond(endpoint->endpoint_module, frag);
    }

    /* Now, ACK everything that is pending */
    ompi_btl_usnic_handle_ack(endpoint, endpoint->endpoint_next_seq_to_send-1);
}
Пример #20
0
/* This function must be called with the rcache lock held */
static void do_unregistration_gc(struct mca_mpool_base_module_t *mpool)
{
    mca_mpool_rdma_module_t *mpool_rdma = (mca_mpool_rdma_module_t*)mpool;
    mca_mpool_base_registration_t *reg;

    do {
        /* Remove registration from garbage collection list
           before deregistering it */
        reg = (mca_mpool_base_registration_t *)
            opal_list_remove_first(&mpool_rdma->gc_list);
        mpool->rcache->rcache_delete(mpool->rcache, reg);

        /* Drop the rcache lock before calling dereg_mem as there
           may be memory allocations */
        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
        dereg_mem(mpool, reg);
        OPAL_THREAD_LOCK(&mpool->rcache->lock);

        OMPI_FREE_LIST_RETURN(&mpool_rdma->reg_list,
                (ompi_free_list_item_t*)reg);
    } while(!opal_list_is_empty(&mpool_rdma->gc_list));
}
Пример #21
0
static int ompi_comm_register_cid (uint32_t cid )
{
    opal_list_item_t *item;
    ompi_comm_reg_t *regcom;
    ompi_comm_reg_t *newentry = OBJ_NEW(ompi_comm_reg_t);

    newentry->cid = cid;
    if ( !(opal_list_is_empty (&ompi_registered_comms)) ) {
        for (item = opal_list_get_first(&ompi_registered_comms);
             item != opal_list_get_end(&ompi_registered_comms);
             item = opal_list_get_next(item)) {
            regcom = (ompi_comm_reg_t *)item;
            if ( regcom->cid > cid ) {
                break;
            }
#if OMPI_ENABLE_THREAD_MULTIPLE
            if( regcom->cid == cid ) {
                /**
                 * The MPI standard state that is the user responsability to
                 * schedule the global communications in order to avoid any
                 * kind of troubles. As, managing communicators involve several
                 * collective communications, we should enforce a sequential
                 * execution order. This test only allow one communicator
                 * creation function based on the same communicator.
                 */
                OBJ_RELEASE(newentry);
                return OMPI_ERROR;
            }
#endif  /* OMPI_ENABLE_THREAD_MULTIPLE */
        }
        opal_list_insert_pos (&ompi_registered_comms, item, 
                              (opal_list_item_t *)newentry);
    }
    else {
        opal_list_append (&ompi_registered_comms, (opal_list_item_t *)newentry);
    }

    return OMPI_SUCCESS;
}
Пример #22
0
mca_bcol_base_lmngr_block_t* mca_coll_ml_lmngr_alloc (
        mca_coll_ml_lmngr_t *lmngr)
{
    int rc;
    opal_list_t *list = &lmngr->blocks_list;

    /* Check if the list manager was initialized */
    if(OPAL_UNLIKELY(NULL == lmngr->base_addr)) {
        ML_VERBOSE(7 ,("Starting memory initialization"));
        rc = mca_coll_ml_lmngr_init(lmngr);
        if (OMPI_SUCCESS != rc) {
            ML_ERROR(("Failed to init memory"));
            return NULL;
        }
    }

    if(OPAL_UNLIKELY(opal_list_is_empty(list))) {
        /* Upper layer need to handle the NULL */
        ML_VERBOSE(1, ("List manager is empty."));
        return NULL;
    }

    return (mca_bcol_base_lmngr_block_t *)opal_list_remove_first(list);
}
Пример #23
0
static void mca_btl_mvapi_endpoint_connected(mca_btl_mvapi_endpoint_t *endpoint)
{
    opal_list_item_t *frag_item;
    mca_btl_mvapi_frag_t *frag;
    mca_btl_mvapi_module_t* mvapi_btl; 

    /* While there are frags in the list, process them */
    endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;

    /**
     * The connection is correctly setup. Now we can decrease the event trigger.
     */
    opal_progress_event_decrement();

    while(!opal_list_is_empty(&(endpoint->pending_send_frags))) {
        frag_item = opal_list_remove_first(&(endpoint->pending_send_frags));
        frag = (mca_btl_mvapi_frag_t *) frag_item;
        mvapi_btl = endpoint->endpoint_btl;
        /* We need to post this one */
        
        if(OMPI_SUCCESS !=  mca_btl_mvapi_endpoint_post_send(mvapi_btl, endpoint, frag))
            BTL_ERROR(("error in mca_btl_mvapi_endpoint_send"));
    }
}
Пример #24
0
/*
 * Function for selecting one component from all those that are
 * available.
 */
void orte_ras_base_allocate(int fd, short args, void *cbdata)
{
    int rc;
    orte_job_t *jdata;
    opal_list_t nodes;
    orte_node_t *node;
    orte_std_cntr_t i;
    orte_app_context_t *app;
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                         "%s ras:base:allocate",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* convenience */
    jdata = caddy->jdata;

    /* if we already did this, don't do it again - the pool of
     * global resources is set. 
     */
    if (orte_ras_base.allocation_read) {
        
        OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                             "%s ras:base:allocate allocation already read",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto next_state;
    }
    orte_ras_base.allocation_read = true;

    /* Otherwise, we have to create
     * the initial set of resources that will delineate all
     * further operations serviced by this HNP. This list will
     * contain ALL nodes that can be used by any subsequent job.
     *
     * In other words, if a node isn't found in this step, then
     * no job launched by this HNP will be able to utilize it.
     */
    
    /* construct a list to hold the results */
    OBJ_CONSTRUCT(&nodes, opal_list_t);

    /* if a component was selected, then we know we are in a managed
     * environment.  - the active module will return a list of what it found
     */
    if (NULL != orte_ras_base.active_module)  {
        /* read the allocation */
        if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(jdata, &nodes))) {
            if (ORTE_ERR_ALLOCATION_PENDING == rc) {
                /* an allocation request is underway, so just do nothing */
                OBJ_DESTRUCT(&nodes);
                OBJ_RELEASE(caddy);
                return;
            }
            if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) {
                /* this module indicates that nodes will be discovered
                 * on a bootstrap basis, so all we do here is add our
                 * own node to the list
                 */
                goto addlocal;
            }
            if (ORTE_ERR_TAKE_NEXT_OPTION == rc) {
                /* we have an active module, but it is unable to
                 * allocate anything for this job - this indicates
                 * that it isn't a fatal error, but could be if
                 * an allocation is required
                 */
                if (orte_allocation_required) {
                    /* an allocation is required, so this is fatal */
                    OBJ_DESTRUCT(&nodes);
                    orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
                    ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    OBJ_RELEASE(caddy);
                    return;
                } else {
                    /* an allocation is not required, so we can just
                     * run on the local node - go add it
                     */
                    goto addlocal;
                }
            }
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&nodes);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
    } 
    /* If something came back, save it and we are done */
    if (!opal_list_is_empty(&nodes)) {
        /* store the results in the global resource pool - this removes the
         * list items
         */
        if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&nodes);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
        OBJ_DESTRUCT(&nodes);
        /* default to no-oversubscribe-allowed for managed systems */
        if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        }
        /* flag that the allocation is managed */
        orte_managed_allocation = true;
        goto DISPLAY;
    } else if (orte_allocation_required) {
        /* if nothing was found, and an allocation is
         * required, then error out
         */
        OBJ_DESTRUCT(&nodes);
        orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                         "%s ras:base:allocate nothing found in module - proceeding to hostfile",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* nothing was found, or no active module was alive. Our next
     * option is to look for a hostfile and assign our global
     * pool from there.
     *
     * Individual hostfile names, if given, are included
     * in the app_contexts for this job. We therefore need to
     * retrieve the app_contexts for the job, and then cycle
     * through them to see if anything is there. The parser will
     * add the nodes found in each hostfile to our list - i.e.,
     * the resulting list contains the UNION of all nodes specified
     * in hostfiles from across all app_contexts
     *
     * We then continue to add any hosts provided by dash-host and
     * the default hostfile, if we have it. We will then filter out
     * all the non-desired hosts (i.e., those not specified by
     * -host and/or -hostfile) when we start the mapping process
     *
     * Note that any relative node syntax found in the hostfiles will
     * generate an error in this scenario, so only non-relative syntax
     * can be present
     */
    if (NULL != orte_default_hostfile) {
        OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                             "%s ras:base:allocate parsing default hostfile %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             orte_default_hostfile));
        
        /* a default hostfile was provided - parse it */
        if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                               orte_default_hostfile))) {
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&nodes);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
    }
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        if (NULL != app->hostfile) {
            OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                                 "%s ras:base:allocate adding hostfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 app->hostfile));
            
            /* hostfile was specified - parse it and add it to the list */
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                                   app->hostfile))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&nodes);
                /* set an error event */
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                OBJ_RELEASE(caddy);
                return;
            }
        } else if (!orte_soft_locations && NULL != app->dash_host) {
            /* if we are using soft locations, then any dash-host would
             * just include desired nodes and not required. We don't want
             * to pick them up here as this would mean the request was
             * always satisfied - instead, we want to allow the request
             * to fail later on and use whatever nodes are actually
             * available
             */
            OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                                 "%s ras:base:allocate adding dash_hosts",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
                                                                    app->dash_host))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&nodes);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                OBJ_RELEASE(caddy);
                return;
            }
        }
    }

    /* if something was found in the hostfile(s), we use that as our global
     * pool - set it and we are done
     */
    if (!opal_list_is_empty(&nodes)) {
        /* store the results in the global resource pool - this removes the
         * list items
         */
        if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
            ORTE_ERROR_LOG(rc);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
        /* cleanup */
        OBJ_DESTRUCT(&nodes);
        goto DISPLAY;
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                         "%s ras:base:allocate nothing found in hostfiles - checking for rankfile",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* Our next option is to look for a rankfile - if one was provided, we
     * will use its nodes to create a default allocation pool
     */
    if (NULL != orte_rankfile) {
        /* check the rankfile for node information */
        if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                               orte_rankfile))) {
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&nodes);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return ;
        }
    }
    /* if something was found in rankfile, we use that as our global
     * pool - set it and we are done
     */
    if (!opal_list_is_empty(&nodes)) {
        /* store the results in the global resource pool - this removes the
         * list items
         */
        if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
            ORTE_ERROR_LOG(rc);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
        /* rankfile is considered equivalent to an RM allocation */
        if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        }
        /* cleanup */
        OBJ_DESTRUCT(&nodes);
        goto DISPLAY;
    }
    
    
    OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                         "%s ras:base:allocate nothing found in rankfile - inserting current node",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
 addlocal:
    /* if nothing was found by any of the above methods, then we have no
     * earthly idea what to do - so just add the local host
     */
    node = OBJ_NEW(orte_node_t);
    if (NULL == node) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        OBJ_DESTRUCT(&nodes);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    /* use the same name we got in orte_process_info so we avoid confusion in
     * the session directories
     */
    node->name = strdup(orte_process_info.nodename);
    node->state = ORTE_NODE_STATE_UP;
    node->slots_inuse = 0;
    node->slots_max = 0;
    node->slots = 1;
    opal_list_append(&nodes, &node->super);
    
    /* store the results in the global resource pool - this removes the
     * list items
     */
    if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&nodes);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    OBJ_DESTRUCT(&nodes);

 DISPLAY:
    /* shall we display the results? */
    if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) {
        orte_ras_base_display_alloc();
    }

 next_state:
    /* are we to report this event? */
    if (orte_report_events) {
        if (ORTE_SUCCESS != (rc = orte_util_comm_report_event(ORTE_COMM_EVENT_ALLOCATE))) {
            ORTE_ERROR_LOG(rc);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
        }
    }
    
    /* set total slots alloc */
    jdata->total_slots_alloc = orte_ras_base.total_slots_alloc;

    /* set the job state to the next position */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE);

    /* cleanup */
    OBJ_RELEASE(caddy);
}
Пример #25
0
int orte_ras_base_add_hosts(orte_job_t *jdata)
{
    int rc;
    opal_list_t nodes;
    int i;
    orte_app_context_t *app;

    /* construct a list to hold the results */
    OBJ_CONSTRUCT(&nodes, opal_list_t);
    
    /* Individual add-hostfile names, if given, are included
     * in the app_contexts for this job. We therefore need to
     * retrieve the app_contexts for the job, and then cycle
     * through them to see if anything is there. The parser will
     * add the nodes found in each add-hostfile to our list - i.e.,
     * the resulting list contains the UNION of all nodes specified
     * in add-hostfiles from across all app_contexts
     *
     * Note that any relative node syntax found in the add-hostfiles will
     * generate an error in this scenario, so only non-relative syntax
     * can be present
     */
    
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        if (NULL != app->add_hostfile) {
            OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
                                 "%s ras:base:add_hosts checking add-hostfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 app->add_hostfile));
            
            /* hostfile was specified - parse it and add it to the list */
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                                   app->add_hostfile))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&nodes);
                return rc;
            }
            /* now indicate that this app is to run across it */
            app->hostfile = app->add_hostfile;
            app->add_hostfile = NULL;
        }
    }

    /* We next check for and add any add-host options. Note this is
     * a -little- different than dash-host in that (a) we add these
     * nodes to the global pool regardless of what may already be there,
     * and (b) as a result, any job and/or app_context can access them.
     *
     * Note that any relative node syntax found in the add-host lists will
     * generate an error in this scenario, so only non-relative syntax
     * can be present
     */
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        if (NULL != app->add_host) {
            if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) {
                char *fff = opal_argv_join(app->add_host, ',');
                opal_output(0, "%s ras:base:add_hosts checking add-host %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fff);
                free(fff);
            }
            if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
                                                                    app->add_host))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&nodes);
                return rc;
            }
            /* now indicate that this app is to run across them */
            app->dash_host = app->add_host;
            app->add_host = NULL;
        }
    }
    
    /* if something was found, we add that to our global pool */
    if (!opal_list_is_empty(&nodes)) {
        /* store the results in the global resource pool - this removes the
         * list items
         */
        if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) {
            ORTE_ERROR_LOG(rc);
        }
        /* cleanup */
        OBJ_DESTRUCT(&nodes);
    }
    
    /* shall we display the results? */
    if (0 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) {
        orte_ras_base_display_alloc();
    }
    
    return ORTE_SUCCESS;
}
Пример #26
0
static void xcast_recv(int status, orte_process_name_t* sender,
                       opal_buffer_t* buffer, orte_rml_tag_t tg,
                       void* cbdata)
{
    opal_list_item_t *item;
    orte_namelist_t *nm;
    int ret, cnt;
    opal_buffer_t *relay, *rly;
    orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD;
    opal_buffer_t wireup;
    opal_byte_object_t *bo;
    int8_t flag;
    orte_job_t *jdata;
    orte_proc_t *rec;
    opal_list_t coll;
    orte_grpcomm_signature_t *sig;
    orte_rml_tag_t tag;

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:direct:xcast:recv: with %d bytes",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)buffer->bytes_used));

    /* we need a passthru buffer to send to our children */
    rly = OBJ_NEW(opal_buffer_t);
    opal_dss.copy_payload(rly, buffer);

    /* get the signature that we do not need */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) {
        ORTE_ERROR_LOG(ret);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }
    OBJ_RELEASE(sig);

    /* get the target tag */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &tag, &cnt, ORTE_RML_TAG))) {
        ORTE_ERROR_LOG(ret);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }

    /* setup a buffer we can pass to ourselves - this just contains
     * the initial message, minus the headers inserted by xcast itself */
    relay = OBJ_NEW(opal_buffer_t);
    opal_dss.copy_payload(relay, buffer);
    /* setup the relay list */
    OBJ_CONSTRUCT(&coll, opal_list_t);

    /* if this is headed for the daemon command processor,
     * then we first need to check for add_local_procs
     * as that command includes some needed wireup info */
    if (ORTE_RML_TAG_DAEMON == tag) {
        /* peek at the command */
        cnt=1;
        if (ORTE_SUCCESS == (ret = opal_dss.unpack(buffer, &command, &cnt, ORTE_DAEMON_CMD))) {
            /* if it is add_procs, then... */
            if (ORTE_DAEMON_ADD_LOCAL_PROCS == command ||
                ORTE_DAEMON_DVM_NIDMAP_CMD == command) {
                /* extract the byte object holding the daemonmap */
                cnt=1;
                if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }

                /* update our local nidmap, if required - the decode function
                 * knows what to do - it will also free the bytes in the byte object
                 */
                if (ORTE_PROC_IS_HNP) {
                    /* no need - already have the info */
                    if (NULL != bo) {
                        if (NULL != bo->bytes) {
                            free(bo->bytes);
                        }
                        free(bo);
                    }
                } else {
                    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                         "%s grpcomm:direct:xcast updating daemon nidmap",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

                    if (ORTE_SUCCESS != (ret = orte_util_decode_daemon_nodemap(bo))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                }

                /* update the routing plan */
                orte_routed.update_routing_plan();

                /* see if we have wiring info as well */
                cnt=1;
                if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }

                if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
                    OBJ_RELEASE(relay);
                    relay = OBJ_NEW(opal_buffer_t);
                    /* repack the command */
                    if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                    if (0 == flag) {
                        /* copy the remainder of the payload */
                        opal_dss.copy_payload(relay, buffer);
                        /* no - just return */
                        goto relay;
                    }
                }

                /* unpack the byte object */
                cnt=1;
                if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                if (0 < bo->size) {
                    /* load it into a buffer */
                    OBJ_CONSTRUCT(&wireup, opal_buffer_t);
                    opal_dss.load(&wireup, bo->bytes, bo->size);
                    /* pass it for processing */
                    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, &wireup))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_DESTRUCT(&wireup);
                        goto relay;
                    }
                    /* done with the wireup buffer - dump it */
                    OBJ_DESTRUCT(&wireup);
                }
                free(bo);
                if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
                    /* copy the remainder of the payload */
                    opal_dss.copy_payload(relay, buffer);
                }
            }
        } else {
            ORTE_ERROR_LOG(ret);
            goto CLEANUP;
        }
    }

 relay:

    /* get the list of next recipients from the routed module */
    orte_routed.get_routing_list(&coll);

    /* if list is empty, no relay is required */
    if (opal_list_is_empty(&coll)) {
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s grpcomm:direct:send_relay - recipient list is empty!",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        OBJ_RELEASE(rly);
        goto CLEANUP;
    }

    /* send the message to each recipient on list, deconstructing it as we go */
    while (NULL != (item = opal_list_remove_first(&coll))) {
        nm = (orte_namelist_t*)item;

        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used,
                             ORTE_NAME_PRINT(&nm->name)));
        OBJ_RETAIN(rly);
        /* check the state of the recipient - no point
         * sending to someone not alive
         */
        jdata = orte_get_job_data_object(nm->name.jobid);
        if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) {
            opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
            OBJ_RELEASE(rly);
            OBJ_RELEASE(item);
            continue;
        }
        if (ORTE_PROC_STATE_RUNNING < rec->state) {
            opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
            OBJ_RELEASE(rly);
            OBJ_RELEASE(item);
            continue;
        }
        if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&nm->name, rly, ORTE_RML_TAG_XCAST,
                                                           orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(rly);
            OBJ_RELEASE(item);
            continue;
        }
        OBJ_RELEASE(item);
    }
    OBJ_RELEASE(rly);  // retain accounting

 CLEANUP:
    /* cleanup */
    OBJ_DESTRUCT(&coll);

    /* now send the relay buffer to myself for processing */
    if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) {
        if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, tag,
                                                           orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(relay);
        }
    }
}
Пример #27
0
/**
 * Discover available (pre-allocated) nodes.  Allocate the
 * requested number of nodes/process slots to the job.
 */
static int orte_ras_ccp_allocate(opal_list_t *nodes)
{
    int ret, i;
    size_t len;
    char *cluster_head = NULL;
    HRESULT hr = S_OK;
    ICluster* pCluster = NULL;

    /* CCP is not thread safe. Use the apartment model. */
    CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);

    /* Create the Cluster object. */
    hr = CoCreateInstance( __uuidof(Cluster),
                           NULL,
                           CLSCTX_INPROC_SERVER,
                           __uuidof(ICluster),
                           reinterpret_cast<void **> (&pCluster) );

    if (FAILED(hr)) {
        OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                            "ras:ccp:allocate: failed to create cluster object!"));
        return ORTE_ERROR;
    }
    
    if(NULL == orte_ccp_headnode) {
        /* Get the cluster head nodes name */
        _dupenv_s(&cluster_head, &len, "LOGONSERVER");

        if(cluster_head == NULL) {
            OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                                "ras:ccp:allocate: connot find cluster head node!"));
            return ORTE_ERROR;
        }

        /* Get rid of the beginning '//'. */
        for( i = 0; i < len - 2; i++){
            cluster_head[i] = cluster_head[i+2];
            cluster_head[i+2] = '\0';
        }
    } else {
        cluster_head = orte_ccp_headnode;
    }

    /* Connect to the cluster's head node */
    hr = pCluster->Connect(_bstr_t(cluster_head));
    if (FAILED(hr)) {
        ras_get_cluster_message(pCluster);
        OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                            "ras:ccp:allocate: connection failed!"));
        return ORTE_ERROR;
    }
   
    if (ORTE_SUCCESS != (ret = discover(nodes, pCluster))) {
        OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                            "ras:ccp:allocate: discover failed!"));
        return ret;
    }
        
    /* in the CCP world, if we didn't find anything, then this
     * is an unrecoverable error - report it
     */
    if (opal_list_is_empty(nodes)) {
        orte_show_help("help-ras-ccp.txt", "no-nodes-found", true);
        return ORTE_ERR_NOT_FOUND;
    }

    /* All finished, release cluster object*/
    pCluster->Release();
    CoUninitialize();

    return ret;
}
Пример #28
0
static int orte_rds_hostfile_query(orte_jobid_t job)
{
    opal_list_t existing;
    opal_list_t updates, rds_updates;
    opal_list_item_t *item;
    orte_rds_cell_desc_t *rds_item;
    orte_rds_cell_attr_t *new_attr;
    orte_ras_node_t *ras_item;
    int rc;

    if (orte_rds_hostfile_queried) {
        /* if we have already been queried, then
         * our info is on the registry, so just
         * return. Note that this restriction
         * may eventually be lifted - ideally, 
         * we might check to see if this is a
         * new file name and go ahead with the
         * query if so.
         */
        return ORTE_SUCCESS;
    }
    orte_rds_hostfile_queried = true;
    
    OBJ_CONSTRUCT(&existing, opal_list_t);
    OBJ_CONSTRUCT(&updates, opal_list_t);
    OBJ_CONSTRUCT(&rds_updates, opal_list_t);
    rc = orte_ras_base_node_query(&existing);
    if(ORTE_SUCCESS != rc) {
        goto cleanup;
    }

    rc = mca_base_param_find("rds", "hostfile", "path");
    mca_base_param_lookup_string(rc, &mca_rds_hostfile_component.path);

    rc = orte_rds_hostfile_parse(mca_rds_hostfile_component.path, &existing, &updates);
    if (ORTE_ERR_NOT_FOUND == rc) {
        if(mca_rds_hostfile_component.default_hostfile) {
            rc = ORTE_SUCCESS;
        } else {
            opal_show_help("help-rds-hostfile.txt", "rds:no-hostfile",
                           true,
                           mca_rds_hostfile_component.path);
        }
        goto cleanup;
    } else if (ORTE_SUCCESS != rc) {
        goto cleanup;
    }

    if ( !opal_list_is_empty(&updates) ) {

        /* Convert RAS update list to RDS update list */
        for ( ras_item  = (orte_ras_node_t*)opal_list_get_first(&updates);
              ras_item != (orte_ras_node_t*)opal_list_get_end(&updates);
              ras_item  = (orte_ras_node_t*)opal_list_get_next(ras_item)) {

            rds_item = OBJ_NEW(orte_rds_cell_desc_t);
            if (NULL == rds_item) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }

            rds_item->site  = strdup("Hostfile");
            rds_item->name  = strdup(ras_item->node_name);
            if (need_cellid) {
#if 0 /* JJH Repair when cellid's are fixed */
                /* Create a new cellid for this hostfile */
                rc = orte_ns.create_cellid(&local_cellid, rds_item->site, rds_item->name);
                if (ORTE_SUCCESS != rc) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
#endif
                local_cellid = 0;
                need_cellid = false;
            }

            rds_item->cellid      = local_cellid;
            ras_item->node_cellid = local_cellid;

            new_attr = OBJ_NEW(orte_rds_cell_attr_t);
            if (NULL == new_attr) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.key          = strdup(ORTE_RDS_NAME);
            new_attr->keyval.value = OBJ_NEW(orte_data_value_t);
            if (NULL == new_attr->keyval.value) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.value->type   = ORTE_STRING;
            new_attr->keyval.value->data   = strdup(ras_item->node_name);
            opal_list_append(&(rds_item->attributes), &new_attr->super);

            new_attr = OBJ_NEW(orte_rds_cell_attr_t);
            if (NULL == new_attr) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.key          = strdup(ORTE_CELLID_KEY);
            new_attr->keyval.value = OBJ_NEW(orte_data_value_t);
            if (NULL == new_attr->keyval.value) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            new_attr->keyval.value->type   = ORTE_CELLID;
            if (ORTE_SUCCESS != (rc = orte_dss.copy(&(new_attr->keyval.value->data), &(rds_item->cellid), ORTE_CELLID))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            opal_list_append(&(rds_item->attributes), &new_attr->super);

            opal_list_append(&rds_updates, &rds_item->super);
        }

        /* Insert the new node into the RDS */
        rc = orte_rds.store_resource(&rds_updates);
        if (ORTE_SUCCESS != rc) {
            goto cleanup;
        }

        /* Then the RAS, since we can assume that any
         * resources listed in the hostfile have been
         * already allocated for our use.
         */
        rc = orte_ras_base_node_insert(&updates);
        if (ORTE_SUCCESS != rc) {
            goto cleanup;
        }
        
        /* and now, indicate that ORTE should override any oversubscribed conditions
         * based on local hardware limits since the user (a) might not have
         * provided us any info on the #slots for a node, and (b) the user
         * might have been wrong! If we don't check the number of local physical
         * processors, then we could be too aggressive on our sched_yield setting
         * and cause performance problems.
         */
        rc = orte_ras_base_set_oversubscribe_override(job);
        if (ORTE_SUCCESS != rc) {
            goto cleanup;
        }
    }

cleanup:
    if (NULL != mca_rds_hostfile_component.path) {
        free(mca_rds_hostfile_component.path);
        mca_rds_hostfile_component.path = NULL;
    }

    while(NULL != (item = opal_list_remove_first(&existing))) {
        OBJ_RELEASE(item);
    }

    while(NULL != (item = opal_list_remove_first(&updates))) {
        OBJ_RELEASE(item);
    }

    while (NULL != (rds_item = (orte_rds_cell_desc_t*)opal_list_remove_first(&rds_updates))) {
        while (NULL != (new_attr = (orte_rds_cell_attr_t*)opal_list_remove_first(&(rds_item->attributes)))) {
            OBJ_RELEASE(new_attr);
        }
        OBJ_RELEASE(rds_item);
    }

    OBJ_DESTRUCT(&existing);
    OBJ_DESTRUCT(&updates);
    OBJ_DESTRUCT(&rds_updates);

    return rc;
}
Пример #29
0
/* the -host option can always be used in both absolute
 * and relative mode, so we have to check for pre-existing
 * allocations if we are to use relative node syntax
 */
int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
                                     char** host_argv)
{
    opal_list_item_t* item;
    bool found;
    opal_list_item_t *next;
    orte_std_cntr_t i, j, k, len_mapped_node=0;
    int rc;
    char **mapped_nodes = NULL, **mini_map, *cptr;
    orte_node_t *node, **nodepool;
    int nodeidx;
    int num_empty=0;
    opal_list_t keep;
    bool want_all_empty = false;
    
    /* if the incoming node list is empty, then there
     * is nothing to filter!
     */
    if (opal_list_is_empty(nodes)) {
        return ORTE_SUCCESS;
    }

    /* setup for relative node syntax */
    nodepool = (orte_node_t**)orte_node_pool->addr;
    
    /* Accumulate all of the host name mappings */
    for (j = 0; j < opal_argv_count(host_argv); ++j) {
        mini_map = opal_argv_split(host_argv[j], ',');
        
        for (k = 0; NULL != mini_map[k]; ++k) {
            if ('+' == mini_map[k][0]) {
                /* see if we specified empty nodes */
                if ('e' == mini_map[k][1] ||
                    'E' == mini_map[k][1]) {
                    /* request for empty nodes - do they want
                     * all of them?
                     */
                    if (NULL != (cptr = strchr(mini_map[k], ':'))) {
                        /* the colon indicates a specific # are requested */
                        cptr++; /* step past : */
                        /* put a marker into the list */
                        cptr--;
                        *cptr = '*';
                        opal_argv_append_nosize(&mapped_nodes, cptr);
                    } else {
                        /* add a marker to the list */
                        opal_argv_append_nosize(&mapped_nodes, "*");
                        want_all_empty = true;
                    }
                } else if ('n' == mini_map[k][1] ||
                           'N' == mini_map[k][1]) {
                    /* they want a specific relative node #, so
                     * look it up on global pool
                     */
                    nodeidx = strtol(&mini_map[k][2], NULL, 10);
                    if (nodeidx < 0 ||
                        nodeidx > (int)orte_node_pool->size) {
                        /* this is an error */
                        orte_show_help("help-dash-host.txt", "dash-host:relative-node-out-of-bounds",
                                       true, nodeidx, mini_map[k]);
                        rc = ORTE_ERR_SILENT;
                        goto cleanup;
                    }
                    /* if the HNP is not allocated, then we need to
                     * adjust the index as the node pool is offset
                     * by one
                     */
                    if (!orte_hnp_is_allocated) {
                        nodeidx++;
                    }
                    /* see if that location is filled */
                    
                    if (NULL == nodepool[nodeidx]) {
                        /* this is an error */
                        orte_show_help("help-dash-host.txt", "dash-host:relative-node-not-found",
                                       true, nodeidx, mini_map[k]);
                        rc = ORTE_ERR_SILENT;
                        goto cleanup;
                    }
                    /* add this node to the list */
                    opal_argv_append_nosize(&mapped_nodes, nodepool[nodeidx]->name);
                } else {
                    /* invalid relative node syntax */
                    orte_show_help("help-dash-host.txt", "dash-host:invalid-relative-node-syntax",
                                   true, mini_map[k]);
                    rc = ORTE_ERR_SILENT;
                    goto cleanup;
                }
            } else { /* non-relative syntax - add to list */
                if (OPAL_SUCCESS != (rc = opal_argv_append_nosize(&mapped_nodes, 
                                                                  mini_map[k]))) {
                    goto cleanup;
                }
            }
        }
        opal_argv_free(mini_map);
    }
    
    /* Did we find anything? If not, then do nothing */
    if (NULL == mapped_nodes && 0 == num_empty) {
        return ORTE_SUCCESS;
    }
    
    /* we found some info - filter what is on the list...
     * i.e., go through the list and remove any nodes that
     * were -not- included on the -host list.
     *
     * NOTE: The following logic is based on knowing that
     * any node can only be included on the incoming
     * nodes list ONCE.
     */
    
    len_mapped_node = opal_argv_count(mapped_nodes);
    /* setup a working list so we can put the final list
     * of nodes in order. This way, if the user specifies a
     * set of nodes, we will use them in the order in which
     * they were specifed. Note that empty node requests
     * will always be appended to the end
     */
    OBJ_CONSTRUCT(&keep, opal_list_t);
    
    for (i = 0; i < len_mapped_node; ++i) {
        /* check if we are supposed to add some number of empty
         * nodes here
         */
        if ('*' == mapped_nodes[i][0]) {
            /* if there is a number after the '*', then we are
             * to insert a specific # of nodes
             */
            if ('\0' == mapped_nodes[i][1]) {
                /* take all empty nodes from the list */
                num_empty = INT_MAX;
            } else {
                /* extract number of nodes to take */
                num_empty = strtol(&mapped_nodes[i][1], NULL, 10);
            }
            /* search for empty nodes and take them */
            item = opal_list_get_first(nodes);
            while (0 < num_empty && item != opal_list_get_end(nodes)) {
                next = opal_list_get_next(item);  /* save this position */
                node = (orte_node_t*)item;
                /* see if this node is empty */
                if (0 == node->slots_inuse) {
                    /* check to see if it is specified later */
                    for (j=i+1; j < len_mapped_node; j++) {
                        if (0 == strcmp(mapped_nodes[j], node->name)) {
                            /* specified later - skip this one */
                            goto skipnode;
                        }
                    }
                    /* remove item from list */
                    opal_list_remove_item(nodes, item);
                    /* xfer to keep list */
                    opal_list_append(&keep, item);
                    --num_empty;
                }
            skipnode:
                item = next;
            }
        } else {
            /* we are looking for a specific node on the list
             * we have a match if one of two conditions is met:
             * 1. the node_name and mapped_nodes directly match
             * 2. the node_name is the local system name AND
             *    either the mapped_node is "localhost" OR it
             *    is a local interface as found by opal_ifislocal
             */
            item = opal_list_get_first(nodes);
            while (item != opal_list_get_end(nodes)) {
                next = opal_list_get_next(item);  /* save this position */
                node = (orte_node_t*)item;
                /* search -host list to see if this one is found */
                found = false;
                if ((0 == strcmp(node->name, mapped_nodes[i]) ||
                    (0 == strcmp(node->name, orte_process_info.nodename) &&
                    (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i]))))) {
                    /* remove item from list */
                    opal_list_remove_item(nodes, item);
                    /* xfer to keep list */
                    opal_list_append(&keep, item);
                    break;
                }
                item = next;
            }
        }
        /* done with the mapped entry */
        free(mapped_nodes[i]);
        mapped_nodes[i] = NULL;
    }

    /* was something specified that was -not- found? */
    for (i=0; i < len_mapped_node; i++) {
        if (NULL != mapped_nodes[i]) {
            orte_show_help("help-dash-host.txt", "not-all-mapped-alloc",
                           true, mapped_nodes[i]);
            rc = ORTE_ERR_SILENT;
            goto cleanup;
        }
    }
    
    /* clear the rest of the nodes list */
    while (NULL != (item = opal_list_remove_first(nodes))) {
        OBJ_RELEASE(item);
    }
    
    /* the nodes list has been cleared - rebuild it in order */
    while (NULL != (item = opal_list_remove_first(&keep))) {
        opal_list_append(nodes, item);
    }
    
    /* did they ask for more than we could provide */
    if (!want_all_empty && 0 < num_empty) {
        orte_show_help("help-dash-host.txt", "dash-host:not-enough-empty",
                       true, num_empty);
        rc = ORTE_ERR_SILENT;
        goto cleanup;
    }
    
    rc = ORTE_SUCCESS;
    /* done filtering existing list */
    
cleanup:
    for (i=0; i < len_mapped_node; i++) {
        if (NULL != mapped_nodes[i]) {
            free(mapped_nodes[i]);
            mapped_nodes[i] = NULL;
        }
    }
    if (NULL != mapped_nodes) {
        free(mapped_nodes);
    }
    
    return rc;
}
Пример #30
0
/*
 * Start monitoring of local processes
 */
static void start(orte_jobid_t jobid)
{
    mca_base_component_t *c = &mca_sensor_file_component.super.base_version;
    opal_list_item_t *item;
    orte_odls_job_t *jobdat;
    orte_app_context_t *app, *aptr;
    int rc, tmp;
    char *filename;
    file_tracker_t *ft;

    /* cannot monitor my own job */
    if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) {
        return;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                         "%s starting file monitoring for job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jobid)));
    
    /* get the local jobdat for this job */
    for (item = opal_list_get_first(&orte_local_jobdata);
         item != opal_list_get_end(&orte_local_jobdata);
         item = opal_list_get_end(&orte_local_jobdata)) {
        jobdat = (orte_odls_job_t*)item;
        if (jobid == jobdat->jobid || ORTE_JOBID_WILDCARD == jobid) {
            /* must be at least one app_context, so use the first one found */
            app = NULL;
            for (tmp=0; tmp < jobdat->apps.size; tmp++) {
                if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, tmp))) {
                    app = aptr;
                    break;
                }
            }
            if (NULL == app) {
                /* got a problem */
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                continue;
            }
            
            /* search the environ to get the filename */
            if (ORTE_SUCCESS != (rc = mca_base_param_find_string(c, "filename", app->env, &filename))) {
                /* was a default file given */
                if (NULL == mca_sensor_file_component.file) {
                    /* can't do anything without a file */
                    OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                                         "%s sensor:file no file for job %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_JOBID_PRINT(jobid)));
                    continue;
                }
                filename = mca_sensor_file_component.file;
            }
            
            /* create the tracking object */
            ft = OBJ_NEW(file_tracker_t);
            ft->jobid = jobid;
            ft->file = strdup(filename);
            
            /* search the environ to see what we are checking */
            tmp = 0;
            if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_size", app->env, &tmp))) {
                /* was a default value given */
                if (0 < mca_sensor_file_component.check_size) {
                    ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size);
                }
            } else {
                ft->check_size = OPAL_INT_TO_BOOL(tmp);
            }
            tmp = 0;
            if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_access", app->env, &tmp))) {
                /* was a default value given */
                if (0 < mca_sensor_file_component.check_access) {
                    ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access);
                }
            } else {
                ft->check_access = OPAL_INT_TO_BOOL(tmp);
            }
            tmp = 0;
            if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_mod", app->env, &tmp))) {
                /* was a default value given */
                if (0 < mca_sensor_file_component.check_mod) {
                    ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod);
                }
            } else {
                ft->check_mod = OPAL_INT_TO_BOOL(tmp);
            }
            tmp = 0;
            if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "limit", app->env, &tmp))) {
                ft->limit = mca_sensor_file_component.limit;
            } else {
                ft->limit = tmp;
            }
            opal_list_append(&jobs, &ft->super);
            OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output,
                                 "%s file %s monitored for %s%s%s with limit %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ft->file, ft->check_size ? "SIZE:" : " ",
                                 ft->check_access ? "ACCESS TIME:" : " ",
                                 ft->check_mod ? "MOD TIME" : " ", ft->limit));
        }
    }
    
    /* start sampling */
    if (NULL == sample_ev && !opal_list_is_empty(&jobs)) {
        /* startup a timer to wake us up periodically
         * for a data sample
         */
        sample_ev =  (opal_event_t *) malloc(sizeof(opal_event_t));
        opal_event_evtimer_set(opal_event_base, sample_ev, sample, sample_ev);
        sample_time.tv_sec = mca_sensor_file_component.sample_rate;
        sample_time.tv_usec = 0;
        opal_event_evtimer_add(sample_ev, &sample_time);
    }
    return;
}