Esempio n. 1
0
int ompi_attr_free_keyval(ompi_attribute_type_t type, int *key,
                          bool predefined)
{
    int ret;
    ompi_attribute_keyval_t *keyval;

    /* Find the key-value pair */
    OPAL_THREAD_LOCK(&attribute_lock);
    ret = opal_hash_table_get_value_uint32(keyval_hash, *key,
                                           (void **) &keyval);
    if ((OMPI_SUCCESS != ret) || (NULL == keyval) ||
        (keyval->attr_type != type) ||
        ((!predefined) && (keyval->attr_flag & OMPI_KEYVAL_PREDEFINED))) {
        OPAL_THREAD_UNLOCK(&attribute_lock);
        return OMPI_ERR_BAD_PARAM;
    }

    /* MPI says to set the returned value to MPI_KEYVAL_INVALID */
    *key = MPI_KEYVAL_INVALID;

    /* This will delete the key only when no attributes are associated
       with it, else it will just decrement the reference count, so that when
       the last attribute is deleted, this object gets deleted too */
    OBJ_RELEASE(keyval);

    opal_atomic_wmb();
    OPAL_THREAD_UNLOCK(&attribute_lock);

    return MPI_SUCCESS;
}
Esempio n. 2
0
/*
 * Front-end function called by the Fortran MPI-2 API functions to set
 * an attribute.
 */
int ompi_attr_set_aint(ompi_attribute_type_t type, void *object,
                       opal_hash_table_t **attr_hash,
                       int key, MPI_Aint attribute,
                       bool predefined)
{
    int ret;
    attribute_value_t *new_attr = OBJ_NEW(attribute_value_t);
    if (NULL == new_attr) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    OPAL_THREAD_LOCK(&attribute_lock);

    new_attr->av_value = (void *) attribute;
    new_attr->av_set_from = OMPI_ATTRIBUTE_AINT;
    ret = set_value(type, object, attr_hash, key, new_attr, predefined);
    if (OMPI_SUCCESS != ret) {
        OBJ_RELEASE(new_attr);
    }

    opal_atomic_wmb();
    OPAL_THREAD_UNLOCK(&attribute_lock);

    return ret;
}
Esempio n. 3
0
/*
 * Front-end function to delete all the attributes on an MPI object
 */
int ompi_attr_delete_all(ompi_attribute_type_t type, void *object,
                         opal_hash_table_t *attr_hash)
{
    int ret, i, num_attrs;
    uint32_t key;
    void *node, *in_node, *attr;
    attribute_value_t **attrs;

    /* Ensure that the table is not empty */

    if (NULL == attr_hash) {
        return MPI_SUCCESS;
    }

    OPAL_THREAD_LOCK(&attribute_lock);

    /* Make an array that contains all attributes in local object's hash */
    num_attrs = opal_hash_table_get_size(attr_hash);
    if (0 == num_attrs) {
        OPAL_THREAD_UNLOCK(&attribute_lock);
        return MPI_SUCCESS;
    }

    attrs = malloc(sizeof(attribute_value_t *) * num_attrs);
    if (NULL == attrs) {
        OPAL_THREAD_UNLOCK(&attribute_lock);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    ret = opal_hash_table_get_first_key_uint32(attr_hash, &key, &attr, &node);
    for (i = 0; OMPI_SUCCESS == ret; i++) {
        attrs[i] = attr;
        in_node = node;
        ret = opal_hash_table_get_next_key_uint32(attr_hash, &key, &attr,
                                                  in_node, &node);
    }

    /* Sort attributes in the order that they were set */
    qsort(attrs, num_attrs, sizeof(attribute_value_t *), compare_attr_sequence);

    /* Delete attributes in the reverse order that they were set.
       Actually this ordering is required only for MPI_COMM_SELF, as
       specified in MPI-2.2: 8.7.1 Allowing User Functions at Process
       Termination, but we do it for everything -- what the heck.
       :-) */
    for (i = num_attrs - 1; i >= 0; i--) {
        ret = ompi_attr_delete_impl(type, object, attr_hash,
                                    attrs[i]->av_key, true);
        if (OMPI_SUCCESS != ret) {
            break;
        }
    }

    /* All done */

    free(attrs);
    opal_atomic_wmb();
    OPAL_THREAD_UNLOCK(&attribute_lock);
    return ret;
}
Esempio n. 4
0
static void fence_release(int status, void *cbdata)
{
    struct fence_result *res = (struct fence_result*)cbdata;
    res->status = status;
    opal_atomic_wmb();
    res->flag = 0;
}
Esempio n. 5
0
static int _opal_progress_register (opal_progress_callback_t cb, volatile opal_progress_callback_t **cbs,
                                    size_t *cbs_size, size_t *cbs_len)
{
    int ret = OPAL_SUCCESS;

    if (OPAL_ERR_NOT_FOUND != opal_progress_find_cb (cb, *cbs, *cbs_len)) {
        return OPAL_SUCCESS;
    }

    /* see if we need to allocate more space */
    if (*cbs_len + 1 > *cbs_size) {
        opal_progress_callback_t *tmp, *old;

        tmp = (opal_progress_callback_t *) malloc (sizeof (tmp[0]) * 2 * *cbs_size);
        if (tmp == NULL) {
            return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
        }

        if (*cbs) {
            /* copy old callbacks */
            memcpy (tmp, (void *) *cbs, sizeof(tmp[0]) * *cbs_size);
        }

        for (size_t i = *cbs_len ; i < 2 * *cbs_size ; ++i) {
            tmp[i] = fake_cb;
        }

        opal_atomic_wmb ();

        /* swap out callback array */
        old = (opal_progress_callback_t *) opal_atomic_swap_ptr ((opal_atomic_intptr_t *) cbs, (intptr_t) tmp);

        opal_atomic_wmb ();

        free (old);
        *cbs_size *= 2;
    }

    cbs[0][*cbs_len] = cb;
    ++*cbs_len;

    opal_atomic_wmb ();

    return ret;
}
Esempio n. 6
0
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
{
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
    ompi_osc_rdma_sync_t *lock;
    int ret = OMPI_SUCCESS;

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock: %d, %d, %d, %s", lock_type, target, assert, win->w_name);

    if (module->no_locks) {
        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
        return OMPI_ERR_RMA_SYNC;
    }

    if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
        /* impossible to get an exclusive lock while holding a global shared lock or in a active
         * target access epoch */
        return OMPI_ERR_RMA_SYNC;
    }

    /* clear the global sync object (in case MPI_Win_fence was called) */
    module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;

    /* create lock item */
    lock = ompi_osc_rdma_sync_allocate (module);
    if (OPAL_UNLIKELY(NULL == lock)) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
    lock->sync.lock.target = target;
    lock->sync.lock.type = lock_type;
    lock->sync.lock.assert = assert;

    lock->peer_list.peer = peer;
    lock->num_peers = 1;
    OBJ_RETAIN(peer);

    if (0 == (assert & MPI_MODE_NOCHECK)) {
        ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
    }

    if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
        ++module->passive_target_access_epoch;

        opal_atomic_wmb ();

        OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
    } else {
        OBJ_RELEASE(lock);
    }

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock %d complete", target);

    return ret;
}
Esempio n. 7
0
static inline void _ctx_add(mca_spml_ucx_ctx_array_t *array, mca_spml_ucx_ctx_t *ctx)
{
    int i;

    if (array->ctxs_count < array->ctxs_num) {
        array->ctxs[array->ctxs_count] = ctx;
    } else {
        array->ctxs = realloc(array->ctxs, (array->ctxs_num + MCA_SPML_UCX_CTXS_ARRAY_INC) * sizeof(mca_spml_ucx_ctx_t *));
        opal_atomic_wmb ();
        for (i = array->ctxs_num; i < array->ctxs_num + MCA_SPML_UCX_CTXS_ARRAY_INC; i++) {
            array->ctxs[i] = NULL;
        }
        array->ctxs[array->ctxs_num] = ctx;
        array->ctxs_num += MCA_SPML_UCX_CTXS_ARRAY_INC;
    }

    opal_atomic_wmb ();
    array->ctxs_count++;
}
Esempio n. 8
0
/*
 * Front end function to delete a single attribute.
 */
int ompi_attr_delete(ompi_attribute_type_t type, void *object,
                     opal_hash_table_t *attr_hash, int key,
                     bool predefined)
{
    int ret;

    OPAL_THREAD_LOCK(&attribute_lock);
    ret = ompi_attr_delete_impl(type, object, attr_hash, key, predefined);
    opal_atomic_wmb();
    OPAL_THREAD_UNLOCK(&attribute_lock);
    return ret;
}
Esempio n. 9
0
static inline void _ctx_remove(mca_spml_ucx_ctx_array_t *array, mca_spml_ucx_ctx_t *ctx)
{
    int i;

    for (i = 0; i < array->ctxs_count; i++) {
        if (array->ctxs[i] == ctx) {
            array->ctxs[i] = array->ctxs[array->ctxs_count-1];
            array->ctxs[array->ctxs_count-1] = NULL;
            break;
        }
    }

    array->ctxs_count--;
    opal_atomic_wmb ();
}
Esempio n. 10
0
/*
 * Front-end function called by the Fortran MPI-2 API functions to get
 * attributes.
 */
int ompi_attr_get_aint(opal_hash_table_t *attr_hash, int key,
                       MPI_Aint *attribute, int *flag)
{
    attribute_value_t *val = NULL;
    int ret;

    OPAL_THREAD_LOCK(&attribute_lock);

    ret = get_value(attr_hash, key, &val, flag);
    if (MPI_SUCCESS == ret && 1 == *flag) {
        *attribute = translate_to_aint(val);
    }

    opal_atomic_wmb();
    OPAL_THREAD_UNLOCK(&attribute_lock);
    return ret;
}
Esempio n. 11
0
static int ompi_attr_create_keyval_impl(ompi_attribute_type_t type,
                            ompi_attribute_fn_ptr_union_t copy_attr_fn,
                            ompi_attribute_fn_ptr_union_t delete_attr_fn,
                            int *key,
                            ompi_attribute_fortran_ptr_t *extra_state,
                            int flags,
                            void *bindings_extra_state)
{
    ompi_attribute_keyval_t *keyval;
    int ret;

    /* Allocate space for the list item */
    keyval = OBJ_NEW(ompi_attribute_keyval_t);
    if (NULL == keyval) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* Fill in the list item (must be done before we set the keyval
       on the keyval_hash in case some other thread immediately reads
       it from the keyval_hash) */
    keyval->copy_attr_fn = copy_attr_fn;
    keyval->delete_attr_fn = delete_attr_fn;
    keyval->extra_state = *extra_state;
    keyval->attr_type = type;
    keyval->attr_flag = flags;
    keyval->key = -1;
    keyval->bindings_extra_state = bindings_extra_state;

    /* Create a new unique key and fill the hash */
    OPAL_THREAD_LOCK(&attribute_lock);
    ret = CREATE_KEY(key);
    if (OMPI_SUCCESS == ret) {
        keyval->key = *key;
        ret = opal_hash_table_set_value_uint32(keyval_hash, *key, keyval);
    }

    if (OMPI_SUCCESS != ret) {
        OBJ_RELEASE(keyval);
    } else {
        ret = MPI_SUCCESS;
    }

    opal_atomic_wmb();
    OPAL_THREAD_UNLOCK(&attribute_lock);
    return ret;
}
Esempio n. 12
0
static int
ompi_mtl_portals4_send_callback(ptl_event_t *ev,
                                ompi_mtl_portals4_base_request_t* ptl_base_request)
{
    bool complete = false;
    int ret;
    ompi_mtl_portals4_send_request_t* ptl_request =
        (ompi_mtl_portals4_send_request_t*) ptl_base_request;

    ret = ompi_mtl_portals4_callback(ev, ptl_base_request, &complete);
    if (complete) {
        ptl_request->retval = ret;
        opal_atomic_wmb();
        ptl_request->complete = true;
    }

    return OMPI_SUCCESS;
}
Esempio n. 13
0
int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win)
{
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_osc_rdma_peer_t *peer;
    ompi_osc_rdma_sync_t *lock;
    int ret = OMPI_SUCCESS;

    OPAL_THREAD_LOCK(&module->lock);

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock: %d, %s", target, win->w_name);

    lock = ompi_osc_rdma_module_lock_find (module, target, &peer);
    if (OPAL_UNLIKELY(NULL == lock)) {
        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "target %d is not locked in window %s",
                         target, win->w_name);
        OPAL_THREAD_UNLOCK(&module->lock);
        return OMPI_ERR_RMA_SYNC;
    }

    ompi_osc_rdma_module_lock_remove (module, lock);

    /* finish all outstanding fragments */
    ompi_osc_rdma_sync_rdma_complete (lock);

    if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
        ret = ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
    }

    /* release our reference to this peer */
    OBJ_RELEASE(peer);

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock %d complete", target);

    --module->passive_target_access_epoch;

    opal_atomic_wmb ();

    OPAL_THREAD_UNLOCK(&module->lock);

    /* delete the lock */
    ompi_osc_rdma_sync_return (lock);

    return ret;
}
Esempio n. 14
0
int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
{
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_osc_rdma_sync_t *lock;

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all: %s", win->w_name);

    OPAL_THREAD_LOCK(&module->lock);

    lock = &module->all_sync;
    if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type) {
        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "not locked in window %s", win->w_name);
        OPAL_THREAD_UNLOCK(&module->lock);
        return OMPI_ERR_RMA_SYNC;
    }

    /* finish all outstanding fragments */
    ompi_osc_rdma_sync_rdma_complete (lock);

    if (0 != (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
        /* decrement the master lock shared count */
        (void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL, offsetof (ompi_osc_rdma_state_t, global_lock));
    }

    lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
    lock->num_peers = 0;
    lock->epoch_active = false;

    --module->passive_target_access_epoch;

    opal_atomic_wmb ();

    OPAL_THREAD_UNLOCK(&module->lock);

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all complete");

    return OMPI_SUCCESS;
}
Esempio n. 15
0
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(opal_shmem_ds_t *ds_buf,
               const char *file_name,
               size_t size)
{
    int rc = OPAL_SUCCESS;
    pid_t my_pid = getpid();
    char *temp1 = NULL, *temp2 = NULL;
    bool space_available = false;
    uint64_t amount_space_avail = 0;

    /* the real size of the shared memory segment.  this includes enough space
     * to store our segment header.
     */
    size_t real_size = size + sizeof(opal_shmem_seg_hdr_t);
    opal_shmem_seg_hdr_t *seg_hdrp = NULL;
    HANDLE hMapObject = INVALID_HANDLE_VALUE;
    LPVOID lpvMem = NULL;

    /* init the contents of opal_shmem_ds_t */
    shmem_ds_reset(ds_buf);

    /* On Windows the shared file will be created by the OS directly on the
     * system ressources. Therefore, no file get involved in the operation.
     * However, a unique key should be used as name for the shared memory object
     * in order to allow all processes to access the same unique shared memory
     * region. The key will be obtained from the original file_name by replacing
     * all path separator occurences by '/' (as '\' is not allowed on the object
     * name).
     */
    temp1 = strdup(file_name);
    if (NULL == temp1) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    temp2 = temp1;
    while (NULL != (temp2 = strchr(temp2, OPAL_PATH_SEP[0])) ) {
        *temp2 = '/';
    }
    /* let's make sure we have enough space for the backing file */
    if (OPAL_SUCCESS != (rc = enough_space(temp1,
                                           real_size,
                                           &amount_space_avail,
                                           &space_available))) {
        opal_output(0, "shmem: windows: an error occurred while determining "
                    "whether or not %s could be created.", temp1);
        /* rc is set */
        free(temp1);
        goto out;
    }
    if (!space_available) {
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        opal_show_help("help-opal-shmem-windows.txt", "target full", 1,
                       temp1, hn, (unsigned long)real_size,
                       (unsigned long long)amount_space_avail);
        free(temp1);
        goto out;
    }
    /* enough space is available, so create the segment */
                                   /* use paging file */
    hMapObject = CreateFileMapping(INVALID_HANDLE_VALUE,
                                   /* no security attributes */
                                   NULL,
                                   /* read/write access */
                                   PAGE_READWRITE,
                                   /* size: high 32-bits */
                                   0,
                                   /* size: low 32-bits */
                                   (DWORD)real_size,
                                   /* name of map object */
                                   temp1);
    if (NULL == hMapObject) {
        rc = OPAL_ERROR;
        goto out;
    }
    /* Get a pointer to the file-mapped shared memory. */
    lpvMem = MapViewOfFile(hMapObject,          /* object to map view of */
                           FILE_MAP_WRITE,      /* read/write access */
                           0,                   /* high offset:  map from */
                           0,                   /* low offset:   beginning */
                           0);                  /* default: map entire file */
    if (NULL == lpvMem) {
        rc = OPAL_ERROR;
        goto out;
    }

    seg_hdrp = (opal_shmem_seg_hdr_t *)lpvMem;

    /* all is well */
    /* -- initialize the shared memory segment -- */
    opal_atomic_rmb();

    /* init segment lock */
    opal_atomic_init(&seg_hdrp->lock, OPAL_ATOMIC_UNLOCKED);
    /* i was the creator of this segment, so note that fact */
    seg_hdrp->cpid = my_pid;

    opal_atomic_wmb();

    /* -- initialize the contents of opal_shmem_ds_t -- */
    ds_buf->seg_cpid = my_pid;
    ds_buf->seg_size = real_size;
    ds_buf->seg_base_addr = (unsigned char *)seg_hdrp;
    /* update path change in ds_buf */
    memcpy(ds_buf->seg_name, temp1, OPAL_PATH_MAX);
    /* relase the temporary file name */
    free(temp1);

    /* set "valid" bit because setment creation was successful */
    OPAL_SHMEM_DS_SET_VALID(ds_buf);

    OPAL_OUTPUT_VERBOSE(
        (70, opal_shmem_base_output,
         "%s: %s: create successful "
         "(id: %d, size: %"PRIsize_t", name: %s)\n",
         mca_shmem_windows_component.super.base_version.mca_type_name,
         mca_shmem_windows_component.super.base_version.mca_component_name,
         ds_buf->seg_id, ds_buf->seg_size, ds_buf->seg_name)
    );

out:
    /* an error occured, so invalidate the shmem object and munmap if needed */
    if (OPAL_SUCCESS != rc) {
        if (NULL != seg_hdrp) {
            UnmapViewOfFile((LPVOID)seg_hdrp);
        }
        shmem_ds_reset(ds_buf);
    }
    return rc;
}
Esempio n. 16
0
int ompi_datatype_get_pack_description( ompi_datatype_t* datatype,
                                        const void** packed_buffer )
{
    ompi_datatype_args_t* args = (ompi_datatype_args_t*)datatype->args;
    int next_index = OMPI_DATATYPE_MAX_PREDEFINED;
    void *packed_description = (void *) datatype->packed_description;
    void* recursive_buffer;

    if (NULL == packed_description) {
        void *_tmp_ptr = NULL;
        if (opal_atomic_compare_exchange_strong_ptr (&datatype->packed_description, (intptr_t *) &_tmp_ptr, 1)) {
            if( ompi_datatype_is_predefined(datatype) ) {
                packed_description = malloc(2 * sizeof(int));
            } else if( NULL == args ) {
                return OMPI_ERROR;
            } else {
                packed_description = malloc(args->total_pack_size);
            }
            recursive_buffer = packed_description;
            __ompi_datatype_pack_description( datatype, &recursive_buffer, &next_index );

            if (!ompi_datatype_is_predefined(datatype)) {
                /* If the precomputed size is not large enough we're already in troubles, we
                 * have overwritten outside of the allocated buffer. Raise the alarm !
                 * If not reassess the size of the packed buffer necessary for holding the
                 * datatype description.
                 */
                assert(args->total_pack_size >= (uintptr_t)((char*)recursive_buffer - (char *) packed_description));
                args->total_pack_size = (uintptr_t)((char*)recursive_buffer - (char *) packed_description);
            }

            opal_atomic_wmb ();
            datatype->packed_description = (intptr_t) packed_description;
        } else {
            /* another thread beat us to it */
            packed_description = (void *) datatype->packed_description;
        }
    }

    if ((void *) 1 == packed_description) {
        struct timespec interval = {.tv_sec = 0, .tv_nsec = 1000};

        /* wait until the packed description is updated */
        while (1 == datatype->packed_description) {
            nanosleep (&interval, NULL);
        }

        packed_description = (void *) datatype->packed_description;
    }

    *packed_buffer = (const void *) packed_description;
    return OMPI_SUCCESS;
}

size_t ompi_datatype_pack_description_length( ompi_datatype_t* datatype )
{
    void *packed_description = (void *) datatype->packed_description;

    if( ompi_datatype_is_predefined(datatype) ) {
        return 2 * sizeof(int);
    }
    if( NULL == packed_description || (void *) 1 == packed_description) {
        const void* buf;
        int rc;

        rc = ompi_datatype_get_pack_description(datatype, &buf);
        if( OMPI_SUCCESS != rc ) {
            return 0;
        }
    }
    assert( NULL != (ompi_datatype_args_t*)datatype->args );
    assert( NULL != (ompi_datatype_args_t*)datatype->packed_description );
    return ((ompi_datatype_args_t*)datatype->args)->total_pack_size;
}
Esempio n. 17
0
int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
{
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    ompi_osc_rdma_sync_t *lock;
    int ret = OMPI_SUCCESS;

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all: %d, %s", assert, win->w_name);

    if (module->no_locks) {
        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
        return OMPI_ERR_RMA_SYNC;
    }

    OPAL_THREAD_LOCK(&module->lock);
    if (module->all_sync.epoch_active) {
        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted lock_all when active target epoch is %s "
                         "and lock all epoch is %s",
                         (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type && module->all_sync.epoch_active) ?
                         "active" : "inactive",
                         (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) ? "active" : "inactive");
        OPAL_THREAD_UNLOCK(&module->lock);
        return OMPI_ERR_RMA_SYNC;
    }

    /* set up lock */
    lock = &module->all_sync;

    lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
    lock->sync.lock.target = -1;
    lock->sync.lock.type   = MPI_LOCK_SHARED;
    lock->sync.lock.assert = assert;
    lock->num_peers = ompi_comm_size (module->comm);

    lock->epoch_active = true;
    /* NTH: TODO -- like fence it might be a good idea to create an array to access all peers
     * without having to access the hash table. Such a change would likely increase performance
     * at the expense of memory usage. Ex. if a window has 1M peers then 8MB per process would
     * be needed for this array. */

    if (0 != (assert & MPI_MODE_NOCHECK)) {
        /* increment the global shared lock */
        ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
                                                 offsetof(ompi_osc_rdma_state_t, global_lock),
                                                 0x00000000ffffffffUL);
    }

    if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
        lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
        lock->num_peers = 0;
        lock->epoch_active = false;
    } else {
        ++module->passive_target_access_epoch;
    }

    opal_atomic_wmb ();

    OPAL_THREAD_UNLOCK(&module->lock);

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all complete");

    return ret;
}
Esempio n. 18
0
/*
 * Main thread logic
 */
static void *service_thread_start(void *context)
{
    int rc, flags;
    fd_set read_fds_copy, write_fds_copy;
    opal_list_item_t *item;
    registered_item_t *ri;

    /* Make an fd set that we can select() on */
    FD_ZERO(&write_fds);
    FD_ZERO(&read_fds);
    FD_SET(pipe_to_service_thread[0], &read_fds);
    max_fd = pipe_to_service_thread[0] + 1;

    OPAL_OUTPUT((-1, "fd service thread running"));

    /* Main loop waiting for commands over the fd's */
    while (1) {
        memcpy(&read_fds_copy, &read_fds, sizeof(read_fds));
        memcpy(&write_fds_copy, &write_fds, sizeof(write_fds));
        OPAL_OUTPUT((-1, "fd service thread blocking on select..."));
        rc = select(max_fd, &read_fds_copy, &write_fds_copy, NULL, NULL);
        if (0 != rc && EAGAIN == errno) {
            continue;
        }
    
        OPAL_OUTPUT((-1, "fd service thread woke up!"));

        if (0 > rc) {
            if (EBADF == errno) {
                /* We are assuming we lost a socket so set rc to 1 so we'll 
                 * try to read a command off the service pipe to receive a 
                 * rm command (corresponding to the socket that went away).  
                 * If the EBADF is from the service pipe then the error
		 * condition will be handled by the service_pipe_cmd().
                 */
                OPAL_OUTPUT((-1,"fd service thread: non-EAGAIN from select %d", errno));
                rc = 1;
            }
        }
        if (rc > 0) {
            if (FD_ISSET(pipe_to_service_thread[0], &read_fds_copy)) {
                OPAL_OUTPUT((-1, "fd service thread: pipe command"));
                if (service_pipe_cmd()) {
                    break;
                }
                OPAL_OUTPUT((-1, "fd service thread: back from pipe command"));
                /* Continue to the top of the loop to see if there are more
                 * commands on the pipe.  This is done to reset the fds
                 * list just in case the last select incurred an EBADF.
                 * Please do not remove this continue thinking one is trying
                 * to enforce a fairness of reading the sockets or we'll
                 * end up with segv's below when select incurs an EBADF.
                 */
                continue;
            }

            /* Go through all the registered events and see who had
               activity */
            if (!opal_list_is_empty(&registered_items)) {
                for (item = opal_list_get_first(&registered_items);
                     item != opal_list_get_end(&registered_items);
                     item = opal_list_get_next(item)) {
                    ri = (registered_item_t*) item;
                    flags = 0;

                    /* See if this fd was ready for reading or writing
                       (fd's will only be in the read_fds or write_fds
                       set depending on what they registered for) */
                    if (FD_ISSET(ri->ri_fd, &read_fds_copy)) {
                        flags |= OPAL_EV_READ;
                    }
                    if (FD_ISSET(ri->ri_fd, &write_fds_copy)) {
                        flags |= OPAL_EV_WRITE;
                    }

                    /* If either was ready, invoke the callback */
                    if (0 != flags) {
                        OPAL_OUTPUT((-1, "fd service thread: invoking callback for registered fd %d", ri->ri_fd));
                        ri->ri_callback.event(ri->ri_fd, flags,
                                              ri->ri_context);
                        OPAL_OUTPUT((-1, "fd service thread: back from callback for registered fd %d", ri->ri_fd));
                    }
                }
            }
        }
    }

    /* All done */
    OPAL_OUTPUT((-1, "fd service thread: exiting"));
    opal_atomic_wmb();
    return NULL;
}
Esempio n. 19
0
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
{
    ompi_osc_rdma_module_t *module = GET_MODULE(win);
    const int my_rank = ompi_comm_rank (module->comm);
    ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
    ompi_osc_rdma_region_t *region;
    osc_rdma_counter_t region_count;
    osc_rdma_counter_t region_id;
    void *bound;
    intptr_t page_size = getpagesize ();
    int region_index;
    int ret;

    if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
        return OMPI_ERR_RMA_FLAVOR;
    }

    if (0 == len) {
        /* shot-circuit 0-byte case */
        return OMPI_SUCCESS;
    }

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: %s, %p, %lu", win->w_name, base, (unsigned long) len);

    OPAL_THREAD_LOCK(&module->lock);

    region_count = module->state->region_count & 0xffffffffL;
    region_id    = module->state->region_count >> 32;

    if (region_count == mca_osc_rdma_component.max_attach) {
        OPAL_THREAD_UNLOCK(&module->lock);
        return OMPI_ERR_RMA_ATTACH;
    }

    /* it is wasteful to register less than a page. this may allow the remote side to access more
     * memory but the MPI standard covers this with calling the calling behavior erroneous */
    bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
    base = (void *)((intptr_t) base & ~(page_size - 1));
    len = (size_t)((intptr_t) bound - (intptr_t) base);

    /* see if a matching region already exists */
    region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
                                                   (intptr_t) bound, module->region_size, &region_index);
    if (NULL != region) {
        ++module->dynamic_handles[region_index].refcnt;
        OPAL_THREAD_UNLOCK(&module->lock);
        /* no need to invalidate remote caches */
        return OMPI_SUCCESS;
    }

    /* region is in flux */
    module->state->region_count = -1;
    opal_atomic_wmb ();

    ompi_osc_rdma_lock_acquire_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));

    /* do a binary seach for where the region should be inserted */
    if (region_count) {
        region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
                                       module->region_size, &region_index);

        if (region_index < region_count) {
            memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);

            if (module->selected_btl->btl_register_mem) {
                memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
                         (region_count - region_index) * sizeof (module->dynamic_handles[0]));
            }
        }
    } else {
        region_index = 0;
        region = (ompi_osc_rdma_region_t *) module->state->regions;
    }

    region->base = (intptr_t) base;
    region->len  = len;

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d",
                     base, (void *)((intptr_t) base + len), region_index);

    if (module->selected_btl->btl_register_mem) {
        mca_btl_base_registration_handle_t *handle;

        ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
                                      &handle);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
            OPAL_THREAD_UNLOCK(&module->lock);
            return OMPI_ERR_RMA_ATTACH;
        }

        memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
        module->dynamic_handles[region_index].btl_handle = handle;
    } else {
        module->dynamic_handles[region_index].btl_handle = NULL;
    }

    module->dynamic_handles[region_index].refcnt = 1;

#if OPAL_ENABLE_DEBUG
    for (int i = 0 ; i < region_count + 1 ; ++i) {
        region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + i * module->region_size);

        OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, " dynamic region %d: {%p, %lu}", i,
                         (void *) region->base, (unsigned long) region->len);
    }
#endif

    opal_atomic_mb ();
    /* the region state has changed */
    module->state->region_count = ((region_id + 1) << 32) | (region_count + 1);

    ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
    OPAL_THREAD_UNLOCK(&module->lock);

    OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach complete");

    return OMPI_SUCCESS;
}
/*
 * Initialize nonblocking barrier.  This is code specific for handling
 * the recycling of data, and uses only a single set of control buffers.
 * It also assumes that for a given process, only a single outstanding 
 * barrier operation will occur for a given control structure, 
 * with the sequence number being used for potential overlap in time
 * between succesive barrier calls on different processes.
 */
int bcol_basesmuma_rd_nb_barrier_init_admin( 
        sm_nbbar_desc_t *sm_desc)

{
    /* local variables */
    int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange;
    int pair_rank;
    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
    netpatterns_pair_exchange_node_t *my_exchange_node;
    int extra_rank, my_rank;
    mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
    mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
    int64_t bank_genaration;
    bool found;
    int pool_index=sm_desc->pool_index;
    mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module;

    /* get the pointer to the segment of control structures */
    idx=sm_desc->coll_buff->number_of_buffs+pool_index;
    leading_dim=sm_desc->coll_buff->size_of_group;
    idx=SM_ARRAY_INDEX(leading_dim,idx,0);
    ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
        sm_desc->coll_buff->ctl_buffs+idx;
    bank_genaration= sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter;
    
	my_exchange_node=&(bcol_module->recursive_doubling_tree);
    my_rank=bcol_module->super.sbgp_partner_module->my_index;
    my_ctl=ctl_structs[my_rank];
    /* debug print */
    /*
    {
	    int ii;
	    for(ii = 0; ii < 6; ii++) {
		    fprintf(stderr,"UUU ctl_struct[%d] := %p\n",ii,
			    bcol_module->colls_no_user_data.ctl_buffs[ii]);
		    fflush(stderr);
	    }
    }
    */
    /* end debug */

    /* signal that I have arrived */
    my_ctl->flag = -1;

    opal_atomic_wmb ();

	/* don't need to set this flag anymore */
    my_ctl->sequence_number = bank_genaration;

    if(0 < my_exchange_node->n_extra_sources) {
        if (EXCHANGE_NODE == my_exchange_node->node_type) {
            volatile int64_t *partner_sn;
            /* I will participate in the exchange - wait for signal from extra
             ** process */
            extra_rank = my_exchange_node->rank_extra_source;
            partner_ctl=ctl_structs[extra_rank];
            partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);

            /* spin n iterations until partner registers */
            loop_cnt=0;
            found=false;
            while( loop_cnt < bcol_module->super.n_poll_loops ) 
            {
                if( *partner_sn >= bank_genaration ) {
                    found=true;
                    break;
                }
                loop_cnt++;
            }
            if( !found ) {
                /* set restart parameters */
                sm_desc->collective_phase=NB_PRE_PHASE;
                return OMPI_SUCCESS;
            }

        }  else {

            /* Nothing to do, already registared that I am here */
        }
    }

    for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) {

        volatile int64_t *partner_sn;
        volatile int *partner_flag;

        /* rank of exchange partner */
        pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
        partner_ctl=ctl_structs[pair_rank];
        partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
        partner_flag=(volatile int *)&(partner_ctl->flag);
		
        /* signal that I am at iteration exchange of the algorithm */
        my_ctl->flag = exchange;

        /* check to see if the partner has arrived */

        /* spin n iterations until partner registers */
        loop_cnt=0;
        found=false;
        while( loop_cnt < bcol_module->super.n_poll_loops ) 
        {
            if( (*partner_sn > bank_genaration) ||
                    ( *partner_sn == bank_genaration && 
                      *partner_flag >= exchange ) ) {
                found=true;
                break;
            }
			
             loop_cnt++;
        	
		}
        if( !found ) {
            /* set restart parameters */
            sm_desc->collective_phase=NB_RECURSIVE_DOUBLING;
            sm_desc->recursive_dbl_iteration=exchange;
            return OMPI_SUCCESS;
        }

    }

    if(0 < my_exchange_node->n_extra_sources)  {
        if ( EXTRA_NODE == my_exchange_node->node_type ) {
            volatile int64_t *partner_sn;
            volatile int *partner_flag;

            /* I will not participate in the exchange - 
             *   wait for signal from extra partner */
            extra_rank = my_exchange_node->rank_extra_source;
            partner_ctl=ctl_structs[extra_rank];
            partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
            partner_flag=(volatile int *)&(partner_ctl->flag);

            /* spin n iterations until partner registers */
            loop_cnt=0;
            found=false;
            while( loop_cnt < bcol_module->super.n_poll_loops ) 
            {
                if( (*partner_sn > bank_genaration) ||
                        ( (*partner_sn == bank_genaration) && 
                        (*partner_flag == (my_exchange_node->log_2)) ) ) {
                    found=true;
                    break;
                }
                loop_cnt++;
			}
            if( !found ) {
                /* set restart parameters */
                sm_desc->collective_phase=NB_POST_PHASE;
                return OMPI_SUCCESS;
            }

        }  else {

            /* signal the extra rank that I am done with the recursive
             * doubling phase.
             */
            my_ctl->flag = my_exchange_node->n_exchanges;

        }
    }

    /* set the barrier as complete */
    sm_desc->collective_phase=NB_BARRIER_DONE;
    /* return */
    return ret;
}
Esempio n. 21
0
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(opal_shmem_ds_t *ds_buf,
               const char *file_name,
               size_t size)
{
    int rc = OPAL_SUCCESS;
    pid_t my_pid = getpid();
    /* the real size of the shared memory segment.  this includes enough space
     * to store our segment header.
     */
    size_t real_size = size + sizeof(opal_shmem_seg_hdr_t);
    opal_shmem_seg_hdr_t *seg_hdrp = MAP_FAILED;

    /* init the contents of opal_shmem_ds_t */
    shmem_ds_reset(ds_buf);

    /* for sysv shared memory we don't have to worry about the backing store
     * being located on a network file system... so no check is needed here.
     */

    /* create a new shared memory segment and save the shmid. note the use of
     * real_size here
     */
    if (-1 == (ds_buf->seg_id = shmget(IPC_PRIVATE, real_size,
                                       IPC_CREAT | IPC_EXCL | S_IRWXU))) {
        int err = errno;
        char hn[OPAL_MAXHOSTNAMELEN];
        gethostname(hn, sizeof(hn));
        opal_show_help("help-opal-shmem-sysv.txt", "sys call fail", 1, hn,
                       "shmget(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    /* attach to the sement */
    else if ((void *)-1 == (seg_hdrp = shmat(ds_buf->seg_id, NULL, 0))) {
        int err = errno;
        char hn[OPAL_MAXHOSTNAMELEN];
        gethostname(hn, sizeof(hn));
        opal_show_help("help-opal-shmem-sysv.txt", "sys call fail", 1, hn,
                       "shmat(2)", "", strerror(err), err);
        shmctl(ds_buf->seg_id, IPC_RMID, NULL);
        rc = OPAL_ERROR;
        goto out;
    }
    /* mark the segment for destruction - if we are here, then the run-time
     * component selection test detected adequate support for this type of
     * thing.
     */
    else if (0 != shmctl(ds_buf->seg_id, IPC_RMID, NULL)) {
        int err = errno;
        char hn[OPAL_MAXHOSTNAMELEN];
        gethostname(hn, sizeof(hn));
        opal_show_help("help-opal-shmem-sysv.txt", "sys call fail", 1, hn,
                       "shmctl(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    /* all is well */
    else {
        /* -- initialize the shared memory segment -- */
        opal_atomic_rmb();

        /* init segment lock */
        opal_atomic_init(&seg_hdrp->lock, OPAL_ATOMIC_UNLOCKED);
        /* i was the creator of this segment, so note that fact */
        seg_hdrp->cpid = my_pid;

        opal_atomic_wmb();

        /* -- initialize the contents of opal_shmem_ds_t -- */
        ds_buf->seg_cpid = my_pid;
        ds_buf->seg_size = real_size;
        ds_buf->seg_base_addr = (unsigned char *)seg_hdrp;

        /* notice that we are not setting ds_buf->name here. sysv doesn't use
         * it, so don't worry about it - shmem_ds_reset took care of
         * initialization, so we aren't passing garbage around.
         */

        /* set "valid" bit because setment creation was successful */
        OPAL_SHMEM_DS_SET_VALID(ds_buf);

        OPAL_OUTPUT_VERBOSE(
            (70, opal_shmem_base_framework.framework_output,
             "%s: %s: create successful "
             "(id: %d, size: %lu, name: %s)\n",
             mca_shmem_sysv_component.super.base_version.mca_type_name,
             mca_shmem_sysv_component.super.base_version.mca_component_name,
             ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
        );
    }

out:
    /* an error occured, so invalidate the shmem object and release any
     * allocated resources.
     */
    if (OPAL_SUCCESS != rc) {
        /* best effort to delete the segment. */
        if ((void *)-1 != seg_hdrp) {
            shmdt((char*)seg_hdrp);
        }
        shmctl(ds_buf->seg_id, IPC_RMID, NULL);

        /* always invalidate in this error path */
        shmem_ds_reset(ds_buf);
    }
    return rc;
}
Esempio n. 22
0
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(opal_shmem_ds_t *ds_buf,
               const char *file_name,
               size_t size)
{
    int rc = OPAL_SUCCESS;
    pid_t my_pid = getpid();
    /* the real size of the shared memory segment.  this includes enough space
     * to store our segment header.
     */
    size_t real_size = size + sizeof(opal_shmem_seg_hdr_t);
    opal_shmem_seg_hdr_t *seg_hdrp = MAP_FAILED;

    /* init the contents of opal_shmem_ds_t */
    shmem_ds_reset(ds_buf);

    /* for posix shared memory we don't have to worry about the backing store
     * being located on a network file system... so no check is needed here.
     */

    /* calling shmem_posix_shm_open searches for an available posix shared
     * memory object name and upon successful completion populates the name
     * buffer
     */
    if (-1 == (ds_buf->seg_id = shmem_posix_shm_open(
                                    ds_buf->seg_name,
                                    OPAL_SHMEM_POSIX_FILE_LEN_MAX - 1))) {
        /* snaps!  something happened in posix_shm_open.  don't report anything
         * here because posix_shm_open will display all the necessary info.
         */
        rc = OPAL_ERROR;
        goto out;
    }
    /* size backing file - note the use of real_size here */
    else if (0 != ftruncate(ds_buf->seg_id, real_size)) {
        int err = errno;
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        opal_show_help("help-opal-shmem-posix.txt", "sys call fail", 1, hn,
                       "ftruncate(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    else if (MAP_FAILED == (seg_hdrp = (opal_shmem_seg_hdr_t*)mmap(NULL, real_size,
                                                                   PROT_READ | PROT_WRITE, MAP_SHARED,
                                                                   ds_buf->seg_id, 0))) {
        int err = errno;
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        opal_show_help("help-opal-shmem-posix.txt", "sys call fail", 1, hn,
                       "mmap(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    /* all is well */
    else {
        /* -- initialize the shared memory segment -- */
        opal_atomic_rmb();

        /* init segment lock */
        opal_atomic_init(&seg_hdrp->lock, OPAL_ATOMIC_UNLOCKED);
        /* i was the creator of this segment, so note that fact */
        seg_hdrp->cpid = my_pid;

        opal_atomic_wmb();

        /* -- initialize the contents of opal_shmem_ds_t -- */
        ds_buf->seg_cpid = my_pid;
        ds_buf->seg_size = real_size;
        ds_buf->seg_base_addr = (unsigned char *)seg_hdrp;

        /* notice that we are not setting ds_buf->name here.  at this point,
         * posix_shm_open was successful, so the contents of ds_buf->name are
         * already set for us :-)
         */

        /* set "valid" bit because setment creation was successful */
        OPAL_SHMEM_DS_SET_VALID(ds_buf);

        OPAL_OUTPUT_VERBOSE(
            (70, opal_shmem_base_framework.framework_output,
             "%s: %s: create successful "
             "(id: %d, size: %lu, name: %s)\n",
             mca_shmem_posix_component.super.base_version.mca_type_name,
             mca_shmem_posix_component.super.base_version.mca_component_name,
             ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
        );
    }

out:
    /* in this component, the id is the file descriptor returned by open.  this
     * check is here to see if it is safe to call close on the file descriptor.
     * that is, we are making sure that our call to open was successful and
     * we are not not in an error path.
     */
    if (-1 != ds_buf->seg_id) {
        if (0 != close(ds_buf->seg_id)) {
            int err = errno;
            char hn[MAXHOSTNAMELEN];
            gethostname(hn, MAXHOSTNAMELEN - 1);
            hn[MAXHOSTNAMELEN - 1] = '\0';
            opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn,
                           "close(2)", "", strerror(err), err);
            rc = OPAL_ERROR;
         }
     }
    /* an error occured, so invalidate the shmem object and release any
     * allocated resources.
     */
    if (OPAL_SUCCESS != rc) {
        /* posix_shm_open was successful, but something else wasn't.
         * note: if the id is not equal to -1 and we are here, name will be
         * valid.  that is, we can safely call shm_unlink with ds_buf->name.
         */
        if (-1 != ds_buf->seg_id) {
            shm_unlink(ds_buf->seg_name);
        }
        if (MAP_FAILED != seg_hdrp) {
            munmap((void*)seg_hdrp, real_size);
        }
        /* always invalidate in this error path */
        shmem_ds_reset(ds_buf);
    }
    return rc;
}
Esempio n. 23
0
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(opal_shmem_ds_t *ds_buf,
               const char *file_name,
               size_t size)
{
    int rc = OPAL_SUCCESS;
    char *real_file_name = NULL;
    pid_t my_pid = getpid();
    bool space_available = false;
    uint64_t amount_space_avail = 0;

    /* the real size of the shared memory segment.  this includes enough space
     * to store our segment header.
     */
    size_t real_size = size + sizeof(opal_shmem_seg_hdr_t);
    opal_shmem_seg_hdr_t *seg_hdrp = MAP_FAILED;

    /* init the contents of opal_shmem_ds_t */
    shmem_ds_reset(ds_buf);

    /* change the path of shmem mmap's backing store? */
    if (0 != opal_shmem_mmap_relocate_backing_file) {
        int err;
        if (path_usable(opal_shmem_mmap_backing_file_base_dir, &err)) {
            if (NULL ==
                (real_file_name =
                     get_uniq_file_name(opal_shmem_mmap_backing_file_base_dir,
                                        file_name))) {
                /* out of resources */
                return OPAL_ERROR;
            }
        }
        /* a relocated backing store was requested, but the path specified
         * cannot be used :-(. if the flag is negative, then warn and continue
         * with the default path.  otherwise, fail.
         */
        else if (opal_shmem_mmap_relocate_backing_file < 0) {
            opal_output(0, "shmem: mmap: WARNING: could not relocate "
                        "backing store to \"%s\" (%s).  Continuing with "
                        "default path.\n",
                        opal_shmem_mmap_backing_file_base_dir, strerror(err));
        }
        /* must be positive, so fail */
        else {
            opal_output(0, "shmem: mmap: WARNING: could not relocate "
                        "backing store to \"%s\" (%s).  Cannot continue with "
                        "shmem mmap.\n", opal_shmem_mmap_backing_file_base_dir,
                        strerror(err));
            return OPAL_ERROR;
        }
    }
    /* are we using the default path? */
    if (NULL == real_file_name) {
        /* use the path specified by the caller of this function */
        if (NULL == (real_file_name = strdup(file_name))) {
            /* out of resources */
            return OPAL_ERROR;
        }
    }

    OPAL_OUTPUT_VERBOSE(
        (70, opal_shmem_base_framework.framework_output,
         "%s: %s: backing store base directory: %s\n",
         mca_shmem_mmap_component.super.base_version.mca_type_name,
         mca_shmem_mmap_component.super.base_version.mca_component_name,
         real_file_name)
    );

    /* determine whether the specified filename is on a network file system.
     * this is an important check because if the backing store is located on
     * a network filesystem, the user may see a shared memory performance hit.
     */
    if (opal_shmem_mmap_nfs_warning && opal_path_nfs(real_file_name)) {
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        opal_show_help("help-opal-shmem-mmap.txt", "mmap on nfs", 1, hn,
                       real_file_name);
    }
    /* let's make sure we have enough space for the backing file */
    if (OPAL_SUCCESS != (rc = enough_space(real_file_name,
                                           real_size,
                                           &amount_space_avail,
                                           &space_available))) {
        opal_output(0, "shmem: mmap: an error occurred while determining "
                    "whether or not %s could be created.", real_file_name);
        /* rc is set */
        goto out;
    }
    if (!space_available) {
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        opal_show_help("help-opal-shmem-mmap.txt", "target full", 1,
                       real_file_name, hn, (unsigned long)real_size,
                       (unsigned long long)amount_space_avail);
        goto out;
    }
    /* enough space is available, so create the segment */
    if (-1 == (ds_buf->seg_id = open(real_file_name, O_CREAT | O_RDWR, 0600))) {
        int err = errno;
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn,
                       "open(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    /* size backing file - note the use of real_size here */
    if (0 != ftruncate(ds_buf->seg_id, real_size)) {
        int err = errno;
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn,
                       "ftruncate(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    if (MAP_FAILED == (seg_hdrp = (opal_shmem_seg_hdr_t *)
                                  mmap(NULL, real_size,
                                       PROT_READ | PROT_WRITE, MAP_SHARED,
                                       ds_buf->seg_id, 0))) {
        int err = errno;
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn,
                       "mmap(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    /* all is well */
    else {
        /* -- initialize the shared memory segment -- */
        opal_atomic_rmb();

        /* init segment lock */
        opal_atomic_init(&seg_hdrp->lock, OPAL_ATOMIC_UNLOCKED);
        /* i was the creator of this segment, so note that fact */
        seg_hdrp->cpid = my_pid;

        opal_atomic_wmb();

        /* -- initialize the contents of opal_shmem_ds_t -- */
        ds_buf->seg_cpid = my_pid;
        ds_buf->seg_size = real_size;
        ds_buf->seg_base_addr = (unsigned char *)seg_hdrp;
        (void)strncpy(ds_buf->seg_name, real_file_name, OPAL_PATH_MAX - 1);

        /* set "valid" bit because setment creation was successful */
        OPAL_SHMEM_DS_SET_VALID(ds_buf);

        OPAL_OUTPUT_VERBOSE(
            (70, opal_shmem_base_framework.framework_output,
             "%s: %s: create successful "
             "(id: %d, size: %lu, name: %s)\n",
             mca_shmem_mmap_component.super.base_version.mca_type_name,
             mca_shmem_mmap_component.super.base_version.mca_component_name,
             ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
        );
    }

out:
    /* in this component, the id is the file descriptor returned by open.  this
     * check is here to see if it is safe to call close on the file descriptor.
     * that is, we are making sure that our call to open was successful and
     * we are not not in an error path.
     */
    if (-1 != ds_buf->seg_id) {
        if (0 != close(ds_buf->seg_id)) {
            int err = errno;
            char hn[MAXHOSTNAMELEN];
            gethostname(hn, MAXHOSTNAMELEN - 1);
            hn[MAXHOSTNAMELEN - 1] = '\0';
            opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn,
                           "close(2)", "", strerror(err), err);
            rc = OPAL_ERROR;
         }
     }
    /* an error occured, so invalidate the shmem object and munmap if needed */
    if (OPAL_SUCCESS != rc) {
        if (MAP_FAILED != seg_hdrp) {
            munmap((void *)seg_hdrp, real_size);
        }
        shmem_ds_reset(ds_buf);
    }
    /* safe to free now because its contents have already been copied */
    if (NULL != real_file_name) {
        free(real_file_name);
    }
    return rc;
}
Esempio n. 24
0
/* look up the remote pointer in the peer rcache and attach if
 * necessary */
mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *ep, void *rem_ptr,
						      size_t size, int flags, void **local_ptr)
{
    struct mca_rcache_base_module_t *rcache = ep->rcache;
    mca_mpool_base_registration_t *regs[10], *reg = NULL;
    xpmem_addr_t xpmem_addr;
    uintptr_t base, bound;
    int rc, i;

    /* protect rcache access */
    OPAL_THREAD_LOCK(&ep->lock);

    /* use btl/self for self communication */
    assert (ep->peer_smp_rank != MCA_BTL_VADER_LOCAL_RANK);

    base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_component.log_attach_align);
    bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1),
                                      mca_btl_vader_component.log_attach_align) + 1;
    if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
        bound = VADER_MAX_ADDRESS;
    }

    /* several segments may match the base pointer */
    rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10);
    for (i = 0 ; i < rc ; ++i) {
        if (bound <= (uintptr_t)regs[i]->bound && base  >= (uintptr_t)regs[i]->base) {
            opal_atomic_add (&regs[i]->ref_count, 1);
            reg = regs[i];
            goto reg_found;
        }

        if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) {
            continue;
        }

        /* remove this pointer from the rcache and decrement its reference count
           (so it is detached later) */
        rc = rcache->rcache_delete (rcache, regs[i]);
        if (OPAL_UNLIKELY(0 != rc)) {
            /* someone beat us to it? */
            break;
        }

        /* start the new segment from the lower of the two bases */
        base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;                        

        opal_atomic_add (&regs[i]->ref_count, -1);

        if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
            /* this pointer is not in use */
            (void) xpmem_detach (regs[i]->alloc_base);
            OBJ_RELEASE(regs[i]);
        }

        break;
    }

    reg = OBJ_NEW(mca_mpool_base_registration_t);
    if (OPAL_LIKELY(NULL != reg)) {
        /* stick around for awhile */
        reg->ref_count = 2;
        reg->base  = (unsigned char *) base;
        reg->bound = (unsigned char *) bound;
        reg->flags = flags;

#if defined(HAVE_SN_XPMEM_H)
        xpmem_addr.id     = ep->apid;
#else
        xpmem_addr.apid   = ep->apid;
#endif
        xpmem_addr.offset = base;

        reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL);
        if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) {
            OPAL_THREAD_UNLOCK(&ep->lock);
            OBJ_RELEASE(reg);
            return NULL;
        }

        opal_memchecker_base_mem_defined (reg->alloc_base, bound - base);

        rcache->rcache_insert (rcache, reg, 0);
    }

reg_found:
    opal_atomic_wmb ();
    *local_ptr = (void *) ((uintptr_t) reg->alloc_base +
                           (ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));

    OPAL_THREAD_UNLOCK(&ep->lock);

    return reg;
}
Esempio n. 25
0
/*
 *
 * Recurssive k-ing algorithm
 * Example k=3 n=9
 *
 *
 * Number of Exchange steps = log (basek) n
 * Number of steps in exchange step = k (radix)
 *
 */
int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
                                           struct mca_bcol_base_function_t *const_args)
{
    /* local variables */
    int8_t  flag_offset;
    volatile int8_t ready_flag;
    mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
    netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
    int group_size = bcol_module->colls_no_user_data.size_of_group;
    int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */
    int bcol_id = (int) bcol_module->super.bcol_id;
    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
    uint32_t buffer_index = input_args->buffer_index;
    int *active_requests =
        &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);

    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
    int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
    int leading_dim, buff_idx, idx;

    int i, j, probe;
    int knt;
    int src;
    int recv_offset, recv_len;

    int pow_k, tree_order;
    int max_requests = 0; /* important to initialize this */

    int matched = 0;
    int64_t sequence_number=input_args->sequence_num;
    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
    int buff_offset = bcol_module->super.hier_scather_offset;


    int pack_len = input_args->count * input_args->dtype->super.size;

    void *data_addr = (void*)(
        (unsigned char *) input_args->sbuf +
        (size_t) input_args->sbuf_offset);
    volatile mca_bcol_basesmuma_payload_t *data_buffs;
    volatile char *peer_data_pointer;

    /* control structures */
    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
    volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;

#if 0
    fprintf(stderr,"entering p2p allgather pack_len %d\n",pack_len);
#endif
    /* initialize the iteration counter */
    buff_idx = input_args->src_desc->buffer_index;
    leading_dim = bcol_module->colls_no_user_data.size_of_group;
    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
        bcol_module->colls_with_user_data.data_buffs+idx;

    /* Set pointer to current proc ctrl region */
    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
    /* NTH: copied from progress */
    flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];

    /* initialize headers and ready flag */
    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);

    /* initialize these */
    *iteration = 0;
    *active_requests = 0;
    *status = 0;

    /* k-nomial parameters */
    tree_order = exchange_node->tree_order;
    pow_k = exchange_node->log_tree_order;

    /* calculate the maximum number of requests
     * at each level each rank communicates with
     * at most (k - 1) peers
     * so if we set k - 1 bit fields in "max_requests", then
     * we have max_request  == 2^(k - 1) -1
     */
    for(i = 0; i < (tree_order - 1); i++){
        max_requests ^=  (1<<i);
    }
    /* let's begin the collective, starting with extra ranks and their
     * respective proxies
     */

    if( EXTRA_NODE == exchange_node->node_type ) {

        /* then I will signal to my proxy rank*/
        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
        ready_flag = flag_offset + 1 + pow_k + 2;
        /* now, poll for completion */
        src = exchange_node->rank_extra_sources_array[0];
        peer_data_pointer = data_buffs[src].payload;
        peer_ctl_pointer = data_buffs[src].ctl_struct;

        /* calculate the offset */
        knt = 0;
        for(i = 0; i < group_size; i++){
            knt += list_connected[i];
        }
        for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) {
            if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
                matched = 1;
                /* we receive the entire message */
                memcpy((void *)((unsigned char *) data_addr + buff_offset),
                       (void *) ((unsigned char *) peer_data_pointer + buff_offset),
                       knt * pack_len);

                goto FINISHED;
            }

        }

        /* save state and bail */
        *iteration = -1;
        return BCOL_FN_STARTED;

    }else if ( 0 < exchange_node->n_extra_sources ) {

        /* I am a proxy for someone */
        src = exchange_node->rank_extra_sources_array[0];
        peer_data_pointer = data_buffs[src].payload;
        peer_ctl_pointer = data_buffs[src].ctl_struct;


        knt = 0;
        for(i = 0; i < src; i++){
            knt += list_connected[i];
        }

        /* probe for extra rank's arrival */
        for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) {
            if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
                matched = 1;
                /* copy it in */
                memcpy((void *)((unsigned char *) data_addr + knt*pack_len),
                       (void *) ((unsigned char *) peer_data_pointer + knt*pack_len),
                       pack_len * list_connected[src]);
                goto MAIN_PHASE;
            }
        }
        *status = ready_flag;
        *iteration = -1;
        return BCOL_FN_STARTED;


    }

MAIN_PHASE:
    /* bump the ready flag */
    ready_flag++;


    /* we start the recursive k - ing phase */
    for( *iteration = 0; *iteration < pow_k; (*iteration)++) {
        /* announce my arrival */
        opal_atomic_wmb ();
        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
        /* calculate the number of active requests */
        CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order);
        /* Now post the recv's */
        for( j = 0; j < (tree_order - 1); j++ ) {

            /* recv phase */
            src = exchange_node->rank_exchanges[*iteration][j];

            if( src < 0 ) {
                /* then not a valid rank, continue */

                continue;
            }

            peer_data_pointer = data_buffs[src].payload;
            peer_ctl_pointer = data_buffs[src].ctl_struct;
            if( !(*active_requests&(1<<j))) {
                /* then the bit hasn't been set, thus this peer
                 * hasn't been processed at this level
                 */
                recv_offset = exchange_node->payload_info[*iteration][j].r_offset * pack_len;
                recv_len = exchange_node->payload_info[*iteration][j].r_len * pack_len;
                /* post the receive */
                /* I am putting the probe loop as the inner most loop to achieve
                 * better temporal locality
                 */
                matched = 0;
                for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
                    if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
                        matched = 1;
                        /* set this request's bit */
                        *active_requests ^= (1<<j);
                        /* get the data */
                        memcpy((void *)((unsigned char *) data_addr + recv_offset),
                               (void *)((unsigned char *) peer_data_pointer + recv_offset),
                               recv_len);
                    }
                }
            }


        }
        if( max_requests == *active_requests ){
            /* bump the ready flag */
            ready_flag++;
            /*reset the active requests */
            *active_requests = 0;
        } else {
            /* save state and hop out
             * only the iteration needs to be tracked
             */
            *status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id];
            return BCOL_FN_STARTED;
        }
    }

    /* bump the flag one more time for the extra rank */
    ready_flag = flag_offset + 1 + pow_k + 2;

    /* finish off the last piece, send the data back to the extra  */
    if( 0 < exchange_node->n_extra_sources ) {
        /* simply announce my arrival */
        opal_atomic_wmb ();
        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;

    }

FINISHED:
    /* bump this up */
    my_ctl_pointer->starting_flag_value[bcol_id]++;
    return BCOL_FN_COMPLETE;
}
Esempio n. 26
0
/* look up the remote pointer in the peer rcache and attach if
 * necessary */
mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *ep, void *rem_ptr,
                                                       size_t size, int flags, void **local_ptr)
{
    mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
    uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align;
    mca_rcache_base_registration_t *reg = NULL;
    vader_check_reg_ctx_t check_ctx = {.ep = ep, .reg = &reg, .vma_module = vma_module};
    xpmem_addr_t xpmem_addr;
    uintptr_t base, bound;
    int rc;

    base = OPAL_DOWN_ALIGN((uintptr_t) rem_ptr, attach_align, uintptr_t);
    bound = OPAL_ALIGN((uintptr_t) rem_ptr + size - 1, attach_align, uintptr_t) + 1;
    if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
        bound = VADER_MAX_ADDRESS;
    }

    check_ctx.base = base;
    check_ctx.bound = bound;

    /* several segments may match the base pointer */
    rc = mca_rcache_base_vma_iterate (vma_module, (void *) base, bound - base, vader_check_reg, &check_ctx);
    if (2 == rc) {
        /* start the new segment from the lower of the two bases */
        base = (uintptr_t) reg->base < base ? (uintptr_t) reg->base : base;

        if (OPAL_LIKELY(0 == opal_atomic_add_32 (&reg->ref_count, -1))) {
            /* this pointer is not in use */
            (void) xpmem_detach (reg->rcache_context);
            OBJ_RELEASE(reg);
        }

        reg = NULL;
    }

    if (NULL == reg) {
        reg = OBJ_NEW(mca_rcache_base_registration_t);
        if (OPAL_LIKELY(NULL != reg)) {
            /* stick around for awhile */
            reg->ref_count = 2;
            reg->base  = (unsigned char *) base;
            reg->bound = (unsigned char *) bound;
            reg->flags = flags;
            reg->alloc_base = (void *) (intptr_t) ep->peer_smp_rank;

#if defined(HAVE_SN_XPMEM_H)
            xpmem_addr.id     = ep->segment_data.xpmem.apid;
#else
            xpmem_addr.apid   = ep->segment_data.xpmem.apid;
#endif
            xpmem_addr.offset = base;

            reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL);
            if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) {
                OBJ_RELEASE(reg);
                return NULL;
            }

            opal_memchecker_base_mem_defined (reg->rcache_context, bound - base);

            mca_rcache_base_vma_insert (vma_module, reg, 0);
        }
    }

    opal_atomic_wmb ();
    *local_ptr = (void *) ((uintptr_t) reg->rcache_context +
                           (ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));

    return reg;
}
Esempio n. 27
0
/*
 * Copy all the attributes from one MPI object to another.  Called
 * when MPI objects are copied (e.g., back-end actions to
 * MPI_COMM_DUP).
 */
int ompi_attr_copy_all(ompi_attribute_type_t type, void *old_object,
                       void *new_object, opal_hash_table_t *oldattr_hash,
                       opal_hash_table_t *newattr_hash)
{
    int ret;
    int err;
    uint32_t key;
    int flag;
    void *node, *in_node;
    attribute_value_t *old_attr, *new_attr;
    ompi_attribute_keyval_t *hash_value;

    /* If there's nothing to do, just return */
    if (NULL == oldattr_hash) {
        return MPI_SUCCESS;
    }

    OPAL_THREAD_LOCK(&attribute_lock);

    /* Get the first attribute in the object's hash */
    ret = opal_hash_table_get_first_key_uint32(oldattr_hash, &key,
                                               (void **) &old_attr,
                                               &node);

    /* While we still have some attribute in the object's key hash */
    while (OMPI_SUCCESS == ret) {
        in_node = node;

        /* Get the keyval in the main keyval hash - so that we know
           what the copy_attr_fn is */
        err = opal_hash_table_get_value_uint32(keyval_hash, key,
                                               (void **) &hash_value);
        if (OMPI_SUCCESS != err) {
            /* This should not happen! */
            ret = MPI_ERR_INTERN;
            goto out;
        }

        err = 0;
        new_attr = OBJ_NEW(attribute_value_t);
        switch (type) {
        case COMM_ATTR:
            /* Now call the copy_attr_fn */
            COPY_ATTR_CALLBACKS(communicator, old_object, hash_value,
                                old_attr, new_object, new_attr, err);
            break;

        case TYPE_ATTR:
            /* Now call the copy_attr_fn */
            COPY_ATTR_CALLBACKS(datatype, old_object, hash_value,
                                old_attr, new_object, new_attr, err);
            break;

        case WIN_ATTR:
            /* Now call the copy_attr_fn */
            COPY_ATTR_CALLBACKS(win, old_object, hash_value,
                                old_attr, new_object, new_attr, err);
            break;

        default:
            /* This should not happen */
            assert(0);
            break;
        }
        /* Did the callback return non-MPI_SUCCESS? */
        if (0 != err) {
            ret = err;
            goto out;
        }

        /* Hang this off the object's hash */

        /* The COPY_ATTR_CALLBACKS macro will have converted the
           _flag_ callback output value from Fortran's .TRUE. value to
           0/1 (if necessary).  So we only need to check for 0/1 here
           -- not .TRUE. */
        if (1 == flag) {
            if (0 != (hash_value->attr_flag & OMPI_KEYVAL_F77)) {
                if (0 != (hash_value->attr_flag & OMPI_KEYVAL_F77_INT)) {
                    new_attr->av_set_from = OMPI_ATTRIBUTE_FINT;
                } else {
                    new_attr->av_set_from = OMPI_ATTRIBUTE_AINT;
                }
            } else {
                new_attr->av_set_from = OMPI_ATTRIBUTE_C;
            }
            ret = set_value(type, new_object, &newattr_hash, key,
                            new_attr, true);
            if (MPI_SUCCESS != ret) {
                goto out;
            }
        } else {
            OBJ_RELEASE(new_attr);
        }

        ret = opal_hash_table_get_next_key_uint32(oldattr_hash, &key,
                                                  (void **) &old_attr,
                                                  in_node, &node);
    }
    ret = MPI_SUCCESS;

 out:
    /* All done */
    opal_atomic_wmb();
    OPAL_THREAD_UNLOCK(&attribute_lock);
    return ret;
}
Esempio n. 28
0
int ompi_datatype_get_pack_description( ompi_datatype_t* datatype,
                                        const void** packed_buffer )
{
    ompi_datatype_args_t* args = (ompi_datatype_args_t*)datatype->args;
    int next_index = OMPI_DATATYPE_MAX_PREDEFINED;
    void *packed_description = datatype->packed_description;
    void* recursive_buffer;

    if (NULL == packed_description) {
        if (opal_atomic_cmpset (&datatype->packed_description, NULL, (void *) 1)) {
            if( ompi_datatype_is_predefined(datatype) ) {
                packed_description = malloc(2 * sizeof(int));
            } else if( NULL == args ) {
                return OMPI_ERROR;
            } else {
                packed_description = malloc(args->total_pack_size);
            }
            recursive_buffer = packed_description;
            __ompi_datatype_pack_description( datatype, &recursive_buffer, &next_index );

            if (!ompi_datatype_is_predefined(datatype)) {
                args->total_pack_size = (uintptr_t)((char*)recursive_buffer - (char *) packed_description);
            }

            opal_atomic_wmb ();
            datatype->packed_description = packed_description;
        } else {
            /* another thread beat us to it */
            packed_description = datatype->packed_description;
        }
    }

    if ((void *) 1 == packed_description) {
        struct timespec interval = {.tv_sec = 0, .tv_nsec = 1000};

        /* wait until the packed description is updated */
        while ((void *) 1 == datatype->packed_description) {
            nanosleep (&interval, NULL);
        }

        packed_description = datatype->packed_description;
    }

    *packed_buffer = (const void *) packed_description;
    return OMPI_SUCCESS;
}

size_t ompi_datatype_pack_description_length( ompi_datatype_t* datatype )
{
    void *packed_description = datatype->packed_description;

    if( ompi_datatype_is_predefined(datatype) ) {
        return 2 * sizeof(int);
    }
    if( NULL == packed_description || (void *) 1 == packed_description) {
        const void* buf;
        int rc;

        rc = ompi_datatype_get_pack_description(datatype, &buf);
        if( OMPI_SUCCESS != rc ) {
            return 0;
        }
    }
    assert( NULL != (ompi_datatype_args_t*)datatype->args );
    assert( NULL != (ompi_datatype_args_t*)datatype->packed_description );
    return ((ompi_datatype_args_t*)datatype->args)->total_pack_size;
}
Esempio n. 29
0
int mca_btl_sm_add_procs(
    struct mca_btl_base_module_t* btl,
    size_t nprocs,
    struct ompi_proc_t **procs,
    struct mca_btl_base_endpoint_t **peers,
    opal_bitmap_t* reachability)
{
    int return_code = OMPI_SUCCESS;
    int32_t n_local_procs = 0, proc, j, my_smp_rank = -1;
    ompi_proc_t* my_proc; /* pointer to caller's proc structure */
    mca_btl_sm_t *sm_btl;
    bool have_connected_peer = false;
    char **bases;
    /* for easy access to the mpool_sm_module */
    mca_mpool_sm_module_t *sm_mpool_modp = NULL;

    /* initializion */

    sm_btl = (mca_btl_sm_t *)btl;

    /* get pointer to my proc structure */
    if(NULL == (my_proc = ompi_proc_local()))
        return OMPI_ERR_OUT_OF_RESOURCE;

    /* Get unique host identifier for each process in the list,
     * and idetify procs that are on this host.  Add procs on this
     * host to shared memory reachbility list.  Also, get number
     * of local procs in the procs list. */
    for (proc = 0; proc < (int32_t)nprocs; proc++) {
        /* check to see if this proc can be reached via shmem (i.e.,
           if they're on my local host and in my job) */
        if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
            !OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
            peers[proc] = NULL;
            continue;
        }

        /* check to see if this is me */
        if(my_proc == procs[proc]) {
            my_smp_rank = mca_btl_sm_component.my_smp_rank = n_local_procs++;
            continue;
        }

         /* sm doesn't support heterogeneous yet... */
        if (procs[proc]->proc_arch != my_proc->proc_arch) {
            continue;
        }

        /* we have someone to talk to */
        have_connected_peer = true;

        if(!(peers[proc] = create_sm_endpoint(n_local_procs, procs[proc]))) {
            return_code = OMPI_ERROR;
            goto CLEANUP;
        }
        n_local_procs++;

        /* add this proc to shared memory accessibility list */
        return_code = opal_bitmap_set_bit(reachability, proc);
        if(OMPI_SUCCESS != return_code)
            goto CLEANUP;
    }

    /* jump out if there's not someone we can talk to */
    if (!have_connected_peer)
        goto CLEANUP;

    /* make sure that my_smp_rank has been defined */
    if (-1 == my_smp_rank) {
        return_code = OMPI_ERROR;
        goto CLEANUP;
    }

    if (!sm_btl->btl_inited) {
        return_code =
            sm_btl_first_time_init(sm_btl, my_smp_rank,
                                   mca_btl_sm_component.sm_max_procs);
        if (return_code != OMPI_SUCCESS) {
            goto CLEANUP;
        }
    }

    /* set local proc's smp rank in the peers structure for
     * rapid access and calculate reachability */
    for(proc = 0; proc < (int32_t)nprocs; proc++) {
        if(NULL == peers[proc])
            continue;
        mca_btl_sm_component.sm_peers[peers[proc]->peer_smp_rank] = peers[proc];
        peers[proc]->my_smp_rank = my_smp_rank;
    }

    bases = mca_btl_sm_component.shm_bases;
    sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_sm_component.sm_mpool;

    /* initialize own FIFOs */
    /*
     * The receiver initializes all its FIFOs.  All components will
     * be allocated near the receiver.  Nothing will be local to
     * "the sender" since there will be many senders.
     */
    for(j = mca_btl_sm_component.num_smp_procs;
        j < mca_btl_sm_component.num_smp_procs + FIFO_MAP_NUM(n_local_procs); j++) {

        return_code = sm_fifo_init( mca_btl_sm_component.fifo_size,
                                    mca_btl_sm_component.sm_mpool,
                                   &mca_btl_sm_component.fifo[my_smp_rank][j],
                                    mca_btl_sm_component.fifo_lazy_free);
        if(return_code != OMPI_SUCCESS)
            goto CLEANUP;
    }

    opal_atomic_wmb();

    /* Sync with other local procs. Force the FIFO initialization to always
     * happens before the readers access it.
     */
    opal_atomic_add_32(&mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1);
    while( n_local_procs >
           mca_btl_sm_component.sm_seg->module_seg->seg_inited) {
        opal_progress();
        opal_atomic_rmb();
    }

    /* it is now safe to unlink the shared memory segment. only one process
     * needs to do this, so just let smp rank zero take care of it. */
    if (0 == my_smp_rank) {
        if (OMPI_SUCCESS !=
            mca_common_sm_module_unlink(mca_btl_sm_component.sm_seg)) {
            /* it is "okay" if this fails at this point. we have gone this far,
             * so just warn about the failure and continue. this is probably
             * only triggered by a programming error. */
            opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
        }
        /* SKG - another abstraction violation here, but I don't want to add
         * extra code in the sm mpool for further synchronization. */

        /* at this point, all processes have attached to the mpool segment. so
         * it is safe to unlink it here. */
        if (OMPI_SUCCESS !=
            mca_common_sm_module_unlink(sm_mpool_modp->sm_common_module)) {
            opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
        }
        if (-1 == unlink(mca_btl_sm_component.sm_mpool_rndv_file_name)) {
            opal_output(0, "WARNING: %s unlink failed.\n",
                        mca_btl_sm_component.sm_mpool_rndv_file_name);
        }
        if (-1 == unlink(mca_btl_sm_component.sm_rndv_file_name)) {
            opal_output(0, "WARNING: %s unlink failed.\n",
                        mca_btl_sm_component.sm_rndv_file_name);
        }
    }

    /* free up some space used by the name buffers */
    free(mca_btl_sm_component.sm_mpool_ctl_file_name);
    free(mca_btl_sm_component.sm_mpool_rndv_file_name);
    free(mca_btl_sm_component.sm_ctl_file_name);
    free(mca_btl_sm_component.sm_rndv_file_name);

    /* coordinate with other processes */
    for(j = mca_btl_sm_component.num_smp_procs;
        j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) {
        ptrdiff_t diff;

        /* spin until this element is allocated */
        /* doesn't really wait for that process... FIFO might be allocated, but not initialized */
        opal_atomic_rmb();
        while(NULL == mca_btl_sm_component.shm_fifo[j]) {
            opal_progress();
            opal_atomic_rmb();
        }

        /* Calculate the difference as (my_base - their_base) */
        diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]);

        /* store local address of remote fifos */
        mca_btl_sm_component.fifo[j] =
            (sm_fifo_t*)OFFSET2ADDR(diff, mca_btl_sm_component.shm_fifo[j]);

        /* cache local copy of peer memory node number */
        mca_btl_sm_component.mem_nodes[j] = mca_btl_sm_component.shm_mem_nodes[j];
    }

    /* update the local smp process count */
    mca_btl_sm_component.num_smp_procs += n_local_procs;

    /* make sure we have enough eager fragmnents for each process */
    return_code = ompi_free_list_resize_mt(&mca_btl_sm_component.sm_frags_eager,
                                           mca_btl_sm_component.num_smp_procs * 2);
    if (OMPI_SUCCESS != return_code)
        goto CLEANUP;

CLEANUP:
    return return_code;
}
Esempio n. 30
0
/**
 * Shared memory broadcast.
 *
 * For the root, the general algorithm is to wait for a set of
 * segments to become available.  Once it is, the root claims the set
 * by writing the current operation number and the number of processes
 * using the set to the flag.  The root then loops over the set of
 * segments; for each segment, it copies a fragment of the user's
 * buffer into the shared data segment and then writes the data size
 * into its childrens' control buffers.  The process is repeated until
 * all fragments have been written.
 *
 * For non-roots, for each set of buffers, they wait until the current
 * operation number appears in the in-use flag (i.e., written by the
 * root).  Then for each segment, they wait for a nonzero to appear
 * into their control buffers.  If they have children, they copy the
 * data from their parent's shared data segment into their shared data
 * segment, and write the data size into each of their childrens'
 * control buffers.  They then copy the data from their shared [local]
 * data segment into the user's output buffer.  The process is
 * repeated until all fragments have been received.  If they do not
 * have children, they copy the data directly from the parent's shared
 * data segment into the user's output buffer.
 */
int mca_coll_sm_bcast_intra(void *buff, int count,
                            struct ompi_datatype_t *datatype, int root,
                            struct ompi_communicator_t *comm,
                            mca_coll_base_module_t *module)
{
    struct iovec iov;
    mca_coll_sm_module_t *sm_module = (mca_coll_sm_module_t*) module;
    mca_coll_sm_comm_t *data;
    int i, ret, rank, size, num_children, src_rank;
    int flag_num, segment_num, max_segment_num;
    int parent_rank;
    size_t total_size, max_data, bytes;
    mca_coll_sm_in_use_flag_t *flag;
    opal_convertor_t convertor;
    mca_coll_sm_tree_node_t *me, *parent, **children;
    mca_coll_sm_data_index_t *index;

    /* Lazily enable the module the first time we invoke a collective
       on it */
    if (!sm_module->enabled) {
        if (OMPI_SUCCESS != (ret = ompi_coll_sm_lazy_enable(module, comm))) {
            return ret;
        }
    }
    data = sm_module->sm_comm_data;

    /* Setup some identities */

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    OBJ_CONSTRUCT(&convertor, opal_convertor_t);
    iov.iov_len = mca_coll_sm_component.sm_fragment_size;
    bytes = 0;

    me = &data->mcb_tree[(rank + size - root) % size];
    parent = me->mcstn_parent;
    children = me->mcstn_children;
    num_children = me->mcstn_num_children;

    /* Only have one top-level decision as to whether I'm the root or
       not.  Do this at the slight expense of repeating a little logic
       -- but it's better than a conditional branch in every loop
       iteration. */

    /*********************************************************************
     * Root
     *********************************************************************/

    if (root == rank) {

        /* The root needs a send convertor to pack from the user's
           buffer to shared memory */

        if (OMPI_SUCCESS !=
            (ret =
             opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
                                                      &(datatype->super),
                                                      count,
                                                      buff,
                                                      0,
                                                      &convertor))) {
            return ret;
        }
        opal_convertor_get_packed_size(&convertor, &total_size);

        /* Main loop over sending fragments */

        do {
            flag_num = (data->mcb_operation_count++ %
                        mca_coll_sm_component.sm_comm_num_in_use_flags);

            FLAG_SETUP(flag_num, flag, data);
            FLAG_WAIT_FOR_IDLE(flag, bcast_root_label);
            FLAG_RETAIN(flag, size - 1, data->mcb_operation_count - 1);

            /* Loop over all the segments in this set */

            segment_num =
                flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
            max_segment_num =
                (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
            do {
                index = &(data->mcb_data_index[segment_num]);

                /* Copy the fragment from the user buffer to my fragment
                   in the current segment */
                max_data = mca_coll_sm_component.sm_fragment_size;
                COPY_FRAGMENT_IN(convertor, index, rank, iov, max_data);
                bytes += max_data;

                /* Wait for the write to absolutely complete */
                opal_atomic_wmb();

                /* Tell my children that this fragment is ready */
                PARENT_NOTIFY_CHILDREN(children, num_children, index,
                                       max_data);

                ++segment_num;
            } while (bytes < total_size && segment_num < max_segment_num);
        } while (bytes < total_size);
    }

    /*********************************************************************
     * Non-root
     *********************************************************************/

    else {

        /* Non-root processes need a receive convertor to unpack from
           shared mmory to the user's buffer */

        if (OMPI_SUCCESS !=
            (ret =
             opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
                                                      &(datatype->super),
                                                      count,
                                                      buff,
                                                      0,
                                                      &convertor))) {
            return ret;
        }
        opal_convertor_get_packed_size(&convertor, &total_size);

        /* Loop over receiving (and possibly re-sending) the
           fragments */

        do {
            flag_num = (data->mcb_operation_count %
                        mca_coll_sm_component.sm_comm_num_in_use_flags);

            /* Wait for the root to mark this set of segments as
               ours */
            FLAG_SETUP(flag_num, flag, data);
            FLAG_WAIT_FOR_OP(flag, data->mcb_operation_count, bcast_nonroot_label1);
            ++data->mcb_operation_count;

            /* Loop over all the segments in this set */

            segment_num =
                flag_num * mca_coll_sm_component.sm_segs_per_inuse_flag;
            max_segment_num =
                (flag_num + 1) * mca_coll_sm_component.sm_segs_per_inuse_flag;
            do {

                /* Pre-calculate some values */
                parent_rank = (parent->mcstn_id + root) % size;
                index = &(data->mcb_data_index[segment_num]);

                /* Wait for my parent to tell me that the segment is ready */
                CHILD_WAIT_FOR_NOTIFY(rank, index, max_data, bcast_nonroot_label2);

                /* If I have children, send the data to them */
                if (num_children > 0) {
                    /* Copy the fragment from the parent's portion in
                       the segment to my portion in the segment. */
                    COPY_FRAGMENT_BETWEEN(parent_rank, rank, index, max_data);

                    /* Wait for the write to absolutely complete */
                    opal_atomic_wmb();

                    /* Tell my children that this fragment is ready */
                    PARENT_NOTIFY_CHILDREN(children, num_children, index,
                                           max_data);

                    /* Set the "copy from buffer" to be my local
                       segment buffer so that we don't potentially
                       incur a non-local memory copy from the parent's
                       fan out data segment [again] when copying to
                       the user's buffer */
                    src_rank = rank;
                }

                /* If I don't have any children, set the "copy from
                   buffer" to be my parent's fan out segment to copy
                   directly from my parent */

                else {
                    src_rank = parent_rank;
                }

                /* Copy to my output buffer */
                COPY_FRAGMENT_OUT(convertor, src_rank, index, iov, max_data);

                bytes += max_data;
                ++segment_num;
            } while (bytes < total_size && segment_num < max_segment_num);

            /* Wait for all copy-out writes to complete before I say
               I'm done with the segments */
            opal_atomic_wmb();

            /* We're finished with this set of segments */
            FLAG_RELEASE(flag);
        } while (bytes < total_size);
    }

    /* Kill the convertor */

    OBJ_DESTRUCT(&convertor);

    /* All done */

    return OMPI_SUCCESS;
}