/* * Allreduce * * Function: - allreduce * Accepts: - same as MPI_Allreduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_fca_allreduce(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_fca_module_t *fca_module = (mca_coll_fca_module_t*)module; fca_reduce_spec_t spec; int ret; spec.sbuf = sbuf; spec.rbuf = rbuf; if (mca_coll_fca_fill_reduce_spec(count, dtype, op, &spec, fca_module->fca_comm_caps.max_payload) != OMPI_SUCCESS) { FCA_VERBOSE(5, "Unsupported allreduce operation %s, using fallback\n", op->o_name); goto orig_allreduce; } FCA_VERBOSE(5,"Using FCA Allreduce"); ret = mca_coll_fca_component.fca_ops.do_all_reduce(fca_module->fca_comm, &spec); if (ret < 0) { if (ret == -EUSEMPI) { goto orig_allreduce; } FCA_ERROR("Allreduce failed: %s", mca_coll_fca_component.fca_ops.strerror(ret)); return OMPI_ERROR; } return OMPI_SUCCESS; orig_allreduce: return fca_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, fca_module->previous_allreduce_module); }
static int _create_fca_comm(mca_scoll_fca_module_t *fca_module) { int comm_size; int rc, ret; rc = _fca_comm_new(fca_module); if (rc != OSHMEM_SUCCESS) return rc; /* allocate comm_init_spec */ FCA_MODULE_VERBOSE(fca_module, 1, "Starting COMM_INIT comm_id %d proc_idx %d num_procs %d", fca_module->fca_comm_desc.comm_id, fca_module->local_proc_idx, fca_module->num_local_procs); comm_size = fca_module->comm->proc_count; ret = mca_scoll_fca_comm_init(mca_scoll_fca_component.fca_context, oshmem_proc_group_find_id(fca_module->comm, fca_module->rank), comm_size, fca_module->local_proc_idx, fca_module->num_local_procs, &fca_module->fca_comm_desc, &fca_module->fca_comm); if (ret < 0) { FCA_ERROR("COMM_INIT failed: %s", fca_strerror(ret)); return OSHMEM_ERROR; } /* get communicator capabilities */ ret = fca_comm_get_caps(fca_module->fca_comm, &fca_module->fca_comm_caps); if (ret < 0) { FCA_ERROR("GET_COMM_CAPS failed: %s", fca_strerror(ret)); return OSHMEM_ERROR; } /* by this point every rank in the communicator is set up */ FCA_MODULE_VERBOSE(fca_module, 1, "Initialized FCA communicator, comm_id %d", fca_module->fca_comm_desc.comm_id); return OSHMEM_SUCCESS; }
static void __destroy_fca_comm(mca_coll_fca_module_t *fca_module) { int ret; fca_comm_destroy(fca_module->fca_comm); if (fca_module->rank == 0) { ret = fca_comm_end(mca_coll_fca_component.fca_context, fca_module->fca_comm_desc.comm_id); if (ret < 0) { FCA_ERROR("COMM_END failed: %s", fca_strerror(ret)); } } FCA_MODULE_VERBOSE(fca_module, 1, "Destroyed FCA communicator, comm_id %d", fca_module->fca_comm_desc.comm_id); }
static void mca_coll_fca_comm_wrap_destruct(mca_coll_fca_comm_wrap_t *item) { int ret; if(item->fca_comm != NULL) { fca_comm_destroy(item->fca_comm); if (item->rank == 0) { ret = fca_comm_end(mca_coll_fca_component.fca_context, item->comm_id); if (ret < 0) { FCA_ERROR("COMM_END failed: %s", fca_strerror(ret)); } } } }
static void _destroy_fca_comm(mca_scoll_fca_module_t *fca_module) { int ret; struct oshmem_group_t *comm = fca_module->comm; const int root_pe = oshmem_proc_pe(comm->proc_array[root_id]); fca_comm_destroy(fca_module->fca_comm); if (comm->my_pe == root_pe && mca_scoll_fca_component.fca_context) { ret = fca_comm_end(mca_scoll_fca_component.fca_context, fca_module->fca_comm_desc.comm_id); if (ret < 0) { FCA_ERROR("COMM_END failed: %s", fca_strerror(ret)); } } FCA_MODULE_VERBOSE(fca_module, 1, "Destroyed FCA communicator, comm_id %d", fca_module->fca_comm_desc.comm_id); }
/* * * Initialize module on the communicator * */ static int mca_scoll_fca_module_enable(mca_scoll_base_module_t *module, struct oshmem_group_t *comm) { mca_scoll_fca_module_t *fca_module = (mca_scoll_fca_module_t*) module; int rc; fca_module->comm = comm; fca_module->rank = comm->my_pe; rc = mca_scoll_fca_get_fca_lib(comm); if (rc != OSHMEM_SUCCESS) goto exit_fatal; rc = _save_coll_handlers(fca_module); if (rc != OSHMEM_SUCCESS) goto exit_fatal; rc = _get_local_ranks(fca_module); if (rc != OSHMEM_SUCCESS) goto exit_fatal; rc = _create_fca_comm(fca_module); if (rc != OSHMEM_SUCCESS) goto exit_fatal; FCA_MODULE_VERBOSE(fca_module, 1, "FCA Module initialized"); return OMPI_SUCCESS; exit_fatal: /* it is possible that other pe(s) succesfully enabled fca. * So differnt frameworks will be used for collective ops */ FCA_ERROR("FCA module enable failed - aborting to prevent inconsistent application state"); /* There's no modules available */ opal_show_help("help-oshmem-scoll-fca.txt", "module_enable:fatal", true, "FCA module enable failed - aborting to prevent inconsistent application state"); oshmem_shmem_abort(-1); return OMPI_ERROR; }
static int _get_local_ranks(mca_scoll_fca_module_t *fca_module) { struct oshmem_group_t *comm = fca_module->comm; oshmem_proc_t* proc; int i, rank; /* Count the local ranks */ fca_module->num_local_procs = 0; for (rank = 0; rank < comm->proc_count; ++rank) { proc = comm->proc_array[rank]; if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { if (proc->super.proc_name.vpid == (uint32_t) fca_module->rank) { fca_module->local_proc_idx = fca_module->num_local_procs; } ++fca_module->num_local_procs; } } /* Make a list of local ranks */ fca_module->local_ranks = calloc(fca_module->num_local_procs, sizeof *fca_module->local_ranks); if (!fca_module->local_ranks) { FCA_ERROR("Failed to allocate memory for %d local ranks", fca_module->num_local_procs); return OSHMEM_ERROR; } i = 0; for (rank = 0; rank < comm->proc_count; ++rank) { proc = comm->proc_array[rank]; if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { fca_module->local_ranks[i++] = rank; } } FCA_MODULE_VERBOSE(fca_module, 3, "i am %d/%d", fca_module->local_proc_idx, fca_module->num_local_procs); return OSHMEM_SUCCESS; }
/* * Function: - barrier * Returns: - MPI_SUCCESS or error code */ int mca_coll_fca_barrier(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_fca_module_t *fca_module = (mca_coll_fca_module_t*)module; int ret; FCA_VERBOSE(5,"Using FCA Barrier"); ret = mca_coll_fca_component.fca_ops.do_barrier(fca_module->fca_comm); if (ret < 0) { if (ret == -EUSEMPI) { goto orig_barrier; } FCA_ERROR("Barrier failed: %s", mca_coll_fca_component.fca_ops.strerror(ret)); return OMPI_ERROR; } return OMPI_SUCCESS; orig_barrier: return fca_module->previous_barrier(comm, fca_module->previous_barrier_module); }
/** * Fills local rank information in fca_module. */ static int __get_local_ranks(mca_coll_fca_module_t *fca_module) { ompi_communicator_t *comm = fca_module->comm; ompi_proc_t* proc; int i, rank; /* Count the local ranks */ fca_module->num_local_procs = 0; for (rank = 0; rank < ompi_comm_size(comm); ++rank) { proc = __local_rank_lookup(comm, rank); if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { if (rank == fca_module->rank) { fca_module->local_proc_idx = fca_module->num_local_procs; } ++fca_module->num_local_procs; } } /* Make a list of local ranks */ fca_module->local_ranks = calloc(fca_module->num_local_procs, sizeof *fca_module->local_ranks); if (!fca_module->local_ranks) { FCA_ERROR("Failed to allocate memory for %d local ranks", fca_module->num_local_procs); return OMPI_ERROR; } i = 0; for (rank = 0; rank < ompi_comm_size(comm); ++rank) { proc = __local_rank_lookup(comm, rank); if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { fca_module->local_ranks[i++] = rank; } } FCA_MODULE_VERBOSE(fca_module, 3, "i am %d/%d", fca_module->local_proc_idx, fca_module->num_local_procs); return OMPI_SUCCESS; }
static int __fca_comm_new(mca_coll_fca_module_t *fca_module) { ompi_communicator_t *comm = fca_module->comm; fca_comm_new_spec_t spec = {0,}; int info_size, all_info_size; void *all_info = NULL; void *my_info = NULL; int *rcounts = NULL; int *displs = NULL; int i, rc, ret, comm_size = ompi_comm_size(fca_module->comm); /* call fca_get_rank_info() on node managers only*/ if (fca_module->local_proc_idx == 0) { #if OMPI_FCA_VERSION >= 30 my_info = fca_get_rank_info(mca_coll_fca_component.fca_context, fca_module->rank, &info_size); #else my_info = fca_get_rank_info(mca_coll_fca_component.fca_context, &info_size); #endif if (!my_info) { FCA_ERROR("fca_get_rank_info returned NULL"); return OMPI_ERROR; } } else { info_size = 0; } FCA_MODULE_VERBOSE(fca_module, 1, "Info size: %d", info_size); /* Allocate gather buffer on the root rank */ if (fca_module->rank == 0) { rcounts = calloc(comm_size, sizeof *rcounts); } /* Get all rank info sizes using MPI_Gather */ rc = comm->c_coll.coll_gather(&info_size, 1, MPI_INT, rcounts, 1, MPI_INT, 0, comm, comm->c_coll.coll_gather_module); if (rc != OMPI_SUCCESS) return rc; /* Allocate buffer for gathering rank information on rank0 */ if (fca_module->rank == 0) { all_info_size = 0; displs = calloc(comm_size, sizeof *displs); for (i = 0; i < comm_size; ++i) { displs[i] = all_info_size; all_info_size += rcounts[i]; if (rcounts[i] > 0) ++spec.rank_count; FCA_MODULE_VERBOSE(fca_module, 1, "gatherv: rcounts[%d]=%d displs[%d]=%d", i, rcounts[i], i, displs[i]); } FCA_MODULE_VERBOSE(fca_module, 1, "Total rank_info size: %d", all_info_size); all_info = calloc(all_info_size, 1); } /* Send all node managers information to rank0 using MPI_Gatherv */ rc = comm->c_coll.coll_gatherv(my_info, info_size, MPI_BYTE, all_info, rcounts, displs, MPI_BYTE, 0, comm, comm->c_coll.coll_gather_module); if (rc != OMPI_SUCCESS) { FCA_ERROR("Failed to gather rank information to rank0: %d", rc); return rc; } /* Rank0 calls fca_comm_new() and fills fca_comm_spec filed */ if (fca_module->rank == 0) { spec.rank_info = all_info; spec.is_comm_world = comm == MPI_COMM_WORLD; free(displs); free(rcounts); #if OMPI_FCA_VERSION >= 30 spec.comm_size = comm_size; spec.comm_init_data = NULL; spec.comm_init_data_size = 0; #endif FCA_MODULE_VERBOSE(fca_module, 1, "starting fca_comm_new(), rank_count: %d", spec.rank_count); ret = fca_comm_new(mca_coll_fca_component.fca_context, &spec, &fca_module->fca_comm_desc); free(all_info); } /* Broadcast return value from rank0 to all other ranks */ rc = fca_module->previous_bcast(&ret, 1, MPI_INT, 0, comm, fca_module->previous_bcast_module); if (rc != OMPI_SUCCESS) { FCA_ERROR("Failed to broadcast comm_new return value from rank0: %d", rc); return rc; } /* Examine comm_new return value */ if (ret < 0) { FCA_ERROR("COMM_NEW failed: %s", fca_strerror(ret)); return OMPI_ERROR; } #if OMPI_FCA_VERSION >= 30 /* Send comm_ini_data_size to all ranks in comm */ rc = fca_module->previous_bcast(&spec.comm_init_data_size, 1, MPI_INT, 0, comm, fca_module->previous_bcast_module); if (OMPI_SUCCESS != rc) { FCA_ERROR("Failed to broadcast comm init data size value from rank0: %d", rc); return rc; } if (0 != fca_module->rank && NULL == (spec.comm_init_data = calloc(1, spec.comm_init_data_size))) { FCA_ERROR("Failed to allocate memory for comm init data."); return OMPI_ERROR; } /* Send comm_ini_data to all ranks in comm */ rc = fca_module->previous_scatter(spec.comm_init_data, spec.comm_init_data_size, MPI_BYTE, spec.comm_init_data, spec.comm_init_data_size, MPI_BYTE, 0, comm, fca_module->previous_scatter_module); if (OMPI_SUCCESS != rc) { FCA_ERROR("Failed to scatter comm_init sizes return value from rank0: %d", rc); return rc; } #endif /* Release allocate rank_info on node managers */ if (fca_module->local_proc_idx == 0) { fca_free_rank_info(my_info); } /* Pass fca_comm_desc to all ranks using MPI_Bcast */ rc = fca_module->previous_bcast(&fca_module->fca_comm_desc, sizeof(fca_module->fca_comm_desc), MPI_BYTE, 0, comm, fca_module->previous_bcast_module); if (rc != OMPI_SUCCESS) { FCA_ERROR("Failed to broadcast comm_desc from rank0: %d", rc); return rc; } FCA_MODULE_VERBOSE(fca_module, 1, "Received FCA communicator spec, comm_id %d", fca_module->fca_comm_desc.comm_id); #if OMPI_FCA_VERSION >= 30 /* allocate comm_init_spec */ FCA_MODULE_VERBOSE(fca_module, 1, "Starting COMM_INIT comm_id %d proc_idx %d num_procs %d", fca_module->fca_comm_desc.comm_id, fca_module->local_proc_idx, fca_module->num_local_procs); ret = mca_coll_fca_comm_init(mca_coll_fca_component.fca_context, fca_module->rank, comm_size, fca_module->local_proc_idx, fca_module->num_local_procs, &fca_module->fca_comm_desc, &fca_module->fca_comm, spec.comm_init_data); if (ret < 0) { FCA_ERROR("COMM_INIT failed: %s", fca_strerror(ret)); return OMPI_ERROR; } if (0 != fca_module->rank) { free(spec.comm_init_data); } #endif return OMPI_SUCCESS; }
static int __create_fca_comm(mca_coll_fca_module_t *fca_module) { int rc, ret; int result = MPI_UNEQUAL; mca_coll_fca_c_cache_item_t *c_item = NULL, *c_item_new = NULL; mca_coll_fca_component_t *cm = &mca_coll_fca_component; int comm_size = ompi_comm_size(fca_module->comm); ompi_communicator_t *comm = fca_module->comm; opal_list_t *c_cache; struct timeval start, end, seq_start, seq_end, par_start, par_end; int act_deep; if(mca_coll_fca_component.fca_verbose == 10) { gettimeofday(&start, NULL); } if(mca_coll_fca_component.fca_enable_cache) { c_cache = &cm->c_cache; if(mca_coll_fca_component.fca_enable_hash){ int grank = ORTE_PROC_MY_NAME->vpid; int hash_index, part_of_hash_index; if(mca_coll_fca_component.fca_parallel_hash_calc == 1) { if(mca_coll_fca_component.fca_verbose == 10){ gettimeofday(&par_start, NULL); } part_of_hash_index = modular_pow(cm->fca_primes[grank % cm->fca_number_of_primes], grank, cm->fca_hash_size); rc = comm->c_coll.coll_allreduce(&part_of_hash_index, &hash_index, 1, MPI_INT, MPI_SUM, comm, comm->c_coll.coll_allreduce_module); if (rc != OMPI_SUCCESS) { FCA_ERROR("Failed to reduce hash_index: %d", rc); return rc; } if(mca_coll_fca_component.fca_verbose == 10){ gettimeofday(&par_end, NULL); mca_coll_fca_component.fca_work_time_parallel =+ par_end.tv_sec - par_start.tv_sec + 1e-6 * (par_end.tv_usec - par_start.tv_usec); } }else{ if(mca_coll_fca_component.fca_verbose == 10){ gettimeofday(&seq_start, NULL); } hash_index = 0; int q_counter = 0; int q_comm_size = ompi_comm_size (comm); for(q_counter = 0; q_counter < q_comm_size; q_counter++) { hash_index += modular_pow(cm->fca_primes[q_counter % cm->fca_number_of_primes], q_counter, cm->fca_hash_size); } if(mca_coll_fca_component.fca_verbose == 10){ gettimeofday(&seq_end, NULL); mca_coll_fca_component.fca_work_time_sequency =+ seq_end.tv_sec - seq_start.tv_sec + 1e-6 * (seq_end.tv_usec - seq_start.tv_usec); } } if(cm->fca_hash[hash_index % cm->fca_hash_size] != NULL) { c_cache = cm->fca_hash[hash_index % cm->fca_hash_size]; if(mca_coll_fca_component.fca_verbose == 10) { gettimeofday(&end, NULL); mca_coll_fca_component.fca_total_work_time =+ end.tv_sec - start.tv_sec + 1e-6 * (end.tv_usec - start.tv_usec); mca_coll_fca_component.fca_hash_hit += 1; } }else { if(mca_coll_fca_component.fca_verbose == 10) { mca_coll_fca_component.fca_hash_miss += 1; } c_cache = OBJ_NEW(opal_list_t); cm->fca_hash[hash_index % cm->fca_hash_size] = c_cache; } } act_deep = 0; for( c_item = (mca_coll_fca_c_cache_item_t *)opal_list_get_first(c_cache); c_item != (mca_coll_fca_c_cache_item_t *)opal_list_get_end(c_cache); c_item = (mca_coll_fca_c_cache_item_t *)opal_list_get_next((opal_list_item_t *) c_item)){ act_deep++; /* first check the size */ if( c_item && (comm_size == c_item->size)) { /* then we have a potential cache hit */ ompi_comm_compare(comm, c_item->comm, &result); if( MPI_CONGRUENT == result) { /* cache hit! Return the context and be done with it */ /* first bump the score */ ret = fca_comm_get_caps(c_item->fca_comm_wrap->fca_comm, &fca_module->fca_comm_caps); if (ret < 0) { FCA_ERROR("GET_COMM_CAPS failed: %s", fca_strerror(ret)); return OMPI_ERROR; } fca_module->fca_comm = c_item->fca_comm_wrap->fca_comm; if(mca_coll_fca_component.fca_verbose == 10) { gettimeofday(&end, NULL); mca_coll_fca_component.fca_total_work_time =+ end.tv_sec - start.tv_sec + 1e-6 * (end.tv_usec - start.tv_usec); mca_coll_fca_component.fca_cache_hit += 1; if(act_deep>mca_coll_fca_component.fca_max_deep_in_cache) mca_coll_fca_component.fca_max_deep_in_cache = act_deep; } return OMPI_SUCCESS; } } } } rc = __fca_comm_new(fca_module); if (OMPI_SUCCESS != rc) { return rc; } #if OMPI_FCA_VERSION < 30 /* allocate comm_init_spec */ FCA_MODULE_VERBOSE(fca_module, 1, "Starting COMM_INIT comm_id %d proc_idx %d num_procs %d", fca_module->fca_comm_desc.comm_id, fca_module->local_proc_idx, fca_module->num_local_procs); ret = mca_coll_fca_comm_init(mca_coll_fca_component.fca_context, fca_module->rank, ompi_comm_size(fca_module->comm), fca_module->local_proc_idx, fca_module->num_local_procs, &fca_module->fca_comm_desc, &fca_module->fca_comm); if (ret < 0) { FCA_ERROR("COMM_INIT failed: %s", fca_strerror(ret)); return OMPI_ERROR; } #endif /* get communicator capabilities */ ret = fca_comm_get_caps(fca_module->fca_comm, &fca_module->fca_comm_caps); if (ret < 0) { FCA_ERROR("GET_COMM_CAPS failed: %s", fca_strerror(ret)); return OMPI_ERROR; } /* by this point every rank in the communicator is set up */ FCA_MODULE_VERBOSE(fca_module, 1, "Initialized FCA communicator, comm_id %d", fca_module->fca_comm_desc.comm_id); if(mca_coll_fca_component.fca_enable_cache) { c_item_new = OBJ_NEW(mca_coll_fca_c_cache_item_t); c_item_new->fca_comm_wrap = OBJ_NEW(mca_coll_fca_comm_wrap_t); OBJ_RETAIN(comm); c_item_new->size = comm_size; c_item_new->comm = comm; c_item_new->fca_comm_wrap->fca_comm = fca_module->fca_comm; c_item_new->fca_comm_wrap->rank = fca_module->rank; c_item_new->fca_comm_wrap->comm_id = fca_module->fca_comm_desc.comm_id; opal_list_append(c_cache,(opal_list_item_t *) c_item_new); } if(mca_coll_fca_component.fca_verbose == 10) { gettimeofday(&end, NULL); mca_coll_fca_component.fca_total_work_time =+ end.tv_sec - start.tv_sec + 1e-6 * (end.tv_usec - start.tv_usec); mca_coll_fca_component.fca_cache_miss += 1; } return OMPI_SUCCESS; }
/* * * Invoked when there's a new communicator that has been created. * * Look at the communicator and decide which set of functions and * * priority we want to return. * */ mca_scoll_base_module_t * mca_scoll_fca_comm_query(struct oshmem_group_t *comm, int *priority) { mca_scoll_base_module_t *module; int size = comm->proc_count; int local_peers = 0; mca_scoll_fca_module_t *fca_module; *priority = 0; module = NULL; if (!mca_scoll_fca_component.fca_enable) { FCA_VERBOSE(20, "FCA is disable on user request => exiting"); goto exit; } if (mca_memheap.memheap_component == NULL ) { FCA_VERBOSE(20, "No memheap => exiting"); goto exit; } if (NULL == mca_scoll_fca_component.ret) { MCA_MEMHEAP_CALL(private_alloc(sizeof(int),(void **)&mca_scoll_fca_component.ret)); MCA_MEMHEAP_CALL(private_alloc(oshmem_group_all->proc_count*sizeof(*mca_scoll_fca_component.rcounts), (void **)&mca_scoll_fca_component.rcounts )); MCA_MEMHEAP_CALL(private_alloc(/*info_size*/20,&mca_scoll_fca_component.my_info_exchangeable)); MCA_MEMHEAP_CALL(private_alloc(sizeof(fca_comm_desc_t), &mca_scoll_fca_component.fca_comm_desc_exchangeable)); } if (size < mca_scoll_fca_component.fca_np) { FCA_VERBOSE(20, "size(%d) < fca_np(%d)", size, mca_scoll_fca_component.fca_np); goto exit; } if (size < 2) { FCA_VERBOSE(20, "size(%d) < 2", size); goto exit; } if (!have_remote_peers(comm, size, &local_peers) /* || OMPI_COMM_IS_INTER(comm)*/) { FCA_VERBOSE(1, "all peers in group are on the same node, fca disabled\n"); goto exit; } fca_module = OBJ_NEW(mca_scoll_fca_module_t); if (!fca_module) { goto exit_fatal; } fca_module->super.scoll_module_enable = mca_scoll_fca_module_enable; fca_module->super.scoll_collect = mca_scoll_fca_component.fca_enable_allgather ? mca_scoll_fca_collect : NULL; fca_module->super.scoll_reduce = mca_scoll_fca_component.fca_enable_allreduce ? mca_scoll_fca_reduce : NULL; fca_module->super.scoll_barrier = mca_scoll_fca_component.fca_enable_barrier ? mca_scoll_fca_barrier : NULL; fca_module->super.scoll_broadcast = mca_scoll_fca_component.fca_enable_bcast ? mca_scoll_fca_broadcast : NULL; *priority = mca_scoll_fca_component.fca_priority; module = &fca_module->super; exit: FCA_VERBOSE(4, "Query FCA module for comm %p size %d rank %d local_peers=%d: priority=%d %s", (void *)comm, size, comm->my_pe, local_peers, *priority, module ? "enabled" : "disabled"); return module; exit_fatal: /* it is possible that other pe(s) succesfully initialized fca. * So differnt frameworks will be used for collective ops */ FCA_ERROR("FCA module query failed - aborting"); oshmem_shmem_abort(-1); return NULL ; }
static int _fca_comm_new(mca_scoll_fca_module_t *fca_module) { struct oshmem_group_t *comm = fca_module->comm; fca_comm_new_spec_t spec; int info_size = 0, all_info_size = 0; void *all_info = NULL, *my_info = NULL; int *disps = NULL; int i; const int root_pe = oshmem_proc_pe(comm->proc_array[root_id]); const int my_id = oshmem_proc_group_find_id(comm, comm->my_pe); /* call fca_get_rank_info() on node managers only*/ if (fca_module->local_proc_idx == 0) { my_info = fca_get_rank_info(mca_scoll_fca_component.fca_context, &info_size); if (!my_info) { FCA_ERROR("fca_get_rank_info returned NULL"); return OSHMEM_ERROR; } } else { info_size = 0; } FCA_MODULE_VERBOSE(fca_module, 1, "Info size: %d", info_size); for (i = 0; i < comm->proc_count; i++) { mca_scoll_fca_component.rcounts[i] = -1; } _internal_barrier(fca_module); MCA_SPML_CALL(put((void *)&mca_scoll_fca_component.rcounts[my_id], (size_t)sizeof(info_size), (void *)&info_size, root_pe)); if (root_pe == comm->my_pe) { int value = -1; for (i = 0; i < comm->proc_count; i++) { MCA_SPML_CALL(wait((void *)&mca_scoll_fca_component.rcounts[i], SHMEM_CMP_NE, &value, SHMEM_INT)); } } /* Allocate buffer for gathering rank information on rank0 */ if (root_pe == comm->my_pe) { all_info_size = 0; disps = calloc(comm->proc_count, sizeof *disps); for (i = 0; i < comm->proc_count; ++i) { disps[i] = all_info_size; all_info_size += mca_scoll_fca_component.rcounts[i]; } all_info = NULL; FCA_MODULE_VERBOSE(fca_module, 1, "Total rank_info size: %d", all_info_size); all_info = malloc(all_info_size); memset(all_info, 0, all_info_size); } if (my_info) { memcpy(mca_scoll_fca_component.my_info_exchangeable, my_info, info_size); } _internal_barrier(fca_module); if (root_pe == comm->my_pe) { for (i = 0; i < comm->proc_count; i++) { if (mca_scoll_fca_component.rcounts[i] > 0) { MCA_SPML_CALL(get((void *)mca_scoll_fca_component.my_info_exchangeable, mca_scoll_fca_component.rcounts[i], (void*)(((char*)all_info)+disps[i]),comm->proc_array[i]->super.proc_name.vpid)); } } } /* Rank0 calls fca_comm_new() and fills fca_comm_spec filed */ if (root_pe == comm->my_pe) { spec.rank_info = all_info; spec.is_comm_world = comm == oshmem_group_all; spec.rank_count = 0; for (i = 0; i < comm->proc_count; ++i) { FCA_MODULE_VERBOSE(fca_module, 1, "rcounts[%d]=%d disps[%d]=%d", i, mca_scoll_fca_component.rcounts[i], i, disps[i]); if (mca_scoll_fca_component.rcounts[i] > 0) ++spec.rank_count; } FCA_MODULE_VERBOSE(fca_module, 1, "starting fca_comm_new(), rank_count: %d", spec.rank_count); *mca_scoll_fca_component.ret = fca_comm_new(mca_scoll_fca_component.fca_context, &spec, &fca_module->fca_comm_desc); free(disps); free(all_info); } _internal_barrier(fca_module); if (root_pe != comm->my_pe) { MCA_SPML_CALL(get((void *)mca_scoll_fca_component.ret,sizeof(int), (void *)mca_scoll_fca_component.ret, root_pe)); } /* Examine comm_new return value */ _internal_barrier(fca_module); if (*mca_scoll_fca_component.ret < 0) { FCA_ERROR("rank %i: COMM_NEW failed: %s", fca_module->rank, fca_strerror(*mca_scoll_fca_component.ret)); return OSHMEM_ERROR; } /* Release allocate rank_info on node managers */ if (fca_module->local_proc_idx == 0) { fca_free_rank_info(my_info); } { if (root_pe == comm->my_pe) { memcpy(mca_scoll_fca_component.fca_comm_desc_exchangeable, &fca_module->fca_comm_desc, sizeof(fca_module->fca_comm_desc)); } _internal_barrier(fca_module); if (root_pe != comm->my_pe) { MCA_SPML_CALL(get((void *)mca_scoll_fca_component.fca_comm_desc_exchangeable, sizeof(fca_module->fca_comm_desc), (void *)&fca_module->fca_comm_desc, root_pe)); } _internal_barrier(fca_module); } FCA_MODULE_VERBOSE(fca_module, 1, "Received FCA communicator spec, comm_id %d", fca_module->fca_comm_desc.comm_id); return OSHMEM_SUCCESS; }
int mca_coll_fca_allgatherv(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_fca_module_t *fca_module = (mca_coll_fca_module_t*)module; #if OMPI_FCA_ALLGATHER == 1 MCA_COLL_FCA_DECLARE_CONVERTOR(sconv); MCA_COLL_FCA_DECLARE_CONVERTOR(rconv); fca_gatherv_spec_t spec; size_t rgap, rsize; int sum_rcounts; ptrdiff_t rdtype_extent; int comm_size; int relemsize; size_t displ; int i, ret; comm_size = ompi_comm_size(fca_module->comm); FCA_DT_EXTENT(rdtype, &rdtype_extent); /* Setup send buffer */ spec.sendsize = __setup_gather_sendbuf(sbuf, (char *)rbuf + disps[fca_module->rank] * rdtype_extent, scount, sdtype, &sconv, &spec.sbuf); /* Allocate alternative recvsizes/displs on the stack, which will be in bytes */ spec.recvsizes = alloca(sizeof *spec.recvsizes * comm_size); spec.displs = alloca(sizeof *spec.displs * comm_size); /* Calculate the size of receive buffer */ sum_rcounts = 0; for (i = 0; i < comm_size; ++i) { sum_rcounts += rcounts[i]; } /* convert MPI counts which depend on dtype) to FCA sizes (which are in bytes) */ if (mca_coll_fca_array_size(rdtype, sum_rcounts, &rgap, &rsize) && rgap == 0) { spec.rbuf = rbuf; for (i = 0; i < comm_size; ++i) { spec.recvsizes[i] = rcounts[i] * rdtype_extent; spec.displs[i] = disps[i] * rdtype_extent; } } else { /* * Reorder and remove gaps in displs - we want to allocate as little memory * as possible, and we should unpack one-by-one anyway. */ FCA_VERBOSE(5, "Reordering AllgatherV displacements"); mca_coll_fca_convertor_create(&rconv, rdtype, sum_rcounts, rbuf, MCA_COLL_FCA_CONV_RECV, &spec.rbuf, &rsize); assert(rsize % sum_rcounts == 0); relemsize = rsize / sum_rcounts; displ = 0; for (i = 0; i < comm_size; ++i) { spec.recvsizes[i] = rcounts[i] * relemsize; spec.displs[i] = displ; displ += spec.recvsizes[i]; } assert(displ == rsize); } /* Call FCA AllgatherV */ FCA_VERBOSE(5,"Using FCA Allgatherv"); ret = mca_coll_fca_component.fca_ops.do_allgatherv(fca_module->fca_comm, &spec); /* Destroy convertors if operation failed */ if (ret < 0) { mca_coll_fca_convertor_destroy(&sconv); mca_coll_fca_convertor_destroy(&rconv); if (ret == -EUSEMPI) { goto orig_allgatherv; } FCA_ERROR("Allgatherv failed: %s", mca_coll_fca_component.fca_ops.strerror(ret)); return OMPI_ERROR; } /* Unpack data and clean up convertor */ mca_coll_fca_convertor_destroy(&sconv); if (mca_coll_fca_convertor_valid(&rconv)) { FCA_VERBOSE(5, "Unpacking AllgatherV receive buffer rdtype_extent=%ld", rdtype_extent); for (i = 0; i < comm_size; ++i) { mca_coll_fca_convertor_set(&rconv, rdtype, (char*)rbuf + disps[i] * rdtype_extent, rcounts[i]); mca_coll_fca_convertor_process(&rconv, spec.displs[i]); } mca_coll_fca_convertor_destroy(&rconv); } return OMPI_SUCCESS; orig_allgatherv: #endif return fca_module->previous_allgatherv(sbuf, scount, sdtype, rbuf, rcounts, disps, rdtype, comm, fca_module->previous_allgatherv_module); }
/* * Allgather * * Function: - allgather * Accepts: - same as MPI_Allgather() * Returns: - MPI_SUCCESS or error code */ int mca_coll_fca_allgather(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_fca_module_t *fca_module = (mca_coll_fca_module_t*)module; #if OMPI_FCA_ALLGATHER == 1 MCA_COLL_FCA_DECLARE_CONVERTOR(sconv); MCA_COLL_FCA_DECLARE_CONVERTOR(rconv); fca_gather_spec_t spec = {0,}; size_t rgap, rsize; ptrdiff_t rdtype_extent; ssize_t total_rcount; int ret; FCA_DT_EXTENT(rdtype, &rdtype_extent); /* Setup send buffer */ spec.size = __setup_gather_sendbuf(sbuf, (char *)rbuf + rcount * fca_module->rank * rdtype_extent, scount, sdtype, &sconv, &spec.sbuf); /* Setup recv buffer */ total_rcount = ompi_comm_size(comm) * rcount; if (mca_coll_fca_array_size(rdtype, total_rcount, &rgap, &rsize) && rgap == 0) { spec.rbuf = rbuf; } else { mca_coll_fca_convertor_create(&rconv, rdtype, total_rcount, rbuf, MCA_COLL_FCA_CONV_RECV, &spec.rbuf, &rsize); } /* Call FCA Allgather */ FCA_VERBOSE(5,"Using FCA Allgather size"); ret = mca_coll_fca_component.fca_ops.do_allgather(fca_module->fca_comm, &spec); /* Destroy convertors if operation failed */ if (ret < 0) { mca_coll_fca_convertor_destroy(&sconv); mca_coll_fca_convertor_destroy(&rconv); if (ret == -EUSEMPI) { goto orig_allgather; } FCA_ERROR("Allgather failed: %s", mca_coll_fca_component.fca_ops.strerror(ret)); return OMPI_ERROR; } /* Unpack data and clean up convertor */ mca_coll_fca_convertor_destroy(&sconv); if (mca_coll_fca_convertor_valid(&rconv)) { FCA_VERBOSE(5, "Unpacking Allgather receive buffer"); mca_coll_fca_convertor_process(&rconv, 0); mca_coll_fca_convertor_destroy(&rconv); } return OMPI_SUCCESS; orig_allgather: #endif return fca_module->previous_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, fca_module->previous_allgather_module); }
/* * Function: - broadcast * Accepts: - same arguments as MPI_Bcast() * Returns: - MPI_SUCCESS or error code */ int mca_coll_fca_bcast(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_fca_module_t *fca_module = (mca_coll_fca_module_t*)module; MCA_COLL_FCA_DECLARE_CONVERTOR(conv); fca_bcast_spec_t spec; size_t gap, size; int ret; FCA_VERBOSE(5, "[%d] Calling mca_coll_fca_bcast, root=%d, count=%d", ompi_comm_rank(comm), root, count); /* Setup exchange buffer */ spec.root = root; if (mca_coll_fca_array_size(datatype, count, &gap, &size)) { spec.buf = buff + gap; } else { mca_coll_fca_convertor_create(&conv, datatype, count, buff, (root == fca_module->rank) ? MCA_COLL_FCA_CONV_SEND : MCA_COLL_FCA_CONV_RECV, &spec.buf, &size); } /* Check that operation size does not exceed limit */ spec.size = size; if (spec.size > fca_module->fca_comm_caps.max_payload) { FCA_VERBOSE(5, "Unsupported bcast operation size %d, using fallback", spec.size); if (spec.buf != buff) { mca_coll_fca_convertor_destroy(&conv); } goto orig_bcast; } /* Sender may pack data */ if (spec.buf != buff && root == fca_module->rank) { mca_coll_fca_convertor_process(&conv, 0); } /* Call FCA Bcast */ FCA_VERBOSE(5, "Using FCA Bcast"); ret = mca_coll_fca_component.fca_ops.do_bcast(fca_module->fca_comm, &spec); /* Destroy convertor if operation failed */ if (ret < 0) { mca_coll_fca_convertor_destroy(&conv); if (ret == -EUSEMPI) { goto orig_bcast; } FCA_ERROR("Bcast failed: %s", mca_coll_fca_component.fca_ops.strerror(ret)); return OMPI_ERROR; } /* Unpack data and clean up convertor */ if (mca_coll_fca_convertor_valid(&conv)) { if (root != fca_module->rank) { mca_coll_fca_convertor_process(&conv, 0); } mca_coll_fca_convertor_destroy(&conv); } return OMPI_SUCCESS; orig_bcast: return fca_module->previous_bcast(buff, count, datatype, root, comm, fca_module->previous_bcast_module); }