/* FIXME: this is not a scalable algorithm because everyone is polling on the same cacheline */ int MPID_nem_barrier(void) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_BARRIER); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_BARRIER); if (MPID_nem_mem_region.num_local == 1) goto fn_exit; MPIR_ERR_CHKINTERNAL(!barrier_init, mpi_errno, "barrier not initialized"); if (OPA_fetch_and_incr_int(&MPID_nem_mem_region.barrier->val) == MPID_nem_mem_region.num_local - 1) { OPA_store_int(&MPID_nem_mem_region.barrier->val, 0); OPA_store_int(&MPID_nem_mem_region.barrier->wait, 1 - sense); OPA_write_barrier(); } else { /* wait */ while (OPA_load_int(&MPID_nem_mem_region.barrier->wait) == sense) MPL_sched_yield(); /* skip */ } sense = 1 - sense; fn_fail: fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_BARRIER); return mpi_errno; }
/* * MPID_Get_universe_size - Get the universe size from the process manager * * Notes: The ch3 device requires that the PMI routines are used to * communicate with the process manager. If a channel wishes to * bypass the standard PMI implementations, it is the responsibility of the * channel to provide an implementation of the PMI routines. */ int MPID_Get_universe_size(int * universe_size) { int mpi_errno = MPI_SUCCESS; #ifdef USE_PMI2_API char val[PMI2_MAX_VALLEN]; int found = 0; char *endptr; mpi_errno = PMI2_Info_GetJobAttr("universeSize", val, sizeof(val), &found); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (!found) *universe_size = MPIR_UNIVERSE_SIZE_NOT_AVAILABLE; else { *universe_size = strtol(val, &endptr, 0); MPIR_ERR_CHKINTERNAL(endptr - val != strlen(val), mpi_errno, "can't parse universe size"); } #else int pmi_errno = PMI_SUCCESS; pmi_errno = PMI_Get_universe_size(universe_size); if (pmi_errno != PMI_SUCCESS) { MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**pmi_get_universe_size", "**pmi_get_universe_size %d", pmi_errno); } if (*universe_size < 0) { *universe_size = MPIR_UNIVERSE_SIZE_NOT_AVAILABLE; } #endif fn_exit: return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: *universe_size = MPIR_UNIVERSE_SIZE_NOT_AVAILABLE; goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3_Connect_to_root (const char *port_name, MPIDI_VC_t **new_vc) { int mpi_errno = MPI_SUCCESS; MPIDI_VC_t * vc; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); *new_vc = NULL; /* so that the err handling knows to cleanup */ MPIU_CHKPMEM_MALLOC (vc, MPIDI_VC_t *, sizeof(MPIDI_VC_t), mpi_errno, "vc"); /* FIXME - where does this vc get freed? ANSWER (goodell@) - ch3u_port.c FreeNewVC (but the VC_Destroy is in this file) */ /* init ch3 portion of vc */ MPIDI_VC_Init (vc, NULL, 0); /* init channel portion of vc */ MPIR_ERR_CHKINTERNAL(!nemesis_initialized, mpi_errno, "Nemesis not initialized"); vc->ch.recv_active = NULL; MPIDI_CHANGE_VC_STATE(vc, ACTIVE); *new_vc = vc; /* we now have a valid, disconnected, temp VC */ mpi_errno = MPID_nem_connect_to_root (port_name, vc); if (mpi_errno) MPIR_ERR_POP (mpi_errno); MPIU_CHKPMEM_COMMIT(); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); return mpi_errno; fn_fail: /* freeing without giving the lower layer a chance to cleanup can lead to leaks on error */ if (*new_vc) MPIDI_CH3_VC_Destroy(*new_vc); MPIU_CHKPMEM_REAP(); goto fn_exit; }
int MPID_Create_intercomm_from_lpids( MPID_Comm *newcomm_ptr, int size, const int lpids[] ) { int mpi_errno = MPI_SUCCESS; MPID_Comm *commworld_ptr; int i; MPIDI_PG_iterator iter; commworld_ptr = MPIR_Process.comm_world; /* Setup the communicator's vc table: remote group */ MPIDI_VCRT_Create( size, &newcomm_ptr->dev.vcrt ); for (i=0; i<size; i++) { MPIDI_VC_t *vc = 0; /* For rank i in the new communicator, find the corresponding virtual connection. For lpids less than the size of comm_world, we can just take the corresponding entry from comm_world. Otherwise, we need to search through the process groups. */ /* printf( "[%d] Remote rank %d has lpid %d\n", MPIR_Process.comm_world->rank, i, lpids[i] ); */ if (lpids[i] < commworld_ptr->remote_size) { vc = commworld_ptr->dev.vcrt->vcr_table[lpids[i]]; } else { /* We must find the corresponding vcr for a given lpid */ /* For now, this means iterating through the process groups */ MPIDI_PG_t *pg = 0; int j; MPIDI_PG_Get_iterator(&iter); /* Skip comm_world */ MPIDI_PG_Get_next( &iter, &pg ); do { MPIDI_PG_Get_next( &iter, &pg ); MPIR_ERR_CHKINTERNAL(!pg, mpi_errno, "no pg"); /* FIXME: a quick check on the min/max values of the lpid for this process group could help speed this search */ for (j=0; j<pg->size; j++) { /*printf( "Checking lpid %d against %d in pg %s\n", lpids[i], pg->vct[j].lpid, (char *)pg->id ); fflush(stdout); */ if (pg->vct[j].lpid == lpids[i]) { vc = &pg->vct[j]; /*printf( "found vc %x for lpid = %d in another pg\n", (int)vc, lpids[i] );*/ break; } } } while (!vc); } /* printf( "about to dup vc %x for lpid = %d in another pg\n", (int)vc, lpids[i] ); */ /* Note that his will increment the ref count for the associate PG if necessary. */ MPIDI_VCR_Dup( vc, &newcomm_ptr->dev.vcrt->vcr_table[i] ); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Get_failed_group(int last_rank, MPIR_Group **failed_group) { char *c; int i, mpi_errno = MPI_SUCCESS, rank; UT_array *failed_procs = NULL; MPIR_Group *world_group; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_FAILED_GROUP); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_FAILED_GROUP); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Getting failed group with %d as last acknowledged\n", last_rank); if (-1 == last_rank) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "No failure acknowledged"); *failed_group = MPIR_Group_empty; goto fn_exit; } if (*MPIDI_failed_procs_string == '\0') { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found no failed ranks"); *failed_group = MPIR_Group_empty; goto fn_exit; } utarray_new(failed_procs, &ut_int_icd); /* parse list of failed processes. This is a comma separated list of ranks or ranges of ranks (e.g., "1, 3-5, 11") */ i = 0; c = MPIDI_failed_procs_string; while(1) { parse_rank(&rank); ++i; MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found failed rank: %d", rank); utarray_push_back(failed_procs, &rank); MPIDI_last_known_failed = rank; MPIR_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list"); if (*c == '\0' || last_rank == rank) break; ++c; /* skip ',' */ } /* Create group of failed processes for comm_world. Failed groups for other communicators can be created from this one using group_intersection. */ mpi_errno = MPIR_Comm_group_impl(MPIR_Process.comm_world, &world_group); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_incl_impl(world_group, i, ut_int_array(failed_procs), failed_group); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_release(world_group); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_FAILED_GROUP); if (failed_procs) utarray_free(failed_procs); return mpi_errno; fn_oom: MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "utarray"); fn_fail: goto fn_exit; }
/* Returns MPI_SUCCESS on success, an MPI error code on failure. Code above * needs to call MPIO_Err_return_xxx. */ static int MPII_Type_cyclic(int *array_of_gsizes, int dim, int ndims, int nprocs, int rank, int darg, int order, MPI_Aint orig_extent, MPI_Datatype type_old, MPI_Datatype * type_new, MPI_Aint * st_offset) { /* nprocs = no. of processes in dimension dim of grid rank = coordinate of this process in dimension dim */ int mpi_errno = MPI_SUCCESS; int blksize, i, blklens[3], st_index, end_index, local_size, rem, count; MPI_Aint stride, disps[3]; MPI_Datatype type_tmp, types[3]; if (darg == MPI_DISTRIBUTE_DFLT_DARG) blksize = 1; else blksize = darg; MPIR_ERR_CHKINTERNAL(blksize <= 0, mpi_errno, "blksize must be > 0"); st_index = rank * blksize; end_index = array_of_gsizes[dim] - 1; if (end_index < st_index) local_size = 0; else { local_size = ((end_index - st_index + 1) / (nprocs * blksize)) * blksize; rem = (end_index - st_index + 1) % (nprocs * blksize); local_size += (rem < blksize) ? rem : blksize; } count = local_size / blksize; rem = local_size % blksize; stride = ((MPI_Aint) nprocs) * ((MPI_Aint) blksize) * orig_extent; if (order == MPI_ORDER_FORTRAN) for (i = 0; i < dim; i++) stride *= (MPI_Aint) (array_of_gsizes[i]); else for (i = ndims - 1; i > dim; i--) stride *= (MPI_Aint) (array_of_gsizes[i]); mpi_errno = MPIR_Type_hvector_impl(count, blksize, stride, type_old, type_new); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (rem) { /* if the last block is of size less than blksize, include * it separately using MPI_Type_struct */ types[0] = *type_new; types[1] = type_old; disps[0] = 0; disps[1] = ((MPI_Aint) count) * stride; blklens[0] = 1; blklens[1] = rem; mpi_errno = MPIR_Type_struct_impl(2, blklens, disps, types, &type_tmp); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Type_free_impl(type_new); *type_new = type_tmp; } /* In the first iteration, we need to set the displacement in that * dimension correctly. */ if (((order == MPI_ORDER_FORTRAN) && (dim == 0)) || ((order == MPI_ORDER_C) && (dim == ndims - 1))) { types[0] = MPI_LB; disps[0] = 0; types[1] = *type_new; disps[1] = ((MPI_Aint) rank) * ((MPI_Aint) blksize) * orig_extent; types[2] = MPI_UB; disps[2] = orig_extent * ((MPI_Aint) (array_of_gsizes[dim])); blklens[0] = blklens[1] = blklens[2] = 1; mpi_errno = MPIR_Type_struct_impl(3, blklens, disps, types, &type_tmp); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Type_free_impl(type_new); *type_new = type_tmp; *st_offset = 0; /* set it to 0 because it is taken care of in * the struct above */ } else { *st_offset = ((MPI_Aint) rank) * ((MPI_Aint) blksize); /* st_offset is in terms of no. of elements of type oldtype in * this dimension */ } if (local_size == 0) *st_offset = 0; fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
static int MPII_Type_block(int *array_of_gsizes, int dim, int ndims, int nprocs, int rank, int darg, int order, MPI_Aint orig_extent, MPI_Datatype type_old, MPI_Datatype * type_new, MPI_Aint * st_offset) { /* nprocs = no. of processes in dimension dim of grid rank = coordinate of this process in dimension dim */ int mpi_errno = MPI_SUCCESS; int blksize, global_size, mysize, i, j; MPI_Aint stride; global_size = array_of_gsizes[dim]; if (darg == MPI_DISTRIBUTE_DFLT_DARG) blksize = (global_size + nprocs - 1) / nprocs; else { blksize = darg; MPIR_ERR_CHKINTERNAL(blksize <= 0, mpi_errno, "blksize must be > 0"); MPIR_ERR_CHKINTERNAL(blksize * nprocs < global_size, mpi_errno, "blksize * nprocs must be >= global size"); } j = global_size - blksize * rank; mysize = (blksize < j) ? blksize : j; if (mysize < 0) mysize = 0; stride = orig_extent; if (order == MPI_ORDER_FORTRAN) { if (dim == 0) { mpi_errno = MPIR_Type_contiguous_impl(mysize, type_old, type_new); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { for (i = 0; i < dim; i++) stride *= (MPI_Aint) (array_of_gsizes[i]); mpi_errno = MPIR_Type_hvector_impl(mysize, 1, stride, type_old, type_new); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } else { if (dim == ndims - 1) { mpi_errno = MPIR_Type_contiguous_impl(mysize, type_old, type_new); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { for (i = ndims - 1; i > dim; i--) stride *= (MPI_Aint) (array_of_gsizes[i]); mpi_errno = MPIR_Type_hvector_impl(mysize, 1, stride, type_old, type_new); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } *st_offset = blksize * rank; /* in terms of no. of elements of type oldtype in this dimension */ if (mysize == 0) *st_offset = 0; fn_exit: return mpi_errno; fn_fail: goto fn_exit; }