/* -------------------------------------------------------------------- *\ DDI_Create(idim,jdim,handle) ============================ [IN] idim - Number of rows in the array to be created. [IN] jdim - Number of columns in the array to be created. [OUT] handle - Handle given to the newly created array. Creates a distributed array with the columns evenly divided amongst the processors. \* -------------------------------------------------------------------- */ void DDI_Create(int idim,int jdim,int *handle) { /* --------------- *\ Local Variables \* --------------- */ int i,np,me; int icol,mincol,lftcol; int jcols[MAX_PROCESSORS]; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); DEBUG_ROOT(LVL1,(stdout," DDI: Entering DDI_Create.\n")) DEBUG_OUT(LVL3,(stdout,"%s: Entering DDI_Create.\n",DDI_Id())) np = comm->np; me = comm->me; /* if(jdim < np && me == 0) { fprintf(stdout," DDI Error: Trying to create an array with fewer columns than processors.\n"); fprintf(stdout," DDI Error: Reduce the number of processors and try again.\n"); Fatal_error(911); } */ mincol = jdim / np; lftcol = jdim % np; for(i=0,icol=0; i<np; i++) { jcols[i] = icol; icol += mincol; if(i<lftcol) icol++; } DDI_Create_custom(idim,jdim,jcols,handle); DEBUG_ROOT(LVL2,(stdout," DDI: Array[%i] successfully created.\n",*handle)) }
void Comm_divide(int ngroups, int comm_id, int *new_comm_id) { int i,in,nt,npg,nr; const DDI_Comm *cur_comm = (const DDI_Comm *) Comm_find(comm_id); int *list = (int *) Malloc(ngroups*sizeof(int)); if(ngroups <=0 || ngroups > cur_comm->nn) { fprintf(stdout,"%s: ngroups=%i (arg #1 of DDI_Comm_divide) is invalid.\n",DDI_Id,ngroups); Fatal_error(911); } nt = cur_comm->nn; npg = nt / ngroups; nr = nt % ngroups; for(i=0,in=0; i<ngroups; i++) { list[i] = npg; if(i < nr) list[i]++; } Comm_divide_custom(ngroups,list,comm_id,new_comm_id); free(list); }
// figure out remote memory address of dynamic load balancer void DDI_ARMCI_DLB_addr() { armci_counter_t *cnt_ptr; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); cnt_ptr = gv(armci_cnt_addr)[comm->global_pid[0]]; gv(dlb_counter) = (size_t*)cnt_ptr; }
/** @see ddi_armci.h */ void DDI_ARMCI_GDLBNext(size_t *counter) { int tmp; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); if (comm->me == 0) ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &tmp, gv(armci_gdlb_counter)[0], 1, 0); MPI_Bcast(&tmp, sizeof(int), MPI_BYTE, 0, comm->compute_comm); *counter = (size_t)tmp; }
void DDI_ARMCI_Memory_init(size_t size) { int code; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD); // malloc ARMCI memory code = ARMCI_Malloc((void*)gv(armci_mem_addr),size); if (code > 0) { ARMCI_Error("ARMCI_Malloc failed",code); Fatal_error(911); } gv(dda_index) = (DDA_Index*)gv(armci_mem_addr)[comm->me]; // malloc ARMCI counter block and set addresses code = ARMCI_Malloc((void*)gv(armci_cnt_addr),sizeof(armci_counter_t)*2); if (code > 0) { ARMCI_Error("ARMCI_Malloc failed",code); Fatal_error(911); } ARMCI_PutValueLong(0, (void*)(gv(armci_cnt_addr)[comm->me]+0), comm->me); ARMCI_PutValueLong(0, (void*)(gv(armci_cnt_addr)[comm->me]+1), comm->me); DDI_ARMCI_DLB_addr(); DDI_ARMCI_GDLB_addr(); // create mutexes code = ARMCI_Create_mutexes(MAX_DD_ARRAYS+1); if (code > 0) { ARMCI_Error("ARMCI_Create_mutexes failed",code); Fatal_error(911); } gv(dlb_access) = MAX_DD_ARRAYS; }
void Comm_divide(int ngroups, int comm_id, int *new_comm_id) { int i,in,nt,npg,nr; int err; const DDI_Comm *cur_comm = (const DDI_Comm *) Comm_find(comm_id); int *list = (int *) Malloc(ngroups*sizeof(int)); if(ngroups <=0 || ngroups > cur_comm->nn) { fprintf(stdout,"%s: ngroups=%i (arg #1 of DDI_Comm_divide) is invalid.\n",DDI_Id,ngroups); Fatal_error(911); } nt = cur_comm->nn; npg = nt / ngroups; nr = nt % ngroups; for(i=0,in=0; i<ngroups; i++) { list[i] = npg; if(i < nr) list[i]++; } #if defined DDI_BGL if (DDI_BGL_Comm_divide_custom(ngroups, list, comm_id, new_comm_id) != 0) { if (cur_comm->me == 0) printf("%s: Groups are not aligned to BG/L psets.\n", DDI_Id()); Comm_divide_custom(ngroups,list,comm_id,new_comm_id); } #else Comm_divide_custom(ngroups,list,comm_id,new_comm_id); #endif free(list); }
void DDI_ISend(void *buffer,size_t size,int to,int *req_val) { int global_pid; DDI_Request *req = &gv(isend_req); const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); # if defined DDI_CHECK_ARGS if(to < 0 || to > comm->np) { fprintf(stdout,"%s: can not send to ddi process %i.\n",DDI_Id(),to); Fatal_error(911); } # endif # if defined DDI_SOC && !defined DDI_MPI pthread_attr_t thread_attr; pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr,PTHREAD_SCOPE_SYSTEM); req->to = to; req->size = size; req->buffer = buffer; if(pthread_create(&req->hnd_thread,&thread_attr,DDI_ISend_thread,req) == -1) { fprintf(stderr,"%s: pthread_create failed in DDI_ISend.\n",DDI_Id()); Fatal_error(911); } # endif # if defined DDI_MPI MPI_Isend(buffer,size,MPI_BYTE,to,1,comm->compute_comm,req); # endif ULTRA_DEBUG((stdout,"%s: non-blocking send to %i issued.\n",DDI_Id(),to)) *req_val = 1; }
void DDI_ARR_select_server(DDI_Patch *dAPatch, int rank) { DDI_ARR_Element element; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); DDI_ARR_select_local(dAPatch, &element); Comm_send(&element, sizeof(DDI_ARR_Element), rank, comm); }
void Comm_divide_custom(int ngroups, int *list, int comm_id, int *new_comm_id) { int nt; int i,in,ip,mygroup,sp,ep,np; const DDI_Comm *cur_comm = (const DDI_Comm *) Comm_find(comm_id); int *my_ids = NULL; int *sn_by_group = (int *) Malloc((ngroups+1)*sizeof(int)); if(ngroups <=0 || ngroups > cur_comm->nn) { fprintf(stdout,"%s: ngroups=%i (arg #1 of DDI_Comm_divide) is invalid.\n",DDI_Id,ngroups); Fatal_error(911); } for(i=0,nt=0; i<ngroups; i++) nt += list[i]; if(nt != cur_comm->nn) { fprintf(stdout," DDI: invalid list of groups sizes in divide_custom.\n"); Fatal_error(911); } for(i=0,in=0; i<ngroups; i++) { sn_by_group[i] = in; in += list[i]; } sn_by_group[ngroups] = in; mygroup = 0; while(sn_by_group[mygroup+1] <= cur_comm->my && mygroup < ngroups) mygroup++; if(mygroup == ngroups) { fprintf(stdout,"%s: unable to find my spot in the new groups.\n",DDI_Id()); Fatal_error(911); } DEBUG_OUT(LVL4,(stdout,"%s: mygroup=%i\n",DDI_Id(),mygroup)) sp = cur_comm->node_master[sn_by_group[mygroup]]; if(mygroup+1 == ngroups) ep = cur_comm->np; else ep = cur_comm->node_master[sn_by_group[mygroup+1]]; np = ep - sp; my_ids = (int *) Malloc(np*sizeof(int)); for(ip=0,i=0; ip<cur_comm->np; ip++) { if(cur_comm->global_pid[ip] >= sp && cur_comm->global_pid[ip] < ep) my_ids[i++]=ip; } if(i != np) { fprintf(stdout,"%s: could not find %i processes expected for the new comm.\n",DDI_Id(),np); Fatal_error(911); } Comm_create(np,my_ids,ngroups,mygroup,comm_id,new_comm_id); free(my_ids); free(sn_by_group); }
void DDI_ARMCI_DLBNext(size_t *counter) { int tmp; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); // increment counter ARMCI_Rmw(ARMCI_FETCH_AND_ADD_LONG,&tmp,(void*)gv(dlb_counter),1,comm->global_pid[0]); *counter = (size_t)tmp; }
/** @see ddi_armci.h */ void DDI_ARMCI_DLBNext(size_t *counter) { long buf; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); int armciPid = comm->global_pid[0]; /* long is used in case signed int is too small */ ARMCI_Rmw(ARMCI_FETCH_AND_ADD_LONG, (void*)(&buf), gv(armci_dlb_counter)[armciPid], 1, armciPid); *counter = (size_t)buf; }
void DDI_Comm_destroy(int commid) { const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid); DDI_Comm *curr_comm = &gv(ddi_base_comm); DDI_Comm *prev_comm = NULL; /* DDI_Data *curr_data = &gv(ddi_base_data); DDI_Data *prev_data = NULL; */ Comm_sync(124,comm); if(commid == DDI_COMM_WORLD) { fprintf(stdout,"%s: Cannot destroy DDI_COMM_WORLD.\n",DDI_Id()); Fatal_error(911); } while(curr_comm->next && curr_comm->id != commid) { prev_comm = curr_comm; curr_comm = (DDI_Comm *) curr_comm->next; } if(curr_comm->id != commid) { fprintf(stdout,"%s: Error in DDI_Comm_destroy - Comm not found.\n",DDI_Id()); Fatal_error(911); } /* while(curr_data->next && curr_data->id != curr_comm->data_id) { prev_data = curr_data; curr_data = (DDI_Data *) curr_data->next; } if(curr_data->id != curr_comm->data_id) { fprintf(stdout,"%s: Error in DDI_Comm_destroy - Data not found.\n",DDI_Id()); Fatal_error(911); } * ----------------------------------------------------------------------- *\ Delete item from DDI_Data linked-list. \* ----------------------------------------------------------------------- * if(curr_comm->me_local == 0) shmdt((void *) curr_data->sync_array); prev_data->next = curr_data->next; free(curr_data); */ /* ----------------------------------------------------------------------- *\ Delete item from DDI_Comm linked-list. \* ----------------------------------------------------------------------- */ free(curr_comm->smp_pid); free(curr_comm->local_nid); free(curr_comm->global_pid); free(curr_comm->global_nid); free(curr_comm->global_dsid); free(curr_comm->node_master); prev_comm->next = curr_comm->next; free(curr_comm); }
void DDI_ARMCI_Barrier(const DDI_Comm *comm) { if (comm == (const DDI_Comm *)Comm_find(DDI_COMM_WORLD)) { ARMCI_Barrier(); } else { ARMCI_AllFence(); MPI_Barrier(comm->compute_comm); } }
/* ---------------------------------------------- *\ FORTRAN Wrapper to return MPI SMP communicator \* --------------------------------------------- */ void F77_SMP_GetMPIComm(int_f77 *commid) { # ifdef DDI_MPI DDI_Comm *comm = Comm_find(DDI_WORKING_COMM); *commid = comm->smp_comm; # else printf("DDI_SMP_GetMPIComm is only supported with an MPI build\n"); Fatal_error(911); # endif }
void DDI_ARMCI_GDLBNext(size_t *counter) { int tmp; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); // increment and broadcast global counter if (comm->me == 0) ARMCI_Rmw(ARMCI_FETCH_AND_ADD,&tmp,(void*)gv(gdlb_counter),1,0); MPI_Bcast(&tmp, sizeof(int), MPI_BYTE, 0, comm->compute_comm); *counter = (size_t)tmp; }
void DDI_ARMCI_Release(const DDA_ARMCI_Index *index, int handle, int proc, int ltype) { int semid = index[proc].semid; int commid = gv(dda_comm)[handle]; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid); int armci_proc = comm->global_pid[proc]; #if defined DDI_ARMCI_LOCK DDI_ARMCI_Unlock(semid,armci_proc); #endif }
int SMP_create(size_t size) { int semflg = 0600; DDI_Comm *comm = (DDI_Comm *) Comm_find(DDI_COMM_WORLD); /* hardcoded atm */ SMP_Data *new_data = (SMP_Data *) Malloc(sizeof(SMP_Data)); SMP_Data *end_data = (SMP_Data *) SMP_find_end(); STD_DEBUG((stdout,"%s: Entered DDI_SMP_Create.\n",DDI_Id())) if(end_data) end_data->next = (void *) new_data; else gv(smp_base_data) = (SMP_Data *) new_data; # if defined USE_SYSV Comm_sync_smp(comm); if(comm->me_local == 0) { new_data->handle = gv(smp_data_id)++; new_data->shmid = gv(shmid) = Shmget(IPC_PRIVATE,size,SHM_R|SHM_W); new_data->semid = Semget(IPC_PRIVATE,1,semflg); new_data->size = size; new_data->next = NULL; } Comm_bcast_smp(new_data,sizeof(SMP_Data),0,comm); new_data->addr = Shmat(new_data->shmid,0,0); MAX_DEBUG((stdout,"%s: SMP memory [%i] shmid=%i, semid=%i, addr=%x.\n", DDI_Id(),new_data->handle,new_data->shmid,new_data->semid,new_data->addr)) Comm_sync_smp(comm); if(comm->me_local == 0) { Shmctl(new_data->shmid,IPC_RMID,NULL); gv(shmid) = 0; } Comm_sync_smp(comm); # else new_data->handle = gv(smp_data_id)++; new_data->size = size; new_data->next = NULL; new_data->addr = Malloc(size); /* MWS: May 2010 It appears above that systems without SysV memory are expected to allocate Process-replicated memory instead of Node-replicated, and get on with it. If these are duplicated, at full size, as it appears, that's likely devastating for the system total memory usage. The parallel CCSD(T) on IBM Blue Gene/P got into a deadlock, but other systems with sockets or MPI seem to work if allowed to proceed. At this time, we kill off just the BG here... */ # ifdef IBMBG fprintf(stdout,"DDI compiled w/o SysV operating system support.\n"); fprintf(stdout,"IBM/BG parallel CCSD(T) cannot run w/o SysV.\n"); Fatal_error(911); # endif # endif return new_data->handle; }
/** @see ddi_armci.h */ void DDI_ARMCI_Index_create(DDA_Index *index, int handle) { DDA_Remote_Index *remoteIndex = gv(dda_remote_index)[handle]; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); /* remoteIndex[pid] is indexed by DDI_WORKING_COMM process id. */ remoteIndex[comm->me].offset = index[handle].offset; remoteIndex[comm->me].mutex = handle; /* remoteIndex[pid].offset may differ from process to process */ MPI_Allgather(&remoteIndex[comm->me], sizeof(DDA_Remote_Index), MPI_BYTE, remoteIndex, sizeof(DDA_Remote_Index), MPI_BYTE, comm->compute_comm); }
void DDI_ARMCI_Index_create(DDA_Index *index, int handle) { DDA_ARMCI_Index *armci_index = gv(dda_armci_index)[handle]; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); armci_index[comm->me].offset = index[handle].offset; armci_index[comm->me].semid = handle; gv(dda_comm)[handle] = DDI_WORKING_COMM; MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, armci_index,sizeof(DDA_ARMCI_Index),MPI_BYTE, comm->compute_comm); }
void DDI_ARMCI_Zero_local(int handle) { double *da = NULL; const DDA_Index *Index = gv(dda_index); size_t i,size = Index[handle].size; const DDA_ARMCI_Index *armci_index = gv(dda_armci_index)[handle]; int commid = gv(dda_comm)[handle]; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid); int armci_proc, proc = comm->me; DDI_ARMCI_Acquire(armci_index,handle,proc,DDI_WRITE_ACCESS,(void **) &da, &armci_proc); for (i=0; i<size; i++) da[i] = 0.0; DDI_ARMCI_Release(armci_index,handle,proc,DDI_WRITE_ACCESS); }
/* ------------------------------------------------------------------------- *\ DDI_Recv_request(buff,from) =========================== [OUT] buff - address in which incoming data is passed. [IN] from - rank of process sending data stream. Used to receive a data request sent by DDI_Send_request. This subroutine is only called from a data server. \* ------------------------------------------------------------------------- */ void DDI_Recv_request(void *buff,int *from) { # if defined DDI_SOC char ack=37; size_t size = sizeof(DDI_Patch); DDI_Recvany(buff,size,from); Send(gv(sockets)[*from],&ack,1,0); # endif /* ---------------------------------- *\ The stand-alone MPI version of DDI \* ---------------------------------- */ /* This routine is the one that is responsible for the poor performance of a 100% MPI-1 model. The "receive from anywhere" option caused by MPI_ANY_SOURCE typically is implemented by repeated checking (polling) on all open MPI processes. This can sometimes be influenced by looking for options to control the polling mechanism, at the "mpirun" time. However, a more practical solution is to use the "mixed" model, which uses our TCP/IP code for handling the small control messages, which do no polling at all. If your adapter allows TCP/IP to coexist with the MPI-1, by all means try "mixed" over "mpi" for DDI. Here's a short note from Ryan, Having talking to some MPI people at past SC conferences, it seems that the Irecv/wait combination has the best chance to be implemented in a blocking fashion. However, I never worked out all the details to make it happen. What I think this means is that you set up an IRECV on every possible source, and then loop around "polling" with the wait yourself, perhaps with a delay from "sleep" added before the loop repeats itself. */ # if defined DDI_MPI && !defined DDI_SOC & !defined DDI_LAPI const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); MPI_Status status; size_t size = sizeof(DDI_Patch); DEBUG_OUT(LVL4,(stdout,"%s: call mpi_recv from any source.\n",DDI_Id())) MPI_Recv(buff,size,MPI_BYTE,MPI_ANY_SOURCE,0,comm->world_comm,&status); *from = status.MPI_SOURCE; DEBUG_OUT(LVL4,(stdout,"%s: received request from %i.\n",DDI_Id(),*from)) # endif }
/** @see ddi_armci.h */ void DDI_ARMCI_Counters_init() { int i; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD); /* The offsets should be the same on all processes. */ size_t dlb_counter_offset = (size_t)(gv(dlb_counter)) - (size_t)(gv(dda_index)); size_t gdlb_counter_offset = (size_t)(gv(gdlb_counter)) - (size_t)(gv(dda_index)); for (i = 0; i < comm->np; ++i) { gv(armci_dlb_counter)[i] = gv(armci_mem_addr)[i] + dlb_counter_offset; gv(armci_gdlb_counter)[i] = gv(armci_mem_addr)[i] + gdlb_counter_offset; } ARMCI_PutValueInt(0, gv(dlb_counter), comm->me); ARMCI_PutValueInt(0, gv(gdlb_counter), comm->me); }
/** @see ddi_armci.h */ void DDI_ARMCI_Release(int handle, int pid, int ltype) { DDA_Remote_Index *remoteIndex = gv(dda_remote_index)[handle]; int commid = gv(dda_index)[handle].commId; const DDI_Comm *comm = (const DDI_Comm*)Comm_find(commid); int armciPid_; int mutex; if (pid < 0) pid = comm->me; armciPid_ = comm->global_pid[pid]; mutex = remoteIndex[pid].mutex; #if defined DDI_ARMCI_LOCK DDI_ARMCI_Unlock(mutex, armciPid_); #endif }
void DDI_ARMCI_Finalize() { int code; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD); #if defined DDI_ARMCI_FREE code = ARMCI_Free((void*)(gv(armci_mem_addr)[comm->me])); if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_mem_addr)[comm->me]); code = ARMCI_Free((void*)(gv(armci_cnt_addr)[comm->me])); if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_cnt_addr)[comm->me]); #endif code = ARMCI_Destroy_mutexes(); if (code > 0) fprintf(stderr,"ARMCI_Destory_mutexes failed: %i",code); ARMCI_Finalize(); }
/** @see ddi_armci.h */ void DDI_ARMCI_Memory_init(size_t size) { int code; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD); code = ARMCI_Malloc(gv(armci_mem_addr), size); if (code != 0) { fprintf(DDI_STDERR, "%s: ARMCI_Malloc(%p, %z) returned %i\n", DDI_Id(), gv(armci_mem_addr), size, code); DDI_Error(DDI_ARMCI_MEMORY_INIT_ERROR, DDI_ARMCI_MEMORY_INIT_ERROR_MESSAGE); } gv(dda_index) = (DDA_Index*)gv(armci_mem_addr)[comm->me]; code = ARMCI_Create_mutexes(MAX_DD_ARRAYS); if (code != 0) { fprintf(DDI_STDERR, "%s: ARMCI_Create_mutexes(%d) returned %i\n", DDI_Id(), MAX_DD_ARRAYS, code); DDI_Error(DDI_ARMCI_MEMORY_INIT_ERROR, DDI_ARMCI_MEMORY_INIT_ERROR_MESSAGE); } }
/** @see ddi_armci.h */ void DDI_ARMCI_Acquire(int handle, int pid, int ltype, void **array, int *armciPid) { DDA_Remote_Index *remoteIndex = gv(dda_remote_index)[handle]; char *buf = NULL; int commId = gv(dda_index)[handle].commId; const DDI_Comm *comm = (const DDI_Comm*)Comm_find(commId); int armciPid_; int mutex; if (pid < 0) pid = comm->me; armciPid_ = comm->global_pid[pid]; if (armciPid != NULL) *armciPid = armciPid_; mutex = remoteIndex[pid].mutex; *array = (void*)((char*)(gv(armci_mem_addr)[armciPid_]) + remoteIndex[pid].offset); #if defined DDI_ARMCI_LOCK DDI_ARMCI_Lock(mutex, armciPid_); #endif }
void DDI_Scattered_comm(int handle,const DDI_Scattered* scattered,int *nsubs,int *ranks,DDI_Scattered *subs,int commid,long * ibuff) { int i,j,k,np,me,nsub; DDI_Scattered s,*t; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid); np = comm->nn; me = comm->my; for(i=0,nsub=0; i<np; i++) { DDI_Scattered_NDistribP(handle,i,&s); if(s.ihi > 0){printf("Error: DDI_Scatter_Acc is not implemented for arrays with more than one row.\n\n"); fflush(stdout); Fatal_error(911);} for (j=0; j<scattered->nelem; j++) { if(s.jlo > (ibuff[j]-1) || s.jhi < (ibuff[j]-1)) { } else { ranks[nsub] = i; t = &subs[nsub]; t->oper = scattered->oper; t->handle = handle; t->nelem = scattered->nelem; t->nlocal = 0; // t->start = j; // t->end = 0; for(k=j; k<scattered->nelem; k++){ if( s.jhi < (ibuff[k]-1) ) {break;} else {t->nlocal += 1;} } // t->end = t->start + t->nlocal -1; t->size = t->nlocal; t->size *= sizeof(double); ++nsub; break; } //(s.jlo > (ibuff[j]-1) || s.jhi < (ibuff[j]-1)) } //(j=0; j<scattered->nelem; j++) } *nsubs = nsub; }
void Comm_create_gpu(int comm_id, int *new_comm_id) { int gpu_count = 1; // dynamic runtime from cuda driver int gpu_flag = 0; const DDI_Comm *cur_comm = (DDI_Comm *) Comm_find(comm_id); const DDI_Comm *comm_world = &gv(ddi_base_comm); DDI_Comm *new_comm = (DDI_Comm *) Malloc(sizeof(DDI_Comm)); DDI_Comm *end_comm = (DDI_Comm *) Comm_find_end(); assert(cur_comm->np > gpu_count); // copy the current comm structure into the new memcpy(new_comm, cur_comm, sizeof(DDI_Comm)); new_comm->smp_with_gpu_comm = cur_comm->smp_comm; new_comm->compute_with_gpu_comm = cur_comm->compute_comm; // split mpi comms if(cur_comm->me_local < gpu_count) gpu_flag = 1; MPI_Comm_split(cur_comm->smp_comm, gpu_flag, cur_comm->me_local, &new_comm->smp_comm); MPI_Comm_split(cur_comm->compute_comm, gpu_flag, cur_comm->me, &new_comm->compute_comm); // update new_comm with new_comm->has_gpu = gpu_flag; MPI_Comm_rank(new_comm->smp_comm, &new_comm->me_local); MPI_Comm_size(new_comm->smp_comm, &new_comm->np_local); MPI_Comm_rank(new_comm->compute_comm, &new_comm->me); MPI_Comm_size(new_comm->compute_comm, &new_comm->np); /* ------------------------------- *\ Add new_comm to the linked list \* ------------------------------- */ new_comm->id = *new_comm_id = gv(ddi_comm_id)++; new_comm->next = NULL; end_comm->next = (void *) new_comm; *new_comm_id = new_comm->id; }
void DDI_ARMCI_Acquire(const DDA_ARMCI_Index *index, int handle, int proc, int ltype, void **array, int *armci_proc_ptr) { char *buf = NULL; int semid = index[proc].semid; int commid = gv(dda_comm)[handle]; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid); int armci_proc = comm->global_pid[proc]; # if defined DDI_MAX_DEBUG if(ltype != DDI_READ_ACCESS && ltype != DDI_WRITE_ACCESS) { fprintf(stdout,"%s: Invalid lock type requested.\n",DDI_Id()); Fatal_error(911); } # endif buf = (char*)gv(armci_mem_addr)[armci_proc]; buf += index[proc].offset; *array = (void*)buf; #if defined DDI_ARMCI_LOCK DDI_ARMCI_Lock(semid,proc); #endif *armci_proc_ptr = armci_proc; }
/* -------------------------------------------------------------------- *\ DDI_Create_custom(idim,jdim,jcols,handle) ========================================= [IN] idim - Number of rows in the array to be created. [IN] jdim - Number of columns in the array to be created. [IN] jcols - Array holding the number of columns to be given to - each processor when creating the distributed array. [OUT] handle - Handle given to the newly created array. Creates a distributed array where the user can customize how the array is distributed across the processors. \* -------------------------------------------------------------------- */ void DDI_Create_custom(int idim,int jdim,int *jcols,int *handle) { int i,np,me,nn,my; int inode; DDI_INT64 totwrds; DDI_INT64 longrows,longcols,longslice,longnd,long2g; # ifndef USE_SYSV int remote_id; # endif DDI_Patch patch; const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM); np = comm->np; me = comm->me; nn = comm->nn; my = comm->my; Comm_sync(3001,comm); /* find an unused handle */ for (i=0; i<gv(nxtdda); ++i) { if (gv(ddacomm)[i] == DDI_COMM_NULL) break; } if (i==gv(nxtdda)) ++gv(nxtdda); *handle = i; # ifndef USE_SYSV remote_id = my; # endif DEBUG_ROOT(LVL2,(stdout," DDI: Entering DDI_Create_custom.\n")) DEBUG_ROOT(LVL2,(stdout," DDI: Creating Array [%i] - %ix%i=%i.\n",*handle,idim,jdim,idim*jdim)) DEBUG_OUT(LVL3,(stdout,"%s: Entering DDI_Create_custom.\n",DDI_Id())) # ifdef DS_SIGNAL if(comm->me_local == 1) { signal(SIGALRM,DS_Thread_main); } # endif if(me == 0) { if(gv(dda_output)) { longrows = idim; longcols = jdim; totwrds = longrows*longcols; fprintf(stdout," DDI: Creating Array [%i] - %i x %i = %li words.\n", *handle,idim,jdim,totwrds); fflush(stdout); } } /* Make sure each slice of the distributed array will be under 2 GWords. Even on 64-bit hardware, most counting in this program is done with 32-bit data types, meaning we can't count higher than 2**31-1. If on 32-bit hardware, the 'long' data types here will be 32-bits, and so we'll see crazy products, including less than zero. In present form, nothing will be trapped here on a 32 bit machine! */ longrows = idim; longcols = jdim; totwrds = longrows*longcols; /* Total distributed array over 2 Gwords is OK, but each */ /* slice (MEMDDI per data server) must be under 2 GWords. */ /* TCP/IP has gv(nd)=-1 (uninitialized) */ /* Cray on one node has gv(nd)=0 since no d.s. exists. */ # if defined DDI_MPI longnd = gv(nd); if (longnd <= 0) longnd=1; # endif # if defined DDI_SOC longnd = np; # endif longslice = totwrds/longnd; /* next is largest signed 32 bit integer, stored as 64 bit quantity */ long2g = 2147483643; if (longslice > long2g) { fprintf(stdout,"\n"); fprintf(stdout," DDI: trouble creating distributed array!\n"); fprintf(stdout," Current number of data servers is %li\n",longnd); fprintf(stdout," so each data server's slice of array"); fprintf(stdout," [%i] is %li words\n",*handle,longslice); fprintf(stdout,"\n"); fprintf(stdout," Add more processors so required total array"); fprintf(stdout," size %li words\n",totwrds); fprintf(stdout," divided by no. of processors (data servers)"); fprintf(stdout," is less than 2 Gwords= %li\n",long2g); fprintf(stdout," For example, %li or more data servers...\n", 1+totwrds/long2g); fprintf(stdout,"\n"); fflush(stdout); Fatal_error(911); } /* ------------------------------------ *\ Ensure 'jcols' is properly formatted \* ------------------------------------ */ for(i=0; i<np; i++) { if(jcols[i] < 0 && me == 0) { fprintf(stdout," Error in argument 3 of DDI_Create_custom: Values must be >= 0.\n"); Fatal_error(911); } if(i > 0) if(jcols[i] < jcols[i-1]) { fprintf(stdout," Error in argument 3 of DDI_Create_custom: Values must increase monotonically.\n"); Fatal_error(911); } } /* ----------------------------------------------------------------- *\ Check to ensure the maximum number of arrays hasn't been reached. \* ----------------------------------------------------------------- */ if( gv(nxtdda) == MAX_DD_ARRAYS ) { if(me == 0) { fprintf(stderr," DDI Error: The maximum number of distributed arrays [%i] has been reached.\n",MAX_DD_ARRAYS); fprintf(stderr," Information: The maximum number of distributed arrays is a DDI compile-time option.\n"); } Fatal_error(911); } gv(nrow)[*handle] = idim; gv(ncol)[*handle] = jdim; gv(ddacomm)[*handle]=gv(ddi_working_comm); /* ---------------------------------------------------- *\ Generate Column Mapping by Compute Process & by Node \* ---------------------------------------------------- */ for(i=0,inode=-1; i<np; i++) { gv(pcmap)[*handle][i] = jcols[i]; /* if(inode == gv(ddiprocs)[i].node) continue; */ if(inode == comm->local_nid[i]) continue; gv(ncmap)[*handle][++inode] = gv(pcmap)[*handle][i]; } gv(pcmap)[*handle][np] = jdim; gv(ncmap)[*handle][nn] = jdim; /* -------------------------- *\ Get local patch dimensions \* -------------------------- */ DDI_DistribP(*handle,me,&patch); /* ----------------------------- *\ Create Distributed Data Array \* ----------------------------- */ patch.handle = *handle; # if defined WINTEL patch.oper = DDI_CREATE_OP; # else patch.oper = DDI_CREATE; # endif patch.size = jdim; # if defined USE_SYSV || defined DDI_ARMCI || defined DDI_MPI2 DDI_Index_create(&patch); # else DDI_Send_request(&patch,&remote_id,NULL); # endif /* ----------------------------- *\ Synchronize Compute Processes \* ----------------------------- */ Comm_sync(3002,comm); DEBUG_OUT(LVL3,(stdout,"%s: Leaving DDI_Create_custom.\n",DDI_Id())) }