/* ----------------------------------------------------------------- *\ DDI_GetAcc_server(patch,from) ========================== [IN] patch - structure containing ilo, ihi, jlo, jhi, etc. [IN] from - rank of DDI process sending data to be accumulated. Used by the data server to accept incoming data and perform a local accumulate. Note, the fence is raised to protect the array from local get/put operations until the accumulate has finished. \* ----------------------------------------------------------------- */ void DDI_GetAcc_server(const DDI_Patch *msg,int from) { /* --------------- *\ Local Variables \* --------------- */ char ack = 57; void *buffer = NULL; /* -------------------------------------------------------------------- *\ Raise protective fence. This is necessary because a compute process can finish with the DDI_Acc subroutine before the remote data server has finished accumulating the patch. \* -------------------------------------------------------------------- */ # if defined USE_SYSV DDI_Fence_acquire(msg->handle); DDI_Send(&ack,1,from); # endif /* ----------------------------------------------------------------- *\ If enough memory is available to receive all the data in a single message, then do so ... otherwise receive and update in batches. *TODO: Implement the second option* \* ----------------------------------------------------------------- */ DDI_Memory_push(msg->size,&buffer,NULL); /* ----------------------- *\ Receive and update data \* ----------------------- */ DDI_Recv(buffer,msg->size,from); DDI_GetAcc_local(msg,buffer); DDI_Send(buffer,msg->size,from); /* ------------------- *\ Free receive buffer \* ------------------- */ DDI_Memory_pop(msg->size); /* --------------- *\ Take down fence \* --------------- */ # if defined USE_SYSV DDI_Fence_release(msg->handle); # endif }
/* ---------------------------------------------------------- *\ DDI_GDLBNext(counter) ==================== [OUT] counter - value of the load balance counter returned to the calling process. An atomic operation that sets the value of counter to the value of the global load-balance counter, then increments the global counter. \* --------------------------------------------------------- */ void DDI_GDLBNext(size_t *counter) { int np,me,nn,my,tmp_scope,remote_id=0; DDI_Patch Patch; # if defined DDI_LAPI lapi_cntr_t org_cntr; uint tgt = gv(lapi_map)[0]; int *tgt_var = gv(lapi_gdlb_cntr_addr)[tgt]; int in_val = 1; int prev_tgt_val = -1; # endif # if defined DDI_ARMCI DDI_ARMCI_GDLBNext(counter); return; # endif DDI_NProc(&np,&me); if(me == 0) { /* ---------------------------------- *\ We need to work in the world scope \* ---------------------------------- */ tmp_scope = DDI_WORKING_COMM; gv(ddi_working_comm) = DDI_COMM_WORLD; DDI_NProc(&np,&me); DDI_NNode(&nn,&my); # if FULL_SMP # if defined DDI_LAPI if(LAPI_Setcntr(gv(lapi_hnd),&org_cntr,0) != LAPI_SUCCESS) { fprintf(stdout,"%s: LAPI_Setcntr failed in DDI_GDLBNext.\n",DDI_Id()); Fatal_error(911); } if(LAPI_Rmw(gv(lapi_hnd),FETCH_AND_ADD,tgt,tgt_var,&in_val, &prev_tgt_val,&org_cntr) != LAPI_SUCCESS) { fprintf(stdout,"%s: LAPI_Rmw failed in DDI_GDLBNext.\n",DDI_Id()); Fatal_error(911); } if(LAPI_Waitcntr(gv(lapi_hnd),&org_cntr,1,NULL) != LAPI_SUCCESS) { fprintf(stdout,"%s: LAPI_Waitcntr failed in DDI_GDLBNext.\n",DDI_Id()); Fatal_error(911); } if(prev_tgt_val == -1) { fprintf(stdout,"%s: LAPI version of DDI_GDLBNext is not working correctly.\n",DDI_Id()); Fatal_error(911); } else { *counter = (size_t) prev_tgt_val; } # else if(my == 0) { DDI_GDLBNext_local(counter); } else { Patch.oper = DDI_GDLBNEXT; DDI_Send_request(&Patch,&remote_id,NULL); DDI_Recv(counter,sizeof(size_t),remote_id); } # endif # else Patch.oper = DDI_GDLBNEXT; DDI_Send_request(&Patch,&remote_id,NULL); DDI_Recv(counter,sizeof(size_t),remote_id); # endif /* --------------------------- *\ Return to the working scope \* --------------------------- */ gv(ddi_working_comm) = tmp_scope; } DDI_BCast(counter,sizeof(size_t),0); }
/* -------------------------------------------------------------- *\ DDI_GetAccP(handle,patch,buff) ============================ [IN] handle - Handle of the distributed array to be accessed. [IN] patch - structure containing ilo, ihi, jlo, jhi, etc. [IN] buff - Data segment to be operated on. \* -------------------------------------------------------------- */ void DDI_GetAccP(int handle,DDI_Patch *patch,void *buff) { /* --------------- *\ Local Variables \* --------------- */ char ack=57; int i,np,me,nn,my,remote_id,nsubp; int ranks[MAX_NODES]; DDI_Patch subp[MAX_NODES]; char *working_buffer = (char *) buff; # if defined DDI_LAPI DDI_Patch *local_patch = NULL; lapi_cntr_t cntr[MAX_NODES]; # endif STD_DEBUG((stdout,"%s: Entering DDI_GetAccP.\n",DDI_Id())) /* -------------------- *\ Process OR Node Rank \* -------------------- */ DDI_NProc(&np,&me); DDI_NNode(&nn,&my); /* ------------------------------------- *\ Ensure the patch has the correct info \* ------------------------------------- */ patch->oper = DDI_GETACC; patch->handle = handle; /* ---------------------------------- *\ Check calling arguments for errors \* ---------------------------------- */ # if defined DDI_CHECK_ARGS if(handle < 0 || handle >= gv(ndda)) { fprintf(stdout,"%s: Invalid handle [%i] in DDI_GetAcc.\n",DDI_Id(),handle); Fatal_error(911); } if(patch->ilo > patch->ihi || patch->ilo < 0 || patch->ihi >= gv(nrow)[handle]) { fprintf(stdout,"%s: Invalid row dimensions during DDI_GetAcc => ilo=%i ihi=%i.\n",DDI_Id(),patch->ilo,patch->ihi); Fatal_error(911); } if(patch->jlo > patch->jhi || patch->jlo < 0 || patch->jhi >= gv(ncol)[handle]) { fprintf(stdout,"%s: Invalid colum dimensions during DDI_GetAcc => jlo=%i jhi=%i.\n",DDI_Id(),patch->jlo,patch->jhi); Fatal_error(911); } # endif /* ------------------------------ *\ Log some simple profiling info \* ------------------------------ */ # if defined DDI_COUNTERS gv(acc_profile).ncalls++; gv(acc_profile).nbytes += DDI_Patch_sizeof(patch); # endif /* ------------------------------------------------------- *\ Determine where the pieces of the requested patch exist \* ------------------------------------------------------- */ DDI_Subpatch(handle,patch,&nsubp,ranks,subp); MAX_DEBUG((stdout,"%s: %i subpatches.\n",DDI_Id(),nsubp)) /* ------------------------------------------------------------------- *\ Send data requests for all non-local pieces of the requested patch. Operate immediately to GetAcc a local portion of the patch. \* ------------------------------------------------------------------- */ for(i=0; i<nsubp; i++) { ULTRA_DEBUG((stdout,"%s: GetAccumulating subpatch %i.\n",DDI_Id(),i)) /* ------------------------------------------------------------- *\ Using SysV, take advantage of shared-memory for a local patch \* ------------------------------------------------------------- */ # if defined USE_SYSV /* ------------------------------------------------ *\ Determine if the ith patch is local to 'my' node \* ------------------------------------------------ */ if(ranks[i] == my) { MAX_DEBUG((stdout,"%s: Subpatch %i is local.\n",DDI_Id(),i)) /* ---------------------------------------------------- *\ Using LAPI, perform the local Getacc after all the data requests have been sent ==> maximize concurrency. \* ---------------------------------------------------- */ # if defined DDI_LAPI local_patch = &subp[i]; local_patch->cp_buffer_addr = working_buffer; # else /* --------------------------------------------- *\ Otherwise, perform the local Getacc immediately. \* --------------------------------------------- */ DDI_GetAcc_local(&subp[i],working_buffer); # endif /* ------------------------------------------------------- *\ Move the working buffer to the next patch and continue. \* ------------------------------------------------------- */ working_buffer += subp[i].size; continue; } # endif /* --------------------------------- *\ If the current patch is NOT local \* --------------------------------- */ remote_id = ranks[i]; /* ----------------------------------------------- *\ Using LAPI, then include some extra information \* ----------------------------------------------- */ # if defined DDI_LAPI subp[i].cp_lapi_id = gv(lapi_map)[me]; subp[i].cp_lapi_cntr = (void *) &cntr[i]; subp[i].cp_buffer_addr = (void *) working_buffer; LAPI_Setcntr(gv(lapi_hnd),&cntr[i],0); ULTRA_DEBUG((stdout,"%s: cp_lapi_id=%i.\n",DDI_Id(),gv(lapi_map)[me])) ULTRA_DEBUG((stdout,"%s: cp_lapi_cntr=%x.\n",DDI_Id(),&cntr[i])) ULTRA_DEBUG((stdout,"%s: cp_buffer_addr=%x.\n",DDI_Id(),working_buffer)) # endif /* -------------------------------- *\ Send data request for subpatch i \* -------------------------------- */ MAX_DEBUG((stdout,"%s: Sending data request to node %i.\n",DDI_Id(),remote_id)) DDI_Send_request(&subp[i],&remote_id,NULL); MAX_DEBUG((stdout,"%s: data request sent to global process %i.\n",DDI_Id(),remote_id)) /* ------------------------------------------------------------ *\ Receive an acknowledgement that the data server has raised a fence that will protect the distributed array from get or put access until all accumulates have finished. This block- ing receive ensures that the current process executing this accumulate can *NOT* finish, until the fence has been raised \* ------------------------------------------------------------ */ # if !defined DDI_LAPI # if defined USE_SYSV MAX_DEBUG((stdout,"%s: Receiving remote fence ACK.\n",DDI_Id())) DDI_Recv(&ack,1,remote_id); # endif /* ---------------------------- *\ Recv subpatch from remote_id \* ---------------------------- */ MAX_DEBUG((stdout,"%s: Sending subpatch %i to %i.\n",DDI_Id(),i,remote_id)) DDI_Send(working_buffer,subp[i].size,remote_id); DDI_Recv(working_buffer,subp[i].size,remote_id); # endif /* ------------ *\ Shift buffer \* ------------ */ working_buffer += subp[i].size; } /* ----------------------------------------------------------- *\ Using LAPI, perform the local Getaccumulate (if needed) as the remote processes are getting the data to Getaccumulate on the target processes. Then wait for all the data to be copied out of the buffer before returning. \* ----------------------------------------------------------- */ # if defined DDI_LAPI /* ------------------------------------ *\ GetAccumulating local patch (if exists) \* ------------------------------------ */ if(local_patch) DDI_GetAcc_local(local_patch,local_patch->cp_buffer_addr); /* ---------------------------------------------------------- *\ Wait for all remote LAPI_Gets to finish copying local data \* ---------------------------------------------------------- */ for(i=0; i<nsubp; i++) { if(subp[i].cp_lapi_cntr) { ULTRA_DEBUG((stdout,"%s: Wait for subpatch %i to be copied.\n",DDI_Id(),i)) LAPI_Waitcntr(gv(lapi_hnd),&cntr[i],3,NULL); ULTRA_DEBUG((stdout,"%s: Subpatch %i copy completed.\n",DDI_Id(),i)) } }
/* ---------------------------- *\ FORTRAN Wrapper for DDI_Recv \* ---------------------------- */ void F77_Recv(void *buff,int_f77 *size,int_f77* from) { size_t isize = (size_t) *size; int ifrom = (int) *from; DDI_Recv(buff,isize,ifrom); }
/* -------------------------------------------------------------------- *\ DDI_Timer_output() ================== Synchronous barrier on compute processes, but also collects total cpu time from each compute process and prints the totals stdout. \* -------------------------------------------------------------------- */ void DDI_Timer_output() { int i,me,np; struct rusage mycputime; struct rusage *timings = NULL; struct timeval cpu_total; struct timeval wall_total; DDI_NProc(&np,&me); DDI_Sync(3081); if(me == 0) { timings = (struct rusage *) Malloc(np*sizeof(struct rusage)); getrusage(RUSAGE_SELF,timings); gettimeofday(&wall_total,NULL); } else { getrusage(RUSAGE_SELF,&mycputime); timings = &mycputime; } timings->ru_utime.tv_sec -= gv(cpu_timer).ru_utime.tv_sec; timings->ru_utime.tv_usec -= gv(cpu_timer).ru_utime.tv_usec; if(timings->ru_utime.tv_usec < 0) { timings->ru_utime.tv_sec--; timings->ru_utime.tv_usec += 1000000; } timings->ru_stime.tv_sec -= gv(cpu_timer).ru_stime.tv_sec; timings->ru_stime.tv_usec -= gv(cpu_timer).ru_stime.tv_usec; if(timings->ru_stime.tv_usec < 0) { timings->ru_stime.tv_sec--; timings->ru_stime.tv_usec += 1000000; } wall_total.tv_sec -= gv(wall_timer).tv_sec; wall_total.tv_usec -= gv(wall_timer).tv_usec; if(wall_total.tv_usec < 0) { wall_total.tv_sec--; wall_total.tv_usec += 1000000; } if(me == 0) { for(i=1; i<np; i++) DDI_Recv(&timings[i],sizeof(struct rusage),i); fprintf(stdout,"\n ------------------------------------------------"); fprintf(stdout,"\n CPU timing information for all compute processes"); fprintf(stdout,"\n ================================================"); for(i=0; i<np; i++) { cpu_total.tv_sec = timings[i].ru_utime.tv_sec + timings[i].ru_stime.tv_sec; cpu_total.tv_usec = timings[i].ru_utime.tv_usec + timings[i].ru_stime.tv_usec; if(cpu_total.tv_usec > 1000000) { cpu_total.tv_sec++; cpu_total.tv_usec -= 1000000; } fprintf(stdout,"\n %4i: %d.%.6d + %d.%.6d = %d.%.6d",i, (int)timings[i].ru_utime.tv_sec,(int)timings[i].ru_utime.tv_usec, (int)timings[i].ru_stime.tv_sec,(int)timings[i].ru_stime.tv_usec, (int)cpu_total.tv_sec,(int)cpu_total.tv_usec); } fprintf(stdout,"\n Wall: %d.%.6d", (int) wall_total.tv_sec, (int) wall_total.tv_usec); fprintf(stdout,"\n ================================================\n\n"); fflush(stdout); free(timings); } else { DDI_Send(&mycputime,sizeof(struct rusage),0); } DDI_Sync(3082); }