Пример #1
0
/* -------------------------------------------------------------------- *\
   DDI_Create(idim,jdim,handle)
   ============================
   [IN]  idim   - Number of rows in the array to be created.
   [IN]  jdim   - Number of columns in the array to be created.
   [OUT] handle - Handle given to the newly created array.
   
   Creates a distributed array with the columns evenly divided amongst
   the processors.
\* -------------------------------------------------------------------- */
   void DDI_Create(int idim,int jdim,int *handle) {

   /* --------------- *\
      Local Variables
   \* --------------- */
      int i,np,me;
      int icol,mincol,lftcol;
      int jcols[MAX_PROCESSORS];
      const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);

      DEBUG_ROOT(LVL1,(stdout," DDI: Entering DDI_Create.\n"))
      DEBUG_OUT(LVL3,(stdout,"%s: Entering DDI_Create.\n",DDI_Id()))
     
      np = comm->np;
      me = comm->me;
/*      
      if(jdim < np && me == 0) {
         fprintf(stdout," DDI Error: Trying to create an array with fewer columns than processors.\n");
         fprintf(stdout," DDI Error: Reduce the number of processors and try again.\n");
         Fatal_error(911);
      }
 */           
      mincol = jdim / np;
      lftcol = jdim % np;
      
      for(i=0,icol=0; i<np; i++) {
         jcols[i] = icol;
         icol += mincol;
         if(i<lftcol) icol++;
      }
      
      DDI_Create_custom(idim,jdim,jcols,handle);
      
      DEBUG_ROOT(LVL2,(stdout," DDI: Array[%i] successfully created.\n",*handle)) 
   }
Пример #2
0
   void Comm_divide(int ngroups, int comm_id, int *new_comm_id) {

      int i,in,nt,npg,nr;
      const DDI_Comm *cur_comm = (const DDI_Comm *) Comm_find(comm_id);

      int *list = (int *) Malloc(ngroups*sizeof(int));

      if(ngroups <=0 || ngroups > cur_comm->nn) {
         fprintf(stdout,"%s: ngroups=%i (arg #1 of DDI_Comm_divide) is invalid.\n",DDI_Id,ngroups);
         Fatal_error(911);
      }

      nt  = cur_comm->nn;
      npg = nt / ngroups;
      nr  = nt % ngroups;

      for(i=0,in=0; i<ngroups; i++) {
         list[i] = npg;
         if(i < nr) list[i]++;
      }

      Comm_divide_custom(ngroups,list,comm_id,new_comm_id);

      free(list);
   }
Пример #3
0
// figure out remote memory address of dynamic load balancer
void DDI_ARMCI_DLB_addr() {
    armci_counter_t *cnt_ptr;
    const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);
    
    cnt_ptr = gv(armci_cnt_addr)[comm->global_pid[0]];
    gv(dlb_counter) = (size_t*)cnt_ptr;
}
Пример #4
0
/** @see ddi_armci.h */
void DDI_ARMCI_GDLBNext(size_t *counter) {
    int tmp;
    const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);
    if (comm->me == 0) ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &tmp, gv(armci_gdlb_counter)[0], 1, 0);
    MPI_Bcast(&tmp, sizeof(int), MPI_BYTE, 0, comm->compute_comm);
    *counter = (size_t)tmp;
}
Пример #5
0
void DDI_ARMCI_Memory_init(size_t size) {
  int code;
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD);
  
  // malloc ARMCI memory
  code = ARMCI_Malloc((void*)gv(armci_mem_addr),size);
  if (code > 0) {
    ARMCI_Error("ARMCI_Malloc failed",code);
    Fatal_error(911);
  }
  gv(dda_index) = (DDA_Index*)gv(armci_mem_addr)[comm->me];

  // malloc ARMCI counter block and set addresses
  code = ARMCI_Malloc((void*)gv(armci_cnt_addr),sizeof(armci_counter_t)*2);
  if (code > 0) {
    ARMCI_Error("ARMCI_Malloc failed",code);
    Fatal_error(911);
  }
  ARMCI_PutValueLong(0, (void*)(gv(armci_cnt_addr)[comm->me]+0), comm->me);
  ARMCI_PutValueLong(0, (void*)(gv(armci_cnt_addr)[comm->me]+1), comm->me);
  DDI_ARMCI_DLB_addr();
  DDI_ARMCI_GDLB_addr();
  
  // create mutexes
  code = ARMCI_Create_mutexes(MAX_DD_ARRAYS+1);
  if (code > 0) {
    ARMCI_Error("ARMCI_Create_mutexes failed",code);
    Fatal_error(911);
  }
  gv(dlb_access) = MAX_DD_ARRAYS;
}
Пример #6
0
   void Comm_divide(int ngroups, int comm_id, int *new_comm_id) {

      int i,in,nt,npg,nr;
      int err;
      const DDI_Comm *cur_comm = (const DDI_Comm *) Comm_find(comm_id);

      int *list = (int *) Malloc(ngroups*sizeof(int));

      if(ngroups <=0 || ngroups > cur_comm->nn) {
         fprintf(stdout,"%s: ngroups=%i (arg #1 of DDI_Comm_divide) is invalid.\n",DDI_Id,ngroups);
         Fatal_error(911);
      }

      nt  = cur_comm->nn;
      npg = nt / ngroups;
      nr  = nt % ngroups;

      for(i=0,in=0; i<ngroups; i++) {
         list[i] = npg;
         if(i < nr) list[i]++;
      }

#if defined DDI_BGL
      if (DDI_BGL_Comm_divide_custom(ngroups, list, comm_id, new_comm_id) != 0) {
	  if (cur_comm->me == 0) printf("%s: Groups are not aligned to BG/L psets.\n", DDI_Id());
	  Comm_divide_custom(ngroups,list,comm_id,new_comm_id);
      }
#else
      Comm_divide_custom(ngroups,list,comm_id,new_comm_id);
#endif

      free(list);
   }
Пример #7
0
   void DDI_ISend(void *buffer,size_t size,int to,int *req_val) {

      int global_pid;
      DDI_Request *req = &gv(isend_req);
      const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);
    
    # if defined DDI_CHECK_ARGS
      if(to < 0 || to > comm->np) {
         fprintf(stdout,"%s: can not send to ddi process %i.\n",DDI_Id(),to);
         Fatal_error(911);
      }
    # endif

      
    # if defined DDI_SOC && !defined DDI_MPI
      pthread_attr_t thread_attr;
      pthread_attr_init(&thread_attr);
      pthread_attr_setscope(&thread_attr,PTHREAD_SCOPE_SYSTEM);
      req->to     = to;
      req->size   = size;
      req->buffer = buffer;
      if(pthread_create(&req->hnd_thread,&thread_attr,DDI_ISend_thread,req) == -1) {
         fprintf(stderr,"%s: pthread_create failed in DDI_ISend.\n",DDI_Id());
         Fatal_error(911);
      }
    # endif
     
    # if defined DDI_MPI
      MPI_Isend(buffer,size,MPI_BYTE,to,1,comm->compute_comm,req);
    # endif
     
      ULTRA_DEBUG((stdout,"%s: non-blocking send to %i issued.\n",DDI_Id(),to))

      *req_val = 1;
   }
Пример #8
0
void DDI_ARR_select_server(DDI_Patch *dAPatch, int rank) {
  DDI_ARR_Element element;
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);

  DDI_ARR_select_local(dAPatch, &element);
  Comm_send(&element, sizeof(DDI_ARR_Element), rank, comm);
}
Пример #9
0
   void Comm_divide_custom(int ngroups, int *list, int comm_id, int *new_comm_id) {

      int nt;
      int i,in,ip,mygroup,sp,ep,np;
      const DDI_Comm *cur_comm = (const DDI_Comm *) Comm_find(comm_id);

      int *my_ids      = NULL;
      int *sn_by_group = (int *) Malloc((ngroups+1)*sizeof(int));

      if(ngroups <=0 || ngroups > cur_comm->nn) {
         fprintf(stdout,"%s: ngroups=%i (arg #1 of DDI_Comm_divide) is invalid.\n",DDI_Id,ngroups);
         Fatal_error(911);
      }

      for(i=0,nt=0; i<ngroups; i++) nt += list[i];

      if(nt != cur_comm->nn) {
         fprintf(stdout," DDI: invalid list of groups sizes in divide_custom.\n");
         Fatal_error(911);
      } 

      for(i=0,in=0; i<ngroups; i++) {
         sn_by_group[i] = in;
         in += list[i];
      }
      sn_by_group[ngroups] = in;

      mygroup = 0;
      while(sn_by_group[mygroup+1] <= cur_comm->my && mygroup < ngroups) mygroup++;

      if(mygroup == ngroups) {
         fprintf(stdout,"%s: unable to find my spot in the new groups.\n",DDI_Id());
         Fatal_error(911);
      }

      DEBUG_OUT(LVL4,(stdout,"%s: mygroup=%i\n",DDI_Id(),mygroup))
      
      sp = cur_comm->node_master[sn_by_group[mygroup]];

      if(mygroup+1 == ngroups) ep = cur_comm->np;
      else                     ep = cur_comm->node_master[sn_by_group[mygroup+1]];
      np = ep - sp;

      my_ids = (int *) Malloc(np*sizeof(int));

      for(ip=0,i=0; ip<cur_comm->np; ip++) {
         if(cur_comm->global_pid[ip] >= sp && cur_comm->global_pid[ip] < ep) my_ids[i++]=ip;
      }

      if(i != np) {
         fprintf(stdout,"%s: could not find %i processes expected for the new comm.\n",DDI_Id(),np);
         Fatal_error(911);
      }

      Comm_create(np,my_ids,ngroups,mygroup,comm_id,new_comm_id);

      free(my_ids);
      free(sn_by_group);
   }
Пример #10
0
void DDI_ARMCI_DLBNext(size_t *counter) {
    int tmp;
    const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);

    // increment counter
    ARMCI_Rmw(ARMCI_FETCH_AND_ADD_LONG,&tmp,(void*)gv(dlb_counter),1,comm->global_pid[0]);
    *counter = (size_t)tmp;
}
Пример #11
0
/** @see ddi_armci.h */
void DDI_ARMCI_DLBNext(size_t *counter) {
    long buf;
    const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);
    int armciPid = comm->global_pid[0];
    /* long is used in case signed int is too small */
    ARMCI_Rmw(ARMCI_FETCH_AND_ADD_LONG, (void*)(&buf), gv(armci_dlb_counter)[armciPid], 1, armciPid);
    *counter = (size_t)buf;
}
Пример #12
0
   void DDI_Comm_destroy(int commid) {
      const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid);

      DDI_Comm *curr_comm = &gv(ddi_base_comm);
      DDI_Comm *prev_comm = NULL;
/*
      DDI_Data *curr_data = &gv(ddi_base_data);
      DDI_Data *prev_data = NULL;
*/
      Comm_sync(124,comm);

      if(commid == DDI_COMM_WORLD) {
         fprintf(stdout,"%s: Cannot destroy DDI_COMM_WORLD.\n",DDI_Id());
         Fatal_error(911);
      }

      while(curr_comm->next && curr_comm->id != commid) {
         prev_comm = curr_comm;
         curr_comm = (DDI_Comm *) curr_comm->next;
      }

      if(curr_comm->id != commid) {
         fprintf(stdout,"%s: Error in DDI_Comm_destroy - Comm not found.\n",DDI_Id());
         Fatal_error(911);
      }
/*
      while(curr_data->next && curr_data->id != curr_comm->data_id) {
         prev_data = curr_data;
         curr_data = (DDI_Data *) curr_data->next;
      }

      if(curr_data->id != curr_comm->data_id) {
         fprintf(stdout,"%s: Error in DDI_Comm_destroy - Data not found.\n",DDI_Id());
         Fatal_error(911);
      }

    * ----------------------------------------------------------------------- *\
      Delete item from DDI_Data linked-list.
   \* ----------------------------------------------------------------------- *
      if(curr_comm->me_local == 0) shmdt((void *) curr_data->sync_array);
      prev_data->next = curr_data->next;
      free(curr_data);
*/
  
   /* ----------------------------------------------------------------------- *\
      Delete item from DDI_Comm linked-list.
   \* ----------------------------------------------------------------------- */
      free(curr_comm->smp_pid);
      free(curr_comm->local_nid);
      free(curr_comm->global_pid);
      free(curr_comm->global_nid);
      free(curr_comm->global_dsid);
      free(curr_comm->node_master);
      prev_comm->next = curr_comm->next;
      free(curr_comm);

   }
Пример #13
0
void DDI_ARMCI_Barrier(const DDI_Comm *comm) {
    if (comm == (const DDI_Comm *)Comm_find(DDI_COMM_WORLD)) {
	ARMCI_Barrier();
    }
    else {
	ARMCI_AllFence();
	MPI_Barrier(comm->compute_comm);
    }
}
Пример #14
0
/* ---------------------------------------------- *\
   FORTRAN Wrapper to return MPI SMP communicator
\* --------------------------------------------- */
   void F77_SMP_GetMPIComm(int_f77 *commid) {
      # ifdef DDI_MPI
        DDI_Comm *comm = Comm_find(DDI_WORKING_COMM);
        *commid = comm->smp_comm;
      # else
        printf("DDI_SMP_GetMPIComm is only supported with an MPI build\n");
        Fatal_error(911);
      # endif
   }
Пример #15
0
void DDI_ARMCI_GDLBNext(size_t *counter) {
    int tmp;
    const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);

    // increment and broadcast global counter
    if (comm->me == 0) ARMCI_Rmw(ARMCI_FETCH_AND_ADD,&tmp,(void*)gv(gdlb_counter),1,0);
    MPI_Bcast(&tmp, sizeof(int), MPI_BYTE, 0, comm->compute_comm);
    *counter = (size_t)tmp;
}
Пример #16
0
void DDI_ARMCI_Release(const DDA_ARMCI_Index *index, int handle, int proc, int ltype) {
  int semid = index[proc].semid;
  int commid = gv(dda_comm)[handle];
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid);
  int armci_proc = comm->global_pid[proc];

#if defined DDI_ARMCI_LOCK
  DDI_ARMCI_Unlock(semid,armci_proc);
#endif
}
Пример #17
0
   int SMP_create(size_t size) {

      int semflg = 0600;

      DDI_Comm *comm     = (DDI_Comm *) Comm_find(DDI_COMM_WORLD); /* hardcoded atm */
      SMP_Data *new_data = (SMP_Data *) Malloc(sizeof(SMP_Data));
      SMP_Data *end_data = (SMP_Data *) SMP_find_end();

      STD_DEBUG((stdout,"%s: Entered DDI_SMP_Create.\n",DDI_Id()))

      if(end_data) end_data->next = (void *) new_data;
      else gv(smp_base_data) = (SMP_Data *) new_data;

    # if defined USE_SYSV
      Comm_sync_smp(comm);
      if(comm->me_local == 0) { 
         new_data->handle = gv(smp_data_id)++; 
         new_data->shmid  = gv(shmid) = Shmget(IPC_PRIVATE,size,SHM_R|SHM_W);
         new_data->semid  = Semget(IPC_PRIVATE,1,semflg);
         new_data->size   = size;
         new_data->next   = NULL;
      }
      Comm_bcast_smp(new_data,sizeof(SMP_Data),0,comm);
      new_data->addr = Shmat(new_data->shmid,0,0);
      MAX_DEBUG((stdout,"%s: SMP memory [%i] shmid=%i, semid=%i, addr=%x.\n",
                 DDI_Id(),new_data->handle,new_data->shmid,new_data->semid,new_data->addr))
      Comm_sync_smp(comm);
      if(comm->me_local == 0) { Shmctl(new_data->shmid,IPC_RMID,NULL); gv(shmid) = 0; }
      Comm_sync_smp(comm);
    # else
      new_data->handle = gv(smp_data_id)++;
      new_data->size   = size;
      new_data->next   = NULL;
      new_data->addr   = Malloc(size);
/*
         MWS: May 2010
     It appears above that systems without SysV memory are expected to
     allocate Process-replicated memory instead of Node-replicated, and
     get on with it.  If these are duplicated, at full size, as it appears,
     that's likely devastating for the system total memory usage.

     The parallel CCSD(T) on IBM Blue Gene/P got into a deadlock, but
     other systems with sockets or MPI seem to work if allowed to proceed.
     At this time, we kill off just the BG here...

*/
       # ifdef IBMBG
         fprintf(stdout,"DDI compiled w/o SysV operating system support.\n");
         fprintf(stdout,"IBM/BG parallel CCSD(T) cannot run w/o SysV.\n");
         Fatal_error(911);
       # endif
    # endif
      
      return new_data->handle;
   }
Пример #18
0
/** @see ddi_armci.h */
void DDI_ARMCI_Index_create(DDA_Index *index, int handle) {
  DDA_Remote_Index *remoteIndex = gv(dda_remote_index)[handle];
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);

  /* remoteIndex[pid] is indexed by DDI_WORKING_COMM process id. */
  remoteIndex[comm->me].offset = index[handle].offset;
  remoteIndex[comm->me].mutex = handle;
  /* remoteIndex[pid].offset may differ from process to process */
  MPI_Allgather(&remoteIndex[comm->me], sizeof(DDA_Remote_Index), MPI_BYTE,
		remoteIndex, sizeof(DDA_Remote_Index), MPI_BYTE, comm->compute_comm);
}
Пример #19
0
void DDI_ARMCI_Index_create(DDA_Index *index, int handle) {
  DDA_ARMCI_Index *armci_index = gv(dda_armci_index)[handle];
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);

  armci_index[comm->me].offset = index[handle].offset;
  armci_index[comm->me].semid = handle;
  gv(dda_comm)[handle] = DDI_WORKING_COMM;

  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, 
		armci_index,sizeof(DDA_ARMCI_Index),MPI_BYTE,
		comm->compute_comm);
}
Пример #20
0
void DDI_ARMCI_Zero_local(int handle) {
    double *da = NULL;
    const DDA_Index *Index = gv(dda_index);
    size_t i,size = Index[handle].size;

    const DDA_ARMCI_Index *armci_index = gv(dda_armci_index)[handle];
    int commid = gv(dda_comm)[handle];
    const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid);
    int armci_proc, proc = comm->me;

    DDI_ARMCI_Acquire(armci_index,handle,proc,DDI_WRITE_ACCESS,(void **) &da, &armci_proc);
    for (i=0; i<size; i++) da[i] = 0.0;
    DDI_ARMCI_Release(armci_index,handle,proc,DDI_WRITE_ACCESS);
}
Пример #21
0
/* ------------------------------------------------------------------------- *\
   DDI_Recv_request(buff,from)
   ===========================
   [OUT] buff - address in which incoming data is passed.
   [IN]  from - rank of process sending data stream.
   
   Used to receive a data request sent by DDI_Send_request.  This subroutine
   is only called from a data server.
\* ------------------------------------------------------------------------- */
   void DDI_Recv_request(void *buff,int *from) {

    # if defined DDI_SOC
      char ack=37;
      size_t size = sizeof(DDI_Patch);
      DDI_Recvany(buff,size,from);
      Send(gv(sockets)[*from],&ack,1,0);
    # endif

   /* ---------------------------------- *\
      The stand-alone MPI version of DDI
   \* ---------------------------------- */

   /*
      This routine is the one that is responsible for the poor
      performance of a 100% MPI-1 model.  The "receive from anywhere"
      option caused by MPI_ANY_SOURCE typically is implemented by
      repeated checking (polling) on all open MPI processes.  This
      can sometimes be influenced by looking for options to control
      the polling mechanism, at the "mpirun" time.  However, a more
      practical solution is to use the "mixed" model, which uses
      our TCP/IP code for handling the small control messages, which
      do no polling at all.  If your adapter allows TCP/IP to coexist
      with the MPI-1, by all means try "mixed" over "mpi" for DDI.

      Here's a short note from Ryan, 
         Having talking to some MPI people at past SC conferences,
         it seems that the Irecv/wait combination has the best 
         chance to be implemented in a blocking fashion.  However, 
         I never worked out all the details to make it happen.

      What I think this means is that you set up an IRECV on every
      possible source, and then loop around "polling" with the wait
      yourself, perhaps with a delay from "sleep" added before the
      loop repeats itself.

    */

    # if defined DDI_MPI && !defined DDI_SOC & !defined DDI_LAPI
      const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);
      MPI_Status status;
      size_t size = sizeof(DDI_Patch);
      DEBUG_OUT(LVL4,(stdout,"%s: call mpi_recv from any source.\n",DDI_Id()))
      MPI_Recv(buff,size,MPI_BYTE,MPI_ANY_SOURCE,0,comm->world_comm,&status);
      *from = status.MPI_SOURCE;
      DEBUG_OUT(LVL4,(stdout,"%s: received request from %i.\n",DDI_Id(),*from))
    # endif

   }
Пример #22
0
/** @see ddi_armci.h */
void DDI_ARMCI_Counters_init() {
  int i;
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD);

  /* The offsets should be the same on all processes. */
  size_t dlb_counter_offset = (size_t)(gv(dlb_counter)) - (size_t)(gv(dda_index));
  size_t gdlb_counter_offset = (size_t)(gv(gdlb_counter)) - (size_t)(gv(dda_index));
  for (i = 0; i < comm->np; ++i) {
    gv(armci_dlb_counter)[i] = gv(armci_mem_addr)[i] + dlb_counter_offset;
    gv(armci_gdlb_counter)[i] = gv(armci_mem_addr)[i] + gdlb_counter_offset;
  }

  ARMCI_PutValueInt(0, gv(dlb_counter), comm->me);
  ARMCI_PutValueInt(0, gv(gdlb_counter), comm->me);
}
Пример #23
0
/** @see ddi_armci.h */
void DDI_ARMCI_Release(int handle, int pid, int ltype) {
  DDA_Remote_Index *remoteIndex = gv(dda_remote_index)[handle];
  int commid = gv(dda_index)[handle].commId;
  const DDI_Comm *comm = (const DDI_Comm*)Comm_find(commid);
  int armciPid_;
  int mutex;

  if (pid < 0) pid = comm->me;
  armciPid_ = comm->global_pid[pid];
  mutex = remoteIndex[pid].mutex;
  
#if defined DDI_ARMCI_LOCK
  DDI_ARMCI_Unlock(mutex, armciPid_);
#endif
}
Пример #24
0
void DDI_ARMCI_Finalize() {
  int code;
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD);
  
#if defined DDI_ARMCI_FREE
  code = ARMCI_Free((void*)(gv(armci_mem_addr)[comm->me]));
  if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_mem_addr)[comm->me]);

  code = ARMCI_Free((void*)(gv(armci_cnt_addr)[comm->me]));
  if (code > 0) fprintf(stderr,"ARMCI_Free(%p) failed: %i",gv(armci_cnt_addr)[comm->me]);
#endif

  code = ARMCI_Destroy_mutexes();
  if (code > 0) fprintf(stderr,"ARMCI_Destory_mutexes failed: %i",code);

  ARMCI_Finalize();
}
Пример #25
0
/** @see ddi_armci.h */
void DDI_ARMCI_Memory_init(size_t size) {
  int code;
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_COMM_WORLD);
  
  code = ARMCI_Malloc(gv(armci_mem_addr), size);
  if (code != 0) {
    fprintf(DDI_STDERR, "%s: ARMCI_Malloc(%p, %z) returned %i\n",
	    DDI_Id(), gv(armci_mem_addr), size, code);
    DDI_Error(DDI_ARMCI_MEMORY_INIT_ERROR, DDI_ARMCI_MEMORY_INIT_ERROR_MESSAGE);
  }
  gv(dda_index) = (DDA_Index*)gv(armci_mem_addr)[comm->me];

  code = ARMCI_Create_mutexes(MAX_DD_ARRAYS);
  if (code != 0) {
    fprintf(DDI_STDERR, "%s: ARMCI_Create_mutexes(%d) returned %i\n",
	    DDI_Id(), MAX_DD_ARRAYS, code);
    DDI_Error(DDI_ARMCI_MEMORY_INIT_ERROR, DDI_ARMCI_MEMORY_INIT_ERROR_MESSAGE);
  }
}
Пример #26
0
/** @see ddi_armci.h */
void DDI_ARMCI_Acquire(int handle, int pid, int ltype, void **array, int *armciPid) {
  DDA_Remote_Index *remoteIndex = gv(dda_remote_index)[handle];
  char *buf = NULL;
  int commId = gv(dda_index)[handle].commId;
  const DDI_Comm *comm = (const DDI_Comm*)Comm_find(commId);
  int armciPid_;
  int mutex;

  if (pid < 0) pid = comm->me;
  armciPid_ = comm->global_pid[pid];
  if (armciPid != NULL) *armciPid = armciPid_;
  mutex = remoteIndex[pid].mutex;

  *array = (void*)((char*)(gv(armci_mem_addr)[armciPid_]) + remoteIndex[pid].offset);

#if defined DDI_ARMCI_LOCK
  DDI_ARMCI_Lock(mutex, armciPid_);
#endif
}
void DDI_Scattered_comm(int handle,const DDI_Scattered* scattered,int *nsubs,int *ranks,DDI_Scattered *subs,int commid,long * ibuff) {
    int i,j,k,np,me,nsub;
    DDI_Scattered s,*t;
    const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid);

      np = comm->nn;
      me = comm->my;

      for(i=0,nsub=0; i<np; i++) {

	  DDI_Scattered_NDistribP(handle,i,&s);
	  if(s.ihi > 0){printf("Error: DDI_Scatter_Acc is not implemented for arrays with more than one row.\n\n"); fflush(stdout); Fatal_error(911);}

	  for (j=0; j<scattered->nelem; j++) {

	      if(s.jlo > (ibuff[j]-1) || s.jhi < (ibuff[j]-1)) { } else {
		  ranks[nsub] = i;
		  t = &subs[nsub];
		  t->oper   = scattered->oper;
		  t->handle = handle;
		  t->nelem = scattered->nelem;
		  t->nlocal = 0;
		  
		  //		  		  t->start = j;
		  //		  		  t->end = 0;
		  for(k=j; k<scattered->nelem; k++){
		      if( s.jhi < (ibuff[k]-1) ) {break;} else {t->nlocal += 1;}
		  }
		  //		  		  t->end = t->start + t->nlocal -1;
		  t->size = t->nlocal;
		  t->size *= sizeof(double);
		  ++nsub;
		  break;
	      } //(s.jlo > (ibuff[j]-1) || s.jhi < (ibuff[j]-1))

	  } //(j=0; j<scattered->nelem; j++)

      }

      *nsubs = nsub;
}
Пример #28
0
   void
   Comm_create_gpu(int comm_id, int *new_comm_id)
   {
        int gpu_count = 1; // dynamic runtime from cuda driver
        int gpu_flag  = 0;
        const DDI_Comm *cur_comm   = (DDI_Comm *) Comm_find(comm_id);
        const DDI_Comm *comm_world = &gv(ddi_base_comm);
        DDI_Comm *new_comm = (DDI_Comm *) Malloc(sizeof(DDI_Comm));
        DDI_Comm *end_comm = (DDI_Comm *) Comm_find_end();

        assert(cur_comm->np > gpu_count);

        // copy the current comm structure into the new
        memcpy(new_comm, cur_comm, sizeof(DDI_Comm));

        new_comm->smp_with_gpu_comm = cur_comm->smp_comm;
        new_comm->compute_with_gpu_comm = cur_comm->compute_comm;

        // split mpi comms
        if(cur_comm->me_local < gpu_count) gpu_flag = 1;
        MPI_Comm_split(cur_comm->smp_comm, gpu_flag, cur_comm->me_local, &new_comm->smp_comm);       
        MPI_Comm_split(cur_comm->compute_comm, gpu_flag, cur_comm->me, &new_comm->compute_comm);       

        // update new_comm with      
        new_comm->has_gpu = gpu_flag;
        MPI_Comm_rank(new_comm->smp_comm, &new_comm->me_local);
        MPI_Comm_size(new_comm->smp_comm, &new_comm->np_local);
        MPI_Comm_rank(new_comm->compute_comm, &new_comm->me);
        MPI_Comm_size(new_comm->compute_comm, &new_comm->np);

     /* ------------------------------- *\
        Add new_comm to the linked list
     \* ------------------------------- */
        new_comm->id = *new_comm_id = gv(ddi_comm_id)++;
        new_comm->next = NULL;
        end_comm->next = (void *) new_comm; 

        *new_comm_id =  new_comm->id;
   }
Пример #29
0
void DDI_ARMCI_Acquire(const DDA_ARMCI_Index *index, int handle, int proc, int ltype, void **array, int *armci_proc_ptr) {
  char *buf = NULL;
  int semid = index[proc].semid;
  int commid = gv(dda_comm)[handle];
  const DDI_Comm *comm = (const DDI_Comm *) Comm_find(commid);
  int armci_proc = comm->global_pid[proc];
  
# if defined DDI_MAX_DEBUG
  if(ltype != DDI_READ_ACCESS && ltype != DDI_WRITE_ACCESS) {
    fprintf(stdout,"%s: Invalid lock type requested.\n",DDI_Id());
    Fatal_error(911);
  }
# endif

  buf = (char*)gv(armci_mem_addr)[armci_proc];
  buf += index[proc].offset;
  *array  = (void*)buf;

#if defined DDI_ARMCI_LOCK
  DDI_ARMCI_Lock(semid,proc);
#endif

  *armci_proc_ptr = armci_proc;
}
Пример #30
0
/* -------------------------------------------------------------------- *\
   DDI_Create_custom(idim,jdim,jcols,handle)
   =========================================
   [IN]  idim   - Number of rows in the array to be created.
   [IN]  jdim   - Number of columns in the array to be created.
   [IN]  jcols  - Array holding the number of columns to be given to
                - each processor when creating the distributed array.
   [OUT] handle - Handle given to the newly created array.
   
   Creates a distributed array where the user can customize how the
   array is distributed across the processors.
\* -------------------------------------------------------------------- */
   void DDI_Create_custom(int idim,int jdim,int *jcols,int *handle) {
   
      int i,np,me,nn,my;
      int inode;
      DDI_INT64 totwrds;
      DDI_INT64 longrows,longcols,longslice,longnd,long2g;
    # ifndef USE_SYSV
      int remote_id;
    # endif
      DDI_Patch patch;
      const DDI_Comm *comm = (const DDI_Comm *) Comm_find(DDI_WORKING_COMM);
      
      np = comm->np;
      me = comm->me;
      nn = comm->nn;
      my = comm->my;

      Comm_sync(3001,comm);

      /* find an unused handle */
      for (i=0; i<gv(nxtdda); ++i) {
        if (gv(ddacomm)[i] == DDI_COMM_NULL) break;
      }
      if (i==gv(nxtdda)) ++gv(nxtdda);
      *handle = i;
     
    # ifndef USE_SYSV
      remote_id = my;
    # endif

      DEBUG_ROOT(LVL2,(stdout," DDI: Entering DDI_Create_custom.\n"))
      DEBUG_ROOT(LVL2,(stdout," DDI: Creating Array [%i] - %ix%i=%i.\n",*handle,idim,jdim,idim*jdim))
      DEBUG_OUT(LVL3,(stdout,"%s: Entering DDI_Create_custom.\n",DDI_Id()))

    # ifdef DS_SIGNAL
      if(comm->me_local == 1) {
         signal(SIGALRM,DS_Thread_main);
      }
    # endif
      
      if(me == 0) {
         if(gv(dda_output)) {
            longrows = idim;
            longcols = jdim;
            totwrds = longrows*longcols;
            fprintf(stdout," DDI: Creating Array [%i] - %i x %i = %li words.\n",
                                    *handle,idim,jdim,totwrds);
            fflush(stdout);
         }
      }

   /*
       Make sure each slice of the distributed array will be under 2 GWords.

       Even on 64-bit hardware, most counting in this program is done
       with 32-bit data types, meaning we can't count higher than 2**31-1.

       If on 32-bit hardware, the 'long' data types here will be 32-bits,
       and so we'll see crazy products, including less than zero.
       In present form, nothing will be trapped here on a 32 bit machine!
   */
      longrows  = idim;
      longcols  = jdim;
      totwrds   = longrows*longcols;
   /*     Total distributed array over 2 Gwords is OK, but each  */
   /*     slice (MEMDDI per data server) must be under 2 GWords. */
   /*     TCP/IP has gv(nd)=-1 (uninitialized)                   */
   /*     Cray on one node has gv(nd)=0 since no d.s. exists.    */
      # if defined DDI_MPI
         longnd    = gv(nd);
         if (longnd <= 0) longnd=1;
      # endif
      # if defined DDI_SOC
         longnd = np;
      # endif
      longslice = totwrds/longnd;
   /*  next is largest signed 32 bit integer, stored as 64 bit quantity  */
      long2g   = 2147483643;
      if (longslice > long2g)
         {
            fprintf(stdout,"\n");
            fprintf(stdout," DDI: trouble creating distributed array!\n");
            fprintf(stdout," Current number of data servers is %li\n",longnd);
            fprintf(stdout," so each data server's slice of array");
            fprintf(stdout," [%i] is %li words\n",*handle,longslice);
            fprintf(stdout,"\n");
            fprintf(stdout," Add more processors so required total array");
            fprintf(stdout," size %li words\n",totwrds);
            fprintf(stdout," divided by no. of processors (data servers)");
            fprintf(stdout," is less than 2 Gwords= %li\n",long2g);
            fprintf(stdout," For example, %li or more data servers...\n",
                                1+totwrds/long2g);
            fprintf(stdout,"\n");
            fflush(stdout);
            Fatal_error(911);
         }

   /* ------------------------------------ *\
      Ensure 'jcols' is properly formatted
   \* ------------------------------------ */
      for(i=0; i<np; i++) {
         if(jcols[i] < 0 && me == 0) {
            fprintf(stdout," Error in argument 3 of DDI_Create_custom: Values must be >= 0.\n");
            Fatal_error(911);
         }
         
         if(i > 0)
         if(jcols[i] < jcols[i-1]) {
            fprintf(stdout," Error in argument 3 of DDI_Create_custom: Values must increase monotonically.\n");
            Fatal_error(911);
         }
      }
   
   /* ----------------------------------------------------------------- *\
      Check to ensure the maximum number of arrays hasn't been reached.
   \* ----------------------------------------------------------------- */
      if( gv(nxtdda) == MAX_DD_ARRAYS ) {
        if(me == 0) {
           fprintf(stderr," DDI Error:  The maximum number of distributed arrays [%i] has been reached.\n",MAX_DD_ARRAYS);
           fprintf(stderr," Information:  The maximum number of distributed arrays is a DDI compile-time option.\n");
        }
        Fatal_error(911);
      }

      gv(nrow)[*handle] = idim;
      gv(ncol)[*handle] = jdim;
      gv(ddacomm)[*handle]=gv(ddi_working_comm);
      
 
   /* ---------------------------------------------------- *\
      Generate Column Mapping by Compute Process & by Node
   \* ---------------------------------------------------- */
      for(i=0,inode=-1; i<np; i++) {
        gv(pcmap)[*handle][i] = jcols[i];

     /* if(inode == gv(ddiprocs)[i].node) continue; */
        if(inode == comm->local_nid[i]) continue;
        gv(ncmap)[*handle][++inode] = gv(pcmap)[*handle][i];
      }

      gv(pcmap)[*handle][np] = jdim;
      gv(ncmap)[*handle][nn] = jdim;


   /* -------------------------- *\
      Get local patch dimensions
   \* -------------------------- */
      DDI_DistribP(*handle,me,&patch);
 
   /* ----------------------------- *\
      Create Distributed Data Array
   \* ----------------------------- */
      patch.handle = *handle;
# if defined WINTEL
      patch.oper   = DDI_CREATE_OP;
# else
      patch.oper   = DDI_CREATE;
# endif
      patch.size   = jdim;


# if defined USE_SYSV || defined DDI_ARMCI || defined DDI_MPI2
      DDI_Index_create(&patch);
# else
      DDI_Send_request(&patch,&remote_id,NULL);
# endif 


   /* ----------------------------- *\
      Synchronize Compute Processes
   \* ----------------------------- */
      Comm_sync(3002,comm);

      DEBUG_OUT(LVL3,(stdout,"%s: Leaving DDI_Create_custom.\n",DDI_Id()))
   }