コード例 #1
0
ファイル: ma_hwloc.c プロジェクト: 12182007/cp2k
int hw_my_gpu(int coreId, int myRank, int nMPIs)
{
  int *devList,i,nDev,nodeId;

  int ngpus = local_topo->ngpus;
  devList = malloc(ngpus*sizeof(int));

  nodeId = hw_get_myNode(coreId);
  hw_my_gpuList(nodeId,devList);
/*
 * The algorithm:
 *  if UMA machine - similar costs to access devices
 *    just make a round-robin distribution of the GPUs
 *  if NUMA machine - not similar costs to access devices
 *    -MPI running on a node without GPUs - bind it to some
 *    GPU in the closest NUMA node
 *    -MPI NUMA node has GPU, bind it to this GPU
 **/

 if( local_topo->nnodes == 0 )
   return devList[myRank%ngpus];
  else {
   ma_get_nDevcu(nodeId,&nDev);
   if (nDev == 0)
    return devList[myRank%ngpus];
   else
    return devList[myRank%nDev];
 }

}
コード例 #2
0
ファイル: ma_linux.c プロジェクト: Jwonsever/shirleyxas-newt
/*
 *Get the GPU ID for a MPI process
 *param core id where the process is running
 *      rank MPI rank number
 *return the GPU ID to connect to           
 * */
int linux_my_gpu(int coreId, int myRank, int nMPIs)
{
  int *devList,i,nDev,nodeId;

  int ngpus = local_topo->ngpus;
  devList = malloc(ngpus*sizeof(int));

  nodeId = linux_get_nodeid_cpu(coreId);


  linux_my_gpuList(nodeId,devList);

/*
 *The algorithm: 
 * if UMA machine - similar costs to access devices
 *   just make a round-robin distribution of the GPUs
 * if NUMA machine - not similar costs to access devices
 *   if number of MPI process equal to number of GPUs
 *     just give one GPU for each MPI process
 *   if the MPI is in a NUMA node that has no GPU
 *     just make a round-robin distribution of the GPUs
 *   if number of MPIs is larger than number of GPUs 
 *      try to balance the GPU usage    	  
 * */

  if( local_topo->nnodes == 0 )
   return devList[myRank%ngpus];
  else {
   ma_get_nDevcu(nodeId,&nDev);
   if (nDev == 0)
    return devList[myRank%ngpus];
   else  
    return devList[myRank%nDev];  
 }

}
コード例 #3
0
ファイル: ma_hwloc.c プロジェクト: 12182007/cp2k
/*
*Prints one branch of the machine topology
*Recursive function that goes throught the machine topology object
*an group them into hierarchical groups
* topology: the HWLOC object that represents the machine
* obj: the current object of a level
*/
void print_machine_branch(hwloc_topology_t topology, hwloc_obj_t obj, int depth, int obj_type)
{
    char string[128], out_string[128];
    unsigned i,arity;
    int *devIds,devId,countDev;

    if (obj->type != HWLOC_OBJ_MACHINE ){
     if(obj->type == HWLOC_OBJ_NODE){
       hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0);
       sprintf(out_string,"%*s%s\n", depth, "", string); 
       strcat(console_output,out_string);      
#if defined  (__DBCSR_ACC) || defined (__PW_CUDA)
       if ((local_topo->ngpus > 0) && (local_topo->ngpus < local_topo->ncores)){
                ma_get_nDevcu(obj->logical_index, &countDev);
                devIds = malloc (countDev*sizeof(int));
                ma_get_cu(obj->logical_index, devIds);
                strcat(console_output," Shared GPUS: ");
                for (i = 0; i<countDev; i++){
                 devId = devIds[i];
                 sprintf(out_string,"#%d ", devId);
                 strcat(console_output,out_string);}
                strcat(console_output,"\n");
       }
#endif
     } 
     else if (obj->type == HWLOC_OBJ_SOCKET){ 
       hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0);
       sprintf(out_string,"%*s%s\n", depth, "", string); 
       strcat(console_output,out_string);}
     else {
       hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0);
      if(obj->type == HWLOC_OBJ_PU)  {
#if defined  (__DBCSR_ACC) || defined (__PW_CUDA)
       sprintf(out_string,"%*s%s\t", depth, "", string);
       strcat(console_output,out_string);
       if (local_topo->ngpus > 0 && local_topo->ngpus == local_topo->ncores){
                ma_get_core_cu(obj->logical_index, &devId);
                strcat(console_output," GPU: ");
                sprintf(out_string,"%d ", devId);
                strcat(console_output,out_string);}
      strcat(console_output,"\n");
#else
       sprintf(out_string,"%*s%s\n", depth, "", string);
       strcat(console_output,out_string);
#endif      
      }
      else if (obj->type == HWLOC_OBJ_CACHE && obj->arity>1){
       hwloc_obj_type_snprintf(string, sizeof(string), obj, 0);
       sprintf(out_string,"%*s%s", depth, "", string);
       strcat(console_output,out_string);
       sprintf(out_string," (%dMB)\n", obj->attr->cache.size/(1024*1024));
       strcat(console_output,out_string);
      }
      else {
       sprintf(out_string,"%*s%s\t", depth, "", string); 
       strcat(console_output,out_string);
      }
     }                 
    }  
    if (obj->type != HWLOC_OBJ_PU) {//it is not a PU
      if((obj->first_child && obj->first_child->type == HWLOC_OBJ_PU) ||
          obj->first_child->type == obj_type)
       arity = 1; //number of children
      else
       arity = obj->arity;

    for (i = 0; i < arity; i++) 
        print_machine_branch(topology, obj->children[i],depth+1,obj_type);
   }
}
コード例 #4
0
ファイル: ma_hwloc.c プロジェクト: 12182007/cp2k
/*
*Prints the machine hierarchy 
*Recursive function that goes throught the machine topology object
*an group them into hierarchical groups
* topology: the HWLOC object
* obj: the current object in the topology
* depth: the horizontal level in the machine topology 
*/
void print_machine(hwloc_topology_t topo, hwloc_obj_t obj, int depth)
{
    char string[256], out_string[256];
    unsigned i,arity;
    int *devIds,devId,countDev;

    if(obj->type == HWLOC_OBJ_SOCKET || obj->type == HWLOC_OBJ_MACHINE ){ 
       hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0);
       sprintf(out_string,"%*s%s\n", depth, "", string);
       strcat(console_output,out_string);
     }
     else if (obj->type == HWLOC_OBJ_NODE){
      hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0);
      sprintf(out_string,"%*s%s\n", depth, "", string);
      strcat(console_output,out_string); 
//if the machine has shared GPUs
#if defined  (__DBCSR_ACC) || defined (__PW_CUDA)
       if ((local_topo->ngpus > 0) && (local_topo->ngpus < local_topo->ncores)){
                ma_get_nDevcu(obj->logical_index, &countDev);
                devIds = malloc (countDev*sizeof(int));
                ma_get_cu(obj->logical_index, devIds);
		strcat(console_output," Shared GPUS: ");
		for (i = 0; i<countDev; i++){
		 devId = devIds[i];
                 sprintf(out_string,"#%d ", devId); 
       		 strcat(console_output,out_string);}
       		strcat(console_output,"\n");
       }
#endif
     }
     else {
       hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0);
      if(obj->type == HWLOC_OBJ_PU )
      {
#if defined  (__DBCSR_ACC) || defined (__PW_CUDA)
       sprintf(out_string,"%*s%s\t", depth, "", string);
       strcat(console_output,out_string);
       if (local_topo->ngpus > 0 && local_topo->ngpus == local_topo->ncores){
                ma_get_core_cu(obj->logical_index, &devId);
                strcat(console_output," GPU: ");
                sprintf(out_string,"%d ", devId);
                strcat(console_output,out_string);}
      strcat(console_output,"\n");
#else
       sprintf(out_string,"%*s%s\n", depth, "", string);
       strcat(console_output,out_string);	
#endif
      }
      else if (obj->type == HWLOC_OBJ_CACHE && obj->arity>1 ){
           hwloc_obj_type_snprintf(string, sizeof(string), obj, 0);
           sprintf(out_string,"%*s%s", depth, "", string);
           strcat(console_output,out_string);
           sprintf(out_string," (%dMB)\n", obj->attr->cache.size/(1024*1024));
           strcat(console_output,out_string);       
         }
      else if (obj->type == HWLOC_OBJ_OS_DEVICE ||
               obj->type == HWLOC_OBJ_PCI_DEVICE ||
               obj->type == HWLOC_OBJ_BRIDGE){
               if(obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK ){
                sprintf(out_string,"%*s%s\n", depth, "--", "Network Card");
                  strcat(console_output,out_string);}
       }
      else if (obj->type == HWLOC_OBJ_CORE)
       {
          char number[33];
          strcpy(string,"Core#");
          sprintf(number,"%d",obj->logical_index);
          strcat(string,number);
          sprintf(out_string,"%*s%s\t", depth, "", string);
          strcat(console_output,out_string);
       }
      else {
           sprintf(out_string,"%*s%s\t", depth, "", string);
           strcat(console_output,out_string);
        }
     }  
    if (obj->type != HWLOC_OBJ_PU) {//it is not a PU
      if((obj->first_child && obj->first_child->type == HWLOC_OBJ_PU))
       arity = 1; //number of children
      else
       arity = obj->arity;

    for (i = 0; i < arity; i++) 
        print_machine(topo, obj->children[i],depth+1);
   }
}
コード例 #5
0
ファイル: ma_linux.c プロジェクト: Jwonsever/shirleyxas-newt
int linux_topology_init(struct arch_topology *topo)
{
  int count, i, j, error,k,tmpNode;
  
#ifdef  __DBCSR_CUDA
  int nDev;
  ma_get_ndevices_cu(&nDev);
  topo->ngpus = nDev;
#endif

  local_topo = malloc(sizeof(struct arch_topology));

  topo->nnodes = linux_get_nnodes();
  local_topo->nnodes = topo->nnodes;
  topo->ncores = linux_get_ncores();
  local_topo->ncores = topo->ncores;
  topo->npus = topo->ncores;
  local_topo->npus = topo->npus;

  //libnuma has no support for I/O devices
  topo->nnetcards = 0;
  local_topo->nnetcards = 0;
  topo->nsockets = linux_get_nsockets();
  local_topo->nsockets = topo->nsockets;
   //Compute number of memory controlers per socket
   //basically the number of NUMA nodes per socket
  if (topo->nnodes > topo->nsockets)
    topo->nmemcontroller = topo->nnodes/topo->nsockets;
  else
    topo->nmemcontroller = 1;
                  
  topo->ncaches = linux_get_ncaches(); 

   local_topo->nmemcontroller = topo->nmemcontroller;
   local_topo->ncaches = topo->ncaches;

  topo->nshared_caches = linux_get_nshared_caches();
  topo->nsiblings = linux_get_nsiblings();  

  local_topo->nshared_caches =  topo->nshared_caches;
  local_topo->nsiblings = topo->nsiblings;

  //Machine node and core representation
  machine_nodes = (struct node*) malloc (topo->nnodes*sizeof(struct node));

  int ncore_node = topo->ncores/topo->nnodes;

 for (i = 0; i < topo->nnodes ; i++)
   {
        machine_nodes[i].id = i;
        machine_nodes[i].memory = 0;
        machine_nodes[i].ncores = ncore_node;
#ifdef  __DBCSR_CUDA
       ma_get_nDevcu(i,&nDev);
       machine_nodes[i].mygpus = malloc (nDev*sizeof(int));
       ma_get_cu(i,machine_nodes[i].mygpus);
#endif
   }

   if (topo->nnodes == -1 || topo->ncores == -1 || topo->npus == -1 ||
       topo->nsockets == -1)
        return -1;
   else
        return 0;

}