int hw_my_gpu(int coreId, int myRank, int nMPIs) { int *devList,i,nDev,nodeId; int ngpus = local_topo->ngpus; devList = malloc(ngpus*sizeof(int)); nodeId = hw_get_myNode(coreId); hw_my_gpuList(nodeId,devList); /* * The algorithm: * if UMA machine - similar costs to access devices * just make a round-robin distribution of the GPUs * if NUMA machine - not similar costs to access devices * -MPI running on a node without GPUs - bind it to some * GPU in the closest NUMA node * -MPI NUMA node has GPU, bind it to this GPU **/ if( local_topo->nnodes == 0 ) return devList[myRank%ngpus]; else { ma_get_nDevcu(nodeId,&nDev); if (nDev == 0) return devList[myRank%ngpus]; else return devList[myRank%nDev]; } }
/* *Get the GPU ID for a MPI process *param core id where the process is running * rank MPI rank number *return the GPU ID to connect to * */ int linux_my_gpu(int coreId, int myRank, int nMPIs) { int *devList,i,nDev,nodeId; int ngpus = local_topo->ngpus; devList = malloc(ngpus*sizeof(int)); nodeId = linux_get_nodeid_cpu(coreId); linux_my_gpuList(nodeId,devList); /* *The algorithm: * if UMA machine - similar costs to access devices * just make a round-robin distribution of the GPUs * if NUMA machine - not similar costs to access devices * if number of MPI process equal to number of GPUs * just give one GPU for each MPI process * if the MPI is in a NUMA node that has no GPU * just make a round-robin distribution of the GPUs * if number of MPIs is larger than number of GPUs * try to balance the GPU usage * */ if( local_topo->nnodes == 0 ) return devList[myRank%ngpus]; else { ma_get_nDevcu(nodeId,&nDev); if (nDev == 0) return devList[myRank%ngpus]; else return devList[myRank%nDev]; } }
/* *Prints one branch of the machine topology *Recursive function that goes throught the machine topology object *an group them into hierarchical groups * topology: the HWLOC object that represents the machine * obj: the current object of a level */ void print_machine_branch(hwloc_topology_t topology, hwloc_obj_t obj, int depth, int obj_type) { char string[128], out_string[128]; unsigned i,arity; int *devIds,devId,countDev; if (obj->type != HWLOC_OBJ_MACHINE ){ if(obj->type == HWLOC_OBJ_NODE){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); #if defined (__DBCSR_ACC) || defined (__PW_CUDA) if ((local_topo->ngpus > 0) && (local_topo->ngpus < local_topo->ncores)){ ma_get_nDevcu(obj->logical_index, &countDev); devIds = malloc (countDev*sizeof(int)); ma_get_cu(obj->logical_index, devIds); strcat(console_output," Shared GPUS: "); for (i = 0; i<countDev; i++){ devId = devIds[i]; sprintf(out_string,"#%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); } #endif } else if (obj->type == HWLOC_OBJ_SOCKET){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string);} else { hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); if(obj->type == HWLOC_OBJ_PU) { #if defined (__DBCSR_ACC) || defined (__PW_CUDA) sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); if (local_topo->ngpus > 0 && local_topo->ngpus == local_topo->ncores){ ma_get_core_cu(obj->logical_index, &devId); strcat(console_output," GPU: "); sprintf(out_string,"%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); #else sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); #endif } else if (obj->type == HWLOC_OBJ_CACHE && obj->arity>1){ hwloc_obj_type_snprintf(string, sizeof(string), obj, 0); sprintf(out_string,"%*s%s", depth, "", string); strcat(console_output,out_string); sprintf(out_string," (%dMB)\n", obj->attr->cache.size/(1024*1024)); strcat(console_output,out_string); } else { sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); } } } if (obj->type != HWLOC_OBJ_PU) {//it is not a PU if((obj->first_child && obj->first_child->type == HWLOC_OBJ_PU) || obj->first_child->type == obj_type) arity = 1; //number of children else arity = obj->arity; for (i = 0; i < arity; i++) print_machine_branch(topology, obj->children[i],depth+1,obj_type); } }
/* *Prints the machine hierarchy *Recursive function that goes throught the machine topology object *an group them into hierarchical groups * topology: the HWLOC object * obj: the current object in the topology * depth: the horizontal level in the machine topology */ void print_machine(hwloc_topology_t topo, hwloc_obj_t obj, int depth) { char string[256], out_string[256]; unsigned i,arity; int *devIds,devId,countDev; if(obj->type == HWLOC_OBJ_SOCKET || obj->type == HWLOC_OBJ_MACHINE ){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); } else if (obj->type == HWLOC_OBJ_NODE){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); //if the machine has shared GPUs #if defined (__DBCSR_ACC) || defined (__PW_CUDA) if ((local_topo->ngpus > 0) && (local_topo->ngpus < local_topo->ncores)){ ma_get_nDevcu(obj->logical_index, &countDev); devIds = malloc (countDev*sizeof(int)); ma_get_cu(obj->logical_index, devIds); strcat(console_output," Shared GPUS: "); for (i = 0; i<countDev; i++){ devId = devIds[i]; sprintf(out_string,"#%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); } #endif } else { hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); if(obj->type == HWLOC_OBJ_PU ) { #if defined (__DBCSR_ACC) || defined (__PW_CUDA) sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); if (local_topo->ngpus > 0 && local_topo->ngpus == local_topo->ncores){ ma_get_core_cu(obj->logical_index, &devId); strcat(console_output," GPU: "); sprintf(out_string,"%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); #else sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); #endif } else if (obj->type == HWLOC_OBJ_CACHE && obj->arity>1 ){ hwloc_obj_type_snprintf(string, sizeof(string), obj, 0); sprintf(out_string,"%*s%s", depth, "", string); strcat(console_output,out_string); sprintf(out_string," (%dMB)\n", obj->attr->cache.size/(1024*1024)); strcat(console_output,out_string); } else if (obj->type == HWLOC_OBJ_OS_DEVICE || obj->type == HWLOC_OBJ_PCI_DEVICE || obj->type == HWLOC_OBJ_BRIDGE){ if(obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK ){ sprintf(out_string,"%*s%s\n", depth, "--", "Network Card"); strcat(console_output,out_string);} } else if (obj->type == HWLOC_OBJ_CORE) { char number[33]; strcpy(string,"Core#"); sprintf(number,"%d",obj->logical_index); strcat(string,number); sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); } else { sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); } } if (obj->type != HWLOC_OBJ_PU) {//it is not a PU if((obj->first_child && obj->first_child->type == HWLOC_OBJ_PU)) arity = 1; //number of children else arity = obj->arity; for (i = 0; i < arity; i++) print_machine(topo, obj->children[i],depth+1); } }
int linux_topology_init(struct arch_topology *topo) { int count, i, j, error,k,tmpNode; #ifdef __DBCSR_CUDA int nDev; ma_get_ndevices_cu(&nDev); topo->ngpus = nDev; #endif local_topo = malloc(sizeof(struct arch_topology)); topo->nnodes = linux_get_nnodes(); local_topo->nnodes = topo->nnodes; topo->ncores = linux_get_ncores(); local_topo->ncores = topo->ncores; topo->npus = topo->ncores; local_topo->npus = topo->npus; //libnuma has no support for I/O devices topo->nnetcards = 0; local_topo->nnetcards = 0; topo->nsockets = linux_get_nsockets(); local_topo->nsockets = topo->nsockets; //Compute number of memory controlers per socket //basically the number of NUMA nodes per socket if (topo->nnodes > topo->nsockets) topo->nmemcontroller = topo->nnodes/topo->nsockets; else topo->nmemcontroller = 1; topo->ncaches = linux_get_ncaches(); local_topo->nmemcontroller = topo->nmemcontroller; local_topo->ncaches = topo->ncaches; topo->nshared_caches = linux_get_nshared_caches(); topo->nsiblings = linux_get_nsiblings(); local_topo->nshared_caches = topo->nshared_caches; local_topo->nsiblings = topo->nsiblings; //Machine node and core representation machine_nodes = (struct node*) malloc (topo->nnodes*sizeof(struct node)); int ncore_node = topo->ncores/topo->nnodes; for (i = 0; i < topo->nnodes ; i++) { machine_nodes[i].id = i; machine_nodes[i].memory = 0; machine_nodes[i].ncores = ncore_node; #ifdef __DBCSR_CUDA ma_get_nDevcu(i,&nDev); machine_nodes[i].mygpus = malloc (nDev*sizeof(int)); ma_get_cu(i,machine_nodes[i].mygpus); #endif } if (topo->nnodes == -1 || topo->ncores == -1 || topo->npus == -1 || topo->nsockets == -1) return -1; else return 0; }