/* *Prints one branch of the machine topology *Recursive function that goes throught the machine topology object *an group them into hierarchical groups * topology: the HWLOC object that represents the machine * obj: the current object of a level */ void print_machine_branch(hwloc_topology_t topology, hwloc_obj_t obj, int depth, int obj_type) { char string[128], out_string[128]; unsigned i,arity; int *devIds,devId,countDev; if (obj->type != HWLOC_OBJ_MACHINE ){ if(obj->type == HWLOC_OBJ_NODE){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); #if defined (__DBCSR_ACC) || defined (__PW_CUDA) if ((local_topo->ngpus > 0) && (local_topo->ngpus < local_topo->ncores)){ ma_get_nDevcu(obj->logical_index, &countDev); devIds = malloc (countDev*sizeof(int)); ma_get_cu(obj->logical_index, devIds); strcat(console_output," Shared GPUS: "); for (i = 0; i<countDev; i++){ devId = devIds[i]; sprintf(out_string,"#%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); } #endif } else if (obj->type == HWLOC_OBJ_SOCKET){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string);} else { hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); if(obj->type == HWLOC_OBJ_PU) { #if defined (__DBCSR_ACC) || defined (__PW_CUDA) sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); if (local_topo->ngpus > 0 && local_topo->ngpus == local_topo->ncores){ ma_get_core_cu(obj->logical_index, &devId); strcat(console_output," GPU: "); sprintf(out_string,"%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); #else sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); #endif } else if (obj->type == HWLOC_OBJ_CACHE && obj->arity>1){ hwloc_obj_type_snprintf(string, sizeof(string), obj, 0); sprintf(out_string,"%*s%s", depth, "", string); strcat(console_output,out_string); sprintf(out_string," (%dMB)\n", obj->attr->cache.size/(1024*1024)); strcat(console_output,out_string); } else { sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); } } } if (obj->type != HWLOC_OBJ_PU) {//it is not a PU if((obj->first_child && obj->first_child->type == HWLOC_OBJ_PU) || obj->first_child->type == obj_type) arity = 1; //number of children else arity = obj->arity; for (i = 0; i < arity; i++) print_machine_branch(topology, obj->children[i],depth+1,obj_type); } }
//Initializes HWLOC and load the machine architecture int hw_topology_init (struct arch_topology *topo) { hwloc_obj_t obj, core1, core2; int count, i, j, error; //Create the machine representation error = hwloc_topology_init(&topology); //Go throught the topology only if HWLOC is //successifully initialized if(!error) { hwloc_topology_load(topology); local_topo = malloc(sizeof(struct arch_topology)); #if defined (__DBCSR_ACC) || defined (__PW_CUDA) int nDev; ma_get_ndevices_cu(&nDev); #endif //Extract number of NUMA nodes if (hwloc_get_type_depth (topology, HWLOC_OBJ_NODE)) topo->nnodes = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_NODE)); else topo->nnodes = 0; //Get number of cores, sockets and processing units topo->ncores = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_CORE)); topo->nsockets = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_SOCKET)); topo->npus = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_PU)); //Compute number of memory controlers per socket //basically the number of NUMA nodes per socket if (topo->nnodes > topo->nsockets) topo->nmemcontroller = topo->nnodes/topo->nsockets; else topo->nmemcontroller = 1; count = 0; topo->nshared_caches = 0; //Get derivate information - get number of cache per PU for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,0); obj; obj = obj->parent) { if (obj->type == HWLOC_OBJ_CACHE) { if (obj->arity>1) topo->nshared_caches++; else { count++; topo->ncaches = count; } } } //Number of direct siblings //Siblings cores are the ones that share at least one component //level of the architecture count = 0; core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, 0); core2 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, 1); obj = hwloc_get_common_ancestor_obj(topology, core1, core2); if (obj) topo->nsiblings = obj->arity; //Machine node and core representation machine_nodes = (struct node*) malloc (topo->nnodes*sizeof(struct node)); machine_cores = (struct core*) malloc (topo->ncores*sizeof(struct core)); phys_cpus = malloc (topo->ncores*sizeof(int)); get_phys_id(topology, topo->ncores, 0); //Get the caches sizes and other information for each core for (i = 0; i < topo->ncores ; i++) { machine_cores[i].caches = malloc (topo->ncaches*sizeof(size_t)); machine_cores[i].shared_caches = malloc (topo->ncaches*sizeof(int)); for (j = 0; j < topo->ncaches; j++) machine_cores[i].shared_caches[j] = 0; for (j = topo->ncaches ; j > topo->ncaches - topo->nshared_caches; j--) machine_cores[i].shared_caches[j-1] = 1; machine_cores[i].nsiblings = topo->nsiblings; machine_cores[i].siblings_id = malloc (topo->nsiblings*sizeof(unsigned)); if(topo->ncores == topo->npus){ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, i); machine_cores[i].id = core1->os_index; count = 0; for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,i); obj; obj = obj->parent) { if (obj->type == HWLOC_OBJ_CACHE){ machine_cores[i].caches[count] = obj->attr->cache.size / 1024; count++; } if (obj->type == HWLOC_OBJ_NODE) machine_cores[i].numaNode = obj->logical_index; } } else{ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i); machine_cores[i].id = core1->os_index; count = 0; for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_CORE,i); obj; obj = obj->parent) { if (obj->type == HWLOC_OBJ_CACHE) { machine_cores[i].caches[count] = obj->attr->cache.size / 1024; count++; } if (obj->type == HWLOC_OBJ_NODE) machine_cores[i].numaNode = obj->logical_index; } } } //Get siblings id - so each core knows its siblings for (i = 0; i < topo->ncores ; i++) { if(topo->ncores == topo->npus){ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, i); set_phys_siblings(i,machine_cores[i].id,core1,topo->ncores,topo->nsiblings,HWLOC_OBJ_PU); } else{ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i); set_phys_siblings(i,machine_cores[i].id,core1,topo->ncores,topo->nsiblings,HWLOC_OBJ_CORE); } } int ncore_node = topo->ncores/topo->nnodes; int count_cores; //Get the information for each NUMAnode for (i = 0; i < topo->nnodes ; i++) { obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NODE, i); machine_nodes[i].id = obj->os_index; machine_nodes[i].memory = obj->memory.total_memory; machine_nodes[i].ncores = ncore_node; machine_nodes[i].mycores = malloc (ncore_node*sizeof(unsigned)); //Get the cores id of each NUMAnode count_cores = 0; set_node_cores(topology, obj, i, &count_cores); //GPU support #if defined (__DBCSR_ACC) || defined (__PW_CUDA) int *devIds; devIds = malloc (nDev*sizeof(int)); topo->ngpus = nDev; ma_get_cu(i,devIds); machine_nodes[i].mygpus = devIds; #endif } //counting network cards count = 0; hwloc_topology_t topo_net; error = hwloc_topology_init(&topo_net); hwloc_topology_set_flags(topo_net, HWLOC_TOPOLOGY_FLAG_IO_DEVICES); if (!error){ hwloc_topology_load(topo_net); for (obj = hwloc_get_obj_by_type(topo_net, HWLOC_OBJ_OS_DEVICE, 0); obj; obj = hwloc_get_next_osdev(topo_net,obj)) if (obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK || obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) count++; topo->nnetcards = count; } else //if can not load I/O devices topo->nnetcards = 0; hwloc_topology_destroy(topo_net); /*Local copy of the machine topology components*/ local_topo->nnodes = topo->nnodes; local_topo->nsockets = topo->nsockets; local_topo->ncores = topo->ncores; local_topo->npus = topo->npus; local_topo->ngpus = topo->ngpus; local_topo->ncaches = topo->ncaches; local_topo->nshared_caches = topo->nshared_caches; local_topo->nsiblings = topo->nsiblings; local_topo->nmemcontroller = topo->nmemcontroller; local_topo->nnetcards = topo->nnetcards; } return error; }
/* *Prints the machine hierarchy *Recursive function that goes throught the machine topology object *an group them into hierarchical groups * topology: the HWLOC object * obj: the current object in the topology * depth: the horizontal level in the machine topology */ void print_machine(hwloc_topology_t topo, hwloc_obj_t obj, int depth) { char string[256], out_string[256]; unsigned i,arity; int *devIds,devId,countDev; if(obj->type == HWLOC_OBJ_SOCKET || obj->type == HWLOC_OBJ_MACHINE ){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); } else if (obj->type == HWLOC_OBJ_NODE){ hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); //if the machine has shared GPUs #if defined (__DBCSR_ACC) || defined (__PW_CUDA) if ((local_topo->ngpus > 0) && (local_topo->ngpus < local_topo->ncores)){ ma_get_nDevcu(obj->logical_index, &countDev); devIds = malloc (countDev*sizeof(int)); ma_get_cu(obj->logical_index, devIds); strcat(console_output," Shared GPUS: "); for (i = 0; i<countDev; i++){ devId = devIds[i]; sprintf(out_string,"#%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); } #endif } else { hwloc_obj_snprintf(string, sizeof(string), topology, obj, "#", 0); if(obj->type == HWLOC_OBJ_PU ) { #if defined (__DBCSR_ACC) || defined (__PW_CUDA) sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); if (local_topo->ngpus > 0 && local_topo->ngpus == local_topo->ncores){ ma_get_core_cu(obj->logical_index, &devId); strcat(console_output," GPU: "); sprintf(out_string,"%d ", devId); strcat(console_output,out_string);} strcat(console_output,"\n"); #else sprintf(out_string,"%*s%s\n", depth, "", string); strcat(console_output,out_string); #endif } else if (obj->type == HWLOC_OBJ_CACHE && obj->arity>1 ){ hwloc_obj_type_snprintf(string, sizeof(string), obj, 0); sprintf(out_string,"%*s%s", depth, "", string); strcat(console_output,out_string); sprintf(out_string," (%dMB)\n", obj->attr->cache.size/(1024*1024)); strcat(console_output,out_string); } else if (obj->type == HWLOC_OBJ_OS_DEVICE || obj->type == HWLOC_OBJ_PCI_DEVICE || obj->type == HWLOC_OBJ_BRIDGE){ if(obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK ){ sprintf(out_string,"%*s%s\n", depth, "--", "Network Card"); strcat(console_output,out_string);} } else if (obj->type == HWLOC_OBJ_CORE) { char number[33]; strcpy(string,"Core#"); sprintf(number,"%d",obj->logical_index); strcat(string,number); sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); } else { sprintf(out_string,"%*s%s\t", depth, "", string); strcat(console_output,out_string); } } if (obj->type != HWLOC_OBJ_PU) {//it is not a PU if((obj->first_child && obj->first_child->type == HWLOC_OBJ_PU)) arity = 1; //number of children else arity = obj->arity; for (i = 0; i < arity; i++) print_machine(topo, obj->children[i],depth+1); } }
int linux_topology_init(struct arch_topology *topo) { int count, i, j, error,k,tmpNode; #ifdef __DBCSR_CUDA int nDev; ma_get_ndevices_cu(&nDev); topo->ngpus = nDev; #endif local_topo = malloc(sizeof(struct arch_topology)); topo->nnodes = linux_get_nnodes(); local_topo->nnodes = topo->nnodes; topo->ncores = linux_get_ncores(); local_topo->ncores = topo->ncores; topo->npus = topo->ncores; local_topo->npus = topo->npus; //libnuma has no support for I/O devices topo->nnetcards = 0; local_topo->nnetcards = 0; topo->nsockets = linux_get_nsockets(); local_topo->nsockets = topo->nsockets; //Compute number of memory controlers per socket //basically the number of NUMA nodes per socket if (topo->nnodes > topo->nsockets) topo->nmemcontroller = topo->nnodes/topo->nsockets; else topo->nmemcontroller = 1; topo->ncaches = linux_get_ncaches(); local_topo->nmemcontroller = topo->nmemcontroller; local_topo->ncaches = topo->ncaches; topo->nshared_caches = linux_get_nshared_caches(); topo->nsiblings = linux_get_nsiblings(); local_topo->nshared_caches = topo->nshared_caches; local_topo->nsiblings = topo->nsiblings; //Machine node and core representation machine_nodes = (struct node*) malloc (topo->nnodes*sizeof(struct node)); int ncore_node = topo->ncores/topo->nnodes; for (i = 0; i < topo->nnodes ; i++) { machine_nodes[i].id = i; machine_nodes[i].memory = 0; machine_nodes[i].ncores = ncore_node; #ifdef __DBCSR_CUDA ma_get_nDevcu(i,&nDev); machine_nodes[i].mygpus = malloc (nDev*sizeof(int)); ma_get_cu(i,machine_nodes[i].mygpus); #endif } if (topo->nnodes == -1 || topo->ncores == -1 || topo->npus == -1 || topo->nsockets == -1) return -1; else return 0; }