void *hclib_allocate_at(place_t *pl, size_t nbytes, int flags) { HASSERT(pl); HASSERT(nbytes > 0); #ifdef VERBOSE fprintf(stderr, "hclib_allocate_at: pl=%p nbytes=%lu flags=%d, is_cpu? %s", pl, (unsigned long)nbytes, flags, is_cpu_place(pl) ? "true" : "false"); #ifdef HC_CUDA fprintf(stderr, ", is_nvgpu? %s, cuda_id=%d", is_nvgpu_place(pl) ? "true" : "false", pl->cuda_id); #endif fprintf(stderr, "\n"); #endif if (is_cpu_place(pl)) { #ifdef HC_CUDA if (flags & PHYSICAL) { void *ptr; const cudaError_t alloc_err = cudaMallocHost((void **)&ptr, nbytes); if (alloc_err != cudaSuccess) { #ifdef VERBOSE fprintf(stderr, "Physical allocation at CPU place failed with " "reason \"%s\"\n", cudaGetErrorString(alloc_err)); #endif return NULL; } else { hclib_memory_tree_insert(ptr, nbytes, &hclib_context->pinned_host_allocs); return ptr; } } #else HASSERT(flags == NONE); #endif return malloc(nbytes); #ifdef HC_CUDA } else if (is_nvgpu_place(pl)) { HASSERT(flags == NONE); void *ptr; HASSERT(pl->cuda_id >= 0); CHECK_CUDA(cudaSetDevice(pl->cuda_id)); const cudaError_t alloc_err = cudaMalloc((void **)&ptr, nbytes); if (alloc_err != cudaSuccess) { #ifdef VERBOSE fprintf(stderr, "Allocation at NVGPU place failed with reason " "\"%s\"\n", cudaGetErrorString(alloc_err)); #endif return NULL; } else { return ptr; } #endif } else { unsupported_place_type_err(pl); return NULL; // will never reach here } }
static void write_hpt_tree(std::ofstream &output, hwloc_obj_t obj, int indent) { #ifdef VERBOSE std::cout << "obj " << obj->os_index << " (type=" << std::string(str_for_type(obj->type)) << ", memory=" << is_memory_place(obj) << ", cache=" << is_cache_place(obj) << ", worker=" << is_cpu_worker(obj) << ", nvgpu=" << is_nvgpu_place(obj) << std::endl; #endif if (is_memory_place(obj)) { write_indent(output, indent); output << "<place num=\"1\" type=\"mem\">" << std::endl; recur_on_children(output, obj, indent + 1); write_indent(output, indent); output << "</place>" << std::endl; } else if (is_cache_place(obj)) { write_indent(output, indent); output << "<place num=\"1\" type=\"cache\">" << std::endl; recur_on_children(output, obj, indent + 1); write_indent(output, indent); output << "</place>" << std::endl; } else if (is_cpu_worker(obj)) { assert(obj->arity == 0); write_indent(output, indent); output << "<worker num=\"1\"/>" << std::endl; } else if (is_nvgpu_place(obj)) { write_indent(output, indent); output << "<place num=\"1\" type=\"nvgpu\" info=\"" << std::string(get_hwloc_info(obj, "GPUVendor")) << ", " << std::string(get_hwloc_info(obj, "GPUModel")) << "\">" << std::endl; recur_on_children(output, obj, indent + 1); write_indent(output, indent); output << "</place>" << std::endl; } else { // Just continue down the tree, ignoring whatever the current node is recur_on_children(output, obj, indent); } }
char *hclib_get_place_name(place_t *pl) { if (is_cpu_place(pl)) { return (char *)cpu_place_name; #ifdef HC_CUDA } else if (is_nvgpu_place(pl)) { struct cudaDeviceProp props; CHECK_CUDA(cudaGetDeviceProperties(&props, pl->cuda_id)); char *gpu_name = (char *)malloc(sizeof(props.name)); memcpy(gpu_name, props.name, sizeof(props.name)); return gpu_name; #endif } else { return unsupported_place_type_err(pl); } }
void hclib_free_at(place_t *pl, void *ptr) { if (is_cpu_place(pl)) { if (is_pinned_cpu_mem(ptr)) { hclib_memory_tree_remove(ptr, &hclib_context->pinned_host_allocs); CHECK_CUDA(cudaFreeHost(ptr)); } else { free(ptr); } #ifdef HC_CUDA } else if (is_nvgpu_place(pl)) { CHECK_CUDA(cudaFree(ptr)); #endif } else { unsupported_place_type_err(pl); } }
/* init the hpt and place deques */ void hc_hpt_init(hc_context *context) { int i, j; #ifdef HPT_DESCENTWORKER_PERPLACE /* * each place has a deque for all workers beneath it (transitively) in the * HPT. */ for (i = 0; i < context->nplaces; i++) { place_t *pl = context->places[i]; int nworkers = pl->ndeques; pl->deques = malloc(sizeof(hc_deque_t) * nworkers); HASSERT(pl->deques); for (j = 0; j < nworkers; j++) { hc_deque_t *deq = &(pl->deques[j]); init_hc_deque_t(deq, pl); } } #else // HPT_ALLWORKER_PERPLACE each place has a deque for each worker for (i = 0; i < context->nplaces; i++) { place_t *pl = context->places[i]; const int ndeques = context->nworkers; #ifdef TODO if (is_device_place(pl)) ndeques = 1; #endif pl->ndeques = ndeques; pl->deques = (hc_deque_t *) malloc(sizeof(hc_deque_t) * ndeques); for (j = 0; j < ndeques; j++) { hc_deque_t *hc_deq = &(pl->deques[j]); init_hc_deque_t(hc_deq, pl); } } #endif /* * link the deques for each cpu workers. the deque index is the same as * ws->id to simplify the search. For every worker, iterate over all places * and store a pointer from the place's deque for that worker to the worker * state for that worker. * * This builds a tree of deques from the worker, to its parent's deque for * it, to its grandparent's deque for it, up to the root. It would seem that * the majority of deques are therefore unused (i.e. even though we allocate * a dequeue for every worker in a platform in every place, only the deques * for workers that are beneath that place in the HPT are used). However, * this does make lookups of the deque in a place for a given worker * constant time based on offset in place->deques. */ #ifdef HC_CUDA int ngpus = 0; int gpu_counter = 0; cudaError_t cuda_err = cudaGetDeviceCount(&ngpus); if (cuda_err == cudaErrorNoDevice) { ngpus = 0; } for (i = 0; i < context->nplaces; i++) { place_t *pl = context->places[i]; pl->cuda_id = -1; if (is_nvgpu_place(pl)) { pl->cuda_id = gpu_counter++; CHECK_CUDA(cudaSetDevice(pl->cuda_id)); CHECK_CUDA(cudaStreamCreate(&pl->cuda_stream)); } } #endif for (i = 0; i < context->nworkers; i++) { hclib_worker_state *ws = context->workers[i]; const int id = ws->id; for (j = 0; j < context->nplaces; j++) { place_t *pl = context->places[j]; if (is_cpu_place(pl)) { hc_deque_t *hc_deq = &(pl->deques[id]); hc_deq->ws = ws; #ifdef HC_CUDA } else if (is_nvgpu_place(pl)) { hc_deque_t *hc_deq = &(pl->deques[id]); hc_deq->ws = ws; #endif } else { /* unhandled or ignored situation */ HASSERT(0); } } /* here we link the deques of the ancestor places for this worker */ place_t *parent = ws->pl; place_t *current = parent; ws->deques = &(current->deques[id]); while (parent->parent != NULL) { parent = parent->parent; current->deques[id].prev = &(parent->deques[id]); parent->deques[id].nnext = &(current->deques[id]); current = parent; } ws->current = &(current->deques[id]); } #ifdef VERBOSE /*Print HPT*/ int level = context->places[0]->level; printf("Level %d: ", level); for (i = 0; i < context->nplaces; i++) { place_t *pl = context->places[i]; if (level != pl->level) { printf("\n"); level = pl->level; printf("Level %d: ", level); } printf("Place %d %s ", pl->id, place_type_to_str(pl->type)); hclib_worker_state *w = pl->workers; if (w != NULL) { printf("[ "); while (w != NULL) { printf("W%d ", w->id); w = w->next_worker; } printf("] "); } place_t *c = pl->child; if (c != NULL) { printf("{ "); while (c != NULL) { printf("%d ", c->id); c = c->nnext; } printf("} "); } printf("\t"); } printf("\n"); #endif }