Beispiel #1
0
void *hclib_allocate_at(place_t *pl, size_t nbytes, int flags) {
    HASSERT(pl);
    HASSERT(nbytes > 0);
#ifdef VERBOSE
    fprintf(stderr, "hclib_allocate_at: pl=%p nbytes=%lu flags=%d, is_cpu? %s",
            pl, (unsigned long)nbytes, flags,
            is_cpu_place(pl) ? "true" : "false");
#ifdef HC_CUDA
    fprintf(stderr, ", is_nvgpu? %s, cuda_id=%d",
            is_nvgpu_place(pl) ? "true" : "false", pl->cuda_id);
#endif
    fprintf(stderr, "\n");
#endif

    if (is_cpu_place(pl)) {
#ifdef HC_CUDA
        if (flags & PHYSICAL) {
            void *ptr;
            const cudaError_t alloc_err = cudaMallocHost((void **)&ptr, nbytes);
            if (alloc_err != cudaSuccess) {
#ifdef VERBOSE
                fprintf(stderr, "Physical allocation at CPU place failed with "
                        "reason \"%s\"\n", cudaGetErrorString(alloc_err));
#endif
                return NULL;
            } else {
                hclib_memory_tree_insert(ptr, nbytes,
                                         &hclib_context->pinned_host_allocs);
                return ptr;
            }
        }
#else
        HASSERT(flags == NONE);
#endif
        return malloc(nbytes);
#ifdef HC_CUDA
    } else if (is_nvgpu_place(pl)) {
        HASSERT(flags == NONE);
        void *ptr;
        HASSERT(pl->cuda_id >= 0);
        CHECK_CUDA(cudaSetDevice(pl->cuda_id));
        const cudaError_t alloc_err = cudaMalloc((void **)&ptr, nbytes);
        if (alloc_err != cudaSuccess) {
#ifdef VERBOSE
            fprintf(stderr, "Allocation at NVGPU place failed with reason "
                    "\"%s\"\n", cudaGetErrorString(alloc_err));
#endif
            return NULL;
        } else {
            return ptr;
        }
#endif
    } else {
        unsupported_place_type_err(pl);
        return NULL; // will never reach here
    }
}
Beispiel #2
0
static void write_hpt_tree(std::ofstream &output, hwloc_obj_t obj, int indent) {
#ifdef VERBOSE
    std::cout << "obj " << obj->os_index << " (type=" <<
        std::string(str_for_type(obj->type)) << ", memory=" <<
        is_memory_place(obj) << ", cache=" << is_cache_place(obj) <<
        ", worker=" << is_cpu_worker(obj) << ", nvgpu=" <<
        is_nvgpu_place(obj) << std::endl;
#endif

    if (is_memory_place(obj)) {
        write_indent(output, indent);
        output << "<place num=\"1\" type=\"mem\">" << std::endl;
        recur_on_children(output, obj, indent + 1);
        write_indent(output, indent);
        output << "</place>" << std::endl;
    } else if (is_cache_place(obj)) {
        write_indent(output, indent);
        output << "<place num=\"1\" type=\"cache\">" << std::endl;
        recur_on_children(output, obj, indent + 1);
        write_indent(output, indent);
        output << "</place>" << std::endl;
    } else if (is_cpu_worker(obj)) {
        assert(obj->arity == 0);
        write_indent(output, indent);
        output << "<worker num=\"1\"/>" << std::endl;
    } else if (is_nvgpu_place(obj)) {
        write_indent(output, indent);
        output << "<place num=\"1\" type=\"nvgpu\" info=\"" <<
            std::string(get_hwloc_info(obj, "GPUVendor")) << ", " <<
            std::string(get_hwloc_info(obj, "GPUModel")) << "\">" << std::endl;
        recur_on_children(output, obj, indent + 1);
        write_indent(output, indent);
        output << "</place>" << std::endl;
    } else {
        // Just continue down the tree, ignoring whatever the current node is
        recur_on_children(output, obj, indent);
    }
}
Beispiel #3
0
char *hclib_get_place_name(place_t *pl) {
    if (is_cpu_place(pl)) {
        return (char *)cpu_place_name;
#ifdef HC_CUDA
    } else if (is_nvgpu_place(pl)) {
        struct cudaDeviceProp props;
        CHECK_CUDA(cudaGetDeviceProperties(&props, pl->cuda_id));
        char *gpu_name = (char *)malloc(sizeof(props.name));
        memcpy(gpu_name, props.name, sizeof(props.name));
        return gpu_name;
#endif
    } else {
        return unsupported_place_type_err(pl);
    }
}
Beispiel #4
0
void hclib_free_at(place_t *pl, void *ptr) {
    if (is_cpu_place(pl)) {
        if (is_pinned_cpu_mem(ptr)) {
            hclib_memory_tree_remove(ptr, &hclib_context->pinned_host_allocs);
            CHECK_CUDA(cudaFreeHost(ptr));
        } else {
            free(ptr);
        }
#ifdef HC_CUDA
    } else if (is_nvgpu_place(pl)) {
        CHECK_CUDA(cudaFree(ptr));
#endif
    } else {
        unsupported_place_type_err(pl);
    }
}
Beispiel #5
0
/* init the hpt and place deques */
void hc_hpt_init(hc_context *context) {
    int i, j;
#ifdef HPT_DESCENTWORKER_PERPLACE
    /*
     * each place has a deque for all workers beneath it (transitively) in the
     * HPT.
     */
    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        int nworkers = pl->ndeques;
        pl->deques = malloc(sizeof(hc_deque_t) * nworkers);
        HASSERT(pl->deques);
        for (j = 0; j < nworkers; j++) {
            hc_deque_t *deq = &(pl->deques[j]);
            init_hc_deque_t(deq, pl);
        }
    }
#else // HPT_ALLWORKER_PERPLACE each place has a deque for each worker
    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        const int ndeques = context->nworkers;
#ifdef TODO
        if (is_device_place(pl)) ndeques = 1;
#endif
        pl->ndeques = ndeques;
        pl->deques = (hc_deque_t *) malloc(sizeof(hc_deque_t) * ndeques);
        for (j = 0; j < ndeques; j++) {
            hc_deque_t *hc_deq = &(pl->deques[j]);
            init_hc_deque_t(hc_deq, pl);
        }
    }
#endif

    /*
     * link the deques for each cpu workers. the deque index is the same as
     * ws->id to simplify the search. For every worker, iterate over all places
     * and store a pointer from the place's deque for that worker to the worker
     * state for that worker.
     *
     * This builds a tree of deques from the worker, to its parent's deque for
     * it, to its grandparent's deque for it, up to the root. It would seem that
     * the majority of deques are therefore unused (i.e. even though we allocate
     * a dequeue for every worker in a platform in every place, only the deques
     * for workers that are beneath that place in the HPT are used). However,
     * this does make lookups of the deque in a place for a given worker
     * constant time based on offset in place->deques.
     */
#ifdef HC_CUDA
    int ngpus = 0;
    int gpu_counter = 0;
    cudaError_t cuda_err = cudaGetDeviceCount(&ngpus);
    if (cuda_err == cudaErrorNoDevice) {
        ngpus = 0;
    }

    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        pl->cuda_id = -1;
        if (is_nvgpu_place(pl)) {
            pl->cuda_id = gpu_counter++;
            CHECK_CUDA(cudaSetDevice(pl->cuda_id));
            CHECK_CUDA(cudaStreamCreate(&pl->cuda_stream));
        }
    }
#endif

    for (i = 0; i < context->nworkers; i++) {
        hclib_worker_state *ws = context->workers[i];
        const int id = ws->id;
        for (j = 0; j < context->nplaces; j++) {
            place_t *pl = context->places[j];
            if (is_cpu_place(pl)) {
                hc_deque_t *hc_deq = &(pl->deques[id]);
                hc_deq->ws = ws;
#ifdef HC_CUDA
            } else if (is_nvgpu_place(pl)) {
                hc_deque_t *hc_deq = &(pl->deques[id]);
                hc_deq->ws = ws;

#endif
            } else {
                /* unhandled or ignored situation */
                HASSERT(0);
            }
        }

        /* here we link the deques of the ancestor places for this worker */
        place_t *parent = ws->pl;
        place_t *current = parent;
        ws->deques = &(current->deques[id]);
        while (parent->parent != NULL) {
            parent = parent->parent;
            current->deques[id].prev = &(parent->deques[id]);
            parent->deques[id].nnext = &(current->deques[id]);
            current = parent;
        }
        ws->current = &(current->deques[id]);
    }

#ifdef VERBOSE
    /*Print HPT*/
    int level = context->places[0]->level;
    printf("Level %d: ", level);
    for (i = 0; i < context->nplaces; i++) {
        place_t *pl = context->places[i];
        if (level != pl->level) {
            printf("\n");
            level = pl->level;
            printf("Level %d: ", level);
        }

        printf("Place %d %s ", pl->id, place_type_to_str(pl->type));
        hclib_worker_state *w = pl->workers;
        if (w != NULL) {
            printf("[ ");
            while (w != NULL) {
                printf("W%d ", w->id);
                w = w->next_worker;
            }
            printf("] ");
        }

        place_t *c = pl->child;
        if (c != NULL) {
            printf("{ ");
            while (c != NULL) {
                printf("%d ", c->id);
                c = c->nnext;
            }
            printf("} ");
        }
        printf("\t");
    }
    printf("\n");
#endif
}