Beispiel #1
0
WEAK int halide_dev_run(void *user_context,
                        void *state_ptr,
                        const char* entry_name,
                        int blocksX, int blocksY, int blocksZ,
                        int threadsX, int threadsY, int threadsZ,
                        int shared_mem_bytes,
                        size_t arg_sizes[],
                        void* args[]) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n",
                  user_context, entry_name,
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ,
                  shared_mem_bytes );

    CUresult err;
    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, state_ptr);
    CUmodule mod = ((module_state*)state_ptr)->module;
    halide_assert(user_context, mod);
    CUfunction f;
    err = cuModuleGetFunction(&f, mod, entry_name);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)",
                             _get_error_name(err));
        return err;
    }

    err = cuLaunchKernel(f,
                         blocksX,  blocksY,  blocksZ,
                         threadsX, threadsY, threadsZ,
                         shared_mem_bytes,
                         NULL, // stream
                         args,
                         NULL);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n",
                             _get_error_name(err));
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif
    return 0;
}
Beispiel #2
0
WEAK int halide_dev_free(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf );

    ClContext ctx(user_context);

    // halide_dev_free, at present, can be exposed to clients and they
    // should be allowed to call halide_dev_free on any buffer_t
    // including ones that have never been used with a GPU.
    if (buf->dev == 0) {
      return 0;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));
    DEBUG_PRINTF(user_context, "    clReleaseMemObject %p\n", (cl_mem)buf->dev );
    cl_int result = clReleaseMemObject((cl_mem)buf->dev);
    // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so
    // we just end our reference to it regardless.
    buf->dev = 0;
    if (result != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clReleaseMemObject failed (%d)", result);
        return result;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Beispiel #3
0
// Used to generate correct timings when tracing
WEAK int halide_dev_sync(void *user_context) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_sync (user_context: %p)\n", user_context );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUresult err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Beispiel #4
0
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) {
    if (!buf->dev_dirty) {
        return 0;
    }

    DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    // Need to check dev_dirty again, in case another thread did the
    // copy_to_host before the serialization point above.
    if (buf->dev_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->dev && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_dev_to_host_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        CUdeviceptr src = (CUdeviceptr)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;
                        DEBUG_PRINTF( user_context, "    cuMemcpyDtoH (%d, %d, %d, %d), %p -> %p, %lld bytes\n",
                                      x, y, z, w,
                                      (void *)src, dst, (long long)size );
                        CUresult err = cuMemcpyDtoH(dst, src, size);
                        if (err != CUDA_SUCCESS) {
                            halide_error_varargs(user_context, "CUDA: cuMemcpyDtoH failed (%s)",
                                                 _get_error_name(err));
                            return err;
                        }
                    }
                }
            }
        }

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->dev_dirty = false;
    return 0;
}
Beispiel #5
0
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    size_t size = _buf_size(user_context, buf);
    if (buf->dev) {
        // This buffer already has a device allocation
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    DEBUG_PRINTF(user_context, "    allocating buffer of %lld bytes, "
                 "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n",
                 (long long)size,
                 (long long)buf->extent[0], (long long)buf->extent[1],
                 (long long)buf->extent[2], (long long)buf->extent[3],
                 (long long)buf->stride[0], (long long)buf->stride[1],
                 (long long)buf->stride[2], (long long)buf->stride[3],
                 buf->elem_size);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUdeviceptr p;
    DEBUG_PRINTF( user_context, "    cuMemAlloc %lld -> ", size );
    CUresult err = cuMemAlloc(&p, size);
    if (err != CUDA_SUCCESS) {
        DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err));
        halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)",
                             _get_error_name(err));
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", p );
    }
    halide_assert(user_context, p);
    buf->dev = (uint64_t)p;

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Beispiel #6
0
WEAK int halide_dev_malloc(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf );

    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    size_t size = _buf_size(user_context, buf);
    if (buf->dev) {
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    DEBUG_PRINTF(user_context, "    Allocating buffer of %lld bytes, "
                 "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n",
                 (long long)size,
                 (long long)buf->extent[0], (long long)buf->extent[1],
                 (long long)buf->extent[2], (long long)buf->extent[3],
                 (long long)buf->stride[0], (long long)buf->stride[1],
                 (long long)buf->stride[2], (long long)buf->stride[3],
                 buf->elem_size);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    cl_int err;
    DEBUG_PRINTF( user_context, "    clCreateBuffer -> ", size );
    buf->dev = (uint64_t)clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err);
    if (err != CL_SUCCESS || buf->dev == 0) {
        DEBUG_PRINTF( user_context, "%d\n", err);
        halide_error_varargs(user_context, "CL: clCreateBuffer failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", (cl_mem)buf->dev );
    }

    DEBUG_PRINTF(user_context, "    Allocated device buffer %p for buffer %p\n",
                 (void *)buf->dev, buf);

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return CL_SUCCESS;
}
Beispiel #7
0
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* ptx_src, int size) {
    DEBUG_PRINTF( user_context, "CUDA: halide_init_kernels (user_context: %p, state_ptr: %p, ptx_src: %p, %i)\n",
                  user_context, state_ptr, ptx_src, size );

    CudaContext ctx(user_context);
    if (ctx.error != 0) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create the state object if necessary. This only happens once, regardless
    // of how many times halide_init_kernels/halide_release is called.
    // halide_release traverses this list and releases the module objects, but
    // it does not modify the list nodes created/inserted here.
    module_state **state = (module_state**)state_ptr;
    if (!(*state)) {
        *state = (module_state*)malloc(sizeof(module_state));
        (*state)->module = NULL;
        (*state)->next = state_list;
        state_list = *state;
    }

    // Create the module itself if necessary.
    if (!(*state)->module) {
        DEBUG_PRINTF( user_context, "    cuModuleLoadData %p, %i -> ", ptx_src, size );
        CUmodule module;
        CUresult err = cuModuleLoadData(&(*state)->module, ptx_src);
        if (err != CUDA_SUCCESS) {
            DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) );
            halide_error_varargs(user_context, "CUDA: cuModuleLoadData failed (%s)",
                                 _get_error_name(err));
            return err;
        } else {
            DEBUG_PRINTF( user_context, "%p\n", module );
        }
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Beispiel #8
0
WEAK int halide_dev_free(void *user_context, buffer_t* buf) {
    // halide_dev_free, at present, can be exposed to clients and they
    // should be allowed to call halide_dev_free on any buffer_t
    // including ones that have never been used with a GPU.
    if (buf->dev == 0) {
        return 0;
    }

    DEBUG_PRINTF( user_context, "CUDA: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS)
        return ctx.error;

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

    DEBUG_PRINTF( user_context, "    cuMemFree %p\n", buf->dev );
    CUresult err = cuMemFree(buf->dev);
    // If cuMemFree fails, it isn't likely to succeed later, so just drop
    // the reference.
    buf->dev = 0;
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuMemFree failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Beispiel #9
0
// Used to generate correct timings when tracing
WEAK int halide_dev_sync(void *user_context) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_sync (user_context: %p)\n", user_context );

    ClContext ctx(user_context);
    halide_assert(user_context, ctx.error == CL_SUCCESS);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    cl_int err = clFinish(ctx.cmd_queue);
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clFinish failed (%d)\n", err);
        return err;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return CL_SUCCESS;
}
Beispiel #10
0
static CUresult create_context(void *user_context, CUcontext *ctx) {
    // Initialize CUDA
    CUresult err = cuInit(0);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuInit failed (%s)",
                             _get_error_name(err));
        return err;
    }

    // Make sure we have a device
    int deviceCount = 0;
    err = cuDeviceGetCount(&deviceCount);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuGetDeviceCount failed (%s)",
                             _get_error_name(err));
        return err;
    }
    if (deviceCount <= 0) {
        halide_error(user_context, "CUDA: No devices available");
        return CUDA_ERROR_NO_DEVICE;
    }

    int device = halide_get_gpu_device(user_context);
    if (device == -1) {
        device = deviceCount - 1;
    }

    // Get device
    CUdevice dev;
    CUresult status = cuDeviceGet(&dev, device);
    if (status != CUDA_SUCCESS) {
        halide_error(user_context, "CUDA: Failed to get device\n");
        return status;
    }

    DEBUG_PRINTF( user_context, "    Got device %d\n", dev );

    // Dump device attributes
    #ifdef DEBUG
    {
        char name[256];
        name[0] = 0;
        err = cuDeviceGetName(name, 256, dev);
        DEBUG_PRINTF(user_context, "      %s\n", name);

        if (err != CUDA_SUCCESS) {
            halide_error_varargs(user_context, "CUDA: cuDeviceGetName failed (%s)",
                                 _get_error_name(err));
            return err;
        }

        size_t memory = 0;
        err = cuDeviceTotalMem(&memory, dev);
        DEBUG_PRINTF(user_context, "      total memory: %d MB\n", (int)(memory >> 20));

        if (err != CUDA_SUCCESS) {
            halide_error_varargs(user_context, "CUDA: cuDeviceTotalMem failed (%s)",
                                 _get_error_name(err));
            return err;
        }

        // Declare variables for other state we want to query.
        int max_threads_per_block = 0, warp_size = 0, num_cores = 0;
        int max_block_size[] = {0, 0, 0};
        int max_grid_size[] = {0, 0, 0};
        int max_shared_mem = 0, max_constant_mem = 0;
        int cc_major = 0, cc_minor = 0;

        struct {int *dst; CUdevice_attribute attr;} attrs[] = {
            {&max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK},
            {&warp_size,             CU_DEVICE_ATTRIBUTE_WARP_SIZE},
            {&num_cores,             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT},
            {&max_block_size[0],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X},
            {&max_block_size[1],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y},
            {&max_block_size[2],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z},
            {&max_grid_size[0],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X},
            {&max_grid_size[1],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y},
            {&max_grid_size[2],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z},
            {&max_shared_mem,        CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK},
            {&max_constant_mem,      CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
            {&cc_major,              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
            {&cc_minor,              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
            {NULL,                   CU_DEVICE_ATTRIBUTE_MAX}};

        // Do all the queries.
        for (int i = 0; attrs[i].dst; i++) {
            err = cuDeviceGetAttribute(attrs[i].dst, attrs[i].attr, dev);
            if (err != CUDA_SUCCESS) {
                halide_error_varargs(user_context,
                                     "CUDA: cuDeviceGetAttribute failed (%s) for attribute %d",
                                     _get_error_name(err), (int)attrs[i].attr);
                return err;
            }
        }

        // threads per core is a function of the compute capability
        int threads_per_core = (cc_major == 1 ? 8 :
                                cc_major == 2 ? (cc_minor == 0 ? 32 : 48) :
                                cc_major == 3 ? 192 :
                                cc_major == 5 ? 128 : 0);

        DEBUG_PRINTF(user_context,
                     "      max threads per block: %d\n"
                     "      warp size: %d\n"
                     "      max block size: %d %d %d\n"
                     "      max grid size: %d %d %d\n"
                     "      max shared memory per block: %d\n"
                     "      max constant memory per block: %d\n"
                     "      compute capability %d.%d\n"
                     "      cuda cores: %d x %d = %d\n",
                     max_threads_per_block, warp_size,
                     max_block_size[0], max_block_size[1], max_block_size[2],
                     max_grid_size[0], max_grid_size[1], max_grid_size[2],
                     max_shared_mem, max_constant_mem,
                     cc_major, cc_minor,
                     num_cores, threads_per_core, num_cores * threads_per_core);
    }
    #endif

    // Create context
    DEBUG_PRINTF( user_context, "    cuCtxCreate %d -> ", dev );
    err = cuCtxCreate(ctx, 0, dev);
    if (err != CUDA_SUCCESS) {
        DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) );
        halide_error_varargs(user_context, "CUDA: cuCtxCreate failed (%s)",
                             _get_error_name(err));
        return err;
    } else {
        unsigned int version = 0;
        cuCtxGetApiVersion(*ctx, &version);
        DEBUG_PRINTF( user_context, "%p (%d)\n", *ctx, version);
    }

    return CUDA_SUCCESS;
}
Beispiel #11
0
WEAK int halide_dev_run(void *user_context,
                        void *state_ptr,
                        const char* entry_name,
                        int blocksX, int blocksY, int blocksZ,
                        int threadsX, int threadsY, int threadsZ,
                        int shared_mem_bytes,
                        size_t arg_sizes[],
                        void* args[]) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n",
                  user_context, entry_name,
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ,
                  shared_mem_bytes );

    cl_int err;
    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create kernel object for entry_name from the program for this module.
    halide_assert(user_context, state_ptr);
    cl_program program = ((module_state*)state_ptr)->program;

    halide_assert(user_context, program);
    DEBUG_PRINTF( user_context, "    clCreateKernel %s -> ", entry_name );
    cl_kernel f = clCreateKernel(program, entry_name, &err);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d\n", err );
        halide_error_varargs(user_context, "CL: clCreateKernel (%s) failed (%d)\n", entry_name, err);
        return err;
    } else {
        #ifdef DEBUG
        uint64_t t_create_kernel = halide_current_time_ns(user_context);
        halide_printf( user_context, "%p (%f ms)\n",
                       f, (t_create_kernel - t_before) / 1.0e6 );
        #endif
    }

    // Pack dims
    size_t global_dim[3] = {blocksX*threadsX,  blocksY*threadsY,  blocksZ*threadsZ};
    size_t local_dim[3] = {threadsX, threadsY, threadsZ};

    // Set args
    int i = 0;
    while (arg_sizes[i] != 0) {
        DEBUG_PRINTF(user_context, "    clSetKernelArg %i %i [0x%x ...]\n", i, arg_sizes[i], *(int *)args[i]);
        cl_int err = clSetKernelArg(f, i, arg_sizes[i], args[i]);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clSetKernelArg failed (%d)\n", err);
            return err;
        }
        i++;
    }
    // Set the shared mem buffer last
    // Always set at least 1 byte of shmem, to keep the launch happy
    DEBUG_PRINTF(user_context, "    clSetKernelArg %i %i [NULL]\n", i, shared_mem_bytes);
    err = clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL);
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clSetKernelArg failed (%d)\n", err);
        return err;
    }

    // Launch kernel
    DEBUG_PRINTF( user_context, "    clEnqueueNDRangeKernel %dx%dx%d, %dx%dx%d -> ",
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ );
    err = clEnqueueNDRangeKernel(ctx.cmd_queue, f,
                                 // NDRange
                                 3, NULL, global_dim, local_dim,
                                 // Events
                                 0, NULL, NULL);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d\n", err );
        halide_error_varargs(user_context, "CL: clEnqueueNDRangeKernel failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF ( user_context, "CL_SUCCESS\n" );
    }

    DEBUG_PRINTF( user_context, "    clReleaseKernel %p\n", f );
    clReleaseKernel(f);

    #ifdef DEBUG
    err = clFinish(ctx.cmd_queue);
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clFinish failed (%d)\n", err);
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif
    return 0;
}
Beispiel #12
0
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF(user_context, "CL: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf );

    // Acquire the context so we can use the command queue. This also avoids multiple
    // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy
    // the same buffer.
    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    if (buf->dev_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->dev && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_dev_to_host_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
#ifdef ENABLE_OPENCL_11
                // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we
                // can deal with the 2 innermost strides with OpenCL.
                uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3];

                size_t offset[3] = { off, 0, 0 };
                size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] };

                DEBUG_PRINTF( user_context, "    clEnqueueReadBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n",
                              z, w,
                              (void *)c.src, c.dst, (int)off,
                              (int)region[0], (int)region[1], (int)region[2],
                              (int)c.stride_bytes[0], (int)c.stride_bytes[1]);

                cl_int err = clEnqueueReadBufferRect(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE,
                                                     offset, offset, region,
                                                     c.stride_bytes[0], c.stride_bytes[1],
                                                     c.stride_bytes[0], c.stride_bytes[1],
                                                     (void *)c.dst,
                                                     0, NULL, NULL);

                if (err != CL_SUCCESS) {
                    halide_error_varargs(user_context, "CL: clEnqueueReadBufferRect failed (%d)\n", err);
                    return err;
                }
#else
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        void *src = (void *)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;

                        DEBUG_PRINTF( user_context, "    clEnqueueReadBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n",
                                      x, y, z, w,
                                      (long long)size, (void *)src, dst );

                        cl_int err = clEnqueueReadBuffer(ctx.cmd_queue, (cl_mem)c.src,
                                                         CL_FALSE, off, size, dst, 0, NULL, NULL);
                        if (err != CL_SUCCESS) {
                            halide_error_varargs(user_context, "CL: clEnqueueReadBuffer failed (%d)\n", err);
                            return err;
                        }
                    }
                }
#endif
            }
        }
        // The writes above are all non-blocking, so empty the command
        // queue before we proceed so that other host code won't read
        // bad data.
        clFinish(ctx.cmd_queue);

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->dev_dirty = false;
    return 0;
}
Beispiel #13
0
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* src, int size) {
    DEBUG_PRINTF( user_context, "CL: halide_init_kernels (user_context: %p, state_ptr: %p, program: %p, %i)\n",
                  user_context, state_ptr, src, size );

    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create the state object if necessary. This only happens once, regardless
    // of how many times halide_init_kernels/halide_release is called.
    // halide_release traverses this list and releases the program objects, but
    // it does not modify the list nodes created/inserted here.
    module_state **state = (module_state**)state_ptr;
    if (!(*state)) {
        *state = (module_state*)malloc(sizeof(module_state));
        (*state)->program = NULL;
        (*state)->next = state_list;
        state_list = *state;
    }

    // Create the program if necessary. TODO: The program object needs to not
    // only already exist, but be created for the same context/device as the
    // calling context/device.
    if (!(*state && (*state)->program) && size > 1) {
        cl_int err = 0;
        cl_device_id dev;

        err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed (%d)\n", err);
            return err;
        }

        cl_device_id devices[] = { dev };
        size_t lengths[] = { size };

        // Get the max constant buffer size supported by this OpenCL implementation.
        cl_ulong max_constant_buffer_size = 0;
        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, NULL);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed (%d)\n", err);
            return err;
        }
        // Get the max number of constant arguments supported by this OpenCL implementation.
        cl_uint max_constant_args = 0;
        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, NULL);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed (%d)\n", err);
            return err;
        }

        // Build the compile argument options.
        char options[256];
        snprintf(options, sizeof(options),
                 "-D MAX_CONSTANT_BUFFER_SIZE=%lld -D MAX_CONSTANT_ARGS=%i",
                 max_constant_buffer_size,
                 max_constant_args);

        const char * sources[] = { src };
        DEBUG_PRINTF( user_context, "    clCreateProgramWithSource -> " );
        cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], NULL, &err );
        if (err != CL_SUCCESS) {
            DEBUG_PRINTF( user_context, "%d\n", err );
            halide_error_varargs(user_context, "CL: clCreateProgramWithSource failed (%d)\n", err);
            return err;
        } else {
            DEBUG_PRINTF( user_context, "%p\n", program );
        }
        (*state)->program = program;

        DEBUG_PRINTF( user_context, "    clBuildProgram %p %s\n", program, options );
        err = clBuildProgram(program, 1, devices, options, NULL, NULL );
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clBuildProgram failed (%d)\n", err);

            // Allocate an appropriately sized buffer for the build log.
            size_t len = 0;
            char *buffer = NULL;
            if (clGetProgramBuildInfo(program,
                                      dev,
                                      CL_PROGRAM_BUILD_LOG,
                                      0, NULL,
                                      &len) == CL_SUCCESS) {
                buffer = (char*)malloc((++len)*sizeof(char));
            }

            // Get build log
            if (buffer && clGetProgramBuildInfo(program,
                                                dev,
                                                CL_PROGRAM_BUILD_LOG,
                                                len, buffer,
                                                NULL) == CL_SUCCESS) {
                halide_printf(user_context, "    Build Log:\n %s\n-----\n", buffer);
            } else {
                halide_printf(user_context, "    clGetProgramBuildInfo failed\n");
            }

            if (buffer) {
                free(buffer);
            }

            halide_assert(user_context, err == CL_SUCCESS);
            return err;
        }
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Beispiel #14
0
// Initializes the context used by the default implementation
// of halide_acquire_context.
static int create_context(void *user_context, cl_context *ctx, cl_command_queue *q) {
    DEBUG_PRINTF( user_context, "    create_context (user_context: %p)\n", user_context );

    halide_assert(user_context, ctx != NULL && *ctx == NULL);
    halide_assert(user_context, q != NULL && *q == NULL);

    cl_int err = 0;

    const cl_uint maxPlatforms = 4;
    cl_platform_id platforms[maxPlatforms];
    cl_uint platformCount = 0;

    err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount );
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clGetPlatformIDs failed (%d)\n", err);
        return err;
    }

    cl_platform_id platform = NULL;

    // Find the requested platform, or the first if none specified.
    const char * name = getenv("HL_OCL_PLATFORM_NAME");
    if (name != NULL) {
        for (cl_uint i = 0; i < platformCount; ++i) {
            const cl_uint maxPlatformName = 256;
            char platformName[maxPlatformName];
            err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
            if (err != CL_SUCCESS) continue;

            // A platform matches the request if it is a substring of the platform name.
            if (strstr(platformName, name)) {
                platform = platforms[i];
                break;
            }
        }
    } else if (platformCount > 0) {
        platform = platforms[0];
    }
    if (platform == NULL){
        halide_error(user_context, "CL: Failed to find platform\n");
        return CL_INVALID_PLATFORM;
    }

    #ifdef DEBUG
    const cl_uint maxPlatformName = 256;
    char platformName[maxPlatformName];
    err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
    if (err != CL_SUCCESS) {
        halide_printf(user_context, "    clGetPlatformInfo(CL_PLATFORM_NAME) failed (%d)\n", err);
        // This is just debug info, report the error but don't fail context creation due to it.
        //return err;
    } else {
        halide_printf(user_context, "    Got platform '%s', about to create context (t=%lld)\n",
                      platformName, (long long)halide_current_time_ns(user_context));
    }
    #endif

    // Get the types of devices requested.
    cl_device_type device_type = 0;
    const char * dev_type = getenv("HL_OCL_DEVICE_TYPE");
    if (dev_type != NULL) {
        if (strstr("cpu", dev_type)) {
            device_type |= CL_DEVICE_TYPE_CPU;
        }
        if (strstr("gpu", dev_type)) {
            device_type |= CL_DEVICE_TYPE_GPU;
        }
    }
    // If no device types are specified, use all the available
    // devices.
    if (device_type == 0) {
        device_type = CL_DEVICE_TYPE_ALL;
    }

    // Get all the devices of the specified type.
    const cl_uint maxDevices = 4;
    cl_device_id devices[maxDevices];
    cl_uint deviceCount = 0;
    err = clGetDeviceIDs( platform, device_type, maxDevices, devices, &deviceCount );
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clGetDeviceIDs failed (%d)\n", err);
        return err;
    }

    // If the user indicated a specific device index to use, use
    // that. Note that this is an index within the set of devices
    // specified by the device type.
    char *device_str = getenv("HL_GPU_DEVICE");
    cl_uint device = deviceCount - 1;
    if (device_str) {
        device = atoi(device_str);
    }

    if (device >= deviceCount) {
        halide_error_varargs(user_context, "CL: Failed to get device %i\n", device);
        return CL_DEVICE_NOT_FOUND;
    }

    cl_device_id dev = devices[device];

    #ifdef DEBUG
    const cl_uint maxDeviceName = 256;
    char deviceName[maxDeviceName];
    err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL );
    if (err != CL_SUCCESS) {
        halide_printf(user_context, "    clGetDeviceInfo(CL_DEVICE_NAME) failed (%d)\n", err);
        // This is just debug info, report the error but don't fail context create if it fails.
        //return err;
    } else {
        halide_printf(user_context, "    Got device '%s'\n", deviceName);
    }
    #endif


    // Create context and command queue.
    cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
    DEBUG_PRINTF( user_context, "    clCreateContext -> " );
    *ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d", err);
        halide_error_varargs(user_context, "CL: clCreateContext failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", *ctx );
    }

    DEBUG_PRINTF(user_context, "    clCreateCommandQueue ");
    *q = clCreateCommandQueue(*ctx, dev, 0, &err);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d", err );
        halide_error_varargs(user_context, "CL: clCreateCommandQueue failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", *q );
    }

    return err;
}