WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) { if (buf->dev) { // This buffer already has a device allocation return; } size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "dev_malloc allocating buffer of %zd bytes, " "extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n", size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3], buf->elem_size); #endif CUdeviceptr p; TIME_CALL( cuMemAlloc(&p, size), "dev_malloc"); buf->dev = (uint64_t)p; halide_assert(user_context, buf->dev); #ifdef DEBUG halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); #endif }
WEAK int halide_dev_free(void *user_context, buffer_t* buf) { DEBUG_PRINTF( user_context, "CL: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf ); ClContext ctx(user_context); // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) { return 0; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); DEBUG_PRINTF(user_context, " clReleaseMemObject %p\n", (cl_mem)buf->dev ); cl_int result = clReleaseMemObject((cl_mem)buf->dev); // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so // we just end our reference to it regardless. buf->dev = 0; if (result != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clReleaseMemObject failed (%d)", result); return result; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) { if (!buf->dev_dirty) { return 0; } DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } // Need to check dev_dirty again, in case another thread did the // copy_to_host before the serialization point above. if (buf->dev_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->dev && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_dev_to_host_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); CUdeviceptr src = (CUdeviceptr)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " cuMemcpyDtoH (%d, %d, %d, %d), %p -> %p, %lld bytes\n", x, y, z, w, (void *)src, dst, (long long)size ); CUresult err = cuMemcpyDtoH(dst, src, size); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuMemcpyDtoH failed (%s)", _get_error_name(err)); return err; } } } } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->dev_dirty = false; return 0; }
WEAK void halide_dev_free(buffer_t* buf) { #ifndef NDEBUG fprintf(stderr, "In dev_free of %p - dev: 0x%zx\n", buf, buf->dev); #endif assert(halide_validate_dev_pointer(buf)); CHECK_CALL( cuMemFree(buf->dev), "cuMemFree" ); buf->dev = 0; }
WEAK void halide_dev_free(buffer_t* buf) { #ifdef DEBUG halide_printf("In dev_free of %p - dev: 0x%p\n", buf, (void*)buf->dev); halide_assert(halide_validate_dev_pointer(buf)); #endif CHECK_CALL( cuMemFree(buf->dev), "cuMemFree" ); buf->dev = 0; }
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } size_t size = _buf_size(user_context, buf); if (buf->dev) { // This buffer already has a device allocation halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); return 0; } halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 && buf->stride[2] >= 0 && buf->stride[3] >= 0); DEBUG_PRINTF(user_context, " allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif CUdeviceptr p; DEBUG_PRINTF( user_context, " cuMemAlloc %lld -> ", size ); CUresult err = cuMemAlloc(&p, size); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err)); halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)", _get_error_name(err)); return err; } else { DEBUG_PRINTF( user_context, "%p\n", p ); } halide_assert(user_context, p); buf->dev = (uint64_t)p; #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void halide_copy_to_dev(void *user_context, buffer_t* buf) { if (buf->host_dirty) { halide_assert(user_context, buf->host && buf->dev); size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "copy_to_dev (%lld bytes) %p -> %p\n", (long long)size, buf->host, (void*)buf->dev); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); int err = clEnqueueWriteBuffer( *cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL ); CHECK_ERR( err, "clEnqueueWriteBuffer" ); } buf->host_dirty = false; }
WEAK int halide_dev_malloc(void *user_context, buffer_t* buf) { DEBUG_PRINTF( user_context, "CL: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf ); ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } size_t size = _buf_size(user_context, buf); if (buf->dev) { halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); return 0; } halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 && buf->stride[2] >= 0 && buf->stride[3] >= 0); DEBUG_PRINTF(user_context, " Allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif cl_int err; DEBUG_PRINTF( user_context, " clCreateBuffer -> ", size ); buf->dev = (uint64_t)clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err); if (err != CL_SUCCESS || buf->dev == 0) { DEBUG_PRINTF( user_context, "%d\n", err); halide_error_varargs(user_context, "CL: clCreateBuffer failed (%d)\n", err); return err; } else { DEBUG_PRINTF( user_context, "%p\n", (cl_mem)buf->dev ); } DEBUG_PRINTF(user_context, " Allocated device buffer %p for buffer %p\n", (void *)buf->dev, buf); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return CL_SUCCESS; }
WEAK void halide_copy_to_host(buffer_t* buf) { if (buf->dev_dirty) { halide_assert(buf->dev); halide_assert(buf->host); size_t size = buf_size(buf); #ifdef DEBUG char msg[256]; snprintf(msg, 256, "copy_to_host (%zu bytes) %p -> %p", size, (void*)buf->dev, buf->host ); halide_assert(halide_validate_dev_pointer(buf)); #endif TIME_CALL( cuMemcpyDtoH(buf->host, buf->dev, size), msg ); } buf->dev_dirty = false; }
WEAK void halide_copy_to_dev(buffer_t* buf) { if (buf->host_dirty) { halide_assert(buf->host && buf->dev); size_t size = buf_size(buf); #ifdef DEBUG char msg[256]; snprintf(msg, 256, "copy_to_dev (%zu bytes) %p -> %p (t=%lld)", size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() ); halide_assert(halide_validate_dev_pointer(buf)); #endif TIME_CALL( cuMemcpyHtoD(buf->dev, buf->host, size), msg ); } buf->host_dirty = false; }
WEAK void halide_dev_free(buffer_t* buf) { // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) return; #ifdef DEBUG halide_printf("In dev_free of %p - dev: 0x%p\n", buf, (void*)buf->dev); #endif halide_assert(halide_validate_dev_pointer(buf)); CHECK_CALL( clReleaseMemObject((cl_mem)buf->dev), "clReleaseMemObject" ); buf->dev = 0; }
WEAK void halide_copy_to_host(void *user_context, buffer_t* buf) { if (buf->dev_dirty) { clFinish(*cl_q); // block on completion before read back halide_assert(user_context, buf->host && buf->dev); size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "copy_to_host buf %p (%lld bytes) %p -> %p\n", buf, (long long)size, (void*)buf->dev, buf->host ); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); int err = clEnqueueReadBuffer( *cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL ); CHECK_ERR( err, "clEnqueueReadBuffer" ); } buf->dev_dirty = false; }
WEAK void halide_dev_free(void *user_context, buffer_t* buf) { // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) return; #ifdef DEBUG halide_printf(user_context, "In dev_free of %p - dev: 0x%p\n", buf, (void*)buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); #endif CHECK_CALL( cuMemFree(buf->dev), "cuMemFree" ); buf->dev = 0; }
WEAK void halide_copy_to_dev(buffer_t* buf) { if (buf->host_dirty) { halide_assert(buf->host && buf->dev); size_t size = __buf_size(buf); #ifdef DEBUG char msg[256]; snprintf(msg, 256, "copy_to_dev (%lld bytes) %p -> %p (t=%lld)", (long long)size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() ); #endif halide_assert(halide_validate_dev_pointer(buf)); TIME_START(); int err = clEnqueueWriteBuffer( cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL ); CHECK_ERR( err, msg ); TIME_CHECK(msg); } buf->host_dirty = false; }
WEAK void halide_dev_malloc(buffer_t* buf) { if (buf->dev) { halide_assert(halide_validate_dev_pointer(buf)); return; } size_t size = __buf_size(buf); #ifdef DEBUG halide_printf("dev_malloc allocating buffer of %zd bytes, extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n", size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3], buf->elem_size); #endif buf->dev = (uint64_t)__dev_malloc(size); halide_assert(buf->dev); }
WEAK void halide_dev_malloc(buffer_t* buf) { if (buf->dev) { // This buffer already has a device allocation return; } #ifndef NDEBUG fprintf(stderr, "dev_malloc of %zdx%zdx%zdx%zd (%zd bytes per element) (buf->dev = %p) buffer\n", buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->elem_size, (void*)buf->dev); #endif CUdeviceptr p; TIME_CALL( cuMemAlloc(&p, buf_size(buf)), "dev_malloc"); buf->dev = (uint64_t)p; assert(buf->dev); #ifndef NDEBUG assert(halide_validate_dev_pointer(buf)); #endif }
WEAK void halide_copy_to_host(buffer_t* buf) { if (buf->dev_dirty) { clFinish(cl_q); // block on completion before read back halide_assert(buf->host && buf->dev); size_t size = __buf_size(buf); #ifndef DEBUG char msg[1] = { 0 }; #else char msg[256]; snprintf(msg, 256, "copy_to_host (%lld bytes) %p -> %p", (long long)size, (void*)buf->dev, buf->host ); #endif halide_assert(halide_validate_dev_pointer(buf, size)); TIME_START(); #ifdef DEBUG halide_printf("%s\n", msg); #endif int err = clEnqueueReadBuffer( cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL ); CHECK_ERR( err, msg ); TIME_CHECK(msg); } buf->dev_dirty = false; }
WEAK int halide_dev_free(void *user_context, buffer_t* buf) { // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) { return 0; } DEBUG_PRINTF( user_context, "CUDA: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) return ctx.error; #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); DEBUG_PRINTF( user_context, " cuMemFree %p\n", buf->dev ); CUresult err = cuMemFree(buf->dev); // If cuMemFree fails, it isn't likely to succeed later, so just drop // the reference. buf->dev = 0; if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuMemFree failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) { if (buf->dev) { halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); return; } size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "dev_malloc allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #endif buf->dev = (uint64_t)__dev_malloc(user_context, size); #ifdef DEBUG halide_printf(user_context, "dev_malloc allocated buffer %p of with buf->dev of %p\n", buf, (void *)buf->dev); #endif halide_assert(user_context, buf->dev); }
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) { DEBUG_PRINTF(user_context, "CL: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf ); // Acquire the context so we can use the command queue. This also avoids multiple // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy // the same buffer. ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } if (buf->dev_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->dev && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_dev_to_host_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { #ifdef ENABLE_OPENCL_11 // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we // can deal with the 2 innermost strides with OpenCL. uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3]; size_t offset[3] = { off, 0, 0 }; size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] }; DEBUG_PRINTF( user_context, " clEnqueueReadBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n", z, w, (void *)c.src, c.dst, (int)off, (int)region[0], (int)region[1], (int)region[2], (int)c.stride_bytes[0], (int)c.stride_bytes[1]); cl_int err = clEnqueueReadBufferRect(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE, offset, offset, region, c.stride_bytes[0], c.stride_bytes[1], c.stride_bytes[0], c.stride_bytes[1], (void *)c.dst, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueReadBufferRect failed (%d)\n", err); return err; } #else for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); void *src = (void *)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " clEnqueueReadBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n", x, y, z, w, (long long)size, (void *)src, dst ); cl_int err = clEnqueueReadBuffer(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE, off, size, dst, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueReadBuffer failed (%d)\n", err); return err; } } } #endif } } // The writes above are all non-blocking, so empty the command // queue before we proceed so that other host code won't read // bad data. clFinish(ctx.cmd_queue); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->dev_dirty = false; return 0; }