WEAK int halide_copy_to_dev(void *user_context, buffer_t* buf) { int err = halide_dev_malloc(user_context, buf); if (err) { return err; } DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_dev (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } if (buf->host_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->host && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_host_to_dev_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); void *src = (void *)(c.src + off); CUdeviceptr dst = (CUdeviceptr)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " cuMemcpyHtoD (%d, %d, %d, %d), %p -> %p, %lld bytes\n", x, y, z, w, src, (void *)dst, (long long)size ); CUresult err = cuMemcpyHtoD(dst, src, size); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuMemcpyHtoD failed (%s)", _get_error_name(err)); return err; } } } } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->host_dirty = false; return 0; }
int main(int argc, char* argv[]) { halide_init_kernels(src, 0); const int N = 2048; buffer_t in, out; in.dev = 0; in.host = (uint8_t*)malloc(N*sizeof(float)); in.elem_size = sizeof(float); in.extent[0] = N; in.extent[1] = 1; in.extent[2] = 1; in.extent[3] = 1; out.dev = 0; out.host = (uint8_t*)malloc(N*sizeof(float)); out.elem_size = sizeof(float); out.extent[0] = N; out.extent[1] = 1; out.extent[2] = 1; out.extent[3] = 1; for (int i = 0; i < N; i++) { ((float*)in.host)[i] = i / 2.0; } in.host_dirty = true; halide_dev_malloc(&in); halide_dev_malloc(&out); halide_copy_to_dev(&in); f( &in, &out, N ); out.dev_dirty = true; halide_copy_to_host(&out); for (int i = 0; i < N; i++) { float a = ((float*)in.host)[i]; float b = ((float*)out.host)[i]; if (b != a*a) { printf("[%d] %f != %f^2\n", i, b, a); } } }
WEAK int halide_copy_to_dev(void *user_context, buffer_t* buf) { int err = halide_dev_malloc(user_context, buf); if (err) { return err; } DEBUG_PRINTF(user_context, "CL: halide_copy_to_dev (user_context: %p, buf: %p)\n", user_context, buf ); // Acquire the context so we can use the command queue. This also avoids multiple // redundant calls to clEnqueueWriteBuffer when multiple threads are trying to copy // the same buffer. ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } if (buf->host_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->host && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_host_to_dev_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { #ifdef ENABLE_OPENCL_11 // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we // can deal with the 2 innermost strides with OpenCL. uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3]; size_t offset[3] = { off, 0, 0 }; size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] }; DEBUG_PRINTF( user_context, " clEnqueueWriteBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n", z, w, (void *)c.src, c.dst, (int)off, (int)region[0], (int)region[1], (int)region[2], (int)c.stride_bytes[0], (int)c.stride_bytes[1]); cl_int err = clEnqueueWriteBufferRect(ctx.cmd_queue, (cl_mem)c.dst, CL_FALSE, offset, offset, region, c.stride_bytes[0], c.stride_bytes[1], c.stride_bytes[0], c.stride_bytes[1], (void *)c.src, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueWriteBufferRect failed (%d)\n", err); return err; } #else for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); void *src = (void *)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " clEnqueueWriteBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n", x, y, z, w, (long long)size, src, (void *)dst ); cl_int err = clEnqueueWriteBuffer(ctx.cmd_queue, (cl_mem)c.dst, CL_FALSE, off, size, src, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueWriteBuffer failed (%d)\n", err); return err; } } } #endif } } // The writes above are all non-blocking, so empty the command // queue before we proceed so that other host code won't write // to the buffer while the above writes are still running. clFinish(ctx.cmd_queue); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->host_dirty = false; return 0; }