WEAK int halide_dev_free(void *user_context, buffer_t* buf) { DEBUG_PRINTF( user_context, "CL: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf ); ClContext ctx(user_context); // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) { return 0; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); DEBUG_PRINTF(user_context, " clReleaseMemObject %p\n", (cl_mem)buf->dev ); cl_int result = clReleaseMemObject((cl_mem)buf->dev); // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so // we just end our reference to it regardless. buf->dev = 0; if (result != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clReleaseMemObject failed (%d)", result); return result; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
// Used to generate correct timings when tracing WEAK int halide_dev_sync(void *user_context) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_sync (user_context: %p)\n", user_context ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif CUresult err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void sampling_profiler_thread(void *) { halide_profiler_state *s = halide_profiler_get_state(); // grab the lock halide_mutex_lock(&s->lock); while (s->current_func != halide_profiler_please_stop) { uint64_t t1 = halide_current_time_ns(NULL); uint64_t t = t1; while (1) { uint64_t t_now = halide_current_time_ns(NULL); int func = s->current_func; if (func == halide_profiler_please_stop) { break; } else if (func >= 0) { // Assume all time since I was last awake is due to // the currently running func. bill_func(s, func, t_now - t); } t = t_now; // Release the lock, sleep, reacquire. int sleep_ms = s->sleep_time; halide_mutex_unlock(&s->lock); halide_sleep_ms(NULL, sleep_ms); halide_mutex_lock(&s->lock); } } s->started = false; halide_mutex_unlock(&s->lock); }
WEAK int halide_dev_run(void *user_context, void *state_ptr, const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n", user_context, entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes ); CUresult err; CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, state_ptr); CUmodule mod = ((module_state*)state_ptr)->module; halide_assert(user_context, mod); CUfunction f; err = cuModuleGetFunction(&f, mod, entry_name); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)", _get_error_name(err)); return err; } err = cuLaunchKernel(f, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes, NULL, // stream args, NULL); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n", _get_error_name(err)); return err; } uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) { if (!buf->dev_dirty) { return 0; } DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } // Need to check dev_dirty again, in case another thread did the // copy_to_host before the serialization point above. if (buf->dev_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->dev && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_dev_to_host_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); CUdeviceptr src = (CUdeviceptr)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " cuMemcpyDtoH (%d, %d, %d, %d), %p -> %p, %lld bytes\n", x, y, z, w, (void *)src, dst, (long long)size ); CUresult err = cuMemcpyDtoH(dst, src, size); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuMemcpyDtoH failed (%s)", _get_error_name(err)); return err; } } } } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->dev_dirty = false; return 0; }
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } size_t size = _buf_size(user_context, buf); if (buf->dev) { // This buffer already has a device allocation halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); return 0; } halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 && buf->stride[2] >= 0 && buf->stride[3] >= 0); DEBUG_PRINTF(user_context, " allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif CUdeviceptr p; DEBUG_PRINTF( user_context, " cuMemAlloc %lld -> ", size ); CUresult err = cuMemAlloc(&p, size); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err)); halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)", _get_error_name(err)); return err; } else { DEBUG_PRINTF( user_context, "%p\n", p ); } halide_assert(user_context, p); buf->dev = (uint64_t)p; #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK int halide_dev_malloc(void *user_context, buffer_t* buf) { DEBUG_PRINTF( user_context, "CL: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf ); ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } size_t size = _buf_size(user_context, buf); if (buf->dev) { halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); return 0; } halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 && buf->stride[2] >= 0 && buf->stride[3] >= 0); DEBUG_PRINTF(user_context, " Allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif cl_int err; DEBUG_PRINTF( user_context, " clCreateBuffer -> ", size ); buf->dev = (uint64_t)clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err); if (err != CL_SUCCESS || buf->dev == 0) { DEBUG_PRINTF( user_context, "%d\n", err); halide_error_varargs(user_context, "CL: clCreateBuffer failed (%d)\n", err); return err; } else { DEBUG_PRINTF( user_context, "%p\n", (cl_mem)buf->dev ); } DEBUG_PRINTF(user_context, " Allocated device buffer %p for buffer %p\n", (void *)buf->dev, buf); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return CL_SUCCESS; }
WEAK void halide_dev_run( const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { CUfunction f = __get_kernel(entry_name); #ifdef DEBUG char msg[256]; snprintf( msg, 256, "dev_run %s with (%dx%dx%d) blks, (%dx%dx%d) threads, %d shmem (t=%lld)", entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes, (long long)halide_current_time_ns() ); #endif TIME_CALL( cuLaunchKernel( f, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes, NULL, // stream args, NULL ), msg ); }
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* ptx_src, int size) { DEBUG_PRINTF( user_context, "CUDA: halide_init_kernels (user_context: %p, state_ptr: %p, ptx_src: %p, %i)\n", user_context, state_ptr, ptx_src, size ); CudaContext ctx(user_context); if (ctx.error != 0) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif // Create the state object if necessary. This only happens once, regardless // of how many times halide_init_kernels/halide_release is called. // halide_release traverses this list and releases the module objects, but // it does not modify the list nodes created/inserted here. module_state **state = (module_state**)state_ptr; if (!(*state)) { *state = (module_state*)malloc(sizeof(module_state)); (*state)->module = NULL; (*state)->next = state_list; state_list = *state; } // Create the module itself if necessary. if (!(*state)->module) { DEBUG_PRINTF( user_context, " cuModuleLoadData %p, %i -> ", ptx_src, size ); CUmodule module; CUresult err = cuModuleLoadData(&(*state)->module, ptx_src); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) ); halide_error_varargs(user_context, "CUDA: cuModuleLoadData failed (%s)", _get_error_name(err)); return err; } else { DEBUG_PRINTF( user_context, "%p\n", module ); } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void sampling_profiler_thread(void *) { halide_profiler_state *s = halide_profiler_get_state(); // grab the lock halide_mutex_lock(&s->lock); while (s->current_func != halide_profiler_please_stop) { uint64_t t1 = halide_current_time_ns(NULL); uint64_t t = t1; while (1) { int func, active_threads; if (s->get_remote_profiler_state) { // Execution has disappeared into remote code running // on an accelerator (e.g. Hexagon DSP) s->get_remote_profiler_state(&func, &active_threads); } else { func = s->current_func; active_threads = s->active_threads; } uint64_t t_now = halide_current_time_ns(NULL); if (func == halide_profiler_please_stop) { break; } else if (func >= 0) { // Assume all time since I was last awake is due to // the currently running func. bill_func(s, func, t_now - t, active_threads); } t = t_now; // Release the lock, sleep, reacquire. int sleep_ms = s->sleep_time; halide_mutex_unlock(&s->lock); halide_sleep_ms(NULL, sleep_ms); halide_mutex_lock(&s->lock); } } s->started = false; halide_mutex_unlock(&s->lock); }
WEAK int halide_dev_free(void *user_context, buffer_t* buf) { // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) { return 0; } DEBUG_PRINTF( user_context, "CUDA: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) return ctx.error; #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); DEBUG_PRINTF( user_context, " cuMemFree %p\n", buf->dev ); CUresult err = cuMemFree(buf->dev); // If cuMemFree fails, it isn't likely to succeed later, so just drop // the reference. buf->dev = 0; if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuMemFree failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void halide_copy_to_dev(buffer_t* buf) { if (buf->host_dirty) { halide_assert(buf->host && buf->dev); size_t size = buf_size(buf); #ifdef DEBUG char msg[256]; snprintf(msg, 256, "copy_to_dev (%zu bytes) %p -> %p (t=%lld)", size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() ); halide_assert(halide_validate_dev_pointer(buf)); #endif TIME_CALL( cuMemcpyHtoD(buf->dev, buf->host, size), msg ); } buf->host_dirty = false; }
static CUfunction __get_kernel(const char* entry_name) { CUfunction f; #ifdef DEBUG char msg[256]; snprintf(msg, 256, "get_kernel %s (t=%lld)", entry_name, (long long)halide_current_time_ns() ); #endif // Get kernel function ptr TIME_CALL( cuModuleGetFunction(&f, __mod, entry_name), msg ); return f; }
// Used to generate correct timings when tracing WEAK int halide_dev_sync(void *user_context) { DEBUG_PRINTF( user_context, "CL: halide_dev_sync (user_context: %p)\n", user_context ); ClContext ctx(user_context); halide_assert(user_context, ctx.error == CL_SUCCESS); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif cl_int err = clFinish(ctx.cmd_queue); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clFinish failed (%d)\n", err); return err; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return CL_SUCCESS; }
WEAK void halide_dev_run( const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { cl_kernel f = __get_kernel(entry_name); #ifndef DEBUG char msg[1]; #else char msg[256]; snprintf( msg, 256, "dev_run %s with (%dx%dx%d) blks, (%dx%dx%d) threads, %d shmem (t=%lld)", entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes, (long long)halide_current_time_ns() ); #endif // Pack dims size_t global_dim[3] = {blocksX*threadsX, blocksY*threadsY, blocksZ*threadsZ}; size_t local_dim[3] = {threadsX, threadsY, threadsZ}; // Set args int i = 0; while (arg_sizes[i] != 0) { CHECK_CALL( clSetKernelArg(f, i, arg_sizes[i], args[i]), "clSetKernelArg" ); i++; } // Set the shared mem buffer last // Always set at least 1 byte of shmem, to keep the launch happy CHECK_CALL( clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL), "clSetKernelArg" ); // Launch kernel TIME_START(); int err = clEnqueueNDRangeKernel( cl_q, f, 3, NULL, global_dim, local_dim, 0, NULL, NULL ); CHECK_ERR(err, "clEnqueueNDRangeKernel"); TIME_CHECK(msg); }
static cl_kernel __get_kernel(const char* entry_name) { cl_kernel f; #ifndef DEBUG // char msg[1]; #else char msg[256]; snprintf(msg, 256, "get_kernel %s (t=%lld)", entry_name, (long long)halide_current_time_ns() ); #endif // Get kernel function ptr TIME_START(); int err; f = clCreateKernel(__mod, entry_name, &err); CHECK_ERR(err, "clCreateKernel"); TIME_CHECK(msg); return f; }
WEAK void halide_copy_to_dev(buffer_t* buf) { if (buf->host_dirty) { halide_assert(buf->host && buf->dev); size_t size = __buf_size(buf); #ifdef DEBUG char msg[256]; snprintf(msg, 256, "copy_to_dev (%lld bytes) %p -> %p (t=%lld)", (long long)size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() ); #endif halide_assert(halide_validate_dev_pointer(buf)); TIME_START(); int err = clEnqueueWriteBuffer( cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL ); CHECK_ERR( err, msg ); TIME_CHECK(msg); } buf->host_dirty = false; }
static cl_mem __dev_malloc(size_t bytes) { cl_mem p; #ifndef DEBUG // char msg[1]; #else char msg[256]; snprintf(msg, 256, "dev_malloc (%lld bytes) (t=%lld)", (long long)bytes, (long long)halide_current_time_ns() ); #endif TIME_START(); int err; p = clCreateBuffer(cl_ctx, CL_MEM_READ_WRITE, bytes, NULL, &err ); TIME_CHECK(msg); #ifdef DEBUG halide_printf(" returned: %p (err: %d)\n", (void*)p, err); #endif halide_assert(p); return p; }
WEAK void halide_init_kernels(void *user_context, const char* src, int size) { int err; cl_device_id dev; // Initialize one shared context for all Halide compiled instances if (!(*cl_ctx)) { const cl_uint maxPlatforms = 4; cl_platform_id platforms[maxPlatforms]; cl_uint platformCount = 0; err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount ); CHECK_ERR( err, "clGetPlatformIDs" ); cl_platform_id platform = NULL; // Find the requested platform, or the first if none specified. const char * name = getenv("HL_OCL_PLATFORM"); if (name != NULL) { for (cl_uint i = 0; i < platformCount; ++i) { const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); if (err != CL_SUCCESS) continue; if (strstr(platformName, name)) { platform = platforms[i]; break; } } } else if (platformCount > 0) { platform = platforms[0]; } if (platform == NULL){ halide_printf(user_context, "Failed to find OpenCL platform\n"); return; } #ifdef DEBUG const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); CHECK_ERR( err, "clGetPlatformInfo" ); halide_printf(user_context, "Got platform '%s', about to create context (t=%lld)\n", platformName, (long long)halide_current_time_ns(user_context)); #endif cl_device_type device_type = 0; // Find the device types requested. const char * dev_type = getenv("HL_OCL_DEVICE"); if (dev_type != NULL) { if (strstr("cpu", dev_type)) device_type |= CL_DEVICE_TYPE_CPU; if (strstr("gpu", dev_type)) device_type |= CL_DEVICE_TYPE_GPU; } // If no devices are specified yet, just use all. if (device_type == 0) device_type = CL_DEVICE_TYPE_ALL; // Make sure we have a device const cl_uint maxDevices = 4; cl_device_id devices[maxDevices]; cl_uint deviceCount = 0; err = clGetDeviceIDs( platform, device_type, maxDevices, devices, &deviceCount ); CHECK_ERR( err, "clGetDeviceIDs" ); if (deviceCount == 0) { halide_printf(user_context, "Failed to get device\n"); return; } dev = devices[deviceCount-1]; #ifdef DEBUG const cl_uint maxDeviceName = 256; char deviceName[maxDeviceName]; err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL ); CHECK_ERR( err, "clGetDeviceInfo" ); halide_printf(user_context, "Got device '%s', about to create context (t=%lld)\n", deviceName, (long long)halide_current_time_ns(user_context)); #endif // Create context cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; *cl_ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err); CHECK_ERR( err, "clCreateContext" ); // cuEventCreate(&__start, 0); // cuEventCreate(&__end, 0); halide_assert(user_context, !(*cl_q)); *cl_q = clCreateCommandQueue(*cl_ctx, dev, 0, &err); CHECK_ERR( err, "clCreateCommandQueue" ); } else { #ifdef DEBUG halide_printf(user_context, "Already had context %p\n", *cl_ctx); #endif // Maintain ref count of context. CHECK_CALL( clRetainContext(*cl_ctx), "clRetainContext" ); CHECK_CALL( clRetainCommandQueue(*cl_q), "clRetainCommandQueue" ); CHECK_CALL( clGetContextInfo(*cl_ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL), "clGetContextInfo" ); } // Initialize a module for just this Halide module if ((!__mod) && (size > 1)) { // Create module cl_device_id devices[] = { dev }; size_t lengths[] = { size }; if (strstr(src, "/*OpenCL C*/")) { // Program is OpenCL C. #ifdef DEBUG halide_printf(user_context, "Compiling OpenCL C kernel: %s\n\n", src); #endif const char * sources[] = { src }; __mod = clCreateProgramWithSource(*cl_ctx, 1, &sources[0], NULL, &err ); CHECK_ERR( err, "clCreateProgramWithSource" ); } else { // Program is SPIR binary. #ifdef DEBUG halide_printf(user_context, "Compiling SPIR kernel (%i bytes)\n", size); #endif const unsigned char * binaries[] = { (unsigned char *)src }; __mod = clCreateProgramWithBinary(*cl_ctx, 1, devices, lengths, &binaries[0], NULL, &err ); CHECK_ERR( err, "clCreateProgramWithBinary" ); } err = clBuildProgram( __mod, 1, &dev, NULL, NULL, NULL ); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; halide_printf(user_context, "Error: Failed to build program executable! err = %d\n", err); if (clGetProgramBuildInfo(__mod, dev, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len) == CL_SUCCESS) halide_printf(user_context, "Build Log:\n %s\n-----\n", buffer); else halide_printf(user_context, "clGetProgramBuildInfo failed to get build log!\n"); halide_assert(user_context, err == CL_SUCCESS); } } }
WEAK void halide_init_kernels(const char* src, int size) { int err; cl_device_id dev; // Initialize one shared context for all Halide compiled instances if (!cl_ctx) { const cl_uint maxPlatforms = 4; cl_platform_id platforms[maxPlatforms]; cl_uint platformCount = 0; err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount ); CHECK_ERR( err, "clGetPlatformIDs" ); cl_platform_id platform = NULL; const char * name = get_opencl_platform(); if (name != NULL) { for (cl_uint i = 0; i < platformCount; ++i) { const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); if (err != CL_SUCCESS) continue; if (strstr(platformName, name)) { platform = platforms[i]; break; } } } else if (platformCount > 0) { platform = platforms[0]; } if (platform == NULL){ halide_printf("Failed to find OpenCL platform\n"); return; } #ifdef DEBUG const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); CHECK_ERR( err, "clGetPlatformInfo" ); halide_printf("Got platform '%s', about to create context (t=%lld)\n", platformName, (long long)halide_current_time_ns()); #endif // Make sure we have a device const cl_uint maxDevices = 4; cl_device_id devices[maxDevices]; cl_uint deviceCount = 0; err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_ALL, maxDevices, devices, &deviceCount ); CHECK_ERR( err, "clGetDeviceIDs" ); if (deviceCount == 0) { halide_printf("Failed to get device\n"); return; } dev = devices[deviceCount-1]; #ifdef DEBUG const cl_uint maxDeviceName = 256; char deviceName[maxDeviceName]; err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL ); CHECK_ERR( err, "clGetDeviceInfo" ); halide_printf("Got device '%s', about to create context (t=%lld)\n", deviceName, (long long)halide_current_time_ns()); #endif // Create context cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err); CHECK_ERR( err, "clCreateContext" ); // cuEventCreate(&__start, 0); // cuEventCreate(&__end, 0); halide_assert(!cl_q); cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err); CHECK_ERR( err, "clCreateCommandQueue" ); } else { // Maintain ref count of context. clRetainContext(cl_ctx); clRetainCommandQueue(cl_q); } // Initialize a module for just this Halide module if ((!__mod) && (size > 1)) { #ifdef DEBUG halide_printf("Compiling kernel (%i bytes)\n", size); #endif // Create module cl_device_id devices[] = { dev }; size_t lengths[] = { size }; if (strstr(src, "/*OpenCL C*/")) { // Program is OpenCL C. const char * sources[] = { src }; __mod = clCreateProgramWithSource(cl_ctx, 1, &sources[0], NULL, &err ); CHECK_ERR( err, "clCreateProgramWithSource" ); } else { // Program is SPIR binary. const unsigned char * binaries[] = { (unsigned char *)src }; __mod = clCreateProgramWithBinary(cl_ctx, 1, devices, lengths, &binaries[0], NULL, &err ); CHECK_ERR( err, "clCreateProgramWithBinary" ); } err = clBuildProgram( __mod, 1, &dev, NULL, NULL, NULL ); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; halide_printf("Error: Failed to build program executable! err = %d\n", err); if (clGetProgramBuildInfo(__mod, dev, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len) == CL_SUCCESS) halide_printf("%s\n", buffer); else halide_printf("clGetProgramBuildInfo failed to get build log!\n"); halide_assert(err == CL_SUCCESS); } } }
// Override halide's print to use logd and also print the time. extern "C" void halide_print(void *, const char *msg) { static int64_t t0 = halide_current_time_ns(); int64_t t1 = halide_current_time_ns(); LOGD("%d: %s\n", (int)(t1 - t0)/1000000, msg); t0 = t1; }
JNIEXPORT bool JNICALL Java_com_example_helloandroidcamera2_JNIUtils_edgeDetect( JNIEnv *env, jobject obj, jint srcWidth, jint srcHeight, jobject srcLumaByteBuffer, jint srcLumaRowStrideBytes, jobject dstSurface) { uint8_t *srcLumaPtr = reinterpret_cast<uint8_t *>( env->GetDirectBufferAddress(srcLumaByteBuffer)); if (srcLumaPtr == NULL) { return false; } ANativeWindow *win = ANativeWindow_fromSurface(env, dstSurface); ANativeWindow_acquire(win); ANativeWindow_Buffer buf; if (int err = ANativeWindow_lock(win, &buf, NULL)) { LOGE("ANativeWindow_lock failed with error code %d\n", err); ANativeWindow_release(win); return false; } ANativeWindow_setBuffersGeometry(win, srcWidth, srcHeight, 0 /*format unchanged*/); uint8_t *dstLumaPtr = reinterpret_cast<uint8_t *>(buf.bits); if (dstLumaPtr == NULL) { ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return false; } if (buf.format != IMAGE_FORMAT_YV12) { LOGE("ANativeWindow buffer locked but its format was not YV12."); ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return false; } if (!checkBufferSizesMatch(srcWidth, srcHeight, &buf)) { LOGE("ANativeWindow buffer locked but its size was %d x %d, expected " "%d x %d", buf.width, buf.height, srcWidth, srcHeight); ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return false; } uint32_t dstLumaSizeBytes = buf.stride * buf.height; uint32_t dstChromaRowStrideBytes = ALIGN(buf.stride / 2, 16); // Size of one chroma plane. uint32_t dstChromaSizeBytes = dstChromaRowStrideBytes * buf.height / 2; uint8_t *dstChromaVPtr = dstLumaPtr + dstLumaSizeBytes; uint8_t *dstChromaUPtr = dstLumaPtr + dstLumaSizeBytes + dstChromaSizeBytes; // Make these static so that we can reuse device allocations across frames. // It doesn't matter now, but useful for GPU backends. static buffer_t srcBuf = { 0 }; static buffer_t dstBuf = { 0 }; static buffer_t dstChromaBuf = { 0 }; srcBuf.host = srcLumaPtr; srcBuf.host_dirty = true; srcBuf.extent[0] = srcWidth; srcBuf.extent[1] = srcHeight; srcBuf.extent[2] = 0; srcBuf.extent[3] = 0; srcBuf.stride[0] = 1; srcBuf.stride[1] = srcLumaRowStrideBytes; srcBuf.min[0] = 0; srcBuf.min[1] = 0; srcBuf.elem_size = 1; dstBuf.host = dstLumaPtr; dstBuf.extent[0] = buf.width; // src and dst width/height actually match. dstBuf.extent[1] = buf.height; dstBuf.extent[2] = 0; dstBuf.extent[3] = 0; dstBuf.stride[0] = 1; dstBuf.stride[1] = buf.stride; // src and dst strides actually match. dstBuf.min[0] = 0; dstBuf.min[1] = 0; dstBuf.elem_size = 1; static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); first_call = false; for (int t = 0; t < 16; t++) { times[t] = 0; } } // Set chrominance to 128 to appear grayscale. // The dst chroma is guaranteed to be tightly packed since it's YV12. memset(dstChromaVPtr, 128, dstChromaSizeBytes * 2); int64_t t1 = halide_current_time_ns(); int err = edge_detect(&srcBuf, &dstBuf); if (err != halide_error_code_success) { LOGE("edge_detect failed with error code: %d", err); } int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1) / 1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) { min = times[i]; } } LOGD("Time taken: %d us (minimum: %d us)", elapsed_us, min); ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); return (err != halide_error_code_success); }
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame( JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jobject surf) { const int w = j_w, h = j_h; halide_set_error_handler(handler); unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL); if (!src) { LOGD("src is null\n"); return; } ANativeWindow *win = ANativeWindow_fromSurface(env, surf); ANativeWindow_acquire(win); static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); LOGD("Resetting buffer format"); ANativeWindow_setBuffersGeometry(win, w, h, 0); first_call = false; for (int t = 0; t < 16; t++) times[t] = 0; } ANativeWindow_Buffer buf; ARect rect = {0, 0, w, h}; if (int err = ANativeWindow_lock(win, &buf, NULL)) { LOGD("ANativeWindow_lock failed with error code %d\n", err); return; } uint8_t *dst = (uint8_t *)buf.bits; // If we're using opencl, use the gpu backend for it. halide_set_ocl_device_type("gpu"); // Make these static so that we can reuse device allocations across frames. static buffer_t srcBuf = {0}; static buffer_t dstBuf = {0}; if (dst) { srcBuf.host = (uint8_t *)src; srcBuf.host_dirty = true; srcBuf.extent[0] = w; srcBuf.extent[1] = h; srcBuf.extent[2] = 0; srcBuf.extent[3] = 0; srcBuf.stride[0] = 1; srcBuf.stride[1] = w; srcBuf.min[0] = 0; srcBuf.min[1] = 0; srcBuf.elem_size = 1; dstBuf.host = dst; dstBuf.extent[0] = w; dstBuf.extent[1] = h; dstBuf.extent[2] = 0; dstBuf.extent[3] = 0; dstBuf.stride[0] = 1; dstBuf.stride[1] = w; dstBuf.min[0] = 0; dstBuf.min[1] = 0; dstBuf.elem_size = 1; // Just copy over chrominance untouched memcpy(dst + w*h, src + w*h, (w*h)/2); int64_t t1 = halide_current_time_ns(); halide_generated(&srcBuf, &dstBuf); if (dstBuf.dev) { halide_copy_to_host(NULL, &dstBuf); } int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1)/1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) min = times[i]; } LOGD("Time taken: %d (%d)", elapsed_us, min); } ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0); }
WEAK void halide_init_kernels(const char* ptx_src) { // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context. if (cuda_ctx_ptr == NULL) { cuda_ctx_ptr = &weak_cuda_ctx; } // Initialize one shared context for all Halide compiled instances if (*cuda_ctx_ptr == 0) { // Initialize CUDA CHECK_CALL( cuInit(0), "cuInit" ); // Make sure we have a device int deviceCount = 0; CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" ); halide_assert(deviceCount > 0); char *device_str = getenv("HL_GPU_DEVICE"); CUdevice dev; // Get device CUresult status; if (device_str) { status = cuDeviceGet(&dev, atoi(device_str)); } else { for (int id = 2; id >= 0; id--) { // Try to get a device >0 first, since 0 should be our display device status = cuDeviceGet(&dev, id); if (status == CUDA_SUCCESS) break; } } if (status != CUDA_SUCCESS) { halide_printf("Failed to get device\n"); exit(-1); } #ifdef DEBUG halide_printf("Got device %d, about to create context (t=%lld)\n", dev, (long long)halide_current_time_ns()); #endif // Create context CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" ); } else { //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" ); } // Initialize a module for just this Halide module if (!__mod) { // Create module CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" ); #ifdef DEBUG halide_printf("-------\nCompiling PTX:\n%s\n--------\n", ptx_src); #endif } // Create two events for timing if (!__start) { cuEventCreate(&__start, 0); cuEventCreate(&__end, 0); } }
// Initializes the context used by the default implementation // of halide_acquire_context. static int create_context(void *user_context, cl_context *ctx, cl_command_queue *q) { DEBUG_PRINTF( user_context, " create_context (user_context: %p)\n", user_context ); halide_assert(user_context, ctx != NULL && *ctx == NULL); halide_assert(user_context, q != NULL && *q == NULL); cl_int err = 0; const cl_uint maxPlatforms = 4; cl_platform_id platforms[maxPlatforms]; cl_uint platformCount = 0; err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount ); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clGetPlatformIDs failed (%d)\n", err); return err; } cl_platform_id platform = NULL; // Find the requested platform, or the first if none specified. const char * name = getenv("HL_OCL_PLATFORM_NAME"); if (name != NULL) { for (cl_uint i = 0; i < platformCount; ++i) { const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); if (err != CL_SUCCESS) continue; // A platform matches the request if it is a substring of the platform name. if (strstr(platformName, name)) { platform = platforms[i]; break; } } } else if (platformCount > 0) { platform = platforms[0]; } if (platform == NULL){ halide_error(user_context, "CL: Failed to find platform\n"); return CL_INVALID_PLATFORM; } #ifdef DEBUG const cl_uint maxPlatformName = 256; char platformName[maxPlatformName]; err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL ); if (err != CL_SUCCESS) { halide_printf(user_context, " clGetPlatformInfo(CL_PLATFORM_NAME) failed (%d)\n", err); // This is just debug info, report the error but don't fail context creation due to it. //return err; } else { halide_printf(user_context, " Got platform '%s', about to create context (t=%lld)\n", platformName, (long long)halide_current_time_ns(user_context)); } #endif // Get the types of devices requested. cl_device_type device_type = 0; const char * dev_type = getenv("HL_OCL_DEVICE_TYPE"); if (dev_type != NULL) { if (strstr("cpu", dev_type)) { device_type |= CL_DEVICE_TYPE_CPU; } if (strstr("gpu", dev_type)) { device_type |= CL_DEVICE_TYPE_GPU; } } // If no device types are specified, use all the available // devices. if (device_type == 0) { device_type = CL_DEVICE_TYPE_ALL; } // Get all the devices of the specified type. const cl_uint maxDevices = 4; cl_device_id devices[maxDevices]; cl_uint deviceCount = 0; err = clGetDeviceIDs( platform, device_type, maxDevices, devices, &deviceCount ); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clGetDeviceIDs failed (%d)\n", err); return err; } // If the user indicated a specific device index to use, use // that. Note that this is an index within the set of devices // specified by the device type. char *device_str = getenv("HL_GPU_DEVICE"); cl_uint device = deviceCount - 1; if (device_str) { device = atoi(device_str); } if (device >= deviceCount) { halide_error_varargs(user_context, "CL: Failed to get device %i\n", device); return CL_DEVICE_NOT_FOUND; } cl_device_id dev = devices[device]; #ifdef DEBUG const cl_uint maxDeviceName = 256; char deviceName[maxDeviceName]; err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL ); if (err != CL_SUCCESS) { halide_printf(user_context, " clGetDeviceInfo(CL_DEVICE_NAME) failed (%d)\n", err); // This is just debug info, report the error but don't fail context create if it fails. //return err; } else { halide_printf(user_context, " Got device '%s'\n", deviceName); } #endif // Create context and command queue. cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; DEBUG_PRINTF( user_context, " clCreateContext -> " ); *ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err); if (err != CL_SUCCESS) { DEBUG_PRINTF( user_context, "%d", err); halide_error_varargs(user_context, "CL: clCreateContext failed (%d)\n", err); return err; } else { DEBUG_PRINTF( user_context, "%p\n", *ctx ); } DEBUG_PRINTF(user_context, " clCreateCommandQueue "); *q = clCreateCommandQueue(*ctx, dev, 0, &err); if (err != CL_SUCCESS) { DEBUG_PRINTF( user_context, "%d", err ); halide_error_varargs(user_context, "CL: clCreateCommandQueue failed (%d)\n", err); return err; } else { DEBUG_PRINTF( user_context, "%p\n", *q ); } return err; }
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* src, int size) { DEBUG_PRINTF( user_context, "CL: halide_init_kernels (user_context: %p, state_ptr: %p, program: %p, %i)\n", user_context, state_ptr, src, size ); ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif // Create the state object if necessary. This only happens once, regardless // of how many times halide_init_kernels/halide_release is called. // halide_release traverses this list and releases the program objects, but // it does not modify the list nodes created/inserted here. module_state **state = (module_state**)state_ptr; if (!(*state)) { *state = (module_state*)malloc(sizeof(module_state)); (*state)->program = NULL; (*state)->next = state_list; state_list = *state; } // Create the program if necessary. TODO: The program object needs to not // only already exist, but be created for the same context/device as the // calling context/device. if (!(*state && (*state)->program) && size > 1) { cl_int err = 0; cl_device_id dev; err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed (%d)\n", err); return err; } cl_device_id devices[] = { dev }; size_t lengths[] = { size }; // Get the max constant buffer size supported by this OpenCL implementation. cl_ulong max_constant_buffer_size = 0; err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed (%d)\n", err); return err; } // Get the max number of constant arguments supported by this OpenCL implementation. cl_uint max_constant_args = 0; err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed (%d)\n", err); return err; } // Build the compile argument options. char options[256]; snprintf(options, sizeof(options), "-D MAX_CONSTANT_BUFFER_SIZE=%lld -D MAX_CONSTANT_ARGS=%i", max_constant_buffer_size, max_constant_args); const char * sources[] = { src }; DEBUG_PRINTF( user_context, " clCreateProgramWithSource -> " ); cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], NULL, &err ); if (err != CL_SUCCESS) { DEBUG_PRINTF( user_context, "%d\n", err ); halide_error_varargs(user_context, "CL: clCreateProgramWithSource failed (%d)\n", err); return err; } else { DEBUG_PRINTF( user_context, "%p\n", program ); } (*state)->program = program; DEBUG_PRINTF( user_context, " clBuildProgram %p %s\n", program, options ); err = clBuildProgram(program, 1, devices, options, NULL, NULL ); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clBuildProgram failed (%d)\n", err); // Allocate an appropriately sized buffer for the build log. size_t len = 0; char *buffer = NULL; if (clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &len) == CL_SUCCESS) { buffer = (char*)malloc((++len)*sizeof(char)); } // Get build log if (buffer && clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, len, buffer, NULL) == CL_SUCCESS) { halide_printf(user_context, " Build Log:\n %s\n-----\n", buffer); } else { halide_printf(user_context, " clGetProgramBuildInfo failed\n"); } if (buffer) { free(buffer); } halide_assert(user_context, err == CL_SUCCESS); return err; } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK int halide_dev_run(void *user_context, void *state_ptr, const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { DEBUG_PRINTF( user_context, "CL: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n", user_context, entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes ); cl_int err; ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif // Create kernel object for entry_name from the program for this module. halide_assert(user_context, state_ptr); cl_program program = ((module_state*)state_ptr)->program; halide_assert(user_context, program); DEBUG_PRINTF( user_context, " clCreateKernel %s -> ", entry_name ); cl_kernel f = clCreateKernel(program, entry_name, &err); if (err != CL_SUCCESS) { DEBUG_PRINTF( user_context, "%d\n", err ); halide_error_varargs(user_context, "CL: clCreateKernel (%s) failed (%d)\n", entry_name, err); return err; } else { #ifdef DEBUG uint64_t t_create_kernel = halide_current_time_ns(user_context); halide_printf( user_context, "%p (%f ms)\n", f, (t_create_kernel - t_before) / 1.0e6 ); #endif } // Pack dims size_t global_dim[3] = {blocksX*threadsX, blocksY*threadsY, blocksZ*threadsZ}; size_t local_dim[3] = {threadsX, threadsY, threadsZ}; // Set args int i = 0; while (arg_sizes[i] != 0) { DEBUG_PRINTF(user_context, " clSetKernelArg %i %i [0x%x ...]\n", i, arg_sizes[i], *(int *)args[i]); cl_int err = clSetKernelArg(f, i, arg_sizes[i], args[i]); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clSetKernelArg failed (%d)\n", err); return err; } i++; } // Set the shared mem buffer last // Always set at least 1 byte of shmem, to keep the launch happy DEBUG_PRINTF(user_context, " clSetKernelArg %i %i [NULL]\n", i, shared_mem_bytes); err = clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clSetKernelArg failed (%d)\n", err); return err; } // Launch kernel DEBUG_PRINTF( user_context, " clEnqueueNDRangeKernel %dx%dx%d, %dx%dx%d -> ", blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ ); err = clEnqueueNDRangeKernel(ctx.cmd_queue, f, // NDRange 3, NULL, global_dim, local_dim, // Events 0, NULL, NULL); if (err != CL_SUCCESS) { DEBUG_PRINTF( user_context, "%d\n", err ); halide_error_varargs(user_context, "CL: clEnqueueNDRangeKernel failed (%d)\n", err); return err; } else { DEBUG_PRINTF ( user_context, "CL_SUCCESS\n" ); } DEBUG_PRINTF( user_context, " clReleaseKernel %p\n", f ); clReleaseKernel(f); #ifdef DEBUG err = clFinish(ctx.cmd_queue); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clFinish failed (%d)\n", err); return err; } uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) { DEBUG_PRINTF(user_context, "CL: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf ); // Acquire the context so we can use the command queue. This also avoids multiple // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy // the same buffer. ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } if (buf->dev_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->dev && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_dev_to_host_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { #ifdef ENABLE_OPENCL_11 // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we // can deal with the 2 innermost strides with OpenCL. uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3]; size_t offset[3] = { off, 0, 0 }; size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] }; DEBUG_PRINTF( user_context, " clEnqueueReadBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n", z, w, (void *)c.src, c.dst, (int)off, (int)region[0], (int)region[1], (int)region[2], (int)c.stride_bytes[0], (int)c.stride_bytes[1]); cl_int err = clEnqueueReadBufferRect(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE, offset, offset, region, c.stride_bytes[0], c.stride_bytes[1], c.stride_bytes[0], c.stride_bytes[1], (void *)c.dst, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueReadBufferRect failed (%d)\n", err); return err; } #else for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); void *src = (void *)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " clEnqueueReadBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n", x, y, z, w, (long long)size, (void *)src, dst ); cl_int err = clEnqueueReadBuffer(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE, off, size, dst, 0, NULL, NULL); if (err != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clEnqueueReadBuffer failed (%d)\n", err); return err; } } } #endif } } // The writes above are all non-blocking, so empty the command // queue before we proceed so that other host code won't read // bad data. clFinish(ctx.cmd_queue); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->dev_dirty = false; return 0; }
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame( JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jint j_orientation, jobject surf) { const int w = j_w, h = j_h, orientation = j_orientation; halide_start_clock(NULL); halide_set_error_handler(handler); unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL); if (!src) { LOGD("src is null\n"); return; } LOGD("[output window size] j_w = %d, j_h = %d", j_w, j_h); LOGD("[src array length] jSrc.length = %d", env->GetArrayLength(jSrc)); ANativeWindow *win = ANativeWindow_fromSurface(env, surf); static bool first_call = true; static unsigned counter = 0; static unsigned times[16]; if (first_call) { LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count()); LOGD("Resetting buffer format"); ANativeWindow_setBuffersGeometry(win, w, h, 0); first_call = false; for (int t = 0; t < 16; t++) times[t] = 0; } ANativeWindow_Buffer buf; ARect rect = {0, 0, w, h}; if (int err = ANativeWindow_lock(win, &buf, NULL)) { LOGD("ANativeWindow_lock failed with error code %d\n", err); return; } uint8_t *dst = (uint8_t *)buf.bits; // If we're using opencl, use the gpu backend for it. #if COMPILING_FOR_OPENCL halide_opencl_set_device_type("gpu"); #endif // Make these static so that we can reuse device allocations across frames. static halide_buffer_t srcBuf = {0}; static halide_dimension_t srcDim[2]; static halide_buffer_t dstBuf = {0}; static halide_dimension_t dstDim[2]; if (dst) { srcBuf.host = (uint8_t *)src; srcBuf.set_host_dirty(); srcBuf.dim = srcDim; srcBuf.dim[0].min = 0; srcBuf.dim[0].extent = w; srcBuf.dim[0].stride = 1; srcBuf.dim[1].min = 0; srcBuf.dim[1].extent = h; srcBuf.dim[1].stride = w; srcBuf.type = halide_type_of<uint8_t>(); if (orientation >= 180) { // Camera sensor is probably upside down (e.g. Nexus 5x) srcBuf.host += w*h-1; srcBuf.dim[0].stride = -1; srcBuf.dim[1].stride = -w; } dstBuf.host = dst; dstBuf.dim = dstDim; dstBuf.dim[0].min = 0; dstBuf.dim[0].extent = w; dstBuf.dim[0].stride = 1; dstBuf.dim[1].min = 0; dstBuf.dim[1].extent = h; dstBuf.dim[1].stride = w; dstBuf.type = halide_type_of<uint8_t>(); // Just set chroma to gray. memset(dst + w*h, 128, (w*h)/2); int64_t t1 = halide_current_time_ns(); hello(&srcBuf, &dstBuf); halide_copy_to_host(NULL, &dstBuf); int64_t t2 = halide_current_time_ns(); unsigned elapsed_us = (t2 - t1)/1000; times[counter & 15] = elapsed_us; counter++; unsigned min = times[0]; for (int i = 1; i < 16; i++) { if (times[i] < min) min = times[i]; } LOGD("Time taken: %d (%d)", elapsed_us, min); } ANativeWindow_unlockAndPost(win); ANativeWindow_release(win); env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0); }