Пример #1
0
WEAK int halide_dev_free(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf );

    ClContext ctx(user_context);

    // halide_dev_free, at present, can be exposed to clients and they
    // should be allowed to call halide_dev_free on any buffer_t
    // including ones that have never been used with a GPU.
    if (buf->dev == 0) {
      return 0;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));
    DEBUG_PRINTF(user_context, "    clReleaseMemObject %p\n", (cl_mem)buf->dev );
    cl_int result = clReleaseMemObject((cl_mem)buf->dev);
    // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so
    // we just end our reference to it regardless.
    buf->dev = 0;
    if (result != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clReleaseMemObject failed (%d)", result);
        return result;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Пример #2
0
// Used to generate correct timings when tracing
WEAK int halide_dev_sync(void *user_context) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_sync (user_context: %p)\n", user_context );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUresult err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Пример #3
0
WEAK void sampling_profiler_thread(void *) {
    halide_profiler_state *s = halide_profiler_get_state();

    // grab the lock
    halide_mutex_lock(&s->lock);

    while (s->current_func != halide_profiler_please_stop) {

        uint64_t t1 = halide_current_time_ns(NULL);
        uint64_t t = t1;
        while (1) {
            uint64_t t_now = halide_current_time_ns(NULL);
            int func = s->current_func;
            if (func == halide_profiler_please_stop) {
                break;
            } else if (func >= 0) {
                // Assume all time since I was last awake is due to
                // the currently running func.
                bill_func(s, func, t_now - t);
            }
            t = t_now;

            // Release the lock, sleep, reacquire.
            int sleep_ms = s->sleep_time;
            halide_mutex_unlock(&s->lock);
            halide_sleep_ms(NULL, sleep_ms);
            halide_mutex_lock(&s->lock);
        }
    }

    s->started = false;

    halide_mutex_unlock(&s->lock);
}
Пример #4
0
WEAK int halide_dev_run(void *user_context,
                        void *state_ptr,
                        const char* entry_name,
                        int blocksX, int blocksY, int blocksZ,
                        int threadsX, int threadsY, int threadsZ,
                        int shared_mem_bytes,
                        size_t arg_sizes[],
                        void* args[]) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n",
                  user_context, entry_name,
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ,
                  shared_mem_bytes );

    CUresult err;
    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, state_ptr);
    CUmodule mod = ((module_state*)state_ptr)->module;
    halide_assert(user_context, mod);
    CUfunction f;
    err = cuModuleGetFunction(&f, mod, entry_name);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)",
                             _get_error_name(err));
        return err;
    }

    err = cuLaunchKernel(f,
                         blocksX,  blocksY,  blocksZ,
                         threadsX, threadsY, threadsZ,
                         shared_mem_bytes,
                         NULL, // stream
                         args,
                         NULL);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n",
                             _get_error_name(err));
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif
    return 0;
}
Пример #5
0
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) {
    if (!buf->dev_dirty) {
        return 0;
    }

    DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    // Need to check dev_dirty again, in case another thread did the
    // copy_to_host before the serialization point above.
    if (buf->dev_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->dev && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_dev_to_host_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        CUdeviceptr src = (CUdeviceptr)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;
                        DEBUG_PRINTF( user_context, "    cuMemcpyDtoH (%d, %d, %d, %d), %p -> %p, %lld bytes\n",
                                      x, y, z, w,
                                      (void *)src, dst, (long long)size );
                        CUresult err = cuMemcpyDtoH(dst, src, size);
                        if (err != CUDA_SUCCESS) {
                            halide_error_varargs(user_context, "CUDA: cuMemcpyDtoH failed (%s)",
                                                 _get_error_name(err));
                            return err;
                        }
                    }
                }
            }
        }

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->dev_dirty = false;
    return 0;
}
Пример #6
0
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    size_t size = _buf_size(user_context, buf);
    if (buf->dev) {
        // This buffer already has a device allocation
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    DEBUG_PRINTF(user_context, "    allocating buffer of %lld bytes, "
                 "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n",
                 (long long)size,
                 (long long)buf->extent[0], (long long)buf->extent[1],
                 (long long)buf->extent[2], (long long)buf->extent[3],
                 (long long)buf->stride[0], (long long)buf->stride[1],
                 (long long)buf->stride[2], (long long)buf->stride[3],
                 buf->elem_size);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUdeviceptr p;
    DEBUG_PRINTF( user_context, "    cuMemAlloc %lld -> ", size );
    CUresult err = cuMemAlloc(&p, size);
    if (err != CUDA_SUCCESS) {
        DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err));
        halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)",
                             _get_error_name(err));
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", p );
    }
    halide_assert(user_context, p);
    buf->dev = (uint64_t)p;

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Пример #7
0
WEAK int halide_dev_malloc(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf );

    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    size_t size = _buf_size(user_context, buf);
    if (buf->dev) {
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    DEBUG_PRINTF(user_context, "    Allocating buffer of %lld bytes, "
                 "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n",
                 (long long)size,
                 (long long)buf->extent[0], (long long)buf->extent[1],
                 (long long)buf->extent[2], (long long)buf->extent[3],
                 (long long)buf->stride[0], (long long)buf->stride[1],
                 (long long)buf->stride[2], (long long)buf->stride[3],
                 buf->elem_size);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    cl_int err;
    DEBUG_PRINTF( user_context, "    clCreateBuffer -> ", size );
    buf->dev = (uint64_t)clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err);
    if (err != CL_SUCCESS || buf->dev == 0) {
        DEBUG_PRINTF( user_context, "%d\n", err);
        halide_error_varargs(user_context, "CL: clCreateBuffer failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", (cl_mem)buf->dev );
    }

    DEBUG_PRINTF(user_context, "    Allocated device buffer %p for buffer %p\n",
                 (void *)buf->dev, buf);

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return CL_SUCCESS;
}
Пример #8
0
WEAK void halide_dev_run(
    const char* entry_name,
    int blocksX, int blocksY, int blocksZ,
    int threadsX, int threadsY, int threadsZ,
    int shared_mem_bytes,
    size_t arg_sizes[],
    void* args[])
{
    CUfunction f = __get_kernel(entry_name);

    #ifdef DEBUG
    char msg[256];
    snprintf(
        msg, 256,
        "dev_run %s with (%dx%dx%d) blks, (%dx%dx%d) threads, %d shmem (t=%lld)",
        entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes,
        (long long)halide_current_time_ns()
    );
    #endif

    TIME_CALL(
        cuLaunchKernel(
            f,
            blocksX,  blocksY,  blocksZ,
            threadsX, threadsY, threadsZ,
            shared_mem_bytes,
            NULL, // stream
            args,
            NULL
        ),
        msg
    );
}
Пример #9
0
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* ptx_src, int size) {
    DEBUG_PRINTF( user_context, "CUDA: halide_init_kernels (user_context: %p, state_ptr: %p, ptx_src: %p, %i)\n",
                  user_context, state_ptr, ptx_src, size );

    CudaContext ctx(user_context);
    if (ctx.error != 0) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create the state object if necessary. This only happens once, regardless
    // of how many times halide_init_kernels/halide_release is called.
    // halide_release traverses this list and releases the module objects, but
    // it does not modify the list nodes created/inserted here.
    module_state **state = (module_state**)state_ptr;
    if (!(*state)) {
        *state = (module_state*)malloc(sizeof(module_state));
        (*state)->module = NULL;
        (*state)->next = state_list;
        state_list = *state;
    }

    // Create the module itself if necessary.
    if (!(*state)->module) {
        DEBUG_PRINTF( user_context, "    cuModuleLoadData %p, %i -> ", ptx_src, size );
        CUmodule module;
        CUresult err = cuModuleLoadData(&(*state)->module, ptx_src);
        if (err != CUDA_SUCCESS) {
            DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) );
            halide_error_varargs(user_context, "CUDA: cuModuleLoadData failed (%s)",
                                 _get_error_name(err));
            return err;
        } else {
            DEBUG_PRINTF( user_context, "%p\n", module );
        }
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Пример #10
0
WEAK void sampling_profiler_thread(void *) {
    halide_profiler_state *s = halide_profiler_get_state();

    // grab the lock
    halide_mutex_lock(&s->lock);

    while (s->current_func != halide_profiler_please_stop) {

        uint64_t t1 = halide_current_time_ns(NULL);
        uint64_t t = t1;
        while (1) {
            int func, active_threads;
            if (s->get_remote_profiler_state) {
                // Execution has disappeared into remote code running
                // on an accelerator (e.g. Hexagon DSP)
                s->get_remote_profiler_state(&func, &active_threads);
            } else {
                func = s->current_func;
                active_threads = s->active_threads;
            }
            uint64_t t_now = halide_current_time_ns(NULL);
            if (func == halide_profiler_please_stop) {
                break;
            } else if (func >= 0) {
                // Assume all time since I was last awake is due to
                // the currently running func.
                bill_func(s, func, t_now - t, active_threads);
            }
            t = t_now;

            // Release the lock, sleep, reacquire.
            int sleep_ms = s->sleep_time;
            halide_mutex_unlock(&s->lock);
            halide_sleep_ms(NULL, sleep_ms);
            halide_mutex_lock(&s->lock);
        }
    }

    s->started = false;

    halide_mutex_unlock(&s->lock);
}
Пример #11
0
WEAK int halide_dev_free(void *user_context, buffer_t* buf) {
    // halide_dev_free, at present, can be exposed to clients and they
    // should be allowed to call halide_dev_free on any buffer_t
    // including ones that have never been used with a GPU.
    if (buf->dev == 0) {
        return 0;
    }

    DEBUG_PRINTF( user_context, "CUDA: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS)
        return ctx.error;

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

    DEBUG_PRINTF( user_context, "    cuMemFree %p\n", buf->dev );
    CUresult err = cuMemFree(buf->dev);
    // If cuMemFree fails, it isn't likely to succeed later, so just drop
    // the reference.
    buf->dev = 0;
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuMemFree failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Пример #12
0
WEAK void halide_copy_to_dev(buffer_t* buf) {
    if (buf->host_dirty) {
        halide_assert(buf->host && buf->dev);
        size_t size = buf_size(buf);
        #ifdef DEBUG
        char msg[256];
        snprintf(msg, 256, "copy_to_dev (%zu bytes) %p -> %p (t=%lld)",
                 size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() );
        halide_assert(halide_validate_dev_pointer(buf));
        #endif
        TIME_CALL( cuMemcpyHtoD(buf->dev, buf->host, size), msg );
    }
    buf->host_dirty = false;
}
Пример #13
0
static CUfunction __get_kernel(const char* entry_name)
{
    CUfunction f;

    #ifdef DEBUG
    char msg[256];
    snprintf(msg, 256, "get_kernel %s (t=%lld)", entry_name, (long long)halide_current_time_ns() );
    #endif

    // Get kernel function ptr
    TIME_CALL( cuModuleGetFunction(&f, __mod, entry_name), msg );

    return f;
}
Пример #14
0
// Used to generate correct timings when tracing
WEAK int halide_dev_sync(void *user_context) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_sync (user_context: %p)\n", user_context );

    ClContext ctx(user_context);
    halide_assert(user_context, ctx.error == CL_SUCCESS);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    cl_int err = clFinish(ctx.cmd_queue);
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clFinish failed (%d)\n", err);
        return err;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return CL_SUCCESS;
}
Пример #15
0
WEAK void halide_dev_run(
    const char* entry_name,
    int blocksX, int blocksY, int blocksZ,
    int threadsX, int threadsY, int threadsZ,
    int shared_mem_bytes,
    size_t arg_sizes[],
    void* args[])
{
    cl_kernel f = __get_kernel(entry_name);
    #ifndef DEBUG
    char msg[1];
    #else
    char msg[256];
    snprintf(
        msg, 256,
        "dev_run %s with (%dx%dx%d) blks, (%dx%dx%d) threads, %d shmem (t=%lld)",
        entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes,
        (long long)halide_current_time_ns()
    );
    #endif
    // Pack dims
    size_t global_dim[3] = {blocksX*threadsX,  blocksY*threadsY,  blocksZ*threadsZ};
    size_t local_dim[3] = {threadsX, threadsY, threadsZ};

    // Set args
    int i = 0;
    while (arg_sizes[i] != 0) {
        CHECK_CALL( clSetKernelArg(f, i, arg_sizes[i], args[i]), "clSetKernelArg" );
        i++;
    }
    // Set the shared mem buffer last
    // Always set at least 1 byte of shmem, to keep the launch happy
    CHECK_CALL( clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL), "clSetKernelArg" );

    // Launch kernel
    TIME_START();
    int err =
    clEnqueueNDRangeKernel(
        cl_q,
        f,
        3,
        NULL,
        global_dim,
        local_dim,
        0, NULL, NULL
    );
    CHECK_ERR(err, "clEnqueueNDRangeKernel");
    TIME_CHECK(msg);
}
Пример #16
0
static cl_kernel __get_kernel(const char* entry_name) {
    cl_kernel f;
    #ifndef DEBUG
    // char msg[1];
    #else
    char msg[256];
    snprintf(msg, 256, "get_kernel %s (t=%lld)",
             entry_name, (long long)halide_current_time_ns() );
    #endif
    // Get kernel function ptr
    TIME_START();
    int err;
    f = clCreateKernel(__mod, entry_name, &err);
    CHECK_ERR(err, "clCreateKernel");
    TIME_CHECK(msg);
    return f;
}
Пример #17
0
WEAK void halide_copy_to_dev(buffer_t* buf) {
    if (buf->host_dirty) {
        halide_assert(buf->host && buf->dev);
        size_t size = __buf_size(buf);
        #ifdef DEBUG
        char msg[256];
        snprintf(msg, 256, "copy_to_dev (%lld bytes) %p -> %p (t=%lld)",
                 (long long)size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() );
        #endif
        halide_assert(halide_validate_dev_pointer(buf));
        TIME_START();
        int err = clEnqueueWriteBuffer( cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL );
        CHECK_ERR( err, msg );
        TIME_CHECK(msg);
    }
    buf->host_dirty = false;
}
Пример #18
0
static cl_mem __dev_malloc(size_t bytes) {
    cl_mem p;
    #ifndef DEBUG
    // char msg[1];
    #else
    char msg[256];
    snprintf(msg, 256, "dev_malloc (%lld bytes) (t=%lld)",
             (long long)bytes, (long long)halide_current_time_ns() );
    #endif
    TIME_START();
    int err;
    p = clCreateBuffer(cl_ctx, CL_MEM_READ_WRITE, bytes, NULL, &err );
    TIME_CHECK(msg);
    #ifdef DEBUG
    halide_printf("    returned: %p (err: %d)\n", (void*)p, err);
    #endif
    halide_assert(p);
    return p;
}
Пример #19
0
WEAK void halide_init_kernels(void *user_context, const char* src, int size) {
    int err;
    cl_device_id dev;
    // Initialize one shared context for all Halide compiled instances
    if (!(*cl_ctx)) {
        const cl_uint maxPlatforms = 4;
        cl_platform_id platforms[maxPlatforms];
        cl_uint platformCount = 0;

        err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount );
        CHECK_ERR( err, "clGetPlatformIDs" );

        cl_platform_id platform = NULL;

        // Find the requested platform, or the first if none specified.
        const char * name = getenv("HL_OCL_PLATFORM");
        if (name != NULL) {
            for (cl_uint i = 0; i < platformCount; ++i) {
                const cl_uint maxPlatformName = 256;
                char platformName[maxPlatformName];
                err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
                if (err != CL_SUCCESS) continue;

                if (strstr(platformName, name))
                {
                    platform = platforms[i];
                    break;
                }
            }
        } else if (platformCount > 0) {
            platform = platforms[0];
        }
        if (platform == NULL){
            halide_printf(user_context, "Failed to find OpenCL platform\n");
            return;
        }

        #ifdef DEBUG
        const cl_uint maxPlatformName = 256;
        char platformName[maxPlatformName];
        err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
        CHECK_ERR( err, "clGetPlatformInfo" );

        halide_printf(user_context, "Got platform '%s', about to create context (t=%lld)\n",
                      platformName, (long long)halide_current_time_ns(user_context));
        #endif

        cl_device_type device_type = 0;
        // Find the device types requested.
        const char * dev_type = getenv("HL_OCL_DEVICE");
        if (dev_type != NULL) {
            if (strstr("cpu", dev_type))
                device_type |= CL_DEVICE_TYPE_CPU;
            if (strstr("gpu", dev_type))
                device_type |= CL_DEVICE_TYPE_GPU;
        } 
        // If no devices are specified yet, just use all.
        if (device_type == 0)
            device_type = CL_DEVICE_TYPE_ALL;
        
        // Make sure we have a device
        const cl_uint maxDevices = 4;
        cl_device_id devices[maxDevices];
        cl_uint deviceCount = 0;
        err = clGetDeviceIDs( platform, device_type, maxDevices, devices, &deviceCount );
        CHECK_ERR( err, "clGetDeviceIDs" );
        if (deviceCount == 0) {
            halide_printf(user_context, "Failed to get device\n");
            return;
        }

        dev = devices[deviceCount-1];

        #ifdef DEBUG
        const cl_uint maxDeviceName = 256;
        char deviceName[maxDeviceName];
        err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL );
        CHECK_ERR( err, "clGetDeviceInfo" );

        halide_printf(user_context, "Got device '%s', about to create context (t=%lld)\n",
                      deviceName, (long long)halide_current_time_ns(user_context));
        #endif


        // Create context
        cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
        *cl_ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err);
        CHECK_ERR( err, "clCreateContext" );
        // cuEventCreate(&__start, 0);
        // cuEventCreate(&__end, 0);

        halide_assert(user_context, !(*cl_q));
        *cl_q = clCreateCommandQueue(*cl_ctx, dev, 0, &err);
        CHECK_ERR( err, "clCreateCommandQueue" );
    } else {
        #ifdef DEBUG
        halide_printf(user_context, "Already had context %p\n", *cl_ctx);
        #endif

        // Maintain ref count of context.
        CHECK_CALL( clRetainContext(*cl_ctx), "clRetainContext" );
        CHECK_CALL( clRetainCommandQueue(*cl_q), "clRetainCommandQueue" );

        CHECK_CALL( clGetContextInfo(*cl_ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL), "clGetContextInfo" );
    }

    // Initialize a module for just this Halide module
    if ((!__mod) && (size > 1)) {
        // Create module

        cl_device_id devices[] = { dev };
        size_t lengths[] = { size };

        if (strstr(src, "/*OpenCL C*/")) {
            // Program is OpenCL C.

            #ifdef DEBUG
            halide_printf(user_context, "Compiling OpenCL C kernel: %s\n\n", src);
            #endif

            const char * sources[] = { src };
            __mod = clCreateProgramWithSource(*cl_ctx, 1, &sources[0], NULL, &err );
            CHECK_ERR( err, "clCreateProgramWithSource" );
        } else {
            // Program is SPIR binary.

            #ifdef DEBUG
            halide_printf(user_context, "Compiling SPIR kernel (%i bytes)\n", size);
            #endif

            const unsigned char * binaries[] = { (unsigned char *)src };
            __mod = clCreateProgramWithBinary(*cl_ctx, 1, devices, lengths, &binaries[0], NULL, &err );
            CHECK_ERR( err, "clCreateProgramWithBinary" );
        }

        err = clBuildProgram( __mod, 1, &dev, NULL, NULL, NULL );
        if (err != CL_SUCCESS) {
            size_t len;
            char buffer[2048];

            halide_printf(user_context, "Error: Failed to build program executable! err = %d\n", err);
            if (clGetProgramBuildInfo(__mod, dev, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len) == CL_SUCCESS)
                halide_printf(user_context, "Build Log:\n %s\n-----\n", buffer);
            else
                halide_printf(user_context, "clGetProgramBuildInfo failed to get build log!\n");
            halide_assert(user_context, err == CL_SUCCESS);
        }
    }
}
Пример #20
0
WEAK void halide_init_kernels(const char* src, int size) {
    int err;
    cl_device_id dev;
    // Initialize one shared context for all Halide compiled instances
    if (!cl_ctx) {
        const cl_uint maxPlatforms = 4;
        cl_platform_id platforms[maxPlatforms];
        cl_uint platformCount = 0;

        err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount );
        CHECK_ERR( err, "clGetPlatformIDs" );

        cl_platform_id platform = NULL;

        const char * name = get_opencl_platform();
        if (name != NULL) {
            for (cl_uint i = 0; i < platformCount; ++i) {
                const cl_uint maxPlatformName = 256;
                char platformName[maxPlatformName];
                err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
                if (err != CL_SUCCESS) continue;

                if (strstr(platformName, name))
                {
                    platform = platforms[i];
                    break;
                }
            }
        } else if (platformCount > 0) {
            platform = platforms[0];
        }
        if (platform == NULL){
            halide_printf("Failed to find OpenCL platform\n");
            return;
        }

        #ifdef DEBUG
        const cl_uint maxPlatformName = 256;
        char platformName[maxPlatformName];
        err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
        CHECK_ERR( err, "clGetPlatformInfo" );

        halide_printf("Got platform '%s', about to create context (t=%lld)\n",
                      platformName, (long long)halide_current_time_ns());
        #endif

        // Make sure we have a device
        const cl_uint maxDevices = 4;
        cl_device_id devices[maxDevices];
        cl_uint deviceCount = 0;
        err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_ALL, maxDevices, devices, &deviceCount );
        CHECK_ERR( err, "clGetDeviceIDs" );
        if (deviceCount == 0) {
            halide_printf("Failed to get device\n");
            return;
        }

        dev = devices[deviceCount-1];

        #ifdef DEBUG
        const cl_uint maxDeviceName = 256;
        char deviceName[maxDeviceName];
        err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL );
        CHECK_ERR( err, "clGetDeviceInfo" );

        halide_printf("Got device '%s', about to create context (t=%lld)\n",
                      deviceName, (long long)halide_current_time_ns());
        #endif


        // Create context
        cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
        cl_ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err);
        CHECK_ERR( err, "clCreateContext" );
        // cuEventCreate(&__start, 0);
        // cuEventCreate(&__end, 0);

        halide_assert(!cl_q);
        cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
        CHECK_ERR( err, "clCreateCommandQueue" );
    } else {
        // Maintain ref count of context.
        clRetainContext(cl_ctx);
        clRetainCommandQueue(cl_q);
    }

    // Initialize a module for just this Halide module
    if ((!__mod) && (size > 1)) {
        #ifdef DEBUG
        halide_printf("Compiling kernel (%i bytes)\n", size);
        #endif

        // Create module

        cl_device_id devices[] = { dev };
        size_t lengths[] = { size };

        if (strstr(src, "/*OpenCL C*/")) {
            // Program is OpenCL C.
            const char * sources[] = { src };
            __mod = clCreateProgramWithSource(cl_ctx, 1, &sources[0], NULL, &err );
            CHECK_ERR( err, "clCreateProgramWithSource" );
        } else {
            // Program is SPIR binary.
            const unsigned char * binaries[] = { (unsigned char *)src };
            __mod = clCreateProgramWithBinary(cl_ctx, 1, devices, lengths, &binaries[0], NULL, &err );
            CHECK_ERR( err, "clCreateProgramWithBinary" );
        }

        err = clBuildProgram( __mod, 1, &dev, NULL, NULL, NULL );
        if (err != CL_SUCCESS) {
            size_t len;
            char buffer[2048];

            halide_printf("Error: Failed to build program executable! err = %d\n", err);
            if (clGetProgramBuildInfo(__mod, dev, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len) == CL_SUCCESS)
                halide_printf("%s\n", buffer);
            else
                halide_printf("clGetProgramBuildInfo failed to get build log!\n");
            halide_assert(err == CL_SUCCESS);
        }
    }
}
Пример #21
0
// Override halide's print to use logd and also print the time.
extern "C" void halide_print(void *, const char *msg) {
    static int64_t t0 = halide_current_time_ns();
    int64_t t1 = halide_current_time_ns();
    LOGD("%d: %s\n", (int)(t1 - t0)/1000000, msg);
    t0 = t1;
}
Пример #22
0
JNIEXPORT bool JNICALL Java_com_example_helloandroidcamera2_JNIUtils_edgeDetect(
    JNIEnv *env, jobject obj, jint srcWidth, jint srcHeight,
    jobject srcLumaByteBuffer, jint srcLumaRowStrideBytes, jobject dstSurface) {
    uint8_t *srcLumaPtr = reinterpret_cast<uint8_t *>(
        env->GetDirectBufferAddress(srcLumaByteBuffer));
    if (srcLumaPtr == NULL) {
        return false;
    }

    ANativeWindow *win = ANativeWindow_fromSurface(env, dstSurface);
    ANativeWindow_acquire(win);

    ANativeWindow_Buffer buf;
    if (int err = ANativeWindow_lock(win, &buf, NULL)) {
        LOGE("ANativeWindow_lock failed with error code %d\n", err);
        ANativeWindow_release(win);
        return false;
    }

    ANativeWindow_setBuffersGeometry(win, srcWidth, srcHeight, 0 /*format unchanged*/);

    uint8_t *dstLumaPtr = reinterpret_cast<uint8_t *>(buf.bits);
    if (dstLumaPtr == NULL) {
        ANativeWindow_unlockAndPost(win);
        ANativeWindow_release(win);
        return false;
    }

    if (buf.format != IMAGE_FORMAT_YV12) {
        LOGE("ANativeWindow buffer locked but its format was not YV12.");
        ANativeWindow_unlockAndPost(win);
        ANativeWindow_release(win);
        return false;
    }

    if (!checkBufferSizesMatch(srcWidth, srcHeight, &buf)) {
        LOGE("ANativeWindow buffer locked but its size was %d x %d, expected "
                "%d x %d", buf.width, buf.height, srcWidth, srcHeight);
        ANativeWindow_unlockAndPost(win);
        ANativeWindow_release(win);
        return false;
    }

    uint32_t dstLumaSizeBytes = buf.stride * buf.height;
    uint32_t dstChromaRowStrideBytes = ALIGN(buf.stride / 2, 16);
    // Size of one chroma plane.
    uint32_t dstChromaSizeBytes = dstChromaRowStrideBytes * buf.height / 2;
    uint8_t *dstChromaVPtr = dstLumaPtr + dstLumaSizeBytes;
    uint8_t *dstChromaUPtr = dstLumaPtr + dstLumaSizeBytes + dstChromaSizeBytes;

    // Make these static so that we can reuse device allocations across frames.
    // It doesn't matter now, but useful for GPU backends.
    static buffer_t srcBuf = { 0 };
    static buffer_t dstBuf = { 0 };
    static buffer_t dstChromaBuf = { 0 };

    srcBuf.host = srcLumaPtr;
    srcBuf.host_dirty = true;
    srcBuf.extent[0] = srcWidth;
    srcBuf.extent[1] = srcHeight;
    srcBuf.extent[2] = 0;
    srcBuf.extent[3] = 0;
    srcBuf.stride[0] = 1;
    srcBuf.stride[1] = srcLumaRowStrideBytes;
    srcBuf.min[0] = 0;
    srcBuf.min[1] = 0;
    srcBuf.elem_size = 1;

    dstBuf.host = dstLumaPtr;
    dstBuf.extent[0] = buf.width;  // src and dst width/height actually match.
    dstBuf.extent[1] = buf.height;
    dstBuf.extent[2] = 0;
    dstBuf.extent[3] = 0;
    dstBuf.stride[0] = 1;
    dstBuf.stride[1] = buf.stride;  // src and dst strides actually match.
    dstBuf.min[0] = 0;
    dstBuf.min[1] = 0;
    dstBuf.elem_size = 1;

    static bool first_call = true;
    static unsigned counter = 0;
    static unsigned times[16];
    if (first_call) {
        LOGD("According to Halide, host system has %d cpus\n",
             halide_host_cpu_count());
        first_call = false;
        for (int t = 0; t < 16; t++) {
            times[t] = 0;
        }
    }

    // Set chrominance to 128 to appear grayscale.
    // The dst chroma is guaranteed to be tightly packed since it's YV12.
    memset(dstChromaVPtr, 128, dstChromaSizeBytes * 2);

    int64_t t1 = halide_current_time_ns();
    int err = edge_detect(&srcBuf, &dstBuf);
    if (err != halide_error_code_success) {
        LOGE("edge_detect failed with error code: %d", err);
    }

    int64_t t2 = halide_current_time_ns();
    unsigned elapsed_us = (t2 - t1) / 1000;

    times[counter & 15] = elapsed_us;
    counter++;
    unsigned min = times[0];
    for (int i = 1; i < 16; i++) {
        if (times[i] < min) {
            min = times[i];
        }
    }
    LOGD("Time taken: %d us (minimum: %d us)", elapsed_us, min);

    ANativeWindow_unlockAndPost(win);
    ANativeWindow_release(win);

    return (err != halide_error_code_success);
}
Пример #23
0
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame(
    JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jobject surf) {

    const int w = j_w, h = j_h;

    halide_set_error_handler(handler);

    unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL);
    if (!src) {
        LOGD("src is null\n");
        return;
    }

    ANativeWindow *win = ANativeWindow_fromSurface(env, surf);
    ANativeWindow_acquire(win);

    static bool first_call = true;
    static unsigned counter = 0;
    static unsigned times[16];
    if (first_call) {
        LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count());
        LOGD("Resetting buffer format");
        ANativeWindow_setBuffersGeometry(win, w, h, 0);
        first_call = false;
        for (int t = 0; t < 16; t++) times[t] = 0;
    }

    ANativeWindow_Buffer buf;
    ARect rect = {0, 0, w, h};

    if (int err = ANativeWindow_lock(win, &buf, NULL)) {
        LOGD("ANativeWindow_lock failed with error code %d\n", err);
        return;
    }

    uint8_t *dst = (uint8_t *)buf.bits;

    // If we're using opencl, use the gpu backend for it.
    halide_set_ocl_device_type("gpu");

    // Make these static so that we can reuse device allocations across frames.
    static buffer_t srcBuf = {0};
    static buffer_t dstBuf = {0};

    if (dst) {
        srcBuf.host = (uint8_t *)src;
        srcBuf.host_dirty = true;
        srcBuf.extent[0] = w;
        srcBuf.extent[1] = h;
        srcBuf.extent[2] = 0;
        srcBuf.extent[3] = 0;
        srcBuf.stride[0] = 1;
        srcBuf.stride[1] = w;
        srcBuf.min[0] = 0;
        srcBuf.min[1] = 0;
        srcBuf.elem_size = 1;

        dstBuf.host = dst;
        dstBuf.extent[0] = w;
        dstBuf.extent[1] = h;
        dstBuf.extent[2] = 0;
        dstBuf.extent[3] = 0;
        dstBuf.stride[0] = 1;
        dstBuf.stride[1] = w;
        dstBuf.min[0] = 0;
        dstBuf.min[1] = 0;
        dstBuf.elem_size = 1;

        // Just copy over chrominance untouched
        memcpy(dst + w*h, src + w*h, (w*h)/2);

        int64_t t1 = halide_current_time_ns();
        halide_generated(&srcBuf, &dstBuf);

        if (dstBuf.dev) {
            halide_copy_to_host(NULL, &dstBuf);
        }

        int64_t t2 = halide_current_time_ns();
        unsigned elapsed_us = (t2 - t1)/1000;


        times[counter & 15] = elapsed_us;
        counter++;
        unsigned min = times[0];
        for (int i = 1; i < 16; i++) {
            if (times[i] < min) min = times[i];
        }
        LOGD("Time taken: %d (%d)", elapsed_us, min);
    }

    ANativeWindow_unlockAndPost(win);
    ANativeWindow_release(win);

    env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0);
}
Пример #24
0
WEAK void halide_init_kernels(const char* ptx_src) {
    // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context.
    if (cuda_ctx_ptr == NULL) {
        cuda_ctx_ptr = &weak_cuda_ctx;
    }

    // Initialize one shared context for all Halide compiled instances
    if (*cuda_ctx_ptr == 0) {
        // Initialize CUDA
        CHECK_CALL( cuInit(0), "cuInit" );

        // Make sure we have a device
        int deviceCount = 0;
        CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" );
        halide_assert(deviceCount > 0);

        char *device_str = getenv("HL_GPU_DEVICE");

        CUdevice dev;
        // Get device
        CUresult status;
        if (device_str) {
            status = cuDeviceGet(&dev, atoi(device_str));
        } else {
            for (int id = 2; id >= 0; id--) {
                // Try to get a device >0 first, since 0 should be our display device
                status = cuDeviceGet(&dev, id);
                if (status == CUDA_SUCCESS) break;
            }
        }

        if (status != CUDA_SUCCESS) {
            halide_printf("Failed to get device\n");
            exit(-1);
        }

        #ifdef DEBUG
        halide_printf("Got device %d, about to create context (t=%lld)\n", dev, (long long)halide_current_time_ns());
        #endif


        // Create context
        CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" );

    } else {
        //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" );
    }

    // Initialize a module for just this Halide module
    if (!__mod) {
        // Create module
        CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" );

        #ifdef DEBUG
        halide_printf("-------\nCompiling PTX:\n%s\n--------\n", ptx_src);
        #endif
    }

    // Create two events for timing
    if (!__start) {
        cuEventCreate(&__start, 0);
        cuEventCreate(&__end, 0);
    }
}
Пример #25
0
// Initializes the context used by the default implementation
// of halide_acquire_context.
static int create_context(void *user_context, cl_context *ctx, cl_command_queue *q) {
    DEBUG_PRINTF( user_context, "    create_context (user_context: %p)\n", user_context );

    halide_assert(user_context, ctx != NULL && *ctx == NULL);
    halide_assert(user_context, q != NULL && *q == NULL);

    cl_int err = 0;

    const cl_uint maxPlatforms = 4;
    cl_platform_id platforms[maxPlatforms];
    cl_uint platformCount = 0;

    err = clGetPlatformIDs( maxPlatforms, platforms, &platformCount );
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clGetPlatformIDs failed (%d)\n", err);
        return err;
    }

    cl_platform_id platform = NULL;

    // Find the requested platform, or the first if none specified.
    const char * name = getenv("HL_OCL_PLATFORM_NAME");
    if (name != NULL) {
        for (cl_uint i = 0; i < platformCount; ++i) {
            const cl_uint maxPlatformName = 256;
            char platformName[maxPlatformName];
            err = clGetPlatformInfo( platforms[i], CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
            if (err != CL_SUCCESS) continue;

            // A platform matches the request if it is a substring of the platform name.
            if (strstr(platformName, name)) {
                platform = platforms[i];
                break;
            }
        }
    } else if (platformCount > 0) {
        platform = platforms[0];
    }
    if (platform == NULL){
        halide_error(user_context, "CL: Failed to find platform\n");
        return CL_INVALID_PLATFORM;
    }

    #ifdef DEBUG
    const cl_uint maxPlatformName = 256;
    char platformName[maxPlatformName];
    err = clGetPlatformInfo( platform, CL_PLATFORM_NAME, maxPlatformName, platformName, NULL );
    if (err != CL_SUCCESS) {
        halide_printf(user_context, "    clGetPlatformInfo(CL_PLATFORM_NAME) failed (%d)\n", err);
        // This is just debug info, report the error but don't fail context creation due to it.
        //return err;
    } else {
        halide_printf(user_context, "    Got platform '%s', about to create context (t=%lld)\n",
                      platformName, (long long)halide_current_time_ns(user_context));
    }
    #endif

    // Get the types of devices requested.
    cl_device_type device_type = 0;
    const char * dev_type = getenv("HL_OCL_DEVICE_TYPE");
    if (dev_type != NULL) {
        if (strstr("cpu", dev_type)) {
            device_type |= CL_DEVICE_TYPE_CPU;
        }
        if (strstr("gpu", dev_type)) {
            device_type |= CL_DEVICE_TYPE_GPU;
        }
    }
    // If no device types are specified, use all the available
    // devices.
    if (device_type == 0) {
        device_type = CL_DEVICE_TYPE_ALL;
    }

    // Get all the devices of the specified type.
    const cl_uint maxDevices = 4;
    cl_device_id devices[maxDevices];
    cl_uint deviceCount = 0;
    err = clGetDeviceIDs( platform, device_type, maxDevices, devices, &deviceCount );
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clGetDeviceIDs failed (%d)\n", err);
        return err;
    }

    // If the user indicated a specific device index to use, use
    // that. Note that this is an index within the set of devices
    // specified by the device type.
    char *device_str = getenv("HL_GPU_DEVICE");
    cl_uint device = deviceCount - 1;
    if (device_str) {
        device = atoi(device_str);
    }

    if (device >= deviceCount) {
        halide_error_varargs(user_context, "CL: Failed to get device %i\n", device);
        return CL_DEVICE_NOT_FOUND;
    }

    cl_device_id dev = devices[device];

    #ifdef DEBUG
    const cl_uint maxDeviceName = 256;
    char deviceName[maxDeviceName];
    err = clGetDeviceInfo( dev, CL_DEVICE_NAME, maxDeviceName, deviceName, NULL );
    if (err != CL_SUCCESS) {
        halide_printf(user_context, "    clGetDeviceInfo(CL_DEVICE_NAME) failed (%d)\n", err);
        // This is just debug info, report the error but don't fail context create if it fails.
        //return err;
    } else {
        halide_printf(user_context, "    Got device '%s'\n", deviceName);
    }
    #endif


    // Create context and command queue.
    cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
    DEBUG_PRINTF( user_context, "    clCreateContext -> " );
    *ctx = clCreateContext(properties, 1, &dev, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d", err);
        halide_error_varargs(user_context, "CL: clCreateContext failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", *ctx );
    }

    DEBUG_PRINTF(user_context, "    clCreateCommandQueue ");
    *q = clCreateCommandQueue(*ctx, dev, 0, &err);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d", err );
        halide_error_varargs(user_context, "CL: clCreateCommandQueue failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", *q );
    }

    return err;
}
Пример #26
0
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* src, int size) {
    DEBUG_PRINTF( user_context, "CL: halide_init_kernels (user_context: %p, state_ptr: %p, program: %p, %i)\n",
                  user_context, state_ptr, src, size );

    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create the state object if necessary. This only happens once, regardless
    // of how many times halide_init_kernels/halide_release is called.
    // halide_release traverses this list and releases the program objects, but
    // it does not modify the list nodes created/inserted here.
    module_state **state = (module_state**)state_ptr;
    if (!(*state)) {
        *state = (module_state*)malloc(sizeof(module_state));
        (*state)->program = NULL;
        (*state)->next = state_list;
        state_list = *state;
    }

    // Create the program if necessary. TODO: The program object needs to not
    // only already exist, but be created for the same context/device as the
    // calling context/device.
    if (!(*state && (*state)->program) && size > 1) {
        cl_int err = 0;
        cl_device_id dev;

        err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, NULL);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed (%d)\n", err);
            return err;
        }

        cl_device_id devices[] = { dev };
        size_t lengths[] = { size };

        // Get the max constant buffer size supported by this OpenCL implementation.
        cl_ulong max_constant_buffer_size = 0;
        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, NULL);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed (%d)\n", err);
            return err;
        }
        // Get the max number of constant arguments supported by this OpenCL implementation.
        cl_uint max_constant_args = 0;
        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, NULL);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed (%d)\n", err);
            return err;
        }

        // Build the compile argument options.
        char options[256];
        snprintf(options, sizeof(options),
                 "-D MAX_CONSTANT_BUFFER_SIZE=%lld -D MAX_CONSTANT_ARGS=%i",
                 max_constant_buffer_size,
                 max_constant_args);

        const char * sources[] = { src };
        DEBUG_PRINTF( user_context, "    clCreateProgramWithSource -> " );
        cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], NULL, &err );
        if (err != CL_SUCCESS) {
            DEBUG_PRINTF( user_context, "%d\n", err );
            halide_error_varargs(user_context, "CL: clCreateProgramWithSource failed (%d)\n", err);
            return err;
        } else {
            DEBUG_PRINTF( user_context, "%p\n", program );
        }
        (*state)->program = program;

        DEBUG_PRINTF( user_context, "    clBuildProgram %p %s\n", program, options );
        err = clBuildProgram(program, 1, devices, options, NULL, NULL );
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clBuildProgram failed (%d)\n", err);

            // Allocate an appropriately sized buffer for the build log.
            size_t len = 0;
            char *buffer = NULL;
            if (clGetProgramBuildInfo(program,
                                      dev,
                                      CL_PROGRAM_BUILD_LOG,
                                      0, NULL,
                                      &len) == CL_SUCCESS) {
                buffer = (char*)malloc((++len)*sizeof(char));
            }

            // Get build log
            if (buffer && clGetProgramBuildInfo(program,
                                                dev,
                                                CL_PROGRAM_BUILD_LOG,
                                                len, buffer,
                                                NULL) == CL_SUCCESS) {
                halide_printf(user_context, "    Build Log:\n %s\n-----\n", buffer);
            } else {
                halide_printf(user_context, "    clGetProgramBuildInfo failed\n");
            }

            if (buffer) {
                free(buffer);
            }

            halide_assert(user_context, err == CL_SUCCESS);
            return err;
        }
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Пример #27
0
WEAK int halide_dev_run(void *user_context,
                        void *state_ptr,
                        const char* entry_name,
                        int blocksX, int blocksY, int blocksZ,
                        int threadsX, int threadsY, int threadsZ,
                        int shared_mem_bytes,
                        size_t arg_sizes[],
                        void* args[]) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n",
                  user_context, entry_name,
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ,
                  shared_mem_bytes );

    cl_int err;
    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create kernel object for entry_name from the program for this module.
    halide_assert(user_context, state_ptr);
    cl_program program = ((module_state*)state_ptr)->program;

    halide_assert(user_context, program);
    DEBUG_PRINTF( user_context, "    clCreateKernel %s -> ", entry_name );
    cl_kernel f = clCreateKernel(program, entry_name, &err);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d\n", err );
        halide_error_varargs(user_context, "CL: clCreateKernel (%s) failed (%d)\n", entry_name, err);
        return err;
    } else {
        #ifdef DEBUG
        uint64_t t_create_kernel = halide_current_time_ns(user_context);
        halide_printf( user_context, "%p (%f ms)\n",
                       f, (t_create_kernel - t_before) / 1.0e6 );
        #endif
    }

    // Pack dims
    size_t global_dim[3] = {blocksX*threadsX,  blocksY*threadsY,  blocksZ*threadsZ};
    size_t local_dim[3] = {threadsX, threadsY, threadsZ};

    // Set args
    int i = 0;
    while (arg_sizes[i] != 0) {
        DEBUG_PRINTF(user_context, "    clSetKernelArg %i %i [0x%x ...]\n", i, arg_sizes[i], *(int *)args[i]);
        cl_int err = clSetKernelArg(f, i, arg_sizes[i], args[i]);
        if (err != CL_SUCCESS) {
            halide_error_varargs(user_context, "CL: clSetKernelArg failed (%d)\n", err);
            return err;
        }
        i++;
    }
    // Set the shared mem buffer last
    // Always set at least 1 byte of shmem, to keep the launch happy
    DEBUG_PRINTF(user_context, "    clSetKernelArg %i %i [NULL]\n", i, shared_mem_bytes);
    err = clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL);
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clSetKernelArg failed (%d)\n", err);
        return err;
    }

    // Launch kernel
    DEBUG_PRINTF( user_context, "    clEnqueueNDRangeKernel %dx%dx%d, %dx%dx%d -> ",
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ );
    err = clEnqueueNDRangeKernel(ctx.cmd_queue, f,
                                 // NDRange
                                 3, NULL, global_dim, local_dim,
                                 // Events
                                 0, NULL, NULL);
    if (err != CL_SUCCESS) {
        DEBUG_PRINTF( user_context, "%d\n", err );
        halide_error_varargs(user_context, "CL: clEnqueueNDRangeKernel failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF ( user_context, "CL_SUCCESS\n" );
    }

    DEBUG_PRINTF( user_context, "    clReleaseKernel %p\n", f );
    clReleaseKernel(f);

    #ifdef DEBUG
    err = clFinish(ctx.cmd_queue);
    if (err != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clFinish failed (%d)\n", err);
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif
    return 0;
}
Пример #28
0
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF(user_context, "CL: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf );

    // Acquire the context so we can use the command queue. This also avoids multiple
    // redundant calls to clEnqueueReadBuffer when multiple threads are trying to copy
    // the same buffer.
    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    if (buf->dev_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->dev && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_dev_to_host_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
#ifdef ENABLE_OPENCL_11
                // OpenCL 1.1 supports stride-aware memory transfers up to 3D, so we
                // can deal with the 2 innermost strides with OpenCL.
                uint64_t off = z * c.stride_bytes[2] + w * c.stride_bytes[3];

                size_t offset[3] = { off, 0, 0 };
                size_t region[3] = { c.chunk_size, c.extent[0], c.extent[1] };

                DEBUG_PRINTF( user_context, "    clEnqueueReadBufferRect ((%d, %d), (%p -> %p) + %d, %dx%dx%d bytes, %dx%d)\n",
                              z, w,
                              (void *)c.src, c.dst, (int)off,
                              (int)region[0], (int)region[1], (int)region[2],
                              (int)c.stride_bytes[0], (int)c.stride_bytes[1]);

                cl_int err = clEnqueueReadBufferRect(ctx.cmd_queue, (cl_mem)c.src, CL_FALSE,
                                                     offset, offset, region,
                                                     c.stride_bytes[0], c.stride_bytes[1],
                                                     c.stride_bytes[0], c.stride_bytes[1],
                                                     (void *)c.dst,
                                                     0, NULL, NULL);

                if (err != CL_SUCCESS) {
                    halide_error_varargs(user_context, "CL: clEnqueueReadBufferRect failed (%d)\n", err);
                    return err;
                }
#else
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        void *src = (void *)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;

                        DEBUG_PRINTF( user_context, "    clEnqueueReadBuffer ((%d, %d, %d, %d), %lld bytes, %p -> %p)\n",
                                      x, y, z, w,
                                      (long long)size, (void *)src, dst );

                        cl_int err = clEnqueueReadBuffer(ctx.cmd_queue, (cl_mem)c.src,
                                                         CL_FALSE, off, size, dst, 0, NULL, NULL);
                        if (err != CL_SUCCESS) {
                            halide_error_varargs(user_context, "CL: clEnqueueReadBuffer failed (%d)\n", err);
                            return err;
                        }
                    }
                }
#endif
            }
        }
        // The writes above are all non-blocking, so empty the command
        // queue before we proceed so that other host code won't read
        // bad data.
        clFinish(ctx.cmd_queue);

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->dev_dirty = false;
    return 0;
}
Пример #29
0
JNIEXPORT void JNICALL Java_com_example_hellohalide_CameraPreview_processFrame(
    JNIEnv *env, jobject obj, jbyteArray jSrc, jint j_w, jint j_h, jint j_orientation, jobject surf) {

    const int w = j_w, h = j_h, orientation = j_orientation;

    halide_start_clock(NULL);
    halide_set_error_handler(handler);

    unsigned char *src = (unsigned char *)env->GetByteArrayElements(jSrc, NULL);
    if (!src) {
        LOGD("src is null\n");
        return;
    }

    LOGD("[output window size] j_w = %d, j_h = %d", j_w, j_h);
    LOGD("[src array length] jSrc.length = %d", env->GetArrayLength(jSrc));

    ANativeWindow *win = ANativeWindow_fromSurface(env, surf);


    static bool first_call = true;
    static unsigned counter = 0;
    static unsigned times[16];
    if (first_call) {
        LOGD("According to Halide, host system has %d cpus\n", halide_host_cpu_count());
        LOGD("Resetting buffer format");
        ANativeWindow_setBuffersGeometry(win, w, h, 0);
        first_call = false;
        for (int t = 0; t < 16; t++) times[t] = 0;
    }

    ANativeWindow_Buffer buf;
    ARect rect = {0, 0, w, h};

    if (int err = ANativeWindow_lock(win, &buf, NULL)) {
        LOGD("ANativeWindow_lock failed with error code %d\n", err);
        return;
    }

    uint8_t *dst = (uint8_t *)buf.bits;

    // If we're using opencl, use the gpu backend for it.
#if COMPILING_FOR_OPENCL
    halide_opencl_set_device_type("gpu");
#endif

    // Make these static so that we can reuse device allocations across frames.
    static halide_buffer_t srcBuf = {0};
    static halide_dimension_t srcDim[2];
    static halide_buffer_t dstBuf = {0};
    static halide_dimension_t dstDim[2];

    if (dst) {
        srcBuf.host = (uint8_t *)src;
        srcBuf.set_host_dirty();
        srcBuf.dim = srcDim;
        srcBuf.dim[0].min = 0;
        srcBuf.dim[0].extent = w;
        srcBuf.dim[0].stride = 1;
        srcBuf.dim[1].min = 0;
        srcBuf.dim[1].extent = h;
        srcBuf.dim[1].stride = w;
        srcBuf.type = halide_type_of<uint8_t>();

        if (orientation >= 180) {
            // Camera sensor is probably upside down (e.g. Nexus 5x)
            srcBuf.host += w*h-1;
            srcBuf.dim[0].stride = -1;
            srcBuf.dim[1].stride = -w;
        }

        dstBuf.host = dst;
        dstBuf.dim = dstDim;
        dstBuf.dim[0].min = 0;
        dstBuf.dim[0].extent = w;
        dstBuf.dim[0].stride = 1;
        dstBuf.dim[1].min = 0;
        dstBuf.dim[1].extent = h;
        dstBuf.dim[1].stride = w;
        dstBuf.type = halide_type_of<uint8_t>();

        // Just set chroma to gray.
        memset(dst + w*h, 128, (w*h)/2);

        int64_t t1 = halide_current_time_ns();
        hello(&srcBuf, &dstBuf);

        halide_copy_to_host(NULL, &dstBuf);

        int64_t t2 = halide_current_time_ns();
        unsigned elapsed_us = (t2 - t1)/1000;


        times[counter & 15] = elapsed_us;
        counter++;
        unsigned min = times[0];
        for (int i = 1; i < 16; i++) {
            if (times[i] < min) min = times[i];
        }
        LOGD("Time taken: %d (%d)", elapsed_us, min);
    }

    ANativeWindow_unlockAndPost(win);
    ANativeWindow_release(win);

    env->ReleaseByteArrayElements(jSrc, (jbyte *)src, 0);
}