Exemplo n.º 1
0
WEAK int halide_dev_run(void *user_context,
                        void *state_ptr,
                        const char* entry_name,
                        int blocksX, int blocksY, int blocksZ,
                        int threadsX, int threadsY, int threadsZ,
                        int shared_mem_bytes,
                        size_t arg_sizes[],
                        void* args[]) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n",
                  user_context, entry_name,
                  blocksX, blocksY, blocksZ,
                  threadsX, threadsY, threadsZ,
                  shared_mem_bytes );

    CUresult err;
    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, state_ptr);
    CUmodule mod = ((module_state*)state_ptr)->module;
    halide_assert(user_context, mod);
    CUfunction f;
    err = cuModuleGetFunction(&f, mod, entry_name);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)",
                             _get_error_name(err));
        return err;
    }

    err = cuLaunchKernel(f,
                         blocksX,  blocksY,  blocksZ,
                         threadsX, threadsY, threadsZ,
                         shared_mem_bytes,
                         NULL, // stream
                         args,
                         NULL);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)",
                             _get_error_name(err));
        return err;
    }

    #ifdef DEBUG
    err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n",
                             _get_error_name(err));
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif
    return 0;
}
Exemplo n.º 2
0
// The default implementation of halide_acquire_cl_context uses the global
// pointers above, and serializes access with a spin lock.
// Overriding implementations of acquire/release must implement the following
// behavior:
// - halide_acquire_cl_context should always store a valid context/command
//   queue in ctx/q, or return an error code.
// - A call to halide_acquire_cl_context is followed by a matching call to
//   halide_release_cl_context. halide_acquire_cl_context should block while a
//   previous call (if any) has not yet been released via halide_release_cl_context.
WEAK int halide_acquire_cuda_context(void *user_context, CUcontext *ctx) {
    // TODO: Should we use a more "assertive" assert? these asserts do
    // not block execution on failure.
    halide_assert(user_context, ctx != NULL);

    if (cuda_ctx_ptr == NULL) {
        cuda_ctx_ptr = &weak_cuda_ctx;
        cuda_lock_ptr = &weak_cuda_lock;
    }

    halide_assert(user_context, cuda_lock_ptr != NULL);
    while (__sync_lock_test_and_set(cuda_lock_ptr, 1)) { }

    // If the context has not been initialized, initialize it now.
    halide_assert(user_context, cuda_ctx_ptr != NULL);
    if (*cuda_ctx_ptr == NULL) {
        CUresult error = create_context(user_context, cuda_ctx_ptr);
        if (error != CUDA_SUCCESS) {
            __sync_lock_release(cuda_lock_ptr);
            return error;
        }
    }

    *ctx = *cuda_ctx_ptr;
    return 0;
}
Exemplo n.º 3
0
WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) {
    if (buf->dev) {
        // This buffer already has a device allocation
        return;
    }

    size_t size = __buf_size(user_context, buf);

    #ifdef DEBUG
    halide_printf(user_context, "dev_malloc allocating buffer of %zd bytes, "
                  "extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n",
                  size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3],
                  buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3],
                  buf->elem_size);
    #endif

    CUdeviceptr p;
    TIME_CALL( cuMemAlloc(&p, size), "dev_malloc");

    buf->dev = (uint64_t)p;
    halide_assert(user_context, buf->dev);

    #ifdef DEBUG
    halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));
    #endif
}
Exemplo n.º 4
0
// The default implementation of halide_acquire_cl_context uses the global
// pointers above, and serializes access with a spin lock.
// Overriding implementations of acquire/release must implement the following
// behavior:
// - halide_acquire_cl_context should always store a valid context/command
//   queue in ctx/q, or return an error code.
// - A call to halide_acquire_cl_context is followed by a matching call to
//   halide_release_cl_context. halide_acquire_cl_context should block while a
//   previous call (if any) has not yet been released via halide_release_cl_context.
WEAK int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q) {
    // TODO: Should we use a more "assertive" assert? These asserts do
    // not block execution on failure.
    halide_assert(user_context, ctx != NULL);
    halide_assert(user_context, q != NULL);

    // If the context pointers aren't hooked up, use our weak globals.
    if (cl_ctx_ptr == NULL) {
        cl_ctx_ptr = &weak_cl_ctx;
        cl_q_ptr = &weak_cl_q;
        cl_lock_ptr = &weak_cl_lock;
    }

    halide_assert(user_context, cl_lock_ptr != NULL);
    while (__sync_lock_test_and_set(cl_lock_ptr, 1)) { }

    // If the context has not been initialized, initialize it now.
    halide_assert(user_context, cl_ctx_ptr != NULL);
    halide_assert(user_context, cl_q_ptr != NULL);
    if (!(*cl_ctx_ptr)) {
        cl_int error = create_context(user_context, cl_ctx_ptr, cl_q_ptr);
        if (error != CL_SUCCESS) {
            __sync_lock_release(cl_lock_ptr);
            return error;
        }
    }

    *ctx = *cl_ctx_ptr;
    *q = *cl_q_ptr;
    return 0;
}
Exemplo n.º 5
0
WEAK void halide_profiler_memory_free(void *user_context,
                                      void *pipeline_state,
                                      int func_id,
                                      uint64_t decr) {
    // It's possible to have 'decr' equal to zero if the allocation is not
    // executed conditionally.
    if (decr == 0) {
        return;
    }

    halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *) pipeline_state;
    halide_assert(user_context, p_stats != NULL);
    halide_assert(user_context, func_id >= 0);
    halide_assert(user_context, func_id < p_stats->num_funcs);

    halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];

    // Note: Update to the counter is done without grabbing the state's lock to
    // reduce lock contention. One potential issue is that other call that frees the
    // pipeline and function stats structs may be running in parallel. However, the
    // current desctructor (called on profiler shutdown) does not free the structs
    // unless user specifically calls halide_profiler_reset().

    // Update per-pipeline memory stats
    __sync_sub_and_fetch(&p_stats->memory_current, decr);

    // Update per-func memory stats
    __sync_sub_and_fetch(&f_stats->memory_current, decr);
}
Exemplo n.º 6
0
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) {
    if (!buf->dev_dirty) {
        return 0;
    }

    DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    // Need to check dev_dirty again, in case another thread did the
    // copy_to_host before the serialization point above.
    if (buf->dev_dirty) {
        #ifdef DEBUG
        uint64_t t_before = halide_current_time_ns(user_context);
        #endif

        halide_assert(user_context, buf->dev && buf->dev);
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));

        _dev_copy c = _make_dev_to_host_copy(buf);

        for (int w = 0; w < c.extent[3]; w++) {
            for (int z = 0; z < c.extent[2]; z++) {
                for (int y = 0; y < c.extent[1]; y++) {
                    for (int x = 0; x < c.extent[0]; x++) {
                        uint64_t off = (x * c.stride_bytes[0] +
                                        y * c.stride_bytes[1] +
                                        z * c.stride_bytes[2] +
                                        w * c.stride_bytes[3]);
                        CUdeviceptr src = (CUdeviceptr)(c.src + off);
                        void *dst = (void *)(c.dst + off);
                        uint64_t size = c.chunk_size;
                        DEBUG_PRINTF( user_context, "    cuMemcpyDtoH (%d, %d, %d, %d), %p -> %p, %lld bytes\n",
                                      x, y, z, w,
                                      (void *)src, dst, (long long)size );
                        CUresult err = cuMemcpyDtoH(dst, src, size);
                        if (err != CUDA_SUCCESS) {
                            halide_error_varargs(user_context, "CUDA: cuMemcpyDtoH failed (%s)",
                                                 _get_error_name(err));
                            return err;
                        }
                    }
                }
            }
        }

        #ifdef DEBUG
        uint64_t t_after = halide_current_time_ns(user_context);
        halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
        #endif
    }
    buf->dev_dirty = false;
    return 0;
}
Exemplo n.º 7
0
WEAK void copy_from_to(void *user_context, const buffer_t &from, buffer_t &to) {
    size_t buffer_size = full_extent(from);
    halide_assert(user_context, from.elem_size == to.elem_size);
    for (int i = 0; i < 4; i++) {
        halide_assert(user_context, from.extent[i] == to.extent[i]);
        halide_assert(user_context, from.stride[i] == to.stride[i]);
    }
    memcpy(to.host, from.host, buffer_size * from.elem_size);
}
    // Constructor sets 'error' if any occurs.
    Context(void *user_context)
        : user_context(user_context), mDev(NULL), mContext(NULL),
          error(RS_ERROR_RUNTIME_ERROR) {
#ifdef DEBUG_RUNTIME
        halide_start_clock(user_context);
#endif
        error = halide_renderscript_acquire_context(user_context, &mDev, &mContext);
        halide_assert(user_context, mDev != NULL);
        halide_assert(user_context, mContext != NULL);
    }
Exemplo n.º 9
0
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) {
    DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf );

    CudaContext ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    size_t size = _buf_size(user_context, buf);
    if (buf->dev) {
        // This buffer already has a device allocation
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    DEBUG_PRINTF(user_context, "    allocating buffer of %lld bytes, "
                 "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n",
                 (long long)size,
                 (long long)buf->extent[0], (long long)buf->extent[1],
                 (long long)buf->extent[2], (long long)buf->extent[3],
                 (long long)buf->stride[0], (long long)buf->stride[1],
                 (long long)buf->stride[2], (long long)buf->stride[3],
                 buf->elem_size);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUdeviceptr p;
    DEBUG_PRINTF( user_context, "    cuMemAlloc %lld -> ", size );
    CUresult err = cuMemAlloc(&p, size);
    if (err != CUDA_SUCCESS) {
        DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err));
        halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)",
                             _get_error_name(err));
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", p );
    }
    halide_assert(user_context, p);
    buf->dev = (uint64_t)p;

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Exemplo n.º 10
0
WEAK void halide_copy_to_dev(void *user_context, buffer_t* buf) {
    if (buf->host_dirty) {
        halide_assert(user_context, buf->host && buf->dev);
        size_t size = __buf_size(user_context, buf);
        #ifdef DEBUG
        halide_printf(user_context, "copy_to_dev (%lld bytes) %p -> %p\n", (long long)size, buf->host, (void*)buf->dev);
        #endif
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));
        int err = clEnqueueWriteBuffer( *cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL );
        CHECK_ERR( err, "clEnqueueWriteBuffer" );
    }
    buf->host_dirty = false;
}
Exemplo n.º 11
0
WEAK int halide_dev_malloc(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf );

    ClContext ctx(user_context);
    if (ctx.error != CL_SUCCESS) {
        return ctx.error;
    }

    size_t size = _buf_size(user_context, buf);
    if (buf->dev) {
        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    DEBUG_PRINTF(user_context, "    Allocating buffer of %lld bytes, "
                 "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n",
                 (long long)size,
                 (long long)buf->extent[0], (long long)buf->extent[1],
                 (long long)buf->extent[2], (long long)buf->extent[3],
                 (long long)buf->stride[0], (long long)buf->stride[1],
                 (long long)buf->stride[2], (long long)buf->stride[3],
                 buf->elem_size);

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    cl_int err;
    DEBUG_PRINTF( user_context, "    clCreateBuffer -> ", size );
    buf->dev = (uint64_t)clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err);
    if (err != CL_SUCCESS || buf->dev == 0) {
        DEBUG_PRINTF( user_context, "%d\n", err);
        halide_error_varargs(user_context, "CL: clCreateBuffer failed (%d)\n", err);
        return err;
    } else {
        DEBUG_PRINTF( user_context, "%p\n", (cl_mem)buf->dev );
    }

    DEBUG_PRINTF(user_context, "    Allocated device buffer %p for buffer %p\n",
                 (void *)buf->dev, buf);

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return CL_SUCCESS;
}
Exemplo n.º 12
0
WEAK void halide_copy_to_dev(buffer_t* buf) {
    if (buf->host_dirty) {
        halide_assert(buf->host && buf->dev);
        size_t size = buf_size(buf);
        #ifdef DEBUG
        char msg[256];
        snprintf(msg, 256, "copy_to_dev (%zu bytes) %p -> %p (t=%lld)",
                 size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() );
        halide_assert(halide_validate_dev_pointer(buf));
        #endif
        TIME_CALL( cuMemcpyHtoD(buf->dev, buf->host, size), msg );
    }
    buf->host_dirty = false;
}
Exemplo n.º 13
0
WEAK void halide_copy_to_host(buffer_t* buf) {
    if (buf->dev_dirty) {
        halide_assert(buf->dev);
        halide_assert(buf->host);
        size_t size = buf_size(buf);
        #ifdef DEBUG
        char msg[256];
        snprintf(msg, 256, "copy_to_host (%zu bytes) %p -> %p", size, (void*)buf->dev, buf->host );
        halide_assert(halide_validate_dev_pointer(buf));
        #endif
        TIME_CALL( cuMemcpyDtoH(buf->host, buf->dev, size), msg );
    }
    buf->dev_dirty = false;
}
Exemplo n.º 14
0
/** Free host and device memory associated with a buffer_t. */
WEAK int halide_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
    debug(user_context) << "halide_device_and_host_free: " << buf
                        << " buf dev " << buf->device
                        << " interface " << buf->device_interface << "\n";
    if (buf != NULL) {
        const halide_device_interface_t *device_interface = buf->device_interface;
        if (device_interface != NULL) {
            // Ensure interface is not freed prematurely.
            // TODO: Exception safety...
            device_interface->use_module();
            int result = device_interface->device_and_host_free(user_context, buf);
            device_interface->release_module();
            halide_assert(user_context, buf->device == 0);
            if (result) {
                return halide_error_code_device_free_failed;
            } else {
                return 0;
            }
        } else if (buf->host) {
            // device_free must have been called on this buffer (which
            // must be legal for the device interface that was
            // used). We'd better still free the host pointer.
            halide_free(user_context, buf->host);
            buf->host = NULL;
        }
    }
    buf->set_device_dirty(false);
    return 0;
}
Exemplo n.º 15
0
WEAK void halide_copy_to_host(void *user_context, buffer_t* buf) {
    if (buf->dev_dirty) {
        clFinish(*cl_q); // block on completion before read back
        halide_assert(user_context, buf->host && buf->dev);
        size_t size = __buf_size(user_context, buf);
        #ifdef DEBUG
        halide_printf(user_context, "copy_to_host buf %p (%lld bytes) %p -> %p\n", buf, (long long)size,
                      (void*)buf->dev, buf->host );
        #endif

        halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size));
        int err = clEnqueueReadBuffer( *cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL );
        CHECK_ERR( err, "clEnqueueReadBuffer" );
    }
    buf->dev_dirty = false;
}
Exemplo n.º 16
0
WEAK int halide_dev_free(void *user_context, buffer_t* buf) {
    DEBUG_PRINTF( user_context, "CL: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf );

    ClContext ctx(user_context);

    // halide_dev_free, at present, can be exposed to clients and they
    // should be allowed to call halide_dev_free on any buffer_t
    // including ones that have never been used with a GPU.
    if (buf->dev == 0) {
      return 0;
    }

    #ifdef DEBUG
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));
    DEBUG_PRINTF(user_context, "    clReleaseMemObject %p\n", (cl_mem)buf->dev );
    cl_int result = clReleaseMemObject((cl_mem)buf->dev);
    // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so
    // we just end our reference to it regardless.
    buf->dev = 0;
    if (result != CL_SUCCESS) {
        halide_error_varargs(user_context, "CL: clReleaseMemObject failed (%d)", result);
        return result;
    }

    #ifdef DEBUG
    uint64_t t_after = halide_current_time_ns(user_context);
    halide_printf(user_context, "    Time: %f ms\n", (t_after - t_before) / 1.0e6);
    #endif

    return 0;
}
Exemplo n.º 17
0
 // Constructor sets 'error' if any occurs.
 ClContext(void *user_context) : user_context(user_context),
                                 context(NULL),
                                 cmd_queue(NULL),
                                 error(CL_SUCCESS) {
     error = halide_acquire_cl_context(user_context, &context, &cmd_queue);
     halide_assert(user_context, context != NULL && cmd_queue != NULL);
 }
Exemplo n.º 18
0
/** Free any device memory associated with a buffer_t. */
WEAK int halide_device_free(void *user_context, struct buffer_t *buf) {
    uint64_t dev_field = 0;
    if (buf) {
        dev_field = buf->dev;
    }
    debug(user_context) << "halide_device_free: " << buf
                        << " buf dev " << buf->dev
                        << " interface " << halide_get_device_interface(dev_field) << "\n";
    if (buf != NULL) {
        const halide_device_interface *interface = halide_get_device_interface(dev_field);
        if (interface != NULL) {
            // Ensure interface is not freed prematurely.
            // TODO: Exception safety...
            interface->use_module();
            int result = interface->device_free(user_context, buf);
            interface->release_module();
            halide_assert(user_context, buf->dev == 0);
            if (result) {
                return halide_error_code_device_free_failed;
            } else {
                return 0;
            }
        }
    }
    buf->dev_dirty = false;
    return 0;
}
Exemplo n.º 19
0
WEAK void halide_release(void *user_context) {
    DEBUG_PRINTF( user_context, "CL: halide_release (user_context: %p)\n", user_context );

    // The ClContext object does not allow the context storage to be modified,
    // so we use halide_acquire_context directly.
    int err;
    cl_context ctx;
    cl_command_queue q;
    err = halide_acquire_cl_context(user_context, &ctx, &q);
    if (err != 0 || !ctx) {
        return;
    }

    err = clFinish(q);
    halide_assert(user_context, err == CL_SUCCESS);

    // Unload the modules attached to this context. Note that the list
    // nodes themselves are not freed, only the program objects are
    // released. Subsequent calls to halide_init_kernels might re-create
    // the program object using the same list node to store the program
    // object.
    module_state *state = state_list;
    while (state) {
        if (state->program) {
            DEBUG_PRINTF(user_context, "    clReleaseProgram %p\n", state->program);
            err = clReleaseProgram(state->program);
            halide_assert(user_context, err == CL_SUCCESS);
            state->program = NULL;
        }
        state = state->next;
    }

    // Release the context itself, if we created it.
    if (ctx == weak_cl_ctx) {
        DEBUG_PRINTF( user_context, "    clReleaseCommandQueue %p\n", weak_cl_q );
        err = clReleaseCommandQueue(weak_cl_q);
        halide_assert(user_context, err == CL_SUCCESS);
        weak_cl_q = NULL;

        DEBUG_PRINTF( user_context, "    clReleaseContext %p\n", weak_cl_ctx );
        err = clReleaseContext(weak_cl_ctx);
        halide_assert(user_context, err == CL_SUCCESS);
        weak_cl_ctx = NULL;
    }

    halide_release_cl_context(user_context);
}
Exemplo n.º 20
0
WEAK void halide_copy_to_dev(buffer_t* buf) {
    if (buf->host_dirty) {
        halide_assert(buf->host && buf->dev);
        size_t size = __buf_size(buf);
        #ifdef DEBUG
        char msg[256];
        snprintf(msg, 256, "copy_to_dev (%lld bytes) %p -> %p (t=%lld)",
                 (long long)size, buf->host, (void*)buf->dev, (long long)halide_current_time_ns() );
        #endif
        halide_assert(halide_validate_dev_pointer(buf));
        TIME_START();
        int err = clEnqueueWriteBuffer( cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL );
        CHECK_ERR( err, msg );
        TIME_CHECK(msg);
    }
    buf->host_dirty = false;
}
Exemplo n.º 21
0
WEAK void halide_dev_malloc(buffer_t* buf) {
    if (buf->dev) {
        halide_assert(halide_validate_dev_pointer(buf));
        return;
    }

    size_t size = __buf_size(buf);
    #ifdef DEBUG
    halide_printf("dev_malloc allocating buffer of %zd bytes, extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n",
		  size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3],
                  buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3],
		  buf->elem_size);
    #endif

    buf->dev = (uint64_t)__dev_malloc(size);
    halide_assert(buf->dev);
}
Exemplo n.º 22
0
Arquivo: cuda.cpp Projeto: kgnk/Halide
// Load a CUDA shared object/dll and get the CUDA API function pointers from it.
WEAK void load_libcuda(void *user_context) {
    debug(user_context) << "    load_libcuda (user_context: " << user_context << ")\n";
    halide_assert(user_context, cuInit == NULL);

    #define CUDA_FN(ret, fn, args) fn = get_cuda_symbol<ret (CUDAAPI *)args>(user_context, #fn);
    #define CUDA_FN_3020(ret, fn, fn_3020, args) fn = get_cuda_symbol<ret (CUDAAPI *)args>(user_context, #fn_3020);
    #define CUDA_FN_4000(ret, fn, fn_4000, args) fn = get_cuda_symbol<ret (CUDAAPI *)args>(user_context, #fn_4000);
    #include "cuda_functions.h"
}
Exemplo n.º 23
0
WEAK void prune_cache() {
#if CACHE_DEBUGGING
    validate_cache();
#endif
    CacheEntry *prune_candidate = least_recently_used;
    while (current_cache_size > max_cache_size &&
           prune_candidate != NULL) {
        CacheEntry *more_recent = prune_candidate->more_recent;
        
        if (prune_candidate->in_use_count == 0) {
            uint32_t h = prune_candidate->hash;
            uint32_t index = h % kHashTableSize;

            // Remove from hash table
            CacheEntry *prev_hash_entry = cache_entries[index];
            if (prev_hash_entry == prune_candidate) {
                cache_entries[index] = prune_candidate->next;
            } else {
                while (prev_hash_entry != NULL && prev_hash_entry->next != prune_candidate) {
                    prev_hash_entry = prev_hash_entry->next;
                }
                halide_assert(NULL, prev_hash_entry != NULL);
                prev_hash_entry->next = prune_candidate->next;
            }

            // Remove from less recent chain.
            if (least_recently_used == prune_candidate) {
                least_recently_used = more_recent;
            }
            if (more_recent != NULL) {
                more_recent->less_recent = prune_candidate->less_recent;
            }

            // Remove from more recent chain.
            if (most_recently_used == prune_candidate) {
                most_recently_used = prune_candidate->less_recent;
            }
            if (prune_candidate->less_recent != NULL) {
                prune_candidate->less_recent = more_recent;
            }

            // Decrease cache used amount.
            for (int32_t i = 0; i < prune_candidate->tuple_count; i++) {
                current_cache_size -= full_extent(prune_candidate->buffer(i));
            }

            // Deallocate the entry.
            prune_candidate->destroy();
            halide_free(NULL, prune_candidate);
        }

        prune_candidate = more_recent;
    }
#if CACHE_DEBUGGING
    validate_cache();
#endif
}
Exemplo n.º 24
0
static inline size_t buf_size(buffer_t* buf) {
    size_t sz = buf->elem_size;
    if (buf->extent[0]) sz *= buf->extent[0];
    if (buf->extent[1]) sz *= buf->extent[1];
    if (buf->extent[2]) sz *= buf->extent[2];
    if (buf->extent[3]) sz *= buf->extent[3];
    halide_assert(sz);
    return sz;
}
Exemplo n.º 25
0
static size_t __buf_size(buffer_t* buf) {
    size_t size = 0;
    for (int i = 0; i < sizeof(buf->stride) / sizeof(buf->stride[0]); i++) {
        size_t total_dim_size = buf->elem_size * buf->extent[i] * buf->stride[i];
        if (total_dim_size > size)
            size = total_dim_size;
     }
    halide_assert(size);
    return size;
}
Exemplo n.º 26
0
WEAK void halide_release(void *user_context) {
    DEBUG_PRINTF( user_context, "CUDA: halide_release (user_context: %p)\n", user_context );

    int err;
    CUcontext ctx;
    err = halide_acquire_cuda_context(user_context, &ctx);
    if (err != CUDA_SUCCESS || !ctx) {
        return;
    }

    // It's possible that this is being called from the destructor of
    // a static variable, in which case the driver may already be
    // shutting down.
    err = cuCtxSynchronize();
    halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);

    // Unload the modules attached to this context. Note that the list
    // nodes themselves are not freed, only the module objects are
    // released. Subsequent calls to halide_init_kernels might re-create
    // the program object using the same list node to store the module
    // object.
    module_state *state = state_list;
    while (state) {
        if (state->module) {
            DEBUG_PRINTF(user_context, "    cuModuleUnload %p\n", state->module);
            err = cuModuleUnload(state->module);
            halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
            state->module = 0;
        }
        state = state->next;
    }

    // Only destroy the context if we own it
    if (ctx == weak_cuda_ctx) {
        DEBUG_PRINTF(user_context, "    cuCtxDestroy %p\n", weak_cuda_ctx);
        err = cuCtxDestroy(weak_cuda_ctx);
        halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
        weak_cuda_ctx = NULL;
    }

    halide_release_cuda_context(user_context);
}
Exemplo n.º 27
0
    // Constructor sets 'error' if any occurs.
    CudaContext(void *user_context) : user_context(user_context),
                                      context(NULL),
                                      error(CUDA_SUCCESS) {
        error = halide_acquire_cuda_context(user_context, &context);
        halide_assert(user_context, context != NULL);
        if (error != 0) {
            return;
        }

        error = cuCtxPushCurrent(context);
    }
Exemplo n.º 28
0
WEAK void halide_dev_free(buffer_t* buf) {

    #ifdef DEBUG
    halide_printf("In dev_free of %p - dev: 0x%p\n", buf, (void*)buf->dev);
    halide_assert(halide_validate_dev_pointer(buf));
    #endif

    CHECK_CALL( cuMemFree(buf->dev), "cuMemFree" );
    buf->dev = 0;

}
Exemplo n.º 29
0
WEAK void halide_copy_to_host(buffer_t* buf) {
    if (buf->dev_dirty) {
        clFinish(cl_q); // block on completion before read back
        halide_assert(buf->host && buf->dev);
        size_t size = __buf_size(buf);
        #ifndef DEBUG
        char msg[1] = { 0 };
        #else
        char msg[256];
        snprintf(msg, 256, "copy_to_host (%lld bytes) %p -> %p", (long long)size, (void*)buf->dev, buf->host );
        #endif
        halide_assert(halide_validate_dev_pointer(buf, size));
        TIME_START();
        #ifdef DEBUG
        halide_printf("%s\n", msg);
        #endif
        int err = clEnqueueReadBuffer( cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL );
        CHECK_ERR( err, msg );
        TIME_CHECK(msg);
    }
    buf->dev_dirty = false;
}
Exemplo n.º 30
0
static cl_mem __dev_malloc(void *user_context, size_t bytes) {
    cl_mem p;
    #ifdef DEBUG
    halide_printf(user_context, "dev_malloc (%lld bytes)\n", (long long)bytes);
    #endif

    int err;
    p = clCreateBuffer(*cl_ctx, CL_MEM_READ_WRITE, bytes, NULL, &err );
    #ifdef DEBUG
    halide_printf(user_context, "    returned: %p (err: %d)\n", (void*)p, err);
    #endif
    halide_assert(user_context, p);
    return p;
}