static int ga_extcopy(GpuArray *dst, const GpuArray *src) { struct extcopy_args a, *aa; gpucontext *ctx = gpudata_context(dst->data); GpuElemwise *k = NULL; void *args[2]; if (ctx != gpudata_context(src->data)) return GA_INVALID_ERROR; a.itype = src->typecode; a.otype = dst->typecode; if (ctx->extcopy_cache != NULL) k = cache_get(ctx->extcopy_cache, &a); if (k == NULL) { gpuelemwise_arg gargs[2]; gargs[0].name = "src"; gargs[0].typecode = src->typecode; gargs[0].flags = GE_READ; gargs[1].name = "dst"; gargs[1].typecode = dst->typecode; gargs[1].flags = GE_WRITE; k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, 0); if (k == NULL) return GA_MISC_ERROR; aa = memdup(&a, sizeof(a)); if (aa == NULL) { GpuElemwise_free(k); return GA_MEMORY_ERROR; } if (ctx->extcopy_cache == NULL) ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash, extcopy_free, (cache_freev_fn)GpuElemwise_free, ctx->err); if (ctx->extcopy_cache == NULL) return GA_MISC_ERROR; if (cache_add(ctx->extcopy_cache, aa, k) != 0) return GA_MISC_ERROR; } args[0] = (void *)src; args[1] = (void *)dst; return GpuElemwise_call(k, args, GE_BROADCAST); }
static gpukernel *cuda_newkernel(void *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb sb = STRB_STATIC_INIT; char *bin, *log = NULL; srckey k, *ak; binval *av; gpukernel *res; size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int ptx_mode = 0; int binary_mode = 0; int major, minor; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive if (flags & ~GA_USE_BINARY) FAIL(NULL, GA_INVALID_ERROR); // We need the length for binary data and there is only one blob. if (count != 1 || lengths == NULL || lengths[0] == 0) FAIL(NULL, GA_VALUE_ERROR); } cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuDeviceComputeCapability(&major, &minor, dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } // GA_USE_CLUDA is done later // GA_USE_SMALL will always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } // GA_USE_HALF should always work if (flags & GA_USE_PTX) { ptx_mode = 1; } else if (flags & GA_USE_BINARY) { binary_mode = 1; } if (binary_mode) { bin = memdup(strings[0], lengths[0]); bin_len = lengths[0]; if (bin == NULL) { cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } } else { if (flags & GA_USE_CLUDA) { strb_appends(&sb, CUDA_PREAMBLE); } if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&sb, strings[i]); } else { for (i = 0; i < count; i++) { if (lengths[i] == 0) strb_appends(&sb, strings[i]); else strb_appendn(&sb, strings[i], lengths[i]); } } strb_append0(&sb); if (strb_error(&sb)) { strb_clear(&sb); cuda_exit(ctx); return NULL; } if (ptx_mode) { bin = sb.s; bin_len = sb.l; } else { bin = NULL; if (compile_cache != NULL) { k.src = sb.s; k.len = sb.l; memcpy(k.arch, ctx->bin_id, BIN_ID_LEN); av = cache_get(compile_cache, &k); if (av != NULL) { bin = memdup(av->bin, av->len); bin_len = av->len; } } if (bin == NULL) { bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, &log, &log_len, ret); } if (bin == NULL) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; // We're substituting debug_msg for a string with this first line: strb_appends(&debug_msg, "CUDA kernel build failure ::\n"); /* Delete the final NUL */ sb.l--; gpukernel_source_with_line_numbers(1, (const char **)&sb.s, &sb.l, &debug_msg); if (log != NULL) { strb_appends(&debug_msg, "\nCompiler log:\n"); strb_appendn(&debug_msg, log, log_len); free(log); } *err_str = strb_cstr(&debug_msg); // *err_str will be free()d by the caller (see docs in kernel.h) } strb_clear(&sb); cuda_exit(ctx); return NULL; } if (compile_cache == NULL) compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free, bin_free); if (compile_cache != NULL) { ak = malloc(sizeof(*ak)); av = malloc(sizeof(*av)); if (ak == NULL || av == NULL) { free(ak); free(av); goto done; } ak->src = memdup(sb.s, sb.l); if (ak->src == NULL) { free(ak); free(av); goto done; } ak->len = sb.l; memmove(ak->arch, ctx->bin_id, BIN_ID_LEN); av->len = bin_len; av->bin = memdup(bin, bin_len); if (av->bin == NULL) { src_free(ak); free(av); goto done; } cache_add(compile_cache, ak, av); } done: strb_clear(&sb); } } res = calloc(1, sizeof(*res)); if (res == NULL) { free(bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } res->bin_sz = bin_len; res->bin = bin; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } ctx->err = cuModuleLoadData(&res->m, bin); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); return res; }