static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ size_t hwDim; int i; /* Use internal remapping when computing the ranges for this thread. */ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); for(i=0;i<ctx->nds;i++){ strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); } for(i=0;i<ctx->nds;i++){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); } for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); } for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "\ti%dAStep = dstArgmaxSteps[%d];\n", i, i); } for(i=ctx->nds-1;i>=ctx->ndd;i--){ /** * If this is the last index, it's the first cumulative dimension * product we generate, and thus we initialize to 1. */ if(i == ctx->nds-1){ strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); }else{ strb_appendf(&ctx->s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); } } for(i=0;i<ctx->nds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); } } for(i=0;i<ctx->nds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); }
static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "/* Typedefs */\n"); strb_appendf(&ctx->s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); strb_appendf(&ctx->s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); }
static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * FREE LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); maxandargmaxAppendLoopMacroDefs (ctx); maxandargmaxAppendLoopOuter (ctx); maxandargmaxAppendLoopMacroUndefs(ctx); }
static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx){ maxandargmaxAppendTypedefs (ctx); maxandargmaxAppendPrototype (ctx); strb_appends (&ctx->s, "{\n"); maxandargmaxAppendOffsets (ctx); maxandargmaxAppendIndexDeclarations(ctx); maxandargmaxAppendRangeCalculations(ctx); maxandargmaxAppendLoops (ctx); strb_appends (&ctx->s, "}\n"); }
static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ int i; /** * Outer Loop Header Generation */ for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Generation */ maxandargmaxAppendLoopInner(ctx); /** * Outer Loop Trailer Generation */ for(i=0;i<ctx->ndd;i++){ strb_appends(&ctx->s, "\t}\n"); } }
static void appendIdxes (strb* s, const char* prologue, const char* prefix, int startIdx, int endIdx, const char* suffix, const char* epilogue){ int i; prologue = prologue ? prologue : ""; prefix = prefix ? prefix : ""; suffix = suffix ? suffix : ""; epilogue = epilogue ? epilogue : ""; strb_appends(s, prologue); for(i=startIdx;i<endIdx;i++){ strb_appendf(s, "%s%d%s%s", prefix, i, suffix, &","[i==endIdx-1]); } strb_appends(s, epilogue); }
static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "#undef FOROVER\n"); strb_appends(&ctx->s, "#undef ESCAPE\n"); strb_appends(&ctx->s, "#undef SRCINDEXER\n"); strb_appends(&ctx->s, "#undef RDXINDEXER\n"); strb_appends(&ctx->s, "#undef DSTMINDEXER\n"); strb_appends(&ctx->s, "#undef DSTAINDEXER\n"); }
static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t/* Add offsets */\n"); strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); strb_appends(&ctx->s, "\tdstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); }
static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ int i; /** * FOROVER Macro */ strb_appends(&ctx->s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); /** * ESCAPE Macro */ strb_appends(&ctx->s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); /** * SRCINDEXER Macro */ appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); for(i=0;i<ctx->nds;i++){ strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); /** * RDXINDEXER Macro */ appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ") ("); for(i=ctx->ndd;i<ctx->nds;i++){ strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); } strb_appends(&ctx->s, "0)\n"); /** * DSTMINDEXER Macro */ appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); /** * DSTAINDEXER Macro */ appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); }
static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx){ int i; /** * Inner Loop Prologue */ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Reduction initialization.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", ""); if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");} appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n"); appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Header Generation */ for(i=ctx->ndd;i<ctx->nds;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Body Generation */ appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\tif(V > maxV){\n"); strb_appends(&ctx->s, "\t\tmaxV = V;\n"); appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t}\n"); /** * Inner Loop Trailer Generation */ for(i=ctx->ndd;i<ctx->nds;i++){ strb_appends(&ctx->s, "\t}\n"); } strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Epilogue Generation */ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Destination writeback.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n"); appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n"); }
static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ int i; strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); if(ctx->ndh>0){ strb_appends(&ctx->s, "\tX "); for(i=0;i<ctx->ndh;i++){ strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", i, i, (i==ctx->ndh-1) ? ";\n" : ", "); } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); }
static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T* src,\n"); strb_appends(&ctx->s, " const X srcOff,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSteps,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSize,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* chunkSize,\n"); strb_appends(&ctx->s, " GLOBAL_MEM T* dstMax,\n"); strb_appends(&ctx->s, " const X dstMaxOff,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* dstMaxSteps,\n"); strb_appends(&ctx->s, " GLOBAL_MEM X* dstArgmax,\n"); strb_appends(&ctx->s, " const X dstArgmaxOff,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* dstArgmaxSteps)"); }
static gpukernel *cuda_newkernel(void *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb sb = STRB_STATIC_INIT; char *bin, *log = NULL; srckey k, *ak; binval *av; gpukernel *res; size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int ptx_mode = 0; int binary_mode = 0; int major, minor; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive if (flags & ~GA_USE_BINARY) FAIL(NULL, GA_INVALID_ERROR); // We need the length for binary data and there is only one blob. if (count != 1 || lengths == NULL || lengths[0] == 0) FAIL(NULL, GA_VALUE_ERROR); } cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuDeviceComputeCapability(&major, &minor, dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } // GA_USE_CLUDA is done later // GA_USE_SMALL will always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } // GA_USE_HALF should always work if (flags & GA_USE_PTX) { ptx_mode = 1; } else if (flags & GA_USE_BINARY) { binary_mode = 1; } if (binary_mode) { bin = memdup(strings[0], lengths[0]); bin_len = lengths[0]; if (bin == NULL) { cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } } else { if (flags & GA_USE_CLUDA) { strb_appends(&sb, CUDA_PREAMBLE); } if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&sb, strings[i]); } else { for (i = 0; i < count; i++) { if (lengths[i] == 0) strb_appends(&sb, strings[i]); else strb_appendn(&sb, strings[i], lengths[i]); } } strb_append0(&sb); if (strb_error(&sb)) { strb_clear(&sb); cuda_exit(ctx); return NULL; } if (ptx_mode) { bin = sb.s; bin_len = sb.l; } else { bin = NULL; if (compile_cache != NULL) { k.src = sb.s; k.len = sb.l; memcpy(k.arch, ctx->bin_id, BIN_ID_LEN); av = cache_get(compile_cache, &k); if (av != NULL) { bin = memdup(av->bin, av->len); bin_len = av->len; } } if (bin == NULL) { bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, &log, &log_len, ret); } if (bin == NULL) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; // We're substituting debug_msg for a string with this first line: strb_appends(&debug_msg, "CUDA kernel build failure ::\n"); /* Delete the final NUL */ sb.l--; gpukernel_source_with_line_numbers(1, (const char **)&sb.s, &sb.l, &debug_msg); if (log != NULL) { strb_appends(&debug_msg, "\nCompiler log:\n"); strb_appendn(&debug_msg, log, log_len); free(log); } *err_str = strb_cstr(&debug_msg); // *err_str will be free()d by the caller (see docs in kernel.h) } strb_clear(&sb); cuda_exit(ctx); return NULL; } if (compile_cache == NULL) compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free, bin_free); if (compile_cache != NULL) { ak = malloc(sizeof(*ak)); av = malloc(sizeof(*av)); if (ak == NULL || av == NULL) { free(ak); free(av); goto done; } ak->src = memdup(sb.s, sb.l); if (ak->src == NULL) { free(ak); free(av); goto done; } ak->len = sb.l; memmove(ak->arch, ctx->bin_id, BIN_ID_LEN); av->len = bin_len; av->bin = memdup(bin, bin_len); if (av->bin == NULL) { src_free(ak); free(av); goto done; } cache_add(compile_cache, ak, av); } done: strb_clear(&sb); } } res = calloc(1, sizeof(*res)); if (res == NULL) { free(bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } res->bin_sz = bin_len; res->bin = bin; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } ctx->err = cuModuleLoadData(&res->m, bin); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); return res; }
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, GpuArray *a, const GpuArray *v, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; char *sz, *ssz; unsigned int i, i2; unsigned int nargs, apos; int flags = GA_USE_CLUDA; int res; nargs = 9 + 2 * v->nd; atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) return GA_MEMORY_ERROR; if (addr32) { sz = "ga_uint"; ssz = "ga_int"; } else { sz = "ga_size"; ssz = "ga_ssize"; } apos = 0; strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; for (i = 0; i < v->nd; i++) { strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i); atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, " "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n", gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; assert(apos == nargs); strb_appendf(&sb, " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n" " const %s numThreads0 = LDIM_0 * GDIM_0;\n" " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n" " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n" " ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" " %s ii0 = ind[i0];\n" " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" " if ((ii0 < 0) || (ii0 >= d0)) {\n" " *err = -1;\n" " continue;\n" " }\n" " pos0 += ii0 * (%s)s0;\n" " for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n" " %s p = pos0;\n", ssz, sz, sz, sz); if (v->nd > 1) { strb_appendf(&sb, " %s pos, ii = i1;\n", sz); for (i2 = v->nd; i2 > 1; i2--) { i = i2 - 1; if (i > 1) strb_appendf(&sb, " pos = ii %% (%s)d%u;\n" " ii /= (%s)d%u;\n", sz, i, sz, i); else strb_appends(&sb, " pos = ii;\n"); strb_appendf(&sb, " p += pos * (%s)s%u;\n", ssz, i); } } strb_appendf(&sb, " r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n", sz, gpuarray_get_type(v->typecode)->cluda_name); strb_appends(&sb, " }\n" " }\n" "}\n"); if (strb_error(&sb)) { res = GA_MEMORY_ERROR; goto bail; } flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1); res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1", nargs, atypes, flags, err_str); bail: free(atypes); strb_clear(&sb); return res; }