static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ size_t hwDim; int i; /* Use internal remapping when computing the ranges for this thread. */ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); for(i=0;i<ctx->nds;i++){ strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); } for(i=0;i<ctx->nds;i++){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); } for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); } for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "\ti%dAStep = dstArgmaxSteps[%d];\n", i, i); } for(i=ctx->nds-1;i>=ctx->ndd;i--){ /** * If this is the last index, it's the first cumulative dimension * product we generate, and thus we initialize to 1. */ if(i == ctx->nds-1){ strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); }else{ strb_appendf(&ctx->s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); } } for(i=0;i<ctx->nds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); } } for(i=0;i<ctx->nds;i++){ /** * Up to 3 dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); }
static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "/* Typedefs */\n"); strb_appendf(&ctx->s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); strb_appendf(&ctx->s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); }
static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ int i; /** * FOROVER Macro */ strb_appends(&ctx->s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); /** * ESCAPE Macro */ strb_appends(&ctx->s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); /** * SRCINDEXER Macro */ appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); for(i=0;i<ctx->nds;i++){ strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); /** * RDXINDEXER Macro */ appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ") ("); for(i=ctx->ndd;i<ctx->nds;i++){ strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); } strb_appends(&ctx->s, "0)\n"); /** * DSTMINDEXER Macro */ appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); /** * DSTAINDEXER Macro */ appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n ", i, i); } strb_appends(&ctx->s, "0))\n"); }
static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ int i; /** * Outer Loop Header Generation */ for(i=0;i<ctx->ndd;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Generation */ maxandargmaxAppendLoopInner(ctx); /** * Outer Loop Trailer Generation */ for(i=0;i<ctx->ndd;i++){ strb_appends(&ctx->s, "\t}\n"); } }
static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ int i; strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); if(ctx->ndh>0){ strb_appends(&ctx->s, "\tX "); for(i=0;i<ctx->ndh;i++){ strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", i, i, (i==ctx->ndh-1) ? ";\n" : ", "); } } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); }
void gpukernel_source_with_line_numbers(unsigned int count, const char **news, size_t *newl, strb *src) { assert(src != NULL); unsigned int section, line, i, j; size_t len; line=1; // start the line counter at 1 for(section=0; section<count; section++) { len = (newl == NULL)?0:newl[section]; if(len<=0) // If either newl==NULL, or has length zero for this section, use strlen() to determine len=strlen(news[section]); i=0; // position of line-starts within news[section] while(i<len) { strb_appendf(src, "%04d\t", line); for(j=i; j<len && news[section][j] != '\n'; j++); // look for next line-end strb_appendn(src, news[section]+i, (j-i)); strb_appendc(src, '\n'); i = j+1; // Character after the newline line++; } } }
void gpuarray_elem_perdim(strb *sb, unsigned int nd, const size_t *dims, const ssize_t *str, const char *id) { int i; if (nd > 0) { strb_appendf(sb, "int %si = i;", id); for (i = nd-1; i > 0; i--) { strb_appendf(sb, "%s %c= ((%si %% %" SPREFIX "u) * " "%" SPREFIX "d);%si = %si / %" SPREFIX "u;", id, (str[i] < 0 ? '-' : '+'), id, dims[i], ssabs(str[i]), id, id, dims[i]); } strb_appendf(sb, "%s %c= (%si * %" SPREFIX "d);", id, (str[0] < 0 ? '-' : '+'), id, ssabs(str[0])); } }
static void cuda_perdim_ptx(strb *sb, unsigned int nd, const size_t *dims, const ssize_t *str, const char *id, unsigned int bits) { int i; if (nd > 0) { strb_appendf(sb, "mov.u%u %si, i;\n", bits, id); for (i = nd-1; i > 0; i--) { strb_appendf(sb, "rem.u%u rl1, %si, %" SPREFIX "uU;\n" "mad.lo.s%u %s, rl1, %" SPREFIX "d, %s;\n" "div.u%u %si, %si, %" SPREFIX "uU;\n", bits, id, dims[i], bits, id, str[i], id, bits, id, id, dims[i]); } strb_appendf(sb, "mad.lo.s%u %s, %si, %" SPREFIX "d, %s;\n", bits, id, id, str[0], id); } }
static void appendIdxes (strb* s, const char* prologue, const char* prefix, int startIdx, int endIdx, const char* suffix, const char* epilogue){ int i; prologue = prologue ? prologue : ""; prefix = prefix ? prefix : ""; suffix = suffix ? suffix : ""; epilogue = epilogue ? epilogue : ""; strb_appends(s, prologue); for(i=startIdx;i<endIdx;i++){ strb_appendf(s, "%s%d%s%s", prefix, i, suffix, &","[i==endIdx-1]); } strb_appends(s, epilogue); }
static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx){ int i; /** * Inner Loop Prologue */ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Reduction initialization.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", ""); if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");} appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n"); appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Header Generation */ for(i=ctx->ndd;i<ctx->nds;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Body Generation */ appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\tif(V > maxV){\n"); strb_appends(&ctx->s, "\t\tmaxV = V;\n"); appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); strb_appends(&ctx->s, "\t}\n"); /** * Inner Loop Trailer Generation */ for(i=ctx->ndd;i<ctx->nds;i++){ strb_appends(&ctx->s, "\t}\n"); } strb_appends(&ctx->s, "\t\n"); /** * Inner Loop Epilogue Generation */ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * Destination writeback.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n"); appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n"); }
static inline int gen_extcopy_kernel(const extcopy_args *a, cuda_context *ctx, gpukernel **v, size_t nEls) { strb sb = STRB_STATIC_INIT; int res = GA_SYS_ERROR; int flags = GA_USE_PTX; unsigned int bits = sizeof(void *)*8; int types[2]; const char *in_t, *in_ld_t; const char *out_t, *out_ld_t; const char *rmod; in_t = map_t(a->itype); out_t = map_t(a->otype); /* Since float16 ('f16') is not a fully-supported type we need to use it as b16 (basically uint16) for read and write operations. */ if (a->itype == GA_HALF) in_ld_t = "b16"; else in_ld_t = in_t; if (a->otype == GA_HALF) out_ld_t = "b16"; else out_ld_t = out_t; rmod = get_rmod(a->itype, a->otype); if (in_t == NULL || out_t == NULL) return GA_DEVSUP_ERROR; strb_appendf(&sb, ELEM_HEADER_PTX, "4.1", ctx->bin_id, bits, bits, bits, bits, in_t, out_t, bits, bits, bits, bits, bits, nEls, bits, bits); cuda_perdim_ptx(&sb, a->ind, a->idims, a->istr, "a_p", bits); cuda_perdim_ptx(&sb, a->ond, a->odims, a->ostr, "b_p", bits); strb_appendf(&sb, "ld.param.u%u rp1, [a_data];\n" "cvt.s%u.s%u rp2, a_p;\n" "add.s%u rp1, rp1, rp2;\n" "ld.global.%s tmpa, [rp1+%" SPREFIX "u];\n" "cvt%s.%s.%s tmpb, tmpa;\n" "ld.param.u%u rp1, [b_data];\n" "cvt.s%u.s%u rp2, b_p;\n" "add.s%u rp1, rp1, rp2;\n" "st.global.%s [rp1+%" SPREFIX "u], tmpb;\n", bits, bits, bits, bits, in_ld_t, a->ioff, rmod, out_t, in_t, bits, bits, bits, bits, out_ld_t, a->ooff); strb_appendf(&sb, ELEM_FOOTER_PTX, bits, bits, nEls); if (strb_error(&sb)) goto fail; if (a->itype == GA_DOUBLE || a->otype == GA_DOUBLE || a->itype == GA_CDOUBLE || a->otype == GA_CDOUBLE) { flags |= GA_USE_DOUBLE; } if (a->otype == GA_HALF || a->itype == GA_HALF) { flags |= GA_USE_HALF; } if (gpuarray_get_elsize(a->otype) < 4 || gpuarray_get_elsize(a->itype) < 4) { /* Should check for non-mod4 strides too */ flags |= GA_USE_SMALL; } if (a->otype == GA_CFLOAT || a->itype == GA_CFLOAT || a->otype == GA_CDOUBLE || a->itype == GA_CDOUBLE) { flags |= GA_USE_COMPLEX; } types[0] = types[1] = GA_BUFFER; res = GA_NO_ERROR; *v = cuda_newkernel(ctx, 1, (const char **)&sb.s, &sb.l, "extcpy", 2, types, flags, &res, NULL); fail: strb_clear(&sb); return res; }
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, GpuArray *a, const GpuArray *v, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; char *sz, *ssz; unsigned int i, i2; unsigned int nargs, apos; int flags = GA_USE_CLUDA; int res; nargs = 9 + 2 * v->nd; atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) return GA_MEMORY_ERROR; if (addr32) { sz = "ga_uint"; ssz = "ga_int"; } else { sz = "ga_size"; ssz = "ga_ssize"; } apos = 0; strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; for (i = 0; i < v->nd; i++) { strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i); atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, " "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n", gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; assert(apos == nargs); strb_appendf(&sb, " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n" " const %s numThreads0 = LDIM_0 * GDIM_0;\n" " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n" " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n" " ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" " %s ii0 = ind[i0];\n" " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" " if ((ii0 < 0) || (ii0 >= d0)) {\n" " *err = -1;\n" " continue;\n" " }\n" " pos0 += ii0 * (%s)s0;\n" " for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n" " %s p = pos0;\n", ssz, sz, sz, sz); if (v->nd > 1) { strb_appendf(&sb, " %s pos, ii = i1;\n", sz); for (i2 = v->nd; i2 > 1; i2--) { i = i2 - 1; if (i > 1) strb_appendf(&sb, " pos = ii %% (%s)d%u;\n" " ii /= (%s)d%u;\n", sz, i, sz, i); else strb_appends(&sb, " pos = ii;\n"); strb_appendf(&sb, " p += pos * (%s)s%u;\n", ssz, i); } } strb_appendf(&sb, " r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n", sz, gpuarray_get_type(v->typecode)->cluda_name); strb_appends(&sb, " }\n" " }\n" "}\n"); if (strb_error(&sb)) { res = GA_MEMORY_ERROR; goto bail; } flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1); res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1", nargs, atypes, flags, err_str); bail: free(atypes); strb_clear(&sb); return res; }