int GpuArray_fromdata(GpuArray *a, gpudata *data, size_t offset, int typecode, unsigned int nd, const size_t *dims, const ssize_t *strides, int writeable) { if (gpuarray_get_type(typecode)->typecode != typecode) return GA_VALUE_ERROR; assert(data != NULL); a->data = data; gpudata_retain(a->data); a->nd = nd; a->offset = offset; a->typecode = typecode; a->dimensions = calloc(nd, sizeof(size_t)); a->strides = calloc(nd, sizeof(ssize_t)); a->flags = (writeable ? GA_WRITEABLE : 0); if (a->dimensions == NULL || a->strides == NULL) { GpuArray_clear(a); return GA_MEMORY_ERROR; } memcpy(a->dimensions, dims, nd*sizeof(size_t)); memcpy(a->strides, strides, nd*sizeof(ssize_t)); GpuArray_fix_flags(a); return GA_NO_ERROR; }
int GpuArray_is_aligned(const GpuArray *a) { size_t align = gpuarray_get_type(a->typecode)->align; unsigned int i; if (a->offset % align != 0) return 0; for (i = 0; i < a->nd; i++) { if (a->strides[i] % align != 0) return 0; } return 1; }
static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ int i; /** * We initialize certain parts of the context. */ ctx->ret = GA_NO_ERROR; ctx->axisList = NULL; ctx->gpuCtx = NULL; ctx->dstMaxType = ctx->dstArgmaxType = NULL; ctx->ndh = 0; ctx->sourceCode = NULL; ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0; ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1; ctx->gridSize [0] = ctx->gridSize [1] = ctx->gridSize [2] = 1; ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1; ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL; /* Insane src or reduxLen? */ if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 || ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){ return ctx->ret=GA_INVALID_ERROR; } /* Insane or duplicate list entry? */ for(i=0;i<ctx->reduxLen;i++){ if(ctx->reduxList[i] < 0 || ctx->reduxList[i] >= (int)ctx->src->nd || axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ return ctx->ret=GA_INVALID_ERROR; } } /* Unknown type? */ ctx->dstMaxType = gpuarray_get_type(ctx->src->typecode)->cluda_name; ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE) ->cluda_name; if(!ctx->dstMaxType || !ctx->dstArgmaxType){ return ctx->ret=GA_INVALID_ERROR; } /* GPU context non-existent? */ ctx->gpuCtx = GpuArray_context(ctx->src); if(!ctx->gpuCtx){ return ctx->ret=GA_INVALID_ERROR; } /** * We initialize some more parts of the context, using the guarantees * we now have about the sanity of the arguments. */ ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; return ctx->ret; }
size_t gpuarray_get_elsize(int typecode) { return gpuarray_get_type(typecode)->size; }
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, GpuArray *a, const GpuArray *v, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; char *sz, *ssz; unsigned int i, i2; unsigned int nargs, apos; int flags = GA_USE_CLUDA; int res; nargs = 9 + 2 * v->nd; atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) return GA_MEMORY_ERROR; if (addr32) { sz = "ga_uint"; ssz = "ga_int"; } else { sz = "ga_size"; ssz = "ga_ssize"; } apos = 0; strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; for (i = 0; i < v->nd; i++) { strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i); atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, " "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n", gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; assert(apos == nargs); strb_appendf(&sb, " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n" " const %s numThreads0 = LDIM_0 * GDIM_0;\n" " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n" " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n" " ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" " %s ii0 = ind[i0];\n" " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" " if ((ii0 < 0) || (ii0 >= d0)) {\n" " *err = -1;\n" " continue;\n" " }\n" " pos0 += ii0 * (%s)s0;\n" " for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n" " %s p = pos0;\n", ssz, sz, sz, sz); if (v->nd > 1) { strb_appendf(&sb, " %s pos, ii = i1;\n", sz); for (i2 = v->nd; i2 > 1; i2--) { i = i2 - 1; if (i > 1) strb_appendf(&sb, " pos = ii %% (%s)d%u;\n" " ii /= (%s)d%u;\n", sz, i, sz, i); else strb_appends(&sb, " pos = ii;\n"); strb_appendf(&sb, " p += pos * (%s)s%u;\n", ssz, i); } } strb_appendf(&sb, " r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n", sz, gpuarray_get_type(v->typecode)->cluda_name); strb_appends(&sb, " }\n" " }\n" "}\n"); if (strb_error(&sb)) { res = GA_MEMORY_ERROR; goto bail; } flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1); res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1", nargs, atypes, flags, err_str); bail: free(atypes); strb_clear(&sb); return res; }