static int maxandargmaxCompile (maxandargmax_ctx* ctx){ const int ARG_TYPECODES[] = { GA_BUFFER, /* src */ GA_SIZE, /* srcOff */ GA_BUFFER, /* srcSteps */ GA_BUFFER, /* srcSize */ GA_BUFFER, /* chnkSize */ GA_BUFFER, /* dstMax */ GA_SIZE, /* dstMaxOff */ GA_BUFFER, /* dstMaxSteps */ GA_BUFFER, /* dstArgmax */ GA_SIZE, /* dstArgmaxOff */ GA_BUFFER /* dstArgmaxSteps */ }; const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); const char* SRCS[1]; SRCS[0] = ctx->sourceCode; ctx->ret = GpuKernel_init(&ctx->kernel, ctx->gpuCtx, 1, SRCS, NULL, "maxandargmax", ARG_TYPECODES_LEN, ARG_TYPECODES, GA_USE_CLUDA, (char**)0); free(ctx->sourceCode); ctx->sourceCode = NULL; return ctx->ret; }
static int setup(void *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle; const char *tmp[2]; cublasStatus_t err; int e; int types[10]; if (ctx->blas_handle != NULL) return GA_NO_ERROR; handle = calloc(1, sizeof(*handle)); if (handle == NULL) return GA_MEMORY_ERROR; cuda_enter(ctx); err = cublasCreate(&handle->h); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); free(handle); return GA_BLAS_ERROR; } err = cublasSetStream(handle->h, ctx->s); if (err != CUBLAS_STATUS_SUCCESS) { e = GA_BLAS_ERROR; goto e1; } cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST); cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED); types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_BUFFER; types[5] = GA_SIZE; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; e = GpuKernel_init(&handle->sgemvBH_N_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_N_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e1; e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e2; tmp[0] = atomicadd_double; tmp[1] = code_dgemvBH_N_a1_b1_small; e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e3; tmp[0] = atomicadd_double; tmp[1] = code_dgemvBH_T_a1_b1_small; e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e4; types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_FLOAT; types[5] = GA_BUFFER; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; types[9] = GA_SIZE; e = GpuKernel_init(&handle->sgerBH_gen_small, &cuda_ops, ctx, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL); if (e != GA_NO_ERROR) goto e5; types[4] = GA_DOUBLE; tmp[0] = atomicadd_double; tmp[1] = code_dgerBH_gen_small; e = GpuKernel_init(&handle->dgerBH_gen_small, &cuda_ops, ctx, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e6; ctx->blas_handle = handle; cuda_exit(ctx); return GA_NO_ERROR; e6: GpuKernel_clear(&handle->sgerBH_gen_small); e5: GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); e4: GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); e3: GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); e2: GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); e1: cublasDestroy(handle->h); cuda_exit(ctx); free(handle); return e; }
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, GpuArray *a, const GpuArray *v, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; char *sz, *ssz; unsigned int i, i2; unsigned int nargs, apos; int flags = GA_USE_CLUDA; int res; nargs = 9 + 2 * v->nd; atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) return GA_MEMORY_ERROR; if (addr32) { sz = "ga_uint"; ssz = "ga_int"; } else { sz = "ga_size"; ssz = "ga_ssize"; } apos = 0; strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; for (i = 0; i < v->nd; i++) { strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i); atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, " "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n", gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; assert(apos == nargs); strb_appendf(&sb, " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n" " const %s numThreads0 = LDIM_0 * GDIM_0;\n" " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n" " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n" " ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" " %s ii0 = ind[i0];\n" " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" " if ((ii0 < 0) || (ii0 >= d0)) {\n" " *err = -1;\n" " continue;\n" " }\n" " pos0 += ii0 * (%s)s0;\n" " for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n" " %s p = pos0;\n", ssz, sz, sz, sz); if (v->nd > 1) { strb_appendf(&sb, " %s pos, ii = i1;\n", sz); for (i2 = v->nd; i2 > 1; i2--) { i = i2 - 1; if (i > 1) strb_appendf(&sb, " pos = ii %% (%s)d%u;\n" " ii /= (%s)d%u;\n", sz, i, sz, i); else strb_appends(&sb, " pos = ii;\n"); strb_appendf(&sb, " p += pos * (%s)s%u;\n", ssz, i); } } strb_appendf(&sb, " r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n", sz, gpuarray_get_type(v->typecode)->cluda_name); strb_appends(&sb, " }\n" " }\n" "}\n"); if (strb_error(&sb)) { res = GA_MEMORY_ERROR; goto bail; } flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1); res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1", nargs, atypes, flags, err_str); bail: free(atypes); strb_clear(&sb); return res; }