int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m, PyGpuArrayObject **z, PyGpuContextObject *ctx) { size_t dims[2] = {0, 0}; size_t ls, gs; void *args[3]; int err; dims[0] = ((DTYPE_INPUT_0 *)PyArray_DATA(n))[0]; dims[1] = ((DTYPE_INPUT_1 *)PyArray_DATA(m))[0]; Py_XDECREF(*z); *z = pygpu_zeros(2, dims, TYPECODE, GA_C_ORDER, ctx, Py_None); if (*z == NULL) return -1; args[0] = (*z)->ga.data; args[1] = &dims[0]; args[2] = &dims[1]; ls = 1; gs = 256; /* The k_eye name comes from the kernel declaration above. */ err = GpuKernel_call(&k_eye, 1, &ls, &gs, 0, args); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: kEye: %s. n%lu, m=%lu.", GpuKernel_error(&k_eye, err), (unsigned long)dims[0], (unsigned long)dims[1]); return -1; } return 0; }
static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ void* args[11]; /** * Argument Marshalling. This the grossest gross thing in here. */ const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), ctx->src->strides, flags, 0); ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), ctx->src->dimensions, flags, 0); ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), ctx->chunkSize, flags, 0); ctx->dstMaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstMax->strides, flags, 0); ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstArgmax->strides, flags, 0); args[ 0] = (void*) ctx->src->data; args[ 1] = (void*)&ctx->src->offset; args[ 2] = (void*) ctx->srcStepsGD; args[ 3] = (void*) ctx->srcSizeGD; args[ 4] = (void*) ctx->chunkSizeGD; args[ 5] = (void*) ctx->dstMax->data; args[ 6] = (void*)&ctx->dstMax->offset; args[ 7] = (void*) ctx->dstMaxStepsGD; args[ 8] = (void*) ctx->dstArgmax->data; args[ 9] = (void*)&ctx->dstArgmax->offset; args[10] = (void*) ctx->dstArgmaxStepsGD; if(ctx->srcStepsGD && ctx->srcSizeGD && ctx->chunkSizeGD && ctx->dstMaxStepsGD && ctx->dstArgmaxStepsGD){ ctx->ret = GpuKernel_call(&ctx->kernel, ctx->ndh>0 ? ctx->ndh : 1, ctx->gridSize, ctx->blockSize, 0, args); }else{ ctx->ret = GA_MEMORY_ERROR; } gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD); gpudata_release(ctx->chunkSizeGD); gpudata_release(ctx->dstMaxStepsGD); gpudata_release(ctx->dstArgmaxStepsGD); return ctx->ret; }
static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; size_t t, *tp, i; size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount}; void *args[10]; gpudata **T; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (incX == 1) { if (ls[0] > 32) { gs[0] = (ls[0] + 31) / 32; ls[0] = 32; } if (ls[0] * ls[1] > 512) { gs[1] = (ls[1] + 15) / 16; ls[1] = 16; } } else { if (ls[1] > 32) { gs[1] = (ls[1] + 31) / 32; ls[1] = 32; } if (ls[0] * ls[1] > 512) { gs[0] = (ls[0] + 15) / 16; ls[0] = 16; } } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) return GA_VALUE_ERROR; gs[2] = (65535 / (gs[0] * gs[1])); } if (order == cb_c) { t = M; M = N; N = t; tp = offX; offX = offY; offY = tp; t = incX; incX = incY; incY = t; T = x; x = y; y = T; } ASSERT_BUF(x[0]); ctx = x[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = xa; args[1] = &incX; args[2] = ya; args[3] = &incY; args[4] = α args[5] = Aa; args[6] = &lda; args[7] = &batchCount; args[8] = &M; args[9] = &N; err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; size_t ls[2], gs[2]; void *args[9]; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR; if (M < 512) { ls[0] = 32; if (batchCount > 16) ls[1] = 16; else ls[1] = batchCount; } else { ls[0] = 512; ls[1] = 1; } gs[0] = (M + ls[0] - 1) / ls[0]; gs[1] = (batchCount + ls[1] - 1) / ls[1]; if (gs[0] * gs[1] / 65535) { gs[1] = (65535 / gs[0]); } if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = Aa; args[1] = &lda; args[2] = xa; args[3] = &incX; args[4] = ya; args[5] = &incY; args[6] = &batchCount; args[7] = &M; args[8] = &N; if (transA == cb_no_trans) { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); } else { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); } cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }
int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int check_error) { size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0}; size_t pl; gpudata *errbuf; #if DEBUG char *errstr = NULL; #endif GpuKernel k; unsigned int j; unsigned int argp; int err, kerr = 0; int addr32 = 0; if (!GpuArray_ISWRITEABLE(a)) return GA_INVALID_ERROR; if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(i)) return GA_UNALIGNED_ERROR; /* a and i have to be C contiguous */ if (!GpuArray_IS_C_CONTIGUOUS(a) || !GpuArray_IS_C_CONTIGUOUS(i)) return GA_INVALID_ERROR; /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */ if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd || a->dimensions[0] != i->dimensions[0]) return GA_INVALID_ERROR; n[0] = i->dimensions[0]; n[1] = 1; for (j = 1; j < v->nd; j++) { if (a->dimensions[j] != v->dimensions[j]) return GA_INVALID_ERROR; n[1] *= v->dimensions[j]; } if (n[0] * n[1] < SADDR32_MAX) { addr32 = 1; } err = gpudata_property(v->data, GA_CTX_PROP_ERRBUF, &errbuf); if (err != GA_NO_ERROR) return err; err = gen_take1_kernel(&k, GpuArray_context(a), #if DEBUG &errstr, #else NULL, #endif a, v, i, addr32); #if DEBUG if (errstr != NULL) { fprintf(stderr, "%s\n", errstr); free(errstr); } #endif if (err != GA_NO_ERROR) return err; err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]); if (err != GA_NO_ERROR) goto out; /* This may not be the best scheduling, but it's good enough */ err = gpukernel_property(k.k, GA_KERNEL_PROP_PREFLSIZE, &pl); ls[0] = ls[1] / pl; ls[1] = pl; if (n[1] > n[0]) { pl = ls[0]; ls[0] = ls[1]; ls[1] = pl; gs[0] = 1; } else { gs[0] = gs[1]; gs[1] = 1; } argp = 0; GpuKernel_setarg(&k, argp++, a->data); GpuKernel_setarg(&k, argp++, (void *)&a->offset); GpuKernel_setarg(&k, argp++, v->data); /* The cast is to avoid a warning about const */ GpuKernel_setarg(&k, argp++, (void *)&v->offset); for (j = 0; j < v->nd; j++) { GpuKernel_setarg(&k, argp++, &v->strides[j]); GpuKernel_setarg(&k, argp++, &v->dimensions[j]); } GpuKernel_setarg(&k, argp++, i->data); GpuKernel_setarg(&k, argp++, (void *)&i->offset); GpuKernel_setarg(&k, argp++, &n[0]); GpuKernel_setarg(&k, argp++, &n[1]); GpuKernel_setarg(&k, argp++, errbuf); err = GpuKernel_call(&k, 2, gs, ls, 0, NULL); if (check_error && err == GA_NO_ERROR) { err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); if (err == GA_NO_ERROR && kerr != 0) { err = GA_VALUE_ERROR; kerr = 0; /* We suppose this will not fail */ gpudata_write(errbuf, 0, &kerr, sizeof(int)); } } out: GpuKernel_clear(&k); return err; }