Example #1
0
static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx){
	const int    ARG_TYPECODES[]   = {
		GA_BUFFER, /* src */
		GA_SIZE,   /* srcOff */
		GA_BUFFER, /* srcSteps */
		GA_BUFFER, /* srcSize */
		GA_BUFFER, /* chnkSize */
		GA_BUFFER, /* dstMax */
		GA_SIZE,   /* dstMaxOff */
		GA_BUFFER, /* dstMaxSteps */
		GA_BUFFER, /* dstArgmax */
		GA_SIZE,   /* dstArgmaxOff */
		GA_BUFFER  /* dstArgmaxSteps */
	};
	const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
	const char*  SRCS[1];

	SRCS[0] = ctx->sourceCode;

	ctx->ret = GpuKernel_init(&ctx->kernel,
	                          ctx->gpuCtx,
	                          1,
	                          SRCS,
	                          NULL,
	                          "maxandargmax",
	                          ARG_TYPECODES_LEN,
	                          ARG_TYPECODES,
	                          GA_USE_CLUDA,
	                          (char**)0);
	free(ctx->sourceCode);
	ctx->sourceCode = NULL;

	return ctx->ret;
}
static int setup(void *c) {
  cuda_context *ctx = (cuda_context *)c;
  blas_handle *handle;
  const char *tmp[2];
  cublasStatus_t err;
  int e;
  int types[10];

  if (ctx->blas_handle != NULL)
    return GA_NO_ERROR;

  handle = calloc(1, sizeof(*handle));
  if (handle == NULL)
    return GA_MEMORY_ERROR;

  cuda_enter(ctx);
  err = cublasCreate(&handle->h);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    free(handle);
    return GA_BLAS_ERROR;
  }

  err = cublasSetStream(handle->h, ctx->s);
  if (err != CUBLAS_STATUS_SUCCESS) {
    e = GA_BLAS_ERROR;
    goto e1;
  }

  cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST);
  cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED);

  types[0] = GA_BUFFER;
  types[1] = GA_SIZE;
  types[2] = GA_BUFFER;
  types[3] = GA_SIZE;
  types[4] = GA_BUFFER;
  types[5] = GA_SIZE;
  types[6] = GA_SIZE;
  types[7] = GA_SIZE;
  types[8] = GA_SIZE;
  e = GpuKernel_init(&handle->sgemvBH_N_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_N_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e1;
  e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e2;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgemvBH_N_a1_b1_small;
  e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e3;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgemvBH_T_a1_b1_small;
  e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e4;

  types[0] = GA_BUFFER;
  types[1] = GA_SIZE;
  types[2] = GA_BUFFER;
  types[3] = GA_SIZE;
  types[4] = GA_FLOAT;
  types[5] = GA_BUFFER;
  types[6] = GA_SIZE;
  types[7] = GA_SIZE;
  types[8] = GA_SIZE;
  types[9] = GA_SIZE;
  e = GpuKernel_init(&handle->sgerBH_gen_small, &cuda_ops, ctx, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e5;
  types[4] = GA_DOUBLE;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgerBH_gen_small;
  e = GpuKernel_init(&handle->dgerBH_gen_small, &cuda_ops, ctx, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e6;

  ctx->blas_handle = handle;

  cuda_exit(ctx);

  return GA_NO_ERROR;

 e6:
  GpuKernel_clear(&handle->sgerBH_gen_small);
 e5:
  GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small);
 e4:
  GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small);
 e3:
  GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small);
 e2:
  GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small);
 e1:
  cublasDestroy(handle->h);
  cuda_exit(ctx);
  free(handle);
  return e;
}
Example #3
0
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                            GpuArray *a, const GpuArray *v,
                            const GpuArray *ind, int addr32) {
  strb sb = STRB_STATIC_INIT;
  int *atypes;
  char *sz, *ssz;
  unsigned int i, i2;
  unsigned int nargs, apos;
  int flags = GA_USE_CLUDA;
  int res;

  nargs = 9 + 2 * v->nd;

  atypes = calloc(nargs, sizeof(int));
  if (atypes == NULL)
    return GA_MEMORY_ERROR;

  if (addr32) {
    sz = "ga_uint";
    ssz = "ga_int";
  } else {
    sz = "ga_size";
    ssz = "ga_ssize";
  }

  apos = 0;
  strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, "
               "GLOBAL_MEM const %s *v, ga_size v_off,",
               gpuarray_get_type(a->typecode)->cluda_name,
               gpuarray_get_type(v->typecode)->cluda_name);
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  for (i = 0; i < v->nd; i++) {
    strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i);
    atypes[apos++] = GA_SSIZE;
    atypes[apos++] = GA_SIZE;
  }
  strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, "
               "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n",
               gpuarray_get_type(ind->typecode)->cluda_name);
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  assert(apos == nargs);
  strb_appendf(&sb, "  const %s idx0 = LDIM_0 * GID_0 + LID_0;\n"
               "  const %s numThreads0 = LDIM_0 * GDIM_0;\n"
               "  const %s idx1 = LDIM_1 * GID_1 + LID_1;\n"
               "  const %s numThreads1 = LDIM_1 * GDIM_1;\n"
               "  %s i0, i1;\n", sz, sz, sz, sz, sz);
  strb_appends(&sb, "  if (idx0 >= n0 || idx1 >= n1) return;\n");
  strb_appendf(&sb, "  r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n"
               "  ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n",
               gpuarray_get_type(a->typecode)->cluda_name,
               gpuarray_get_type(ind->typecode)->cluda_name);
  strb_appendf(&sb, "  for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n"
               "    %s ii0 = ind[i0];\n"
               "    %s pos0 = v_off;\n"
               "    if (ii0 < 0) ii0 += d0;\n"
               "    if ((ii0 < 0) || (ii0 >= d0)) {\n"
               "      *err = -1;\n"
               "      continue;\n"
               "    }\n"
               "    pos0 += ii0 * (%s)s0;\n"
               "    for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n"
               "      %s p = pos0;\n", ssz, sz, sz, sz);
  if (v->nd > 1) {
    strb_appendf(&sb, "      %s pos, ii = i1;\n", sz);
    for (i2 = v->nd; i2 > 1; i2--) {
      i = i2 - 1;
      if (i > 1)
        strb_appendf(&sb, "      pos = ii %% (%s)d%u;\n"
                     "      ii /= (%s)d%u;\n", sz, i, sz, i);
      else
        strb_appends(&sb, "      pos = ii;\n");
      strb_appendf(&sb, "      p += pos * (%s)s%u;\n", ssz, i);
    }
  }
  strb_appendf(&sb, "      r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n",
               sz, gpuarray_get_type(v->typecode)->cluda_name);
  strb_appends(&sb, "    }\n"
               "  }\n"
               "}\n");
  if (strb_error(&sb)) {
    res = GA_MEMORY_ERROR;
    goto bail;
  }
  flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1);
  res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1",
                       nargs, atypes, flags, err_str);
bail:
  free(atypes);
  strb_clear(&sb);
  return res;
}