示例#1
0
static gpukernel *cuda_newkernel(void *c, unsigned int count,
                                 const char **strings, const size_t *lengths,
                                 const char *fname, unsigned int argcount,
                                 const int *types, int flags, int *ret,
                                 char **err_str) {
    cuda_context *ctx = (cuda_context *)c;
    strb sb = STRB_STATIC_INIT;
    char *bin, *log = NULL;
    srckey k, *ak;
    binval *av;
    gpukernel *res;
    size_t bin_len = 0, log_len = 0;
    CUdevice dev;
    unsigned int i;
    int ptx_mode = 0;
    int binary_mode = 0;
    int major, minor;

    if (count == 0) FAIL(NULL, GA_VALUE_ERROR);

    if (flags & GA_USE_OPENCL)
      FAIL(NULL, GA_DEVSUP_ERROR);

    if (flags & GA_USE_BINARY) {
      // GA_USE_BINARY is exclusive
      if (flags & ~GA_USE_BINARY)
        FAIL(NULL, GA_INVALID_ERROR);
      // We need the length for binary data and there is only one blob.
      if (count != 1 || lengths == NULL || lengths[0] == 0)
        FAIL(NULL, GA_VALUE_ERROR);
    }

    cuda_enter(ctx);

    ctx->err = cuCtxGetDevice(&dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    ctx->err = cuDeviceComputeCapability(&major, &minor, dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    // GA_USE_CLUDA is done later
    // GA_USE_SMALL will always work
    if (flags & GA_USE_DOUBLE) {
      if (major < 1 || (major == 1 && minor < 3)) {
        cuda_exit(ctx);
        FAIL(NULL, GA_DEVSUP_ERROR);
      }
    }
    if (flags & GA_USE_COMPLEX) {
      // just for now since it is most likely broken
      cuda_exit(ctx);
      FAIL(NULL, GA_DEVSUP_ERROR);
    }
    // GA_USE_HALF should always work

    if (flags & GA_USE_PTX) {
      ptx_mode = 1;
    } else if (flags & GA_USE_BINARY) {
      binary_mode = 1;
    }

    if (binary_mode) {
      bin = memdup(strings[0], lengths[0]);
      bin_len = lengths[0];
      if (bin == NULL) {
        cuda_exit(ctx);
        FAIL(NULL, GA_MEMORY_ERROR);
      }
    } else {
      if (flags & GA_USE_CLUDA) {
        strb_appends(&sb, CUDA_PREAMBLE);
      }

      if (lengths == NULL) {
        for (i = 0; i < count; i++)
        strb_appends(&sb, strings[i]);
      } else {
        for (i = 0; i < count; i++) {
          if (lengths[i] == 0)
            strb_appends(&sb, strings[i]);
          else
            strb_appendn(&sb, strings[i], lengths[i]);
        }
      }

      strb_append0(&sb);

      if (strb_error(&sb)) {
        strb_clear(&sb);
        cuda_exit(ctx);
        return NULL;
      }

      if (ptx_mode) {
        bin = sb.s;
        bin_len = sb.l;
      } else {
        bin = NULL;
        if (compile_cache != NULL) {
          k.src = sb.s;
          k.len = sb.l;
          memcpy(k.arch, ctx->bin_id, BIN_ID_LEN);
          av = cache_get(compile_cache, &k);
          if (av != NULL) {
            bin = memdup(av->bin, av->len);
            bin_len = av->len;
          }
        }
        if (bin == NULL) {
          bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len,
                              &log, &log_len, ret);
        }
        if (bin == NULL) {
          if (err_str != NULL) {
            strb debug_msg = STRB_STATIC_INIT;

            // We're substituting debug_msg for a string with this first line:
            strb_appends(&debug_msg, "CUDA kernel build failure ::\n");

            /* Delete the final NUL */
            sb.l--;
            gpukernel_source_with_line_numbers(1, (const char **)&sb.s,
                                               &sb.l, &debug_msg);

            if (log != NULL) {
              strb_appends(&debug_msg, "\nCompiler log:\n");
              strb_appendn(&debug_msg, log, log_len);
              free(log);
            }
            *err_str = strb_cstr(&debug_msg);
            // *err_str will be free()d by the caller (see docs in kernel.h)
          }
          strb_clear(&sb);
          cuda_exit(ctx);
          return NULL;
        }
        if (compile_cache == NULL)
          compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free,
                                     bin_free);

        if (compile_cache != NULL) {
          ak = malloc(sizeof(*ak));
          av = malloc(sizeof(*av));
          if (ak == NULL || av == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->src = memdup(sb.s, sb.l);
          if (ak->src == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->len = sb.l;
          memmove(ak->arch, ctx->bin_id, BIN_ID_LEN);
          av->len = bin_len;
          av->bin = memdup(bin, bin_len);
          if (av->bin == NULL) {
            src_free(ak);
            free(av);
            goto done;
          }
          cache_add(compile_cache, ak, av);
        }
      done:
        strb_clear(&sb);
      }
    }

    res = calloc(1, sizeof(*res));
    if (res == NULL) {
      free(bin);
      cuda_exit(ctx);
      FAIL(NULL, GA_SYS_ERROR);
    }

    res->bin_sz = bin_len;
    res->bin = bin;

    res->refcnt = 1;
    res->argcount = argcount;
    res->types = calloc(argcount, sizeof(int));
    if (res->types == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }
    memcpy(res->types, types, argcount*sizeof(int));
    res->args = calloc(argcount, sizeof(void *));
    if (res->args == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }

    ctx->err = cuModuleLoadData(&res->m, bin);

    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    res->ctx = ctx;
    ctx->refcnt++;
    cuda_exit(ctx);
    TAG_KER(res);
    return res;
}
示例#2
0
static inline int gen_extcopy_kernel(const extcopy_args *a,
				     cuda_context *ctx, gpukernel **v,
				     size_t nEls) {
  strb sb = STRB_STATIC_INIT;
  int res = GA_SYS_ERROR;
  int flags = GA_USE_PTX;
  unsigned int bits = sizeof(void *)*8;
  int types[2];
  const char *in_t, *in_ld_t;
  const char *out_t, *out_ld_t;
  const char *rmod;

  in_t = map_t(a->itype);
  out_t = map_t(a->otype);
  /* Since float16 ('f16') is not a fully-supported type we need to use
     it as b16 (basically uint16) for read and write operations. */
  if (a->itype == GA_HALF)
    in_ld_t = "b16";
  else
    in_ld_t = in_t;
  if (a->otype == GA_HALF)
    out_ld_t = "b16";
  else
    out_ld_t = out_t;
  rmod = get_rmod(a->itype, a->otype);
  if (in_t == NULL || out_t == NULL) return GA_DEVSUP_ERROR;

  strb_appendf(&sb, ELEM_HEADER_PTX, "4.1", ctx->bin_id,
               bits, bits, bits, bits, in_t, out_t, bits,
               bits, bits, bits, bits, nEls, bits, bits);

  cuda_perdim_ptx(&sb, a->ind, a->idims, a->istr, "a_p", bits);
  cuda_perdim_ptx(&sb, a->ond, a->odims, a->ostr, "b_p", bits);

  strb_appendf(&sb, "ld.param.u%u rp1, [a_data];\n"
	       "cvt.s%u.s%u rp2, a_p;\n"
	       "add.s%u rp1, rp1, rp2;\n"
	       "ld.global.%s tmpa, [rp1+%" SPREFIX "u];\n"
	       "cvt%s.%s.%s tmpb, tmpa;\n"
	       "ld.param.u%u rp1, [b_data];\n"
	       "cvt.s%u.s%u rp2, b_p;\n"
	       "add.s%u rp1, rp1, rp2;\n"
	       "st.global.%s [rp1+%" SPREFIX "u], tmpb;\n", bits,
	       bits, bits,
	       bits,
	       in_ld_t, a->ioff,
	       rmod, out_t, in_t,
	       bits,
	       bits, bits,
	       bits,
	       out_ld_t, a->ooff);

  strb_appendf(&sb, ELEM_FOOTER_PTX, bits, bits, nEls);

  if (strb_error(&sb))
    goto fail;

  if (a->itype == GA_DOUBLE || a->otype == GA_DOUBLE ||
      a->itype == GA_CDOUBLE || a->otype == GA_CDOUBLE) {
    flags |= GA_USE_DOUBLE;
  }

  if (a->otype == GA_HALF || a->itype == GA_HALF) {
    flags |= GA_USE_HALF;
  }

  if (gpuarray_get_elsize(a->otype) < 4 || gpuarray_get_elsize(a->itype) < 4) {
    /* Should check for non-mod4 strides too */
    flags |= GA_USE_SMALL;
  }

  if (a->otype == GA_CFLOAT || a->itype == GA_CFLOAT ||
      a->otype == GA_CDOUBLE || a->itype == GA_CDOUBLE) {
    flags |= GA_USE_COMPLEX;
  }

  types[0] = types[1] = GA_BUFFER;
  res = GA_NO_ERROR;
  *v = cuda_newkernel(ctx, 1, (const char **)&sb.s, &sb.l, "extcpy",
                      2, types, flags, &res, NULL);
 fail:
  strb_clear(&sb);
  return res;
}
示例#3
0
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                            GpuArray *a, const GpuArray *v,
                            const GpuArray *ind, int addr32) {
  strb sb = STRB_STATIC_INIT;
  int *atypes;
  char *sz, *ssz;
  unsigned int i, i2;
  unsigned int nargs, apos;
  int flags = GA_USE_CLUDA;
  int res;

  nargs = 9 + 2 * v->nd;

  atypes = calloc(nargs, sizeof(int));
  if (atypes == NULL)
    return GA_MEMORY_ERROR;

  if (addr32) {
    sz = "ga_uint";
    ssz = "ga_int";
  } else {
    sz = "ga_size";
    ssz = "ga_ssize";
  }

  apos = 0;
  strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, "
               "GLOBAL_MEM const %s *v, ga_size v_off,",
               gpuarray_get_type(a->typecode)->cluda_name,
               gpuarray_get_type(v->typecode)->cluda_name);
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  for (i = 0; i < v->nd; i++) {
    strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i);
    atypes[apos++] = GA_SSIZE;
    atypes[apos++] = GA_SIZE;
  }
  strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, "
               "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n",
               gpuarray_get_type(ind->typecode)->cluda_name);
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  assert(apos == nargs);
  strb_appendf(&sb, "  const %s idx0 = LDIM_0 * GID_0 + LID_0;\n"
               "  const %s numThreads0 = LDIM_0 * GDIM_0;\n"
               "  const %s idx1 = LDIM_1 * GID_1 + LID_1;\n"
               "  const %s numThreads1 = LDIM_1 * GDIM_1;\n"
               "  %s i0, i1;\n", sz, sz, sz, sz, sz);
  strb_appends(&sb, "  if (idx0 >= n0 || idx1 >= n1) return;\n");
  strb_appendf(&sb, "  r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n"
               "  ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n",
               gpuarray_get_type(a->typecode)->cluda_name,
               gpuarray_get_type(ind->typecode)->cluda_name);
  strb_appendf(&sb, "  for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n"
               "    %s ii0 = ind[i0];\n"
               "    %s pos0 = v_off;\n"
               "    if (ii0 < 0) ii0 += d0;\n"
               "    if ((ii0 < 0) || (ii0 >= d0)) {\n"
               "      *err = -1;\n"
               "      continue;\n"
               "    }\n"
               "    pos0 += ii0 * (%s)s0;\n"
               "    for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n"
               "      %s p = pos0;\n", ssz, sz, sz, sz);
  if (v->nd > 1) {
    strb_appendf(&sb, "      %s pos, ii = i1;\n", sz);
    for (i2 = v->nd; i2 > 1; i2--) {
      i = i2 - 1;
      if (i > 1)
        strb_appendf(&sb, "      pos = ii %% (%s)d%u;\n"
                     "      ii /= (%s)d%u;\n", sz, i, sz, i);
      else
        strb_appends(&sb, "      pos = ii;\n");
      strb_appendf(&sb, "      p += pos * (%s)s%u;\n", ssz, i);
    }
  }
  strb_appendf(&sb, "      r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n",
               sz, gpuarray_get_type(v->typecode)->cluda_name);
  strb_appends(&sb, "    }\n"
               "  }\n"
               "}\n");
  if (strb_error(&sb)) {
    res = GA_MEMORY_ERROR;
    goto bail;
  }
  flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1);
  res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1",
                       nargs, atypes, flags, err_str);
bail:
  free(atypes);
  strb_clear(&sb);
  return res;
}