Ejemplo n.º 1
0
static int ga_extcopy(GpuArray *dst, const GpuArray *src) {
  struct extcopy_args a, *aa;
  gpucontext *ctx = gpudata_context(dst->data);
  GpuElemwise *k = NULL;
  void *args[2];

  if (ctx != gpudata_context(src->data))
    return GA_INVALID_ERROR;

  a.itype = src->typecode;
  a.otype = dst->typecode;

  if (ctx->extcopy_cache != NULL)
    k = cache_get(ctx->extcopy_cache, &a);
  if (k == NULL) {
    gpuelemwise_arg gargs[2];
    gargs[0].name = "src";
    gargs[0].typecode = src->typecode;
    gargs[0].flags = GE_READ;
    gargs[1].name = "dst";
    gargs[1].typecode = dst->typecode;
    gargs[1].flags = GE_WRITE;
    k = GpuElemwise_new(ctx, "", "dst = src", 2, gargs, 0, 0);
    if (k == NULL)
      return GA_MISC_ERROR;
    aa = memdup(&a, sizeof(a));
    if (aa == NULL) {
      GpuElemwise_free(k);
      return GA_MEMORY_ERROR;
    }
    if (ctx->extcopy_cache == NULL)
      ctx->extcopy_cache = cache_twoq(4, 8, 8, 2, extcopy_eq, extcopy_hash,
                                      extcopy_free,
                                      (cache_freev_fn)GpuElemwise_free,
                                      ctx->err);
    if (ctx->extcopy_cache == NULL)
      return GA_MISC_ERROR;
    if (cache_add(ctx->extcopy_cache, aa, k) != 0)
      return GA_MISC_ERROR;
  }
  args[0] = (void *)src;
  args[1] = (void *)dst;
  return GpuElemwise_call(k, args, GE_BROADCAST);
}
Ejemplo n.º 2
0
static gpukernel *cuda_newkernel(void *c, unsigned int count,
                                 const char **strings, const size_t *lengths,
                                 const char *fname, unsigned int argcount,
                                 const int *types, int flags, int *ret,
                                 char **err_str) {
    cuda_context *ctx = (cuda_context *)c;
    strb sb = STRB_STATIC_INIT;
    char *bin, *log = NULL;
    srckey k, *ak;
    binval *av;
    gpukernel *res;
    size_t bin_len = 0, log_len = 0;
    CUdevice dev;
    unsigned int i;
    int ptx_mode = 0;
    int binary_mode = 0;
    int major, minor;

    if (count == 0) FAIL(NULL, GA_VALUE_ERROR);

    if (flags & GA_USE_OPENCL)
      FAIL(NULL, GA_DEVSUP_ERROR);

    if (flags & GA_USE_BINARY) {
      // GA_USE_BINARY is exclusive
      if (flags & ~GA_USE_BINARY)
        FAIL(NULL, GA_INVALID_ERROR);
      // We need the length for binary data and there is only one blob.
      if (count != 1 || lengths == NULL || lengths[0] == 0)
        FAIL(NULL, GA_VALUE_ERROR);
    }

    cuda_enter(ctx);

    ctx->err = cuCtxGetDevice(&dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    ctx->err = cuDeviceComputeCapability(&major, &minor, dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    // GA_USE_CLUDA is done later
    // GA_USE_SMALL will always work
    if (flags & GA_USE_DOUBLE) {
      if (major < 1 || (major == 1 && minor < 3)) {
        cuda_exit(ctx);
        FAIL(NULL, GA_DEVSUP_ERROR);
      }
    }
    if (flags & GA_USE_COMPLEX) {
      // just for now since it is most likely broken
      cuda_exit(ctx);
      FAIL(NULL, GA_DEVSUP_ERROR);
    }
    // GA_USE_HALF should always work

    if (flags & GA_USE_PTX) {
      ptx_mode = 1;
    } else if (flags & GA_USE_BINARY) {
      binary_mode = 1;
    }

    if (binary_mode) {
      bin = memdup(strings[0], lengths[0]);
      bin_len = lengths[0];
      if (bin == NULL) {
        cuda_exit(ctx);
        FAIL(NULL, GA_MEMORY_ERROR);
      }
    } else {
      if (flags & GA_USE_CLUDA) {
        strb_appends(&sb, CUDA_PREAMBLE);
      }

      if (lengths == NULL) {
        for (i = 0; i < count; i++)
        strb_appends(&sb, strings[i]);
      } else {
        for (i = 0; i < count; i++) {
          if (lengths[i] == 0)
            strb_appends(&sb, strings[i]);
          else
            strb_appendn(&sb, strings[i], lengths[i]);
        }
      }

      strb_append0(&sb);

      if (strb_error(&sb)) {
        strb_clear(&sb);
        cuda_exit(ctx);
        return NULL;
      }

      if (ptx_mode) {
        bin = sb.s;
        bin_len = sb.l;
      } else {
        bin = NULL;
        if (compile_cache != NULL) {
          k.src = sb.s;
          k.len = sb.l;
          memcpy(k.arch, ctx->bin_id, BIN_ID_LEN);
          av = cache_get(compile_cache, &k);
          if (av != NULL) {
            bin = memdup(av->bin, av->len);
            bin_len = av->len;
          }
        }
        if (bin == NULL) {
          bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len,
                              &log, &log_len, ret);
        }
        if (bin == NULL) {
          if (err_str != NULL) {
            strb debug_msg = STRB_STATIC_INIT;

            // We're substituting debug_msg for a string with this first line:
            strb_appends(&debug_msg, "CUDA kernel build failure ::\n");

            /* Delete the final NUL */
            sb.l--;
            gpukernel_source_with_line_numbers(1, (const char **)&sb.s,
                                               &sb.l, &debug_msg);

            if (log != NULL) {
              strb_appends(&debug_msg, "\nCompiler log:\n");
              strb_appendn(&debug_msg, log, log_len);
              free(log);
            }
            *err_str = strb_cstr(&debug_msg);
            // *err_str will be free()d by the caller (see docs in kernel.h)
          }
          strb_clear(&sb);
          cuda_exit(ctx);
          return NULL;
        }
        if (compile_cache == NULL)
          compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free,
                                     bin_free);

        if (compile_cache != NULL) {
          ak = malloc(sizeof(*ak));
          av = malloc(sizeof(*av));
          if (ak == NULL || av == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->src = memdup(sb.s, sb.l);
          if (ak->src == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->len = sb.l;
          memmove(ak->arch, ctx->bin_id, BIN_ID_LEN);
          av->len = bin_len;
          av->bin = memdup(bin, bin_len);
          if (av->bin == NULL) {
            src_free(ak);
            free(av);
            goto done;
          }
          cache_add(compile_cache, ak, av);
        }
      done:
        strb_clear(&sb);
      }
    }

    res = calloc(1, sizeof(*res));
    if (res == NULL) {
      free(bin);
      cuda_exit(ctx);
      FAIL(NULL, GA_SYS_ERROR);
    }

    res->bin_sz = bin_len;
    res->bin = bin;

    res->refcnt = 1;
    res->argcount = argcount;
    res->types = calloc(argcount, sizeof(int));
    if (res->types == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }
    memcpy(res->types, types, argcount*sizeof(int));
    res->args = calloc(argcount, sizeof(void *));
    if (res->args == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }

    ctx->err = cuModuleLoadData(&res->m, bin);

    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    res->ctx = ctx;
    ctx->refcnt++;
    cuda_exit(ctx);
    TAG_KER(res);
    return res;
}