void *cuda_make_ctx(CUcontext ctx, int flags) {
  cuda_context *res;
  void *p;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  res->freeblocks = NULL;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    goto fail_cache;
  }
  res->extcopy_cache = cache_lru(64, 32, (cache_eq_fn)extcopy_eq,
                                 (cache_hash_fn)extcopy_hash,
                                 (cache_freek_fn)extcopy_free,
                                 (cache_freev_fn)cuda_freekernel);
  if (res->extcopy_cache == NULL) {
    goto fail_cache;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    goto fail_stream;
  }
  err = cuStreamCreate(&res->mem_s, CU_STREAM_NON_BLOCKING);
  if (err != CUDA_SUCCESS) {
    goto fail_mem_stream;
  }
  err = cuMemAllocHost(&p, 16);
  if (err != CUDA_SUCCESS) {
    goto fail_errbuf;
  }
  memset(p, 0, 16);
  /* Need to tag for new_gpudata */
  TAG_CTX(res);
  res->errbuf = new_gpudata(res, (CUdeviceptr)p, 16);
  if (res->errbuf == NULL) {
    err = res->err;
    goto fail_end;
  }
  res->errbuf->flags |= CUDA_MAPPED_PTR;
  return res;
 fail_end:
  cuMemFreeHost(p);
 fail_errbuf:
  cuStreamDestroy(res->mem_s);
 fail_mem_stream:
  cuStreamDestroy(res->s);
 fail_stream:
  cache_destroy(res->extcopy_cache);
 fail_cache:
  free(res);
  return NULL;
}
void *cuda_make_ctx(CUcontext ctx, int flags) {
  int64_t v = 0;
  cuda_context *res;
  int e = 0;

  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->enter = 0;
  if (detect_arch(ARCH_PREFIX, res->bin_id, &err)) {
    free(res);
    return NULL;
  }
  res->extcopy_cache = cache_alloc(64, 32);
  if (res->extcopy_cache == NULL) {
    free(res);
    return NULL;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    cache_free(res->extcopy_cache);
    free(res);
    return NULL;
  }
  TAG_CTX(res); /* Need to tag before cuda_alloc */
  res->errbuf = cuda_alloc(res, 8, &v, GA_BUFFER_INIT, &e);
  if (e != GA_NO_ERROR) {
    err = res->err;
    cache_free(res->extcopy_cache);
    cuStreamDestroy(res->s);
    free(res);
    return NULL;
  }
  res->refcnt--; /* Don't want to create a reference loop with the errbuf */
  return res;
}
void *cuda_make_ctx(CUcontext ctx, int flags) {
  cuda_context *res;
  res = malloc(sizeof(*res));
  if (res == NULL)
    return NULL;
  res->ctx = ctx;
  res->err = CUDA_SUCCESS;
  res->blas_handle = NULL;
  res->refcnt = 1;
  res->flags = flags;
  res->extcopy_cache = cache_alloc(64, 32);
  if (res->extcopy_cache == NULL) {
    free(res);
    return NULL;
  }
  err = cuStreamCreate(&res->s, 0);
  if (err != CUDA_SUCCESS) {
    cache_free(res->extcopy_cache);
    free(res);
    return NULL;
  }
  TAG_CTX(res);
  return res;
}