Esempio n. 1
0
static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz,
                              void *dst_c, int may_share) {
  cuda_context *ctx = src->ctx;
  cuda_context *dst_ctx = (cuda_context *)dst_c;
  gpudata *dst;

  ASSERT_BUF(src);
  ASSERT_CTX(ctx);
  ASSERT_CTX(dst_ctx);

  if (ctx == dst_ctx) {
    if (may_share && offset == 0) {
        cuda_retain(src);
        return src;
    }
    dst = cuda_alloc(ctx, sz, NULL, 0, NULL);
    if (dst == NULL) return NULL;
    cuda_enter(ctx);

    cuda_wait(src, CUDA_WAIT_READ);
    cuda_wait(dst, CUDA_WAIT_WRITE);

    ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      cuda_free(dst);
      return NULL;
    }
    cuda_record(src, CUDA_WAIT_READ);
    cuda_record(dst, CUDA_WAIT_WRITE);

    cuda_exit(ctx);
    return dst;
  }

  dst = cuda_alloc(dst_ctx, sz, NULL, 0, NULL);
  if (dst == NULL)
    return NULL;
  cuda_enter(ctx);
  cuda_waits(src, CUDA_WAIT_READ, dst_ctx->mem_s);
  cuda_waits(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s);
  ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset,
			       src->ctx->ctx, sz, dst_ctx->mem_s);
  if (ctx->err != CUDA_SUCCESS) {
    cuda_free(dst);
    cuda_exit(ctx);
    return NULL;
  }

  cuda_records(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s);
  cuda_records(src, CUDA_WAIT_READ, dst_ctx->mem_s);

  cuda_exit(ctx);
  return dst;
}
Esempio n. 2
0
void cuda_exit(cuda_context *ctx) {
  ASSERT_CTX(ctx);
  assert(ctx->enter > 0);
  ctx->enter--;
  if (!ctx->enter)
    cuCtxPopCurrent(NULL);
}
Esempio n. 3
0
static void cuda_free_ctx(cuda_context *ctx) {
  gpuarray_blas_ops *blas_ops;
  gpudata *next, *curr;

  ASSERT_CTX(ctx);
  ctx->refcnt--;
  if (ctx->refcnt == 0) {
    assert(ctx->enter == 0 && "Context was active when freed!");
    if (ctx->blas_handle != NULL) {
      ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS,
                               &blas_ops);
      blas_ops->teardown(ctx);
    }
    cuMemFreeHost((void *)ctx->errbuf->ptr);
    deallocate(ctx->errbuf);

    cuStreamDestroy(ctx->s);

    /* Clear out the freelist */
    for (curr = ctx->freeblocks; curr != NULL; curr = next) {
      next = curr->next;
      cuMemFree(curr->ptr);
      deallocate(curr);
    }

    if (!(ctx->flags & DONTFREE))
      cuCtxDestroy(ctx->ctx);
    cache_destroy(ctx->extcopy_cache);
    CLEAR(ctx);
    free(ctx);
  }
}
/**
 * \brief NCCL implementation of \ref gpucomm_new.
 */
static int comm_new(gpucomm **comm_ptr, gpucontext *ctx,
                    gpucommCliqueId comm_id, int ndev, int rank) {
  gpucomm *comm;
  ncclResult_t err;

  ASSERT_CTX(ctx);

  GA_CHECK(setup_lib(ctx->err));

  comm = calloc(1, sizeof(*comm));  // Allocate memory
  if (comm == NULL) {
    *comm_ptr = NULL;  // Set to NULL if failed
    return error_sys(ctx->err, "calloc");
  }
  comm->ctx = (cuda_context *)ctx;  // convert to underlying cuda context
  // So that context would not be destroyed before communicator
  comm->ctx->refcnt++;
  cuda_enter(comm->ctx);  // Use device
  err = ncclCommInitRank(&comm->c, ndev, *((ncclUniqueId *)&comm_id), rank);
  cuda_exit(comm->ctx);
  TAG_COMM(comm);
  if (err != ncclSuccess) {
    *comm_ptr = NULL;  // Set to NULL if failed
    comm_clear(comm);
    return error_nccl(ctx->err, "ncclCommInitRank", err);
  }
  *comm_ptr = comm;
  return GA_NO_ERROR;
}
void cuda_enter(cuda_context *ctx) {
  ASSERT_CTX(ctx);
  cuCtxGetCurrent(&ctx->old);
  if (ctx->old != ctx->ctx)
    ctx->err = cuCtxSetCurrent(ctx->ctx);
  /* If no context was there in the first place, then we take over
     to avoid the set dance on the thread */
  if (ctx->old == NULL) ctx->old = ctx->ctx;
}
static void cuda_free_ctx(cuda_context *ctx) {
  gpuarray_blas_ops *blas_ops;

  ASSERT_CTX(ctx);
  ctx->refcnt--;
  if (ctx->refcnt == 0) {
    if (ctx->blas_handle != NULL) {
      ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops);
      blas_ops->teardown(ctx);
    }
    cuStreamDestroy(ctx->s);
    if (!(ctx->flags & DONTFREE))
      cuCtxDestroy(ctx->ctx);
    cache_free(ctx->extcopy_cache);
    CLEAR(ctx);
    free(ctx);
  }
}
static void cuda_free_ctx(cuda_context *ctx) {
  gpuarray_blas_ops *blas_ops;

  ASSERT_CTX(ctx);
  ctx->refcnt--;
  if (ctx->refcnt == 0) {
    assert(ctx->enter == 0 && "Context was active when freed!");
    if (ctx->blas_handle != NULL) {
      ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS,
                               &blas_ops);
      blas_ops->teardown(ctx);
    }
    ctx->refcnt = 2; /* Prevent recursive calls */
    cuda_free(ctx->errbuf);
    cuStreamDestroy(ctx->s);
    if (!(ctx->flags & DONTFREE))
      cuCtxDestroy(ctx->ctx);
    cache_free(ctx->extcopy_cache);
    CLEAR(ctx);
    free(ctx);
  }
}
Esempio n. 8
0
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id,
                         void *res) {
  cuda_context *ctx = NULL;
  if (c != NULL) {
    ctx = (cuda_context *)c;
    ASSERT_CTX(ctx);
  } else if (buf != NULL) {
    ASSERT_BUF(buf);
    ctx = buf->ctx;
  } else if (k != NULL) {
    ASSERT_KER(k);
    ctx = k->ctx;
  }
  /* I know that 512 and 1024 are magic numbers.
     There is an indication in buffer.h, though. */
  if (prop_id < 512) {
    if (ctx == NULL)
      return GA_VALUE_ERROR;
  } else if (prop_id < 1024) {
    if (buf == NULL)
      return GA_VALUE_ERROR;
  } else {
    if (k == NULL)
      return GA_VALUE_ERROR;
  }

  switch (prop_id) {
    char *s;
    CUdevice id;
    int i;
    size_t sz;

  case GA_CTX_PROP_DEVNAME:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    /* 256 is what the CUDA API uses so it's good enough for me */
    s = malloc(256);
    if (s == NULL) {
      cuda_exit(ctx);
      return GA_MEMORY_ERROR;
    }
    ctx->err = cuDeviceGetName(s, 256, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((char **)res) = s;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_LMEMSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_NUMPROCS:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i,
                                    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((unsigned int *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXGSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_BLAS_OPS:
#ifdef WITH_CUDA_CUBLAS
    *((gpuarray_blas_ops **)res) = &cublas_ops;
    return GA_NO_ERROR;
#else
    *((void **)res) = NULL;
    return GA_DEVSUP_ERROR;
#endif

  case GA_CTX_PROP_BIN_ID:
    *((const char **)res) = ctx->bin_id;
    return GA_NO_ERROR;

  case GA_CTX_PROP_ERRBUF:
    *((gpudata **)res) = ctx->errbuf;
    return GA_NO_ERROR;

  case GA_CTX_PROP_TOTAL_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo(&sz, (size_t *)res);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_CTX_PROP_FREE_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo((size_t *)res, &sz);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_BUFFER_PROP_REFCNT:
    *((unsigned int *)res) = buf->refcnt;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_SIZE:
    *((size_t *)res) = buf->sz;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_CTX:
  case GA_KERNEL_PROP_CTX:
    *((void **)res) = (void *)ctx;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuFuncGetAttribute(&i,
                                  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                                  k->k);
    cuda_exit(ctx);
    if (ctx->err != CUDA_SUCCESS)
      return GA_IMPL_ERROR;
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_PREFLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    cuda_exit(ctx);
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_NUMARGS:
    *((unsigned int *)res) = k->argcount;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_TYPES:
    *((const int **)res) = k->types;
    return GA_NO_ERROR;

  default:
    return GA_INVALID_ERROR;
  }
}
Esempio n. 9
0
void cuda_enter(cuda_context *ctx) {
  ASSERT_CTX(ctx);
  if (!ctx->enter)
    cuCtxPushCurrent(ctx->ctx);
  ctx->enter++;
}
Esempio n. 10
0
CUstream cuda_get_stream(void *ctx) {
  ASSERT_CTX((cuda_context *)ctx);
  return ((cuda_context *)ctx)->s;
}
Esempio n. 11
0
CUcontext cuda_get_ctx(void *ctx) {
  ASSERT_CTX((cuda_context *)ctx);
  return ((cuda_context *)ctx)->ctx;
}
/**
 * \brief NCCL implementation of \ref gpucomm_gen_clique_id.
 */
static int generate_clique_id(gpucontext *c, gpucommCliqueId *comm_id) {
  ASSERT_CTX(c);

  GA_CHECK(setup_lib(c->err));
  NCCL_CHKFAIL(c, ncclGetUniqueId((ncclUniqueId *)comm_id));
}