コード例 #1
0
static int cuda_share(gpudata *a, gpudata *b, int *ret) {
  ASSERT_BUF(a);
  ASSERT_BUF(b);
  return (a->ctx == b->ctx && a->sz != 0 && b->sz != 0 &&
          ((a->ptr <= b->ptr && a->ptr + a->sz > b->ptr) ||
           (b->ptr <= a->ptr && b->ptr + b->sz > a->ptr)));
}
コード例 #2
0
static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src,
                     size_t srcoff, size_t sz) {
    cuda_context *ctx = dst->ctx;
    int res = GA_NO_ERROR;
    ASSERT_BUF(dst);
    ASSERT_BUF(src);
    if (src->ctx != dst->ctx) return GA_VALUE_ERROR;

    if (sz == 0) return GA_NO_ERROR;

    if ((dst->sz - dstoff) < sz || (src->sz - srcoff) < sz)
        return GA_VALUE_ERROR;

    cuda_enter(ctx);

    cuda_wait(src, CUDA_WAIT_READ);
    cuda_wait(dst, CUDA_WAIT_WRITE);

    ctx->err = cuMemcpyDtoDAsync(dst->ptr + dstoff, src->ptr + srcoff, sz,
                                 ctx->s);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }

    cuda_record(src, CUDA_WAIT_READ);
    cuda_record(dst, CUDA_WAIT_WRITE);
    cuda_exit(ctx);
    return res;
}
コード例 #3
0
/**
 * \brief NCCL implementation of \ref gpucomm_all_reduce.
 */
static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
                      size_t offdest, size_t count, int typecode, int opcode,
                      gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclRedOp_t op = ncclNumOps;
  ncclDataType_t datatype = ncclNumTypes;
  cuda_context *ctx;

  ASSERT_BUF(src);
  ASSERT_COMM(comm);
  ASSERT_BUF(dest);
  GA_CHECK(check_restrictions(src, offsrc, dest, offdest, count, typecode,
                              opcode, comm, &datatype, &op));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ));
  // sync: wait till a read/write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc),
                                        (void *)(dest->ptr + offdest), count,
                                        datatype, op, comm->c, ctx->s));

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #4
0
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 double beta, gpudata *C, size_t offC, size_t ldc) {
  cuda_context *ctx = A->ctx;
  gpudata *T;
  size_t t;
  cublasStatus_t err;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(B, CUDA_WAIT_READ);
  cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h,
                    convT(transA), convT(transB), M, N, K,
                    &alpha, ((double *)A->ptr) + offA, lda,
                    ((double *)B->ptr) + offB, ldb, &beta,
                    ((double *)C->ptr) + offC, ldc);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(B, CUDA_WAIT_READ);
  cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #5
0
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X,
                size_t offX, int incX, gpudata *Y, size_t offY, int incY,
                gpudata *A, size_t offA, size_t lda) {
  cuda_context *ctx = X->ctx;
  blas_handle *h = (blas_handle *)ctx->blas_handle;
  gpudata *td;
  size_t t;

  ASSERT_BUF(X);
  ASSERT_BUF(Y);
  ASSERT_BUF(A);

  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) ||
      LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY))
    return GA_XLARGE_ERROR;

  if (order == cb_c) {
    t = M;
    M = N;
    N = t;
    t = offX;
    offX = offY;
    offY = t;
    t = incX;
    incX = incY;
    incY = t;
    td = X;
    X = Y;
    Y = td;
  }

  cuda_enter(ctx);

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL));

  h->err = cublasDger(h->h, M, N, &alpha,
                                ((double *)X->ptr) + offX, incX,
                                ((double *)Y->ptr) + offY, incY,
                                ((double *)A->ptr) + offA, lda);
  if (h->err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_ALL));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #6
0
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 double alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX,
                 double beta, gpudata *Y, size_t offY, int incY) {
  cuda_context *ctx = A->ctx;
  blas_handle *h = (blas_handle *)ctx->blas_handle;
  size_t t;

  ASSERT_BUF(A);
  ASSERT_BUF(X);
  ASSERT_BUF(Y);

  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) ||
      LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY))
    return GA_XLARGE_ERROR;

  if (order == cb_c) {
    t = N;
    N = M;
    M = t;

    if (transA == cb_no_trans) {
      transA = cb_trans;
    } else {
      transA = cb_no_trans;
    }
  }

  cuda_enter(ctx);

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL));

  h->err = cublasDgemv(h->h,
                       convT(transA), M, N, &alpha,
                       ((double *)A->ptr) + offA, lda,
                       ((double *)X->ptr) + offX, incX,
                       &beta, ((double *)Y->ptr) + offY, incY);
  if (h->err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #7
0
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X,
                size_t offX, int incX, gpudata *Y, size_t offY, int incY,
                gpudata *A, size_t offA, size_t lda) {
  cuda_context *ctx = X->ctx;
  gpudata *td;
  size_t t;
  cublasStatus_t err;

  ASSERT_BUF(X);
  ASSERT_BUF(Y);
  ASSERT_BUF(A);

  if (order == cb_c) {
    t = M;
    M = N;
    N = t;
    t = offX;
    offX = offY;
    offY = t;
    t = incX;
    incX = incY;
    incY = t;
    td = X;
    X = Y;
    Y = td;
  }

  cuda_enter(ctx);

  cuda_wait(X, CUDA_WAIT_READ);
  cuda_wait(Y, CUDA_WAIT_READ);
  cuda_wait(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasDger(((blas_handle *)ctx->blas_handle)->h, M, N, &alpha,
                   ((double *)X->ptr) + offX, incX,
                   ((double *)Y->ptr) + offY, incY,
                   ((double *)A->ptr) + offA, lda);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(X, CUDA_WAIT_READ);
  cuda_record(Y, CUDA_WAIT_READ);
  cuda_record(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #8
0
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 double alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX,
                 double beta, gpudata *Y, size_t offY, int incY) {
  cuda_context *ctx = A->ctx;
  cublasStatus_t err;
  size_t t;

  ASSERT_BUF(A);
  ASSERT_BUF(X);
  ASSERT_BUF(Y);

  if (order == cb_c) {
    t = N;
    N = M;
    M = t;

    if (transA == cb_no_trans) {
      transA = cb_trans;
    } else {
      transA = cb_no_trans;
    }
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(X, CUDA_WAIT_READ);
  cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasDgemv(((blas_handle *)ctx->blas_handle)->h,
                     convT(transA), M, N, &alpha,
                    ((double *)A->ptr) + offA, lda,
                    ((double *)X->ptr) + offX, incX,
                    &beta, ((double *)Y->ptr) + offY, incY);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(X, CUDA_WAIT_READ);
  cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #9
0
static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) {
    cuda_context *ctx = src->ctx;

    ASSERT_BUF(src);

    if (sz == 0) return GA_NO_ERROR;

    if ((src->sz - srcoff) < sz)
        return GA_VALUE_ERROR;

    cuda_enter(ctx);

    if (src->flags & CUDA_MAPPED_PTR) {
      ctx->err = cuEventSynchronize(src->wev);
      if (ctx->err != CUDA_SUCCESS) {
        cuda_exit(ctx);
        return GA_IMPL_ERROR;
      }
      memcpy(dst, (void *)(src->ptr + srcoff), sz);
    } else {
      cuda_waits(src, CUDA_WAIT_READ, ctx->mem_s);

      ctx->err = cuMemcpyDtoHAsync(dst, src->ptr + srcoff, sz, ctx->mem_s);
      if (ctx->err != CUDA_SUCCESS) {
        cuda_exit(ctx);
        return GA_IMPL_ERROR;
      }
      cuda_records(src, CUDA_WAIT_READ, ctx->mem_s);
    }
    cuda_exit(ctx);
    return GA_NO_ERROR;
}
コード例 #10
0
static int cuda_write(gpudata *dst, size_t dstoff, const void *src,
                      size_t sz) {
    cuda_context *ctx = dst->ctx;

    ASSERT_BUF(dst);

    if (sz == 0) return GA_NO_ERROR;

    if ((dst->sz - dstoff) < sz)
        return GA_VALUE_ERROR;

    cuda_enter(ctx);

    if (dst->flags & CUDA_MAPPED_PTR) {
      ctx->err = cuEventSynchronize(dst->rev);
      if (ctx->err != CUDA_SUCCESS) {
        cuda_exit(ctx);
        return GA_IMPL_ERROR;
      }
      memcpy((void *)(dst->ptr + dstoff), src, sz);
    } else {
      cuda_waits(dst, CUDA_WAIT_WRITE, ctx->mem_s);

      ctx->err = cuMemcpyHtoDAsync(dst->ptr + dstoff, src, sz, ctx->mem_s);
      if (ctx->err != CUDA_SUCCESS) {
        cuda_exit(ctx);
        return GA_IMPL_ERROR;
      }

      cuda_records(dst, CUDA_WAIT_WRITE, ctx->mem_s);
    }
    cuda_exit(ctx);
    return GA_NO_ERROR;
}
コード例 #11
0
/**
 * \brief NCCL implementation of \ref gpucomm_broadcast.
 */
static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
                     int root, gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclDataType_t datatype = ncclNumTypes;
  int rank = 0;
  cuda_context *ctx;

  ASSERT_BUF(array);
  ASSERT_COMM(comm);
  GA_CHECK(check_restrictions(array, offset, NULL, 0, count, typecode, 0, comm,
                              &datatype, NULL));
  GA_CHECK(get_rank(comm, &rank));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  if (rank == root)
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_READ));
  else
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count,
                                    datatype, root, comm->c, ctx->s));

  if (rank == root)
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_READ));
  else
    GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #12
0
static void cuda_free(gpudata *d) {
  /* We ignore errors on free */
  ASSERT_BUF(d);
  d->refcnt--;
  if (d->refcnt == 0) {
    /* Keep a reference to the context since we deallocate the gpudata
     * object */
    cuda_context *ctx = d->ctx;
    if (d->flags & DONTFREE) {
      /* This is the path for "external" buffers */
      deallocate(d);
    } else if (ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE) {
      /* Just free the pointer */
      cuMemFree(d->ptr);
      deallocate(d);
    } else {
      /* Find the position in the freelist.  Freelist is kept in order
         of allocation address */
      gpudata *next = d->ctx->freeblocks, *prev = NULL;
      for (; next && next->ptr < d->ptr; next = next->next) {
        prev = next;
      }
      next = prev != NULL ? prev->next : d->ctx->freeblocks;

      /* See if we can merge the block with the previous one */
      if (!(d->flags & CUDA_HEAD_ALLOC) &&
            prev != NULL && prev->ptr + prev->sz == d->ptr) {
        prev->sz = prev->sz + d->sz;
        cuda_wait(d, CUDA_WAIT_ALL);
        cuda_record(prev, CUDA_WAIT_ALL);
        deallocate(d);
        d = prev;
      } else if (prev != NULL) {
        prev->next = d;
      } else {
        d->ctx->freeblocks = d;
      }

      /* See if we can merge with next */
      if (next && !(next->flags & CUDA_HEAD_ALLOC) &&
          d->ptr + d->sz == next->ptr) {
        d->sz = d->sz + next->sz;
        d->next = next->next;
        cuda_wait(next, CUDA_WAIT_ALL);
        cuda_record(d, CUDA_WAIT_ALL);
        deallocate(next);
      } else {
        d->next = next;
      }
    }
    /* We keep this at the end since the freed buffer could be the
     * last reference to the context and therefore clearing the
     * reference could trigger the freeing if the whole context
     * including the freelist, which we manipulate. */
    cuda_free_ctx(ctx);
  }
}
コード例 #13
0
/**
 * \brief NCCL implementation of \ref gpucomm_reduce_scatter.
 */
static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
                          size_t offdest, size_t count, int typecode,
                          int opcode, gpucomm *comm) {
  // need dummy init so that compiler shuts up
  ncclRedOp_t op = ncclNumOps;
  ncclDataType_t datatype = ncclNumTypes;
  int ndev = 0;
  size_t resc_size;
  cuda_context *ctx;

  ASSERT_BUF(src);
  ASSERT_COMM(comm);
  ASSERT_BUF(dest);
  GA_CHECK(get_count(comm, &ndev));
  GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode,
                              opcode, comm, &datatype, &op));
  if (dest->ctx != comm->ctx)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ");
  resc_size = count * gpuarray_get_elsize(typecode);
  if ((dest->sz - offdest) < resc_size)
    return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation");
  assert(!(offdest > dest->sz));

  ctx = comm->ctx;
  cuda_enter(ctx);

  // sync: wait till a write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ));
  // sync: wait till a read/write has finished (out of concurrent kernels)
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE));

  // change stream of nccl ops to enable concurrency
  NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc),
                                            (void *)(dest->ptr + offdest), count,
                                            datatype, op, comm->c, ctx->s));

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
コード例 #14
0
static int cuda_records(gpudata *a, int flags, CUstream s) {
  ASSERT_BUF(a);
  cuda_enter(a->ctx);
  if (flags & CUDA_WAIT_READ)
    a->ctx->err = cuEventRecord(a->rev, s);
  if (flags & CUDA_WAIT_WRITE)
    a->ctx->err = cuEventRecord(a->wev, s);
  cuda_exit(a->ctx);
  return GA_NO_ERROR;
}
コード例 #15
0
static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz,
                              void *dst_c, int may_share) {
  cuda_context *ctx = src->ctx;
  cuda_context *dst_ctx = (cuda_context *)dst_c;
  gpudata *dst;

  ASSERT_BUF(src);
  ASSERT_CTX(ctx);
  ASSERT_CTX(dst_ctx);

  if (ctx == dst_ctx) {
    if (may_share && offset == 0) {
        cuda_retain(src);
        return src;
    }
    dst = cuda_alloc(ctx, sz, NULL, 0, NULL);
    if (dst == NULL) return NULL;
    cuda_enter(ctx);

    cuda_wait(src, CUDA_WAIT_READ);
    cuda_wait(dst, CUDA_WAIT_WRITE);

    ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      cuda_free(dst);
      return NULL;
    }
    cuda_record(src, CUDA_WAIT_READ);
    cuda_record(dst, CUDA_WAIT_WRITE);

    cuda_exit(ctx);
    return dst;
  }

  dst = cuda_alloc(dst_ctx, sz, NULL, 0, NULL);
  if (dst == NULL)
    return NULL;
  cuda_enter(ctx);
  cuda_waits(src, CUDA_WAIT_READ, dst_ctx->mem_s);
  cuda_waits(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s);
  ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset,
			       src->ctx->ctx, sz, dst_ctx->mem_s);
  if (ctx->err != CUDA_SUCCESS) {
    cuda_free(dst);
    cuda_exit(ctx);
    return NULL;
  }

  cuda_records(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s);
  cuda_records(src, CUDA_WAIT_READ, dst_ctx->mem_s);

  cuda_exit(ctx);
  return dst;
}
コード例 #16
0
static int cuda_sync(gpudata *b) {
  cuda_context *ctx = (cuda_context *)b->ctx;
  int err = GA_NO_ERROR;

  ASSERT_BUF(b);
  cuda_enter(ctx);
  ctx->err = cuEventSynchronize(b->wev);
  if (ctx->err != CUDA_SUCCESS)
    err = GA_IMPL_ERROR;
  ctx->err = cuEventSynchronize(b->rev);
  if (ctx->err != CUDA_SUCCESS)
    err = GA_IMPL_ERROR;
  cuda_exit(ctx);
  return err;
}
コード例 #17
0
static int cuda_memset(gpudata *dst, size_t dstoff, int data) {
    cuda_context *ctx = dst->ctx;

    ASSERT_BUF(dst);

    if ((dst->sz - dstoff) == 0) return GA_NO_ERROR;

    cuda_enter(ctx);

    cuda_wait(dst, CUDA_WAIT_WRITE);

    ctx->err = cuMemsetD8Async(dst->ptr + dstoff, data, dst->sz - dstoff,
                               ctx->s);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }

    cuda_record(dst, CUDA_WAIT_WRITE);
    cuda_exit(ctx);
    return GA_NO_ERROR;
}
コード例 #18
0
static int cuda_waits(gpudata *a, int flags, CUstream s) {
  ASSERT_BUF(a);
  /* If others are only reads, no need to wait */
  cuda_enter(a->ctx);
  if (flags & CUDA_WAIT_READ) {
    /* We wait for writes that happened before since multiple reads at
     * the same time are fine */
    a->ctx->err = cuStreamWaitEvent(s, a->wev, 0);
    if (a->ctx->err != CUDA_SUCCESS) {
      cuda_exit(a->ctx);
      return GA_IMPL_ERROR;
    }
  }
  if (flags & CUDA_WAIT_WRITE) {
    /* Make sure to not disturb previous reads */
    a->ctx->err = cuStreamWaitEvent(s, a->rev, 0);
    if (a->ctx->err != CUDA_SUCCESS) {
      cuda_exit(a->ctx);
      return GA_IMPL_ERROR;
    }
  }
  cuda_exit(a->ctx);
  return GA_NO_ERROR;
}
コード例 #19
0
static int dgemvBatch(cb_order order, cb_transpose transA,
                      size_t M, size_t N, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **x, size_t *offX, size_t incX,
                      double beta, gpudata **y, size_t *offY, size_t incY,
                      size_t batchCount, int flags) {
  cuda_context *ctx;
  size_t t, i;
  size_t ls[2], gs[2];
  void *args[9];
  gpudata *Aa, *xa, *ya;
  int err;

  if (flags != 0) return GA_INVALID_ERROR;
  if (batchCount == 0) return GA_NO_ERROR;

  if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR;

  if (M < 512) {
    ls[0] = 32;
    if (batchCount > 16)
      ls[1] = 16;
    else
      ls[1] = batchCount;
  } else {
    ls[0] = 512;
    ls[1] = 1;
  }
  gs[0] = (M + ls[0] - 1) / ls[0];
  gs[1] = (batchCount + ls[1] - 1) / ls[1];
  if (gs[0] * gs[1] / 65535) {
    gs[1] = (65535 / gs[0]);
  }

  if (order == cb_c) {
    t = N;
    N = M;
    M = t;
    if (transA == cb_no_trans) {
      transA = cb_trans;
    } else {
      transA = cb_no_trans;
    }
  }

  ASSERT_BUF(A[0]);

  ctx = A[0]->ctx;

  cuda_enter(ctx);

  {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **x_l = (const double **)T_l + batchCount;
    double **y_l = T_l + (batchCount * 2);

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(x[i]);
      ASSERT_BUF(y[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(x[i], CUDA_WAIT_READ);
      cuda_wait(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = (double *)(A[i]->ptr + offA[i]);
      x_l[i] = (double *)(x[i]->ptr + offX[i]);
      y_l[i] = (double *)(y[i]->ptr + offY[i]);
    }

    Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l,
                               GA_BUFFER_INIT, &err);
    if (Aa == NULL)
      return err;
    xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l,
                               GA_BUFFER_INIT, &err);
    if (xa == NULL) {
      cuda_ops.buffer_release(Aa);
      return err;
    }
    ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l,
                               GA_BUFFER_INIT, &err);
    if (ya == NULL) {
      cuda_ops.buffer_release(Aa);
      cuda_ops.buffer_release(xa);
      return err;
    }
  }

  args[0] = Aa;
  args[1] = &lda;
  args[2] = xa;
  args[3] = &incX;
  args[4] = ya;
  args[5] = &incY;
  args[6] = &batchCount;
  args[7] = &M;
  args[8] = &N;

  if (transA == cb_no_trans) {
    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
  } else {
    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
  }

  cuda_ops.buffer_release(Aa);
  cuda_ops.buffer_release(xa);
  cuda_ops.buffer_release(ya);

  if (err != GA_NO_ERROR) {
    cuda_exit(ctx);
    return err;
  }

  for (i = 0; i < batchCount; i++) {
    cuda_record(A[i], CUDA_WAIT_READ);
    cuda_record(x[i], CUDA_WAIT_READ);
    cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #20
0
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 double beta, gpudata *C, size_t offC, size_t ldc) {
  cuda_context *ctx = A->ctx;
  blas_handle *h = (blas_handle *)ctx->blas_handle;
  gpudata *T;
  size_t t;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
    return GA_XLARGE_ERROR;

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));

  h->err = cublasDgemm(h->h,
                       convT(transA), convT(transB), M, N, K,
                       &alpha, ((double *)A->ptr) + offA, lda,
                       ((double *)B->ptr) + offB, ldb, &beta,
                       ((double *)C->ptr) + offC, ldc);
  if (h->err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL));

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #21
0
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cuda_context *ctx;
  blas_handle *h;
  size_t *lt, t;
  gpudata **T;
  size_t i;
  const size_t threshold = 650;
  cb_transpose transT;

  if (batchCount == 0) return GA_NO_ERROR;

  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
    return GA_XLARGE_ERROR;

  ASSERT_BUF(A[0]);
  ctx = A[0]->ctx;
  h = (blas_handle *)ctx->blas_handle;
  cuda_enter(ctx);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    lt = offA;
    offA = offB;
    offB = lt;
  }

  /* use parallel cublasSgemm calls rather than cublasSgemmBatched for
   * large products */
  if (M * N * K > threshold * threshold * threshold) {
    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL));

      h->err = cublasSgemm(h->h,
                           convT(transA), convT(transB),
                           M, N, K, &alpha,
                           (float*)A[i]->ptr + offA[i], lda,
                           (float*)B[i]->ptr + offB[i], ldb,
                           &beta,
                           (float*)C[i]->ptr + offC[i], ldc);
      if (h->err != CUBLAS_STATUS_SUCCESS) {
        cuda_exit(ctx);
        if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
          return GA_DEVSUP_ERROR;
        return GA_BLAS_ERROR;
      }

      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL));
    }
  } else {
    float **T_l = alloca(sizeof(float *) * batchCount * 3);
    const float **A_l = (const float **)T_l;
    const float **B_l = (const float **)T_l + batchCount;
    float **C_l = T_l + (batchCount * 2);
    CUdeviceptr Ta, Aa, Ba, Ca;

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL));
      A_l[i] = ((float *)A[i]->ptr) + offA[i];
      B_l[i] = ((float *)B[i]->ptr) + offB[i];
      C_l[i] = ((float *)C[i]->ptr) + offC[i];
    }

    cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3);
    Aa = Ta;
    Ba = Ta + (batchCount * sizeof(float *));
    Ca = Ta + (batchCount * sizeof(float *) * 2);

    cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3);

    h->err = cublasSgemmBatched(h->h,
                                convT(transA), convT(transB),
                                M, N, K, &alpha,
                                (const float **)Aa, lda,
                                (const float **)Ba, ldb, &beta,
                                (float **)Ca, ldc, batchCount);
    cuMemFree(Ta);
    if (h->err != CUBLAS_STATUS_SUCCESS) {
      cuda_exit(ctx);
      if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
        return GA_DEVSUP_ERROR;
      return GA_BLAS_ERROR;
    }

    for (i = 0; i < batchCount; i++) {
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL));
    }
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #22
0
CUdeviceptr cuda_get_ptr(gpudata *g) { ASSERT_BUF(g); return g->ptr; }
コード例 #23
0
static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
                     gpudata **x, size_t *offX, size_t incX,
                     gpudata **y, size_t *offY, size_t incY,
                     gpudata **A, size_t *offA, size_t lda,
                     size_t batchCount, int flags) {
  cuda_context *ctx;
  size_t t, *tp, i;
  size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount};
  void *args[10];
  gpudata **T;
  gpudata *Aa, *xa, *ya;
  int err;

  if (flags != 0) return GA_INVALID_ERROR;
  if (batchCount == 0) return GA_NO_ERROR;

  if (incX == 1) {
    if (ls[0] > 32) {
      gs[0] = (ls[0] + 31) / 32;
      ls[0] = 32;
    }
    if (ls[0] * ls[1] > 512) {
      gs[1] = (ls[1] + 15) / 16;
      ls[1] = 16;
    }
  } else {
    if (ls[1] > 32) {
      gs[1] = (ls[1] + 31) / 32;
      ls[1] = 32;
    }
    if (ls[0] * ls[1] > 512) {
      gs[0] = (ls[0] + 15) / 16;
      ls[0] = 16;
    }
  }
  if (gs[0] * gs[1] * gs[2] > 65535) {
    if (gs[0] * gs[1] > 65535)
      return GA_VALUE_ERROR;
    gs[2] = (65535 / (gs[0] * gs[1]));
  }

  if (order == cb_c) {
    t = M;
    M = N;
    N = t;
    tp = offX;
    offX = offY;
    offY = tp;
    t = incX;
    incX = incY;
    incY = t;
    T = x;
    x = y;
    y = T;
  }

  ASSERT_BUF(x[0]);

  ctx = x[0]->ctx;

  cuda_enter(ctx);

  {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **x_l = (const double **)T_l + batchCount;
    double **y_l = T_l + (batchCount * 2);

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(x[i]);
      ASSERT_BUF(y[i]);
      cuda_wait(A[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      cuda_wait(x[i], CUDA_WAIT_READ);
      cuda_wait(y[i], CUDA_WAIT_READ);
      A_l[i] = (double *)(A[i]->ptr + offA[i]);
      x_l[i] = (double *)(x[i]->ptr + offX[i]);
      y_l[i] = (double *)(y[i]->ptr + offY[i]);
    }

    Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l,
                               GA_BUFFER_INIT, &err);
    if (Aa == NULL)
      return err;
    xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l,
                               GA_BUFFER_INIT, &err);
    if (xa == NULL) {
      cuda_ops.buffer_release(Aa);
      return err;
    }
    ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l,
                               GA_BUFFER_INIT, &err);
    if (ya == NULL) {
      cuda_ops.buffer_release(Aa);
      cuda_ops.buffer_release(xa);
      return err;
    }
  }

  args[0] = xa;
  args[1] = &incX;
  args[2] = ya;
  args[3] = &incY;
  args[4] = &alpha;
  args[5] = Aa;
  args[6] = &lda;
  args[7] = &batchCount;
  args[8] = &M;
  args[9] = &N;

  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);

  cuda_ops.buffer_release(Aa);
  cuda_ops.buffer_release(xa);
  cuda_ops.buffer_release(ya);

  if (err != GA_NO_ERROR) {
    cuda_exit(ctx);
    return err;
  }


  for (i = 0; i < batchCount; i++) {
    cuda_record(A[i], CUDA_WAIT_READ);
    cuda_record(x[i], CUDA_WAIT_READ);
    cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #24
0
static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 float beta, gpudata *C, size_t offC, size_t ldc) {
#ifdef HAVE_CUBLAS_SGEMMEX
  /* This will use float32 for computation as it's the best we can
   * have right now. In the future when native float16 support will be
   * there we will switch to that. */
  cuda_context *ctx = A->ctx;
  gpudata *T;
  size_t t;
  cublasStatus_t err;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(B, CUDA_WAIT_READ);
  cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasSgemmEx(((blas_handle *)ctx->blas_handle)->h,
                      convT(transA), convT(transB), M, N, K,
                      &alpha,
                      ((uint16_t *)A->ptr) + offA, CUBLAS_DATA_HALF, lda,
                      ((uint16_t *)B->ptr) + offB, CUBLAS_DATA_HALF, ldb,
                      &beta,
                      ((uint16_t *)C->ptr) + offC, CUBLAS_DATA_HALF, ldc);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(B, CUDA_WAIT_READ);
  cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);
  return GA_NO_ERROR;
#else
  return GA_DEVSUP_ERROR;
#endif
}
コード例 #25
0
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cuda_context *ctx;
  size_t *lt, t;
  gpudata **T;
  size_t i;
  cb_transpose transT;
  cublasStatus_t err;

  if (batchCount == 0) return GA_NO_ERROR;

  ASSERT_BUF(A[0]);
  ctx = A[0]->ctx;
  cuda_enter(ctx);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    lt = offA;
    offA = offB;
    offB = lt;
  }

  // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products
  const size_t threshold = 650;
  const int multiple_dispatch = M * N * K > threshold * threshold * threshold;
  if (multiple_dispatch) {
    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);

      err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h,
                        convT(transA), convT(transB),
                        M, N, K, &alpha,
                        (double*)A[i]->ptr + offA[i], lda,
                        (double*)B[i]->ptr + offB[i], ldb,
                        &beta,
                        (double*)C[i]->ptr + offC[i], ldc);
      if (err != CUBLAS_STATUS_SUCCESS) {
        cuda_exit(ctx);
        if (err == CUBLAS_STATUS_ARCH_MISMATCH)
          return GA_DEVSUP_ERROR;
        return GA_BLAS_ERROR;
      }

      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  } else {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **B_l = (const double **)T_l + batchCount;
    double **C_l = T_l + (batchCount * 2);
    CUdeviceptr Ta, Aa, Ba, Ca;

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = ((double *)A[i]->ptr) + offA[i];
      B_l[i] = ((double *)B[i]->ptr) + offB[i];
      C_l[i] = ((double *)C[i]->ptr) + offC[i];
    }

    cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3);
    Aa = Ta;
    Ba = Ta + (batchCount * sizeof(double *));
    Ca = Ta + (batchCount * sizeof(double *) * 2);

    cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3);

    err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h,
                             convT(transA), convT(transB),
                             M, N, K, &alpha, (const double **)Aa, lda,
                             (const double **)Ba, ldb, &beta,
                             (double **)Ca, ldc, batchCount);
    cuMemFree(Ta);
    if (err != CUBLAS_STATUS_SUCCESS) {
      cuda_exit(ctx);
      if (err == CUBLAS_STATUS_ARCH_MISMATCH)
        return GA_DEVSUP_ERROR;
      return GA_BLAS_ERROR;
    }

    for (i = 0; i < batchCount; i++) {
      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #26
0
static void cuda_retain(gpudata *d) {
  ASSERT_BUF(d);
  d->refcnt++;
}
コード例 #27
0
size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; }
コード例 #28
0
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id,
                         void *res) {
  cuda_context *ctx = NULL;
  if (c != NULL) {
    ctx = (cuda_context *)c;
    ASSERT_CTX(ctx);
  } else if (buf != NULL) {
    ASSERT_BUF(buf);
    ctx = buf->ctx;
  } else if (k != NULL) {
    ASSERT_KER(k);
    ctx = k->ctx;
  }
  /* I know that 512 and 1024 are magic numbers.
     There is an indication in buffer.h, though. */
  if (prop_id < 512) {
    if (ctx == NULL)
      return GA_VALUE_ERROR;
  } else if (prop_id < 1024) {
    if (buf == NULL)
      return GA_VALUE_ERROR;
  } else {
    if (k == NULL)
      return GA_VALUE_ERROR;
  }

  switch (prop_id) {
    char *s;
    CUdevice id;
    int i;
    size_t sz;

  case GA_CTX_PROP_DEVNAME:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    /* 256 is what the CUDA API uses so it's good enough for me */
    s = malloc(256);
    if (s == NULL) {
      cuda_exit(ctx);
      return GA_MEMORY_ERROR;
    }
    ctx->err = cuDeviceGetName(s, 256, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((char **)res) = s;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_LMEMSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_NUMPROCS:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i,
                                    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((unsigned int *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXGSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_BLAS_OPS:
#ifdef WITH_CUDA_CUBLAS
    *((gpuarray_blas_ops **)res) = &cublas_ops;
    return GA_NO_ERROR;
#else
    *((void **)res) = NULL;
    return GA_DEVSUP_ERROR;
#endif

  case GA_CTX_PROP_BIN_ID:
    *((const char **)res) = ctx->bin_id;
    return GA_NO_ERROR;

  case GA_CTX_PROP_ERRBUF:
    *((gpudata **)res) = ctx->errbuf;
    return GA_NO_ERROR;

  case GA_CTX_PROP_TOTAL_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo(&sz, (size_t *)res);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_CTX_PROP_FREE_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo((size_t *)res, &sz);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_BUFFER_PROP_REFCNT:
    *((unsigned int *)res) = buf->refcnt;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_SIZE:
    *((size_t *)res) = buf->sz;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_CTX:
  case GA_KERNEL_PROP_CTX:
    *((void **)res) = (void *)ctx;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuFuncGetAttribute(&i,
                                  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                                  k->k);
    cuda_exit(ctx);
    if (ctx->err != CUDA_SUCCESS)
      return GA_IMPL_ERROR;
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_PREFLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    cuda_exit(ctx);
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_NUMARGS:
    *((unsigned int *)res) = k->argcount;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_TYPES:
    *((const int **)res) = k->types;
    return GA_NO_ERROR;

  default:
    return GA_INVALID_ERROR;
  }
}
コード例 #29
0
static int cuda_extcopy(gpudata *input, size_t ioff, gpudata *output,
                        size_t ooff, int intype, int outtype,
                        unsigned int a_nd, const size_t *a_dims,
                        const ssize_t *a_str, unsigned int b_nd,
                        const size_t *b_dims, const ssize_t *b_str) {
  cuda_context *ctx = input->ctx;
  void *args[2];
  int res = GA_SYS_ERROR;
  unsigned int i;
  size_t nEls = 1, ls, gs;
  gpukernel *k;
  extcopy_args a, *aa;

  ASSERT_BUF(input);
  ASSERT_BUF(output);
  if (input->ctx != output->ctx)
    return GA_INVALID_ERROR;

  for (i = 0; i < a_nd; i++) {
    nEls *= a_dims[i];
  }
  if (nEls == 0) return GA_NO_ERROR;

  a.ind = a_nd;
  a.ond = b_nd;
  a.itype = intype;
  a.otype = outtype;
  a.ioff = ioff;
  a.ooff = ooff;
  a.idims = a_dims;
  a.odims = b_dims;
  a.istr = a_str;
  a.ostr = b_str;

  k = cache_get(ctx->extcopy_cache, &a);
  if (k == NULL) {
    res = gen_extcopy_kernel(&a, input->ctx, &k, nEls);
    if (res != GA_NO_ERROR)
      return res;

    /* Cache the kernel */
    aa = memdup(&a, sizeof(a));
    if (aa == NULL) goto done;
    aa->idims = memdup(a_dims, a_nd*sizeof(size_t));
    aa->odims = memdup(b_dims, b_nd*sizeof(size_t));
    aa->istr = memdup(a_str, a_nd*sizeof(ssize_t));
    aa->ostr = memdup(b_str, b_nd*sizeof(ssize_t));
    if (aa->idims == NULL || aa->odims == NULL ||
        aa->istr == NULL || aa->ostr == NULL) {
      extcopy_free(aa);
      goto done;
    }
    /* One ref is given to the cache, we manage the other */
    cuda_retainkernel(k);
    cache_add(ctx->extcopy_cache, aa, k);
  } else {
    /* This is our reference */
    cuda_retainkernel(k);
  }
done:

  /* Cheap kernel scheduling */
  res = cuda_property(NULL, NULL, k, GA_KERNEL_PROP_MAXLSIZE, &ls);
  if (res != GA_NO_ERROR) goto fail;

  gs = ((nEls-1) / ls) + 1;
  args[0] = input;
  args[1] = output;
  res = cuda_callkernel(k, 1, &ls, &gs, 0, args);

fail:
  /* We free our reference here */
  cuda_freekernel(k);
  return res;
}