static void strmm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[0]); unsigned h = STARPU_MATRIX_GET_NX(descr[1]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct strmm_arg * arg = (struct strmm_arg *)args; cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; cublasOperation_t trans = CUBLAS_OP_T; const float factor = 1.0f; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { float *sub11; sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]); unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]); unsigned long z; switch (s) { case 0: for (z = 0; z < nx; z++) { float pivot; pivot = sub11[z+z*ld]; STARPU_ASSERT(pivot != 0.0f); STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld); STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f, &sub11[z+(z+1)*ld], ld, &sub11[(z+1)+z*ld], 1, &sub11[(z+1) + (z+1)*ld],ld); } break; #ifdef STARPU_USE_CUDA case 1: for (z = 0; z < nx; z++) { float pivot; cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); STARPU_ASSERT(pivot != 0.0f); cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld); cublasSger(nx - z - 1, nx - z - 1, -1.0f, &sub11[z+(z+1)*ld], ld, &sub11[(z+1)+z*ld], 1, &sub11[(z+1) + (z+1)*ld],ld); } cudaStreamSynchronize(starpu_cuda_get_local_stream()); break; #endif default: STARPU_ABORT(); break; } }
static void memset_cuda(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream()); }
static void redux_cuda_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]); unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]); unsigned host_dst, host_src; /* This is a dummy technique of course */ cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); host_dst += host_src; cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); }
static void neutral_cuda_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]); /* This is a dummy technique of course */ unsigned host_dst = 0; cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); }
static void sgemm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[2]); unsigned h = STARPU_MATRIX_GET_NX(descr[2]); unsigned k = STARPU_MATRIX_GET_NY(descr[0]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct sgemm_arg * arg = (struct sgemm_arg*)args; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, h, w, k, &arg->alpha, a, lda, b, ldb, &arg->beta, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
void dot_cuda_func(void *descr[], void *cl_arg) { DOT_TYPE current_dot; DOT_TYPE local_dot; float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); cudaMemcpyAsync(¤t_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1); /* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */ current_dot += local_dot; cudaMemcpyAsync(dot, ¤t_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); }
void init_cuda_func(void *descr[], void *cl_arg) { DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream()); }