void ds_kernel_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { uint8_t *input = (uint8_t *)STARPU_MATRIX_GET_PTR(descr[0]); const unsigned input_ld = STARPU_MATRIX_GET_LD(descr[0]); uint8_t *output = (uint8_t *)STARPU_MATRIX_GET_PTR(descr[1]); const unsigned output_ld = STARPU_MATRIX_GET_LD(descr[1]); const unsigned ncols = STARPU_MATRIX_GET_NX(descr[0]); const unsigned nlines = STARPU_MATRIX_GET_NY(descr[0]); unsigned line, col; for (line = 0; line < nlines; line+=FACTOR) for (col = 0; col < ncols; col+=FACTOR) { unsigned sum = 0; unsigned lline, lcol; for (lline = 0; lline < FACTOR; lline++) for (lcol = 0; lcol < FACTOR; lcol++) { unsigned in_index = (lcol + col) + (lline + line)*input_ld; sum += input[in_index]; } unsigned out_index = (col / FACTOR) + (line / FACTOR)*output_ld; output[out_index] = (uint8_t)(sum/(FACTOR*FACTOR)); } }
static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { /* printf("22\n"); */ float *block = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *in = (float *)STARPU_VECTOR_GET_PTR(descr[1]); float *out = (float *)STARPU_VECTOR_GET_PTR(descr[2]); unsigned dx = STARPU_MATRIX_GET_NX(descr[0]); unsigned dy = STARPU_MATRIX_GET_NY(descr[0]); unsigned ld = STARPU_MATRIX_GET_LD(descr[0]); switch (s) { case 0: cblas_sgemv(CblasRowMajor, CblasNoTrans, dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1); break; #ifdef STARPU_USE_CUDA case 1: cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1); break; #endif default: STARPU_ABORT(); break; } }
static void strmm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[0]); unsigned h = STARPU_MATRIX_GET_NX(descr[1]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct strmm_arg * arg = (struct strmm_arg *)args; cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; cublasOperation_t trans = CUBLAS_OP_T; const float factor = 1.0f; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { float *sub11; sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]); unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]); unsigned long z; switch (s) { case 0: for (z = 0; z < nx; z++) { float pivot; pivot = sub11[z+z*ld]; STARPU_ASSERT(pivot != 0.0f); STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld); STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f, &sub11[z+(z+1)*ld], ld, &sub11[(z+1)+z*ld], 1, &sub11[(z+1) + (z+1)*ld],ld); } break; #ifdef STARPU_USE_CUDA case 1: for (z = 0; z < nx; z++) { float pivot; cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); STARPU_ASSERT(pivot != 0.0f); cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld); cublasSger(nx - z - 1, nx - z - 1, -1.0f, &sub11[z+(z+1)*ld], ld, &sub11[(z+1)+z*ld], 1, &sub11[(z+1) + (z+1)*ld],ld); } cudaStreamSynchronize(starpu_cuda_get_local_stream()); break; #endif default: STARPU_ABORT(); break; } }
static void getri(void *descr[], int type) { // Computes the Echelon form of A, stores the corresponding inverted multiples // in the lower part // ------------------------------ // | A | | | | | | | // |----------------------------| // | | | | | | | | // |----------------------------| // | | | | | | | | // |----------------------------| // | | | | | | | | // ------------------------------ unsigned int i, j, k; unsigned int *sub_a = (unsigned int *)STARPU_MATRIX_GET_PTR(descr[0]); unsigned int x_dim = STARPU_MATRIX_GET_NX(descr[0]); unsigned int y_dim = STARPU_MATRIX_GET_NY(descr[0]); unsigned int offset_a = STARPU_MATRIX_GET_OFFSET(descr[0]); unsigned int ld_a = STARPU_MATRIX_GET_LD(descr[0]); unsigned int mult = 0; #if DEBUG0 printf("\n --- GETRI ---\n"); #endif #if DEBUG printf("x_dim = %u\n", x_dim); printf("y_dim = %u\n", y_dim); printf("ld_a = %u\n", ld_a); #endif for (i = 0; i < y_dim; ++i) { // compute inverse neg_inv_piv[i+offset_a] = negInverseModP(sub_a[i+i*ld_a], prime); #if DEBUG printf("sub_a[%u] = %u\n", i+i*ld_a,sub_a[i+i*ld_a]); printf("inv = %u\n", neg_inv_piv[i+offset_a]); #endif for (j = i+1; j < x_dim; ++j) { // multiply by corresponding coeff mult = (neg_inv_piv[i+offset_a] * sub_a[i+j*ld_a]); //% prime; #if DEBUG printf("sub_a[%u] = %u\n", i+j*ld_a,sub_a[i+j*ld_a]); printf("mult = %u\n", mult); #endif sub_a[i+j*ld_a] = mult; for (k = i+1; k < y_dim; ++k) { sub_a[k+j*ld_a] += (sub_a[k+i*ld_a] * mult); //sub_a[k+j*ld_a] %= prime; } } } #if DEBUG0 printf("\n --- GETRI DONE ---\n"); printf("TASKS READY %d\n", starpu_task_nready()); printf("TASKS SUBMITTED %d\n", starpu_task_nsubmitted()); #endif }
static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]); TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]); TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]); unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]); unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]); unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]); CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC); }
static void sgemm_cpu(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[2]); unsigned h = STARPU_MATRIX_GET_NX(descr[2]); unsigned ks = STARPU_MATRIX_GET_NY(descr[0]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct sgemm_arg * arg = (struct sgemm_arg*)args; sgemm_("N", "N", (int*)&h, (int*)&w, (int*)&ks, &arg->alpha, a, (int*)&lda, b, (int*)&ldb, &arg->beta, c, (int*)&ldc); free(arg); }
static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { float *left = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *right = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *center = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned dx = STARPU_MATRIX_GET_NX(descr[2]); unsigned dy = STARPU_MATRIX_GET_NY(descr[2]); unsigned dz = STARPU_MATRIX_GET_NY(descr[0]); unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]); unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]); unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]); #ifdef STARPU_USE_CUDA cublasStatus status; #endif switch (s) { case 0: STARPU_SGEMM("N", "N", dy, dx, dz, -1.0f, left, ld21, right, ld12, 1.0f, center, ld22); break; #ifdef STARPU_USE_CUDA case 1: cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21, right, ld12, 1.0f, center, ld22); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) STARPU_CUBLAS_REPORT_ERROR(status); break; #endif default: STARPU_ABORT(); break; } }
static void sgemm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[2]); unsigned h = STARPU_MATRIX_GET_NX(descr[2]); unsigned k = STARPU_MATRIX_GET_NY(descr[0]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct sgemm_arg * arg = (struct sgemm_arg*)args; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, h, w, k, &arg->alpha, a, lda, b, ldb, &arg->beta, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { float *sub11; float *sub12; sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); sub12 = (float *)STARPU_MATRIX_GET_PTR(descr[1]); unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]); unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]); unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]); unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]); #ifdef STARPU_USE_CUDA cublasStatus status; #endif /* solve L11 U12 = A12 (find U12) */ switch (s) { case 0: STARPU_STRSM("L", "L", "N", "N", nx12, ny12, 1.0f, sub11, ld11, sub12, ld12); break; #ifdef STARPU_USE_CUDA case 1: cublasStrsm('L', 'L', 'N', 'N', ny12, nx12, 1.0f, sub11, ld11, sub12, ld12); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) STARPU_CUBLAS_REPORT_ERROR(status); break; #endif default: STARPU_ABORT(); break; } }
static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { float *sub11; float *sub21; sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]); unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]); unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]); unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]); unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]); #ifdef STARPU_USE_CUDA cublasStatus status; #endif switch (s) { case 0: STARPU_STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21); break; #ifdef STARPU_USE_CUDA case 1: cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) STARPU_CUBLAS_REPORT_ERROR(status); break; #endif default: STARPU_ABORT(); break; } }
static void cpu_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]); TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]); TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]); unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]); unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]); unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]); int worker_size = starpu_combined_worker_get_size(); if (worker_size == 1) { /* Sequential CPU task */ CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC); } else { /* Parallel CPU task */ unsigned rank = starpu_combined_worker_get_rank(); unsigned block_size = (nyC + worker_size - 1)/worker_size; unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank; STARPU_ASSERT(nyC = STARPU_MATRIX_GET_NY(descr[1])); TYPE *new_subB = &subB[block_size*rank]; TYPE *new_subC = &subC[block_size*rank]; CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC); } }
fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total); } } } /* * U22 */ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args) { float *left = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *right = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *center = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned dx = STARPU_MATRIX_GET_NX(descr[2]); unsigned dy = STARPU_MATRIX_GET_NY(descr[2]); unsigned dz = STARPU_MATRIX_GET_NY(descr[0]); unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]); unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]); unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]); #ifdef STARPU_USE_CUDA cublasStatus status; #endif switch (s) { case 0: SGEMM("N", "N", dy, dx, dz, -1.0f, left, ld21, right, ld12,
static unsigned xdim = 3; static unsigned ydim = 3; static int no_of_iteration=1; static void cpu_jacobi(void *descr[], __attribute__((unused)) void *arg) { float *subB; uint32_t nxB, nyB; uint32_t ldB; float east,west,north,south; uint32_t offset,dimension,offset_row_shift; subB = (float *)STARPU_MATRIX_GET_PTR(descr[0]); nxB = STARPU_MATRIX_GET_NX(descr[0]); nyB = STARPU_MATRIX_GET_NY(descr[0]); ldB = STARPU_MATRIX_GET_LD(descr[0]); unsigned i,j; unsigned int x,y; offset=STARPU_MATRIX_GET_OFFSET(descr[0]); //Based on offset we need to calculate the location of north,south,east and west in matrix A. offset_row_shift=offset; for(i=0;i<nxB;i++) { for(j=0;j<nyB;j++) { dimension=offset/4; x=floor(dimension/ydim);
*/ #include <starpu.h> #define NX 6 #define NY 6 #define PARTS 2 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0) void matrix_fill(void *buffers[], void *cl_arg STARPU_ATTRIBUTE_UNUSED) { unsigned i, j; /* length of the matrix */ unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]); unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]); unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]); int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]); for(j=0; j<ny ; j++) { for(i=0; i<nx ; i++) val[(j*ld)+i] = i+100*j; } } struct starpu_codelet cl_fill = { .cpu_funcs = {matrix_fill}, .cpu_funcs_name = {"matrix_fill"},
//Cpu function static void cpu_mult(void *descr[], __attribute__((unused)) void *arg) { float *subA, *subB, *subC; uint32_t nxC, nyC, nyA; uint32_t ldA, ldB, ldC; //The below function would return pointer to the block subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]); subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]); subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]); //nx = return the number of elements in x -axis of the handle //ny = return the number of elements in y- axis of the handle nxC = STARPU_MATRIX_GET_NX(descr[2]); nyC = STARPU_MATRIX_GET_NY(descr[2]); nyA = STARPU_MATRIX_GET_NY(descr[0]); //LD returns the number of elements in each row of matrix //It will be useful in calculating the offset to read and write in memory address ldA = STARPU_MATRIX_GET_LD(descr[0]); ldB = STARPU_MATRIX_GET_LD(descr[1]); ldC = STARPU_MATRIX_GET_LD(descr[2]); unsigned i,j,k; for (i=0;i<nyC;i++) { for(j=0;j<nxC;j++) { float sum=0.0;