void ds_kernel_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	uint8_t *input = (uint8_t *)STARPU_MATRIX_GET_PTR(descr[0]);
	const unsigned input_ld = STARPU_MATRIX_GET_LD(descr[0]);

	uint8_t *output = (uint8_t *)STARPU_MATRIX_GET_PTR(descr[1]);
	const unsigned output_ld = STARPU_MATRIX_GET_LD(descr[1]);

	const unsigned ncols = STARPU_MATRIX_GET_NX(descr[0]);
	const unsigned nlines = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned line, col;
	for (line = 0; line < nlines; line+=FACTOR)
	for (col = 0; col < ncols; col+=FACTOR)
	{
		unsigned sum = 0;

		unsigned lline, lcol;
		for (lline = 0; lline < FACTOR; lline++)
		for (lcol = 0; lcol < FACTOR; lcol++)
		{
			unsigned in_index = (lcol + col) + (lline + line)*input_ld;

			sum += input[in_index];
		}

		unsigned out_index = (col / FACTOR) + (line / FACTOR)*output_ld;
		output[out_index] = (uint8_t)(sum/(FACTOR*FACTOR));
	}
}
static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	/* printf("22\n"); */
	float *block 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	float *in 	= (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	float *out 	= (float *)STARPU_VECTOR_GET_PTR(descr[2]);

	unsigned dx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned dy = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);

	switch (s)
	{
		case 0:
			cblas_sgemv(CblasRowMajor, CblasNoTrans, dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Example #3
0
static void strmm_cuda(void *descr[], void *args) {
  float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

  unsigned w = STARPU_MATRIX_GET_NY(descr[0]);
  unsigned h = STARPU_MATRIX_GET_NX(descr[1]);

  unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
  unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
  unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);
  
  struct strmm_arg * arg = (struct strmm_arg *)args;

  cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
  cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
  cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
  cublasOperation_t trans = CUBLAS_OP_T;

  const float factor = 1.0f;
  
  cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

  cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc);
  cudaStreamSynchronize(starpu_cuda_get_local_stream());

  free(arg);
}
static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) 
{
	float *sub11;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 

	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);

	unsigned long z;

	switch (s)
	{
		case 0:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				pivot = sub11[z+z*ld];
				STARPU_ASSERT(pivot != 0.0f);
		
				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
		
				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
						&sub11[z+(z+1)*ld], ld,
						&sub11[(z+1)+z*ld], 1,
						&sub11[(z+1) + (z+1)*ld],ld);
			}
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
				cudaStreamSynchronize(starpu_cuda_get_local_stream());

				STARPU_ASSERT(pivot != 0.0f);
				
				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
				
				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
								&sub11[z+(z+1)*ld], ld,
								&sub11[(z+1)+z*ld], 1,
								&sub11[(z+1) + (z+1)*ld],ld);
			}

			cudaStreamSynchronize(starpu_cuda_get_local_stream());

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Example #5
0
static void getri(void *descr[], int type) {
  // Computes the Echelon form of A, stores the corresponding inverted multiples
  // in the lower part
  // ------------------------------
  // | A |   |   |   |   |   |    | 
  // |----------------------------|
  // |   |   |   |   |   |   |    |
  // |----------------------------|
  // |   |   |   |   |   |   |    |
  // |----------------------------|
  // |   |   |   |   |   |   |    |
  // ------------------------------
  unsigned int i, j, k;
  unsigned int *sub_a   = (unsigned int *)STARPU_MATRIX_GET_PTR(descr[0]);
  unsigned int x_dim    = STARPU_MATRIX_GET_NX(descr[0]);
  unsigned int y_dim    = STARPU_MATRIX_GET_NY(descr[0]);
  unsigned int offset_a = STARPU_MATRIX_GET_OFFSET(descr[0]);
  unsigned int ld_a     = STARPU_MATRIX_GET_LD(descr[0]);
  unsigned int mult     = 0;
 
#if DEBUG0
  printf("\n --- GETRI ---\n");
#endif
#if DEBUG
  printf("x_dim = %u\n", x_dim);
  printf("y_dim = %u\n", y_dim);
  printf("ld_a  = %u\n", ld_a);
#endif
  for (i = 0; i < y_dim; ++i) {  
    // compute inverse
    neg_inv_piv[i+offset_a] = negInverseModP(sub_a[i+i*ld_a], prime);
#if DEBUG
    printf("sub_a[%u] = %u\n", i+i*ld_a,sub_a[i+i*ld_a]);
    printf("inv  = %u\n", neg_inv_piv[i+offset_a]);
#endif
    for (j = i+1; j < x_dim; ++j) {
      // multiply by corresponding coeff
      mult  = (neg_inv_piv[i+offset_a] * sub_a[i+j*ld_a]); //% prime;
#if DEBUG
      printf("sub_a[%u] = %u\n", i+j*ld_a,sub_a[i+j*ld_a]);
      printf("mult      = %u\n", mult);
#endif
      sub_a[i+j*ld_a] = mult;
      for (k = i+1; k < y_dim; ++k) {
        sub_a[k+j*ld_a] +=  (sub_a[k+i*ld_a] * mult);
        //sub_a[k+j*ld_a] %=  prime;
      }
    }
  }  
#if DEBUG0
  printf("\n --- GETRI DONE ---\n");
  printf("TASKS READY     %d\n", starpu_task_nready());
  printf("TASKS SUBMITTED %d\n", starpu_task_nsubmitted());
#endif
}
Example #6
0
static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);

	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);

	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
				     (TYPE)0.0, subC, ldC);
}
Example #7
0
static void sgemm_cpu(void *descr[], void *args) {
    float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
    float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
    float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

    unsigned w = STARPU_MATRIX_GET_NY(descr[2]);
    unsigned h = STARPU_MATRIX_GET_NX(descr[2]);
    unsigned ks = STARPU_MATRIX_GET_NY(descr[0]);

    unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
    unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
    unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);

    struct sgemm_arg * arg = (struct sgemm_arg*)args;

    sgemm_("N", "N", (int*)&h, (int*)&w, (int*)&ks, &arg->alpha, a, (int*)&lda, b, (int*)&ldb, &arg->beta, c, (int*)&ldc);
    free(arg);
}
static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);

	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);

#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	switch (s)
	{
		case 0:
			STARPU_SGEMM("N", "N",	dy, dx, dz, 
				-1.0f, left, ld21, right, ld12,
					     1.0f, center, ld22);
			break;

#ifdef STARPU_USE_CUDA
		case 1:
			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
					right, ld12, 1.0f, center, ld22);
			status = cublasGetError();
			if (status != CUBLAS_STATUS_SUCCESS)
				STARPU_CUBLAS_REPORT_ERROR(status);

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Example #9
0
static void sgemm_cuda(void *descr[], void *args) {
    float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
    float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
    float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

    unsigned w = STARPU_MATRIX_GET_NY(descr[2]);
    unsigned h = STARPU_MATRIX_GET_NX(descr[2]);
    unsigned k = STARPU_MATRIX_GET_NY(descr[0]);

    unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
    unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
    unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);

    struct sgemm_arg * arg = (struct sgemm_arg*)args;

    cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

    cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, h, w, k, &arg->alpha, a, lda, b, ldb, &arg->beta, c, ldc);
    cudaStreamSynchronize(starpu_cuda_get_local_stream());
    free(arg);
}
static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	float *sub11;
	float *sub12;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);	
	sub12 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);

	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);

	unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
	unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
	
#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	/* solve L11 U12 = A12 (find U12) */
	switch (s)
	{
		case 0:
			STARPU_STRSM("L", "L", "N", "N",
					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
					1.0f, sub11, ld11, sub12, ld12);
			status = cublasGetError();
			if (status != CUBLAS_STATUS_SUCCESS)
				STARPU_CUBLAS_REPORT_ERROR(status);

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	float *sub11;
	float *sub21;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);

	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);

	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
	
#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	switch (s)
	{
		case 0:
			STARPU_STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
			status = cublasGetError();
			if (status != CUBLAS_STATUS_SUCCESS)
				STARPU_CUBLAS_REPORT_ERROR(status);

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Example #12
0
static void cpu_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED  void *arg)
{
	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);

	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);

	int worker_size = starpu_combined_worker_get_size();

	if (worker_size == 1)
	{
		/* Sequential CPU task */
		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
	}
	else
	{
		/* Parallel CPU task */
		unsigned rank = starpu_combined_worker_get_rank();

		unsigned block_size = (nyC + worker_size - 1)/worker_size;
		unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;

		STARPU_ASSERT(nyC = STARPU_MATRIX_GET_NY(descr[1]));

		TYPE *new_subB = &subB[block_size*rank];
		TYPE *new_subC = &subC[block_size*rank];

		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
	}
}
Example #13
0
			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
		}
	}
}

/*
 *   U22 
 */

static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
{
	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);

	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);

#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	switch (s) {
		case 0:
			SGEMM("N", "N",	dy, dx, dz, 
				-1.0f, left, ld21, right, ld12,
static unsigned xdim = 3;
static unsigned ydim = 3;
static int no_of_iteration=1;



static void cpu_jacobi(void *descr[], __attribute__((unused)) void *arg)
{ 
  float *subB;
  uint32_t nxB, nyB;
  uint32_t ldB;
  float east,west,north,south;
  uint32_t offset,dimension,offset_row_shift;
  subB = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
    
  nxB = STARPU_MATRIX_GET_NX(descr[0]);
  nyB = STARPU_MATRIX_GET_NY(descr[0]);
  
  ldB = STARPU_MATRIX_GET_LD(descr[0]);

  unsigned i,j;
  unsigned int x,y;
  offset=STARPU_MATRIX_GET_OFFSET(descr[0]);
  //Based on offset we need to calculate the location of north,south,east and west in matrix A. 
  offset_row_shift=offset;
  for(i=0;i<nxB;i++)
  {
    for(j=0;j<nyB;j++)
    {
        dimension=offset/4;
        x=floor(dimension/ydim);
 */

#include <starpu.h>

#define NX    6
#define NY    6
#define PARTS 2

#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

void matrix_fill(void *buffers[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
{
	unsigned i, j;

	/* length of the matrix */
	unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
	unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
	unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
	int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);

	for(j=0; j<ny ; j++)
	{
		for(i=0; i<nx ; i++)
			val[(j*ld)+i] = i+100*j;
	}
}

struct starpu_codelet cl_fill =
{
	.cpu_funcs = {matrix_fill},
	.cpu_funcs_name = {"matrix_fill"},
//Cpu function
static void cpu_mult(void *descr[], __attribute__((unused)) void *arg)
{ 
  float *subA, *subB, *subC;
  uint32_t nxC, nyC, nyA;
  uint32_t ldA, ldB, ldC;
  
  //The below function would return pointer to the block
  subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

  //nx = return the number of elements in x -axis of the handle
  //ny = return the number of elements in y- axis of the handle
  nxC = STARPU_MATRIX_GET_NX(descr[2]);
  nyC = STARPU_MATRIX_GET_NY(descr[2]);
  nyA = STARPU_MATRIX_GET_NY(descr[0]);
  
  //LD returns the number of elements in each row of matrix
  //It will be useful in calculating the offset to read and write in memory address
  ldA = STARPU_MATRIX_GET_LD(descr[0]);
  ldB = STARPU_MATRIX_GET_LD(descr[1]);
  ldC = STARPU_MATRIX_GET_LD(descr[2]);
  
  unsigned i,j,k;
  for (i=0;i<nyC;i++)
  { 
    for(j=0;j<nxC;j++)
     {
      float sum=0.0;