Example #1
0
Integer util_gnxtval_(Integer *val) {

    if(*val > 0) {
       if(!initialized) ga_error("nxtval: not yet initialized", 0L);
       return (Integer) NGA_Read_inc(g_T, &subscript, 1);
    }
    else if(*val==0) {
       int n = 1;
       initialized=1;

       /* create task array */
       g_T = NGA_Create(C_LONG, 1, &n,"Atomic Task", NULL);
       
       /* Initialize the task array */
       if(GA_Nodeid()==0) {
	  int lo=0, hi=0;
	  NGA_Put (g_T, &lo, &hi, &initval, &hi);
	  initval=0;
       }
              GA_Sync();
       return 0;
    }
    else if (*val < 0) { GA_Destroy(g_T); initialized=0; initval=0; return 0;}
    
    ga_error("nxtval: invalid value passed", 0L);
    return -1;
}
Integer util_tcesublock_(Integer *val,Integer *p_handle) {

//    if(*p_handle==0) exit(1);
//ga_error("nxtask: p_handle is zero", 1);

    if(*val > 0) {
//       if(!initialized) exit(1);
//ga_error("nxtask: not yet initialized", 1);
       return (Integer) NGA_Read_inc(g_T, &subscript, 1);
    }
    else if(*val==0) {
       int n = 1;
       initialized=1;
       int p_h = (int)*p_handle;

       /* create task array */
//       g_T = NGA_Create(C_LONG, 1, &n,"Atomic Task", NULL);

       g_T = NGA_Create_config(C_LONG,1,&n,"Atomic Task",NULL,p_h);

       /* Initialize the task array */
       if(GA_Pgroup_nodeid(p_h)==0) {
	  int lo=0, hi=0;
	  NGA_Put (g_T, &lo, &hi, &initval, &hi);
//          printf("PUT %i %i %i\n",sizeof(*p_handle),sizeof(Integer),sizeof(int));
	  initval=0;
       }

       GA_Pgroup_sync(p_h);
//       printf("CREATE %i %i \n",*p_handle,g_T);
       return 0;
    }
    else if (*val < 0) {
        GA_Destroy(g_T);
//        printf("DELETE %i %i \n",*p_handle,g_T);
//        ga_pgroup_sync_(p_handle);
        initialized=0; 
        initval=0; 
        return 0;
   }
    
//    ga_error("nxtval: invalid value passed", 0L);
    return -1;
}
/* Square matrix-matrix multiplication */
void matrix_multiply(int M, int N, int K, 
		int blockX_len, int blockY_len) 
{
	/* Local buffers and Global arrays declaration */
	double *a=NULL, *b=NULL, *c=NULL;

	int dims[NDIMS], ld[NDIMS], chunks[NDIMS];
	int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */

	int g_a, g_b, g_c, g_cnt, g_cnt2;
	int offset;
	double alpha = 1.0, beta=0.0;
	int count_p = 0, next_p = 0;
	int count_gac = 0, next_gac = 0;
	double t1,t2,seconds;
        ga_nbhdl_t nbh;
        int count_acc = 0;

	/* Find local processor ID and the number of processes */
	int proc=GA_Nodeid(), nprocs=GA_Nnodes();

	if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 
			|| (K % blockX_len) != 0 || (K % blockY_len) != 0)
		GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101);

	/* Allocate/Set process local buffers */
	a = malloc (blockX_len * blockY_len * sizeof(double)); 
	b = malloc (blockX_len * blockY_len * sizeof(double)); 
	c = malloc (blockX_len * blockY_len * sizeof(double));

	cdims[0] = blockX_len;
	cdims[1] = blockY_len;	

	/* Configure array dimensions */
	for(int i = 0; i < NDIMS; i++) {
		dims[i]  = N;
		chunks[i] = -1;
		ld[i]    = cdims[i]; /* leading dimension/stride of the local buffer */
	}

	/* create a global array g_a and duplicate it to get g_b and g_c*/
	g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks);

	if (!g_a) 
		GA_Error("NGA_Create failed: A", NDIMS);

#if DEBUG>1
	if (proc == 0) 
		printf("  Created Array A\n");
#endif
	/* Ditto for C and B */
	g_b = GA_Duplicate(g_a, "array B");
	g_c = GA_Duplicate(g_a, "array C");

	if (!g_b || !g_c) 
		GA_Error("GA_Duplicate failed",NDIMS);
	if (proc == 0) 
		printf("Created Arrays B and C\n");

	/* Subscript array for read-incr, which is nothing but proc */
	int * rdcnt = malloc (nprocs * sizeof(int));
	memset (rdcnt, 0, nprocs * sizeof(int));
	int * rdcnt2 = malloc (nprocs * sizeof(int));
	memset (rdcnt2, 0, nprocs * sizeof(int));

	/* Create global array of nprocs elements for nxtval */	
	int counter_dim[1];
	counter_dim[0] = nprocs;

	g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL);

	if (!g_cnt) 
		GA_Error("Shared counter failed",1);

	g_cnt2 = GA_Duplicate(g_cnt, "another shared counter");

	if (!g_cnt2) 
		GA_Error("Another shared counter failed",1);

	GA_Zero(g_cnt);
	GA_Zero(g_cnt2);

#if DEBUG>1	
	/* initialize data in matrices a and b */
	if(proc == 0)
		printf("Initializing local buffers - a and b\n");
#endif
	int w = 0; 
	int l = 7;
	for(int i = 0; i < cdims[0]; i++) {
		for(int j = 0; j < cdims[1]; j++) {
			a[i*cdims[1] + j] = (double)(++w%29);
			b[i*cdims[1] + j] = (double)(++l%37);
		}
	}

	/* Copy data to global arrays g_a and g_b from local buffers */
	next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
	for (int i = 0; i < N; i+=cdims[0]) 
	{
		if (next_p == count_p) {
			for (int j = 0; j < N; j+=cdims[1])
			{
				/* Indices of patch */
				lo[0] = i;
				lo[1] = j;
				hi[0] = lo[0] + cdims[0];
				hi[1] = lo[1] + cdims[1];

				hi[0] = hi[0]-1;
				hi[1] = hi[1]-1;
#if DEBUG>1
				printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Put(g_a, lo, hi, a, ld);
				NGA_Put(g_b, lo, hi, b, ld);

			}
			next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
		}		
		count_p++;
	}


#if DEBUG>1
	printf ("After NGA_PUT to global - A and B arrays\n");
#endif
	/* Synchronize all processors to make sure puts from 
	   nprocs has finished before proceeding with dgemm */
	GA_Sync();

	t1 = GA_Wtime();

	next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
	for (int m = 0; m < N; m+=cdims[0])
	{
		for (int k = 0; k < N; k+=cdims[0])
		{
			if (next_gac == count_gac)	
			{
				/* A = m x k */
				lo[0] = m; lo[1] = k;
				hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];

				hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
				printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Get(g_a, lo, hi, a, ld);

				for (int n = 0; n < N; n+=cdims[1])
				{
					memset (c, 0, sizeof(double) * cdims[0] * cdims[1]);
					/* B = k x n */
					lo[0] = k; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_Get(g_b, lo, hi, b, ld);


					//_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0);

					/* TODO I am assuming square matrix blocks, further testing/work 
					   required for rectangular matrices */
					cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */
							cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha,
							a, cdims[0], /* lda */ b, cdims[1], /* ldb */
							beta=1.0, c, cdims[0] /* ldc */);

					NGA_NbWait(&nbh);

					/* C = m x n */
					lo[0] = m; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh);
					count_acc += 1;
				} /* END LOOP N */
				next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
			} /* ENDIF if count == next */
			count_gac++;
		} /* END LOOP K */
	} /* END LOOP M */

	GA_Sync();
	t2 = GA_Wtime();
	seconds = t2 - t1;
	if (proc == 0)
		printf("Time taken for MM (secs):%lf \n", seconds);

        printf("Number of ACC: %d\n", count_acc);

	/* Correctness test - modify data again before this function */
	for (int i = 0; i < NDIMS; i++) {
		lo[i] = 0;
		hi[i] = dims[i]-1;
		ld[i] = dims[i];
	}

	verify(g_a, g_b, g_c, lo, hi, ld, N);

	/* Clear local buffers */
	free(a);
	free(b);
	free(c);
	free(rdcnt);
	free(rdcnt2);

	GA_Sync();

	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);
	GA_Destroy(g_c);
	GA_Destroy(g_cnt);
	GA_Destroy(g_cnt2);
}