int main(int argc, char **argv) { int rank, nprocs; int g_A; int *local_A=NULL, *local_B=NULL, *output_A=NULL; int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE; int value=SIZE; #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &rank ); ElMPICommSize( MPI_COMM_WORLD, &nprocs ); // instantiate el::global array ElGlobalArraysConstruct_i( &eliga ); // initialize global arrays ElGlobalArraysInitialize_i( eliga ); #else MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MA_init(C_INT, 1000, 1000); GA_Initialize(); #endif local_A=(int*)malloc(SIZE*SIZE*sizeof(int)); output_A=(int*)malloc(SIZE*SIZE*sizeof(int)); memset (output_A, 0, SIZE*SIZE*sizeof(int)); for(int j=0; j<SIZE; j++) for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(i + j); //for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(rand()%10); local_B=(int*)malloc(SIZE*SIZE*sizeof(int)); memset (local_B, 0, SIZE*SIZE*sizeof(int)); // nb handle #if defined(USE_ELEMENTAL) typedef ElInt ga_nbhdl_t; #endif ga_nbhdl_t nbnb; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_i( eliga, DIM, dims, "array_A", NULL, &g_A ); ElGlobalArraysFill_i( eliga, g_A, &value ); #else g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL); GA_Fill(g_A, &value); #endif if (rank == 0) printf ("Initial global array:\n"); #if defined(USE_ELEMENTAL) ElGlobalArraysPrint_i( eliga, g_A ); #else GA_Print(g_A); #endif for (int i = 0; i < NITERS; i++) { // acc data #if defined(USE_ELEMENTAL) ElGlobalArraysNBAccumulate_i( eliga, g_A, lo, hi, local_A, &ld, &value, &nbnb ); #else NGA_NbAcc(g_A, lo, hi, local_A, &ld, &value, &nbnb); #endif // updated output MPI_Reduce (local_A, output_A, SIZE*SIZE, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); #if defined(USE_ELEMENTAL) ElGlobalArraysNBWait_i( eliga, &nbnb ); #else NGA_NbWait (&nbnb); #endif // get if (rank == 0) printf ("Get in iter #%d\n", i); #if defined(USE_ELEMENTAL) ElGlobalArraysSync_i( eliga ); ElGlobalArraysGet_i( eliga, g_A, lo, hi, local_B, &ld ); ElGlobalArraysPrint_i( eliga, g_A ); #else GA_Sync(); NGA_Get(g_A, lo, hi, local_B, &ld); GA_Print(g_A); #endif } // end of iters if(rank==0) { printf(" Alpha (multiplier): %d\n", value); printf(" Original local buffer (before accumulation): \n"); for(int i=0; i<SIZE; i++) { for(int j=0; j<SIZE; j++) printf("%d ", local_A[i*ld+j]); printf("\n"); } printf("\n"); printf(" Get returns: \n"); for(int i=0; i<SIZE; i++) { for(int j=0; j<SIZE; j++) printf("%d ", local_B[i*ld + j]); printf("\n"); } printf("\n"); for(int i=0; i<SIZE; i++) { for(int j=0; j<SIZE; j++) { if(local_B[i*ld+j]!=(value + (NITERS * value * (output_A[i*ld+j])))) GA_Error("ERROR", -99); } } } #if defined(USE_ELEMENTAL) ElGlobalArraysDestroy_i( eliga, g_A ); #else GA_Destroy(g_A); #endif if(rank == 0) printf ("OK. Test passed\n"); free (local_A); free (local_B); free (output_A); #if defined(USE_ELEMENTAL) ElGlobalArraysTerminate_i( eliga ); // call el::global arrays destructor ElGlobalArraysDestruct_i( eliga ); ElFinalize(); #else GA_Terminate(); MPI_Finalize(); #endif }
/* Square matrix-matrix multiplication */ void matrix_multiply(int M, int N, int K, int blockX_len, int blockY_len) { /* Local buffers and Global arrays declaration */ double *a=NULL, *b=NULL, *c=NULL; int dims[NDIMS], ld[NDIMS], chunks[NDIMS]; int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */ int g_a, g_b, g_c, g_cnt, g_cnt2; int offset; double alpha = 1.0, beta=0.0; int count_p = 0, next_p = 0; int count_gac = 0, next_gac = 0; double t1,t2,seconds; ga_nbhdl_t nbh; int count_acc = 0; /* Find local processor ID and the number of processes */ int proc=GA_Nodeid(), nprocs=GA_Nnodes(); if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 || (K % blockX_len) != 0 || (K % blockY_len) != 0) GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101); /* Allocate/Set process local buffers */ a = malloc (blockX_len * blockY_len * sizeof(double)); b = malloc (blockX_len * blockY_len * sizeof(double)); c = malloc (blockX_len * blockY_len * sizeof(double)); cdims[0] = blockX_len; cdims[1] = blockY_len; /* Configure array dimensions */ for(int i = 0; i < NDIMS; i++) { dims[i] = N; chunks[i] = -1; ld[i] = cdims[i]; /* leading dimension/stride of the local buffer */ } /* create a global array g_a and duplicate it to get g_b and g_c*/ g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks); if (!g_a) GA_Error("NGA_Create failed: A", NDIMS); #if DEBUG>1 if (proc == 0) printf(" Created Array A\n"); #endif /* Ditto for C and B */ g_b = GA_Duplicate(g_a, "array B"); g_c = GA_Duplicate(g_a, "array C"); if (!g_b || !g_c) GA_Error("GA_Duplicate failed",NDIMS); if (proc == 0) printf("Created Arrays B and C\n"); /* Subscript array for read-incr, which is nothing but proc */ int * rdcnt = malloc (nprocs * sizeof(int)); memset (rdcnt, 0, nprocs * sizeof(int)); int * rdcnt2 = malloc (nprocs * sizeof(int)); memset (rdcnt2, 0, nprocs * sizeof(int)); /* Create global array of nprocs elements for nxtval */ int counter_dim[1]; counter_dim[0] = nprocs; g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL); if (!g_cnt) GA_Error("Shared counter failed",1); g_cnt2 = GA_Duplicate(g_cnt, "another shared counter"); if (!g_cnt2) GA_Error("Another shared counter failed",1); GA_Zero(g_cnt); GA_Zero(g_cnt2); #if DEBUG>1 /* initialize data in matrices a and b */ if(proc == 0) printf("Initializing local buffers - a and b\n"); #endif int w = 0; int l = 7; for(int i = 0; i < cdims[0]; i++) { for(int j = 0; j < cdims[1]; j++) { a[i*cdims[1] + j] = (double)(++w%29); b[i*cdims[1] + j] = (double)(++l%37); } } /* Copy data to global arrays g_a and g_b from local buffers */ next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); for (int i = 0; i < N; i+=cdims[0]) { if (next_p == count_p) { for (int j = 0; j < N; j+=cdims[1]) { /* Indices of patch */ lo[0] = i; lo[1] = j; hi[0] = lo[0] + cdims[0]; hi[1] = lo[1] + cdims[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>1 printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Put(g_a, lo, hi, a, ld); NGA_Put(g_b, lo, hi, b, ld); } next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); } count_p++; } #if DEBUG>1 printf ("After NGA_PUT to global - A and B arrays\n"); #endif /* Synchronize all processors to make sure puts from nprocs has finished before proceeding with dgemm */ GA_Sync(); t1 = GA_Wtime(); next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); for (int m = 0; m < N; m+=cdims[0]) { for (int k = 0; k < N; k+=cdims[0]) { if (next_gac == count_gac) { /* A = m x k */ lo[0] = m; lo[1] = k; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_a, lo, hi, a, ld); for (int n = 0; n < N; n+=cdims[1]) { memset (c, 0, sizeof(double) * cdims[0] * cdims[1]); /* B = k x n */ lo[0] = k; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_b, lo, hi, b, ld); //_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0); /* TODO I am assuming square matrix blocks, further testing/work required for rectangular matrices */ cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */ cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha, a, cdims[0], /* lda */ b, cdims[1], /* ldb */ beta=1.0, c, cdims[0] /* ldc */); NGA_NbWait(&nbh); /* C = m x n */ lo[0] = m; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh); count_acc += 1; } /* END LOOP N */ next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); } /* ENDIF if count == next */ count_gac++; } /* END LOOP K */ } /* END LOOP M */ GA_Sync(); t2 = GA_Wtime(); seconds = t2 - t1; if (proc == 0) printf("Time taken for MM (secs):%lf \n", seconds); printf("Number of ACC: %d\n", count_acc); /* Correctness test - modify data again before this function */ for (int i = 0; i < NDIMS; i++) { lo[i] = 0; hi[i] = dims[i]-1; ld[i] = dims[i]; } verify(g_a, g_b, g_c, lo, hi, ld, N); /* Clear local buffers */ free(a); free(b); free(c); free(rdcnt); free(rdcnt2); GA_Sync(); /* Deallocate arrays */ GA_Destroy(g_a); GA_Destroy(g_b); GA_Destroy(g_c); GA_Destroy(g_cnt); GA_Destroy(g_cnt2); }