fillandscale(int rank, int nprocs) { int g_A, val1=5, val2=5, local_A[SIZE][SIZE], i, j; int dims[DIM]={SIZE,SIZE}, alo[DIM]={1,1}, ahi[DIM]={2,2}, ld=5; g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL); GA_Zero(g_A); NGA_Fill_patch(g_A, alo, ahi, &val1); GA_Print(g_A); GA_Scale(g_A, &val2); GA_Print(g_A); NGA_Get(g_A, alo, ahi, local_A, &ld); if(rank == 1) { for(i=0; i<DIM; i++) { for(j=0; j<DIM; j++) if(local_A[i][j]!=val1*val2) printf(" GA ERROR: \n"); } } GA_Destroy(g_A); }
main(int argc, char **argv) { int rank, nprocs, i, j; int g_A, g_B, g_C, local_C[DIM][DIM], dims[DIM]={5,5}; int val_A=5, val_B=3, ld=DIM, max; int lo[DIM]={2,2}, hi[DIM]={4,4}, blo[DIM]={0,0}, bhi[DIM]={2,2}, clo[DIM]={1,1}, chi[DIM]={3,3}; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MA_init(C_INT, 1000, 1000); GA_Initialize(); g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL); g_B = NGA_Create(C_INT, DIM, dims, "array_B", NULL); g_C = NGA_Create(C_INT, DIM, dims, "array_C", NULL); GA_Fill(g_A, &val_A); GA_Fill(g_B, &val_B); GA_Zero(g_C); GA_Elem_maximum_patch(g_A, lo, hi, g_B, blo, bhi, g_C, clo, chi); GA_Print(g_C); GA_Sync(); NGA_Get(g_C, clo, chi, local_C, &ld); if(rank==1) { for(i=0; i<DIM; i++) { for(j=0; j<DIM; j++)printf("%d ", local_C[i][j]); printf("\n"); } if(val_A>val_B) max=val_A; else max=val_B; for(i=0; i<DIM; i++) { for(j=0; j<DIM; j++) if(local_C[i][j]!=max) printf("GA Error : \n"); } } GA_Sync(); if(rank == 0) printf("Test Completed \n"); GA_Terminate(); MPI_Finalize(); }
// ------------------------------------------------------------- // MatZeroEntries_DenseGA // ------------------------------------------------------------- static PetscErrorCode MatZeroEntries_DenseGA(Mat A) { PetscErrorCode ierr = 0; struct MatGACtx *ctx; ierr = MatShellGetContext(A, &ctx); CHKERRQ(ierr); GA_Zero(ctx->ga); return ierr; }
main(int argc, char **argv) { int rank, nprocs, i, j; int g_A, **local_A=NULL, **local_B=NULL; int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=5, value=5; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MA_init(C_INT, 1000, 1000); GA_Initialize(); local_A=(int**)malloc(SIZE*sizeof(int*)); for(i=0; i<SIZE; i++) { local_A[i]=(int*)malloc(SIZE*sizeof(int)); for(j=0; j<SIZE; j++) local_A[i][j]=rand()%10; } local_B=(int**)malloc(SIZE*sizeof(int*)); for(i=0; i<SIZE; i++) { local_B[i]=(int*)malloc(SIZE*sizeof(int)); for(j=0; j<SIZE; j++) local_B[i][j]=rand()%10; } g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL); GA_Zero(g_A); if(rank==0) { NGA_Put(g_A, lo, hi, local_A, &ld); NGA_Get(g_A, lo, hi, local_B, &ld); for(i=0; i<SIZE; i++) { for(j=0; j<SIZE; j++) if(local_A[i][j]!=local_B[i][j]) GA_ERROR_MSG(); } } GA_Sync(); GA_Destroy(g_A); if(rank == 0) GA_PRINT_MSG(); GA_Terminate(); MPI_Finalize(); }
void test(int data_type) { int me=GA_Nodeid(); int nproc = GA_Nnodes(); int g_a, g_b, g_c; int ndim = 2; int dims[2]={N,N}; int lo[2]={0,0}; int hi[2]={N-1,N-1}; int block_size[2]={NB,NB-1}; int proc_grid[2]; int i,j,l,k,m,n, ld; double alpha_dbl = 1.0, beta_dbl = 0.0; double dzero = 0.0; double ddiff; float alpha_flt = 1.0, beta_flt = 0.0; float fzero = 0.0; float fdiff; float ftmp; double dtmp; SingleComplex ctmp; DoubleComplex ztmp; DoubleComplex alpha_dcpl = {1.0, 0.0} , beta_dcpl = {0.0, 0.0}; DoubleComplex zzero = {0.0,0.0}; DoubleComplex zdiff; SingleComplex alpha_scpl = {1.0, 0.0} , beta_scpl = {0.0, 0.0}; SingleComplex czero = {0.0,0.0}; SingleComplex cdiff; void *alpha=NULL, *beta=NULL; void *abuf=NULL, *bbuf=NULL, *cbuf=NULL, *c_ptr=NULL; switch (data_type) { case C_FLOAT: alpha = (void *)&alpha_flt; beta = (void *)&beta_flt; abuf = (void*)malloc(N*N*sizeof(float)); bbuf = (void*)malloc(N*N*sizeof(float)); cbuf = (void*)malloc(N*N*sizeof(float)); if(me==0) printf("Single Precision: Testing GA_Sgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; case C_DBL: alpha = (void *)&alpha_dbl; beta = (void *)&beta_dbl; abuf = (void*)malloc(N*N*sizeof(double)); bbuf = (void*)malloc(N*N*sizeof(double)); cbuf = (void*)malloc(N*N*sizeof(double)); if(me==0) printf("Double Precision: Testing GA_Dgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; case C_DCPL: alpha = (void *)&alpha_dcpl; beta = (void *)&beta_dcpl; abuf = (void*)malloc(N*N*sizeof(DoubleComplex)); bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)); cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)); if(me==0) printf("Double Complex: Testing GA_Zgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; case C_SCPL: alpha = (void *)&alpha_scpl; beta = (void *)&beta_scpl; abuf = (void*)malloc(N*N*sizeof(SingleComplex)); bbuf = (void*)malloc(N*N*sizeof(SingleComplex)); cbuf = (void*)malloc(N*N*sizeof(SingleComplex)); if(me==0) printf("Single Complex: Testing GA_Cgemm,NGA_Matmul_patch for %d-Dimension", ndim); break; default: GA_Error("wrong data type", data_type); } if (me==0) printf("\nCreate A, B, C\n"); #ifdef USE_REGULAR g_a = NGA_Create(data_type, ndim, dims, "array A", NULL); #endif #ifdef USE_SIMPLE_CYCLIC g_a = NGA_Create_handle(); NGA_Set_data(g_a,ndim,dims,data_type); NGA_Set_array_name(g_a,"array A"); NGA_Set_block_cyclic(g_a,block_size); if (!GA_Allocate(g_a)) { GA_Error("Failed: create: g_a",40); } #endif #ifdef USE_SCALAPACK g_a = NGA_Create_handle(); NGA_Set_data(g_a,ndim,dims,data_type); NGA_Set_array_name(g_a,"array A"); grid_factor(nproc,&i,&j); proc_grid[0] = i; proc_grid[1] = j; NGA_Set_block_cyclic_proc_grid(g_a,block_size,proc_grid); if (!GA_Allocate(g_a)) { GA_Error("Failed: create: g_a",40); } #endif #ifdef USE_TILED g_a = NGA_Create_handle(); NGA_Set_data(g_a,ndim,dims,data_type); NGA_Set_array_name(g_a,"array A"); grid_factor(nproc,&i,&j); proc_grid[0] = i; proc_grid[1] = j; NGA_Set_tiled_proc_grid(g_a,block_size,proc_grid); if (!GA_Allocate(g_a)) { GA_Error("Failed: create: g_a",40); } #endif g_b = GA_Duplicate(g_a, "array B"); g_c = GA_Duplicate(g_a, "array C"); if(!g_a || !g_b || !g_c) GA_Error("Create failed: a, b or c",1); ld = N; if (me==0) printf("\nInitialize A\n"); /* Set up matrix A */ if (me == 0) { for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: ((float*)abuf)[i*N+j] = (float)(i*N+j); break; case C_DBL: ((double*)abuf)[i*N+j] = (double)(i*N+j); break; case C_DCPL: ((DoubleComplex*)abuf)[i*N+j].real = (double)(i*N+j); ((DoubleComplex*)abuf)[i*N+j].imag = 1.0; break; case C_SCPL: ((SingleComplex*)abuf)[i*N+j].real = (float)(i*N+j); ((SingleComplex*)abuf)[i*N+j].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_a,lo,hi,abuf,&ld); } GA_Sync(); if (me==0) printf("\nInitialize B\n"); /* Set up matrix B */ if (me == 0) { for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: ((float*)bbuf)[i*N+j] = (float)(j*N+i); break; case C_DBL: ((double*)bbuf)[i*N+j] = (double)(j*N+i); break; case C_DCPL: ((DoubleComplex*)bbuf)[i*N+j].real = (double)(j*N+i); ((DoubleComplex*)bbuf)[i*N+j].imag = 1.0; break; case C_SCPL: ((SingleComplex*)bbuf)[i*N+j].real = (float)(j*N+i); ((SingleComplex*)bbuf)[i*N+j].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_b,lo,hi,bbuf,&ld); } GA_Sync(); if (me==0) printf("\nPerform matrix multiply\n"); switch (data_type) { case C_FLOAT: NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DBL: NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_SCPL: NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DCPL: NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; default: GA_Error("wrong data type", data_type); } GA_Sync(); #if 0 if (me==0) printf("\nCheck answer\n"); /* GA_Print(g_a); if (me == 0) printf("\n\n\n\n"); GA_Print(g_b); if (me == 0) printf("\n\n\n\n"); GA_Print(g_c); */ /* Check answer */ NGA_Get(g_a,lo,hi,abuf,&ld); NGA_Get(g_b,lo,hi,bbuf,&ld); for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N+j] = fzero; break; case C_DBL: ((double*)cbuf)[i*N+j] = dzero; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N+j] = zzero; break; case C_SCPL: ((SingleComplex*)cbuf)[i*N+j] = czero; break; default: GA_Error("wrong data type", data_type); } for (k=0; k<N; k++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N+j] += ((float*)abuf)[i*N+k] *((float*)bbuf)[k*N+j]; break; case C_DBL: ((double*)cbuf)[i*N+j] += ((double*)abuf)[i*N+k] *((double*)bbuf)[k*N+j]; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N+j].real += (((DoubleComplex*)abuf)[i*N+k].real *((DoubleComplex*)bbuf)[k*N+j].real -(((DoubleComplex*)abuf)[i*N+k].imag *((DoubleComplex*)bbuf)[k*N+j].imag)); ((DoubleComplex*)cbuf)[i*N+j].imag += (((DoubleComplex*)abuf)[i*N+k].real *((DoubleComplex*)bbuf)[k*N+j].imag +(((DoubleComplex*)abuf)[i*N+k].imag *((DoubleComplex*)bbuf)[k*N+j].real)); break; case C_SCPL: ((SingleComplex*)cbuf)[i*N+j].real += (((SingleComplex*)abuf)[i*N+k].real *((SingleComplex*)bbuf)[k*N+j].real -(((SingleComplex*)abuf)[i*N+k].imag *((SingleComplex*)bbuf)[k*N+j].imag)); ((SingleComplex*)cbuf)[i*N+j].imag += (((SingleComplex*)abuf)[i*N+k].real *((SingleComplex*)bbuf)[k*N+j].imag +(((SingleComplex*)abuf)[i*N+k].imag *((SingleComplex*)bbuf)[k*N+j].real)); break; default: GA_Error("wrong data type", data_type); } } } } GA_Sync(); if (me == 0) { NGA_Get(g_c,lo,hi,abuf,&ld); for (i=0; i<N; i++) { for (j=0; j<N; j++) { switch (data_type) { case C_FLOAT: fdiff = ((float*)abuf)[i*N+j]-((float*)cbuf)[i*N+j]; if (((float*)abuf)[i*N+j] != 0.0) { fdiff /= ((float*)abuf)[i*N+j]; } if (fabs(fdiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j, ((float*)abuf)[i*N+j],((float*)cbuf)[i*N+j]); } break; case C_DBL: ddiff = ((double*)abuf)[i*N+j]-((double*)cbuf)[i*N+j]; if (((double*)abuf)[i*N+j] != 0.0) { ddiff /= ((double*)abuf)[i*N+j]; } if (fabs(ddiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j, ((double*)abuf)[i*N+j],((double*)cbuf)[i*N+j]); } break; case C_DCPL: zdiff.real = ((DoubleComplex*)abuf)[i*N+j].real -((DoubleComplex*)cbuf)[i*N+j].real; zdiff.imag = ((DoubleComplex*)abuf)[i*N+j].imag -((DoubleComplex*)cbuf)[i*N+j].imag; if (((DoubleComplex*)abuf)[i*N+j].real != 0.0 || ((DoubleComplex*)abuf)[i*N+j].imag != 0.0) { ztmp = ((DoubleComplex*)abuf)[i*N+j]; ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag) /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag)); } else { ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag); } if (fabs(ddiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j, ((DoubleComplex*)abuf)[i*N+j].real, ((DoubleComplex*)abuf)[i*N+j].imag, ((DoubleComplex*)cbuf)[i*N+j].real, ((DoubleComplex*)cbuf)[i*N+j].imag); } break; case C_SCPL: cdiff.real = ((SingleComplex*)abuf)[i*N+j].real -((SingleComplex*)cbuf)[i*N+j].real; cdiff.imag = ((SingleComplex*)abuf)[i*N+j].imag -((SingleComplex*)cbuf)[i*N+j].imag; if (((SingleComplex*)abuf)[i*N+j].real != 0.0 || ((SingleComplex*)abuf)[i*N+j].imag != 0.0) { ctmp = ((SingleComplex*)abuf)[i*N+j]; fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag) /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag)); } else { fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag); } if (fabs(fdiff) > TOLERANCE) { printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j, ((SingleComplex*)abuf)[i*N+j].real, ((SingleComplex*)abuf)[i*N+j].imag, ((SingleComplex*)cbuf)[i*N+j].real, ((SingleComplex*)cbuf)[i*N+j].imag); } break; default: GA_Error("wrong data type", data_type); } } } } GA_Sync(); /* copy cbuf back to g_a */ if (me == 0) { NGA_Put(g_a,lo,hi,cbuf,&ld); } GA_Sync(); /* Get norm of g_a */ switch (data_type) { case C_FLOAT: ftmp = GA_Fdot(g_a,g_a); break; case C_DBL: dtmp = GA_Ddot(g_a,g_a); break; case C_DCPL: ztmp = GA_Zdot(g_a,g_a); break; case C_SCPL: ctmp = GA_Cdot(g_a,g_a); break; default: GA_Error("wrong data type", data_type); } /* subtract C from A and put the results in B */ beta_flt = -1.0; beta_dbl = -1.0; beta_scpl.real = -1.0; beta_dcpl.real = -1.0; GA_Zero(g_b); GA_Add(alpha,g_a,beta,g_c,g_b); /* evaluate the norm of the difference between the two matrices */ switch (data_type) { case C_FLOAT: fdiff = GA_Fdot(g_b, g_b); if (ftmp != 0.0) { fdiff /= ftmp; } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE); GA_Error("GA_Sgemm Failed", 1); } else if (me == 0) { printf("\nGA_Sgemm OK\n\n"); } break; case C_DBL: ddiff = GA_Ddot(g_b, g_b); if (dtmp != 0.0) { ddiff /= dtmp; } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE); GA_Error("GA_Dgemm Failed", 1); } else if (me == 0) { printf("\nGA_Dgemm OK\n\n"); } break; case C_DCPL: zdiff = GA_Zdot(g_b, g_b); if (ztmp.real != 0.0 || ztmp.imag != 0.0) { ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag) /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag)); } else { ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag); } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE); GA_Error("GA_Zgemm Failed", 1); } else if (me == 0) { printf("\nGA_Zgemm OK\n\n"); } break; case C_SCPL: cdiff = GA_Cdot(g_b, g_b); if (ctmp.real != 0.0 || ctmp.imag != 0.0) { fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag) /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag)); } else { fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag); } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE); GA_Error("GA_Cgemm Failed", 1); } else if (me == 0) { printf("\nGA_Cgemm OK\n\n"); } break; default: GA_Error("wrong data type", data_type); } #endif free(abuf); free(bbuf); free(cbuf); switch (data_type) { case C_FLOAT: abuf = (void*)malloc(N*N*sizeof(float)/4); bbuf = (void*)malloc(N*N*sizeof(float)/4); cbuf = (void*)malloc(N*N*sizeof(float)/4); break; case C_DBL: abuf = (void*)malloc(N*N*sizeof(double)/4); bbuf = (void*)malloc(N*N*sizeof(double)/4); cbuf = (void*)malloc(N*N*sizeof(double)/4); break; case C_DCPL: abuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4); bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4); cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4); break; case C_SCPL: abuf = (void*)malloc(N*N*sizeof(SingleComplex)/4); bbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4); cbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4); break; default: GA_Error("wrong data type", data_type); } /* Test multiply on a fraction of matrix. Start by reinitializing * A and B */ GA_Zero(g_a); GA_Zero(g_b); GA_Zero(g_c); if (me==0) printf("\nTest patch multiply\n"); lo[0] = N/4; lo[1] = N/4; hi[0] = 3*N/4-1; hi[1] = 3*N/4-1; ld = N/2; /* Set up matrix A */ if (me==0) printf("\nInitialize A\n"); if (me == 0) { for (i=N/4; i<3*N/4; i++) { for (j=N/4; j<3*N/4; j++) { switch (data_type) { case C_FLOAT: ((float*)abuf)[(i-N/4)*N/2+(j-N/4)] = (float)(i*N+j); break; case C_DBL: ((double*)abuf)[(i-N/4)*N/2+(j-N/4)] = (double)(i*N+j); break; case C_DCPL: ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(i*N+j); ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; case C_SCPL: ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(i*N+j); ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_a,lo,hi,abuf,&ld); } GA_Sync(); if (me==0) printf("\nInitialize B\n"); /* Set up matrix B */ if (me == 0) { for (i=N/4; i<3*N/4; i++) { for (j=N/4; j<3*N/4; j++) { switch (data_type) { case C_FLOAT: ((float*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (float)(j*N+i); break; case C_DBL: ((double*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (double)(j*N+i); break; case C_DCPL: ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(j*N+i); ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; case C_SCPL: ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(j*N+i); ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0; break; default: GA_Error("wrong data type", data_type); } } } NGA_Put(g_b,lo,hi,bbuf,&ld); } GA_Sync(); beta_flt = 0.0; beta_dbl = 0.0; beta_scpl.real = 0.0; beta_dcpl.real = 0.0; if (me==0) printf("\nPerform matrix multiply on sub-blocks\n"); switch (data_type) { case C_FLOAT: NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DBL: NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_SCPL: NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; case C_DCPL: NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi, g_b,lo,hi,g_c,lo,hi); break; default: GA_Error("wrong data type", data_type); } GA_Sync(); #if 0 if (0) { /* if (data_type != C_SCPL && data_type != C_DCPL) { */ if (me==0) printf("\nCheck answer\n"); /* Multiply buffers by hand */ if (me == 0) { for (i=0; i<N/2; i++) { for (j=0; j<N/2; j++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N/2+j] = fzero; break; case C_DBL: ((double*)cbuf)[i*N/2+j] = dzero; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N/2+j] = zzero; break; case C_SCPL: ((SingleComplex*)cbuf)[i*N/2+j] = czero; break; default: GA_Error("wrong data type", data_type); } for (k=0; k<N/2; k++) { switch (data_type) { case C_FLOAT: ((float*)cbuf)[i*N/2+j] += ((float*)abuf)[i*N/2+k] *((float*)bbuf)[k*N/2+j]; break; case C_DBL: ((double*)cbuf)[i*N/2+j] += ((double*)abuf)[i*N/2+k] *((double*)bbuf)[k*N/2+j]; break; case C_DCPL: ((DoubleComplex*)cbuf)[i*N/2+j].real += (((DoubleComplex*)abuf)[i*N/2+k].real *((DoubleComplex*)bbuf)[k*N/2+j].real -(((DoubleComplex*)abuf)[i*N/2+k].imag *((DoubleComplex*)bbuf)[k*N/2+j].imag)); ((DoubleComplex*)cbuf)[i*N/2+j].imag += (((DoubleComplex*)abuf)[i*N/2+k].real *((DoubleComplex*)bbuf)[k*N/2+j].imag +(((DoubleComplex*)abuf)[i*N/2+k].imag *((DoubleComplex*)bbuf)[k*N/2+j].real)); break; case C_SCPL: ((SingleComplex*)cbuf)[i*N/2+j].real += (((SingleComplex*)abuf)[i*N/2+k].real *((SingleComplex*)bbuf)[k*N/2+j].real -(((SingleComplex*)abuf)[i*N/2+k].imag *((SingleComplex*)bbuf)[k*N/2+j].imag)); ((SingleComplex*)cbuf)[i*N/2+j].imag += (((SingleComplex*)abuf)[i*N/2+k].real *((SingleComplex*)bbuf)[k*N/2+j].imag +(((SingleComplex*)abuf)[i*N/2+k].imag *((SingleComplex*)bbuf)[k*N/2+j].real)); break; default: GA_Error("wrong data type", data_type); } } } } NGA_Put(g_a,lo,hi,cbuf,&ld); } if (me == 0) printf("\n\n\n\n"); /* Get norm of g_a */ switch (data_type) { case C_FLOAT: ftmp = NGA_Fdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; case C_DBL: dtmp = NGA_Ddot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; case C_DCPL: ztmp = NGA_Zdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; case C_SCPL: ctmp = NGA_Cdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi); break; default: GA_Error("wrong data type", data_type); } /* subtract C from A and put the results in B */ beta_flt = -1.0; beta_dbl = -1.0; beta_scpl.real = -1.0; beta_dcpl.real = -1.0; NGA_Zero_patch(g_b,lo,hi); NGA_Add_patch(alpha,g_a,lo,hi,beta,g_c,lo,hi,g_b,lo,hi); /* evaluate the norm of the difference between the two matrices */ switch (data_type) { case C_FLOAT: fdiff = NGA_Fdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (ftmp != 0.0) { fdiff /= ftmp; } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE); GA_Error("GA_Sgemm Failed", 1); } else if (me == 0) { printf("\nGA_Sgemm OK\n\n"); } break; case C_DBL: ddiff = NGA_Ddot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (dtmp != 0.0) { ddiff /= dtmp; } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE); GA_Error("GA_Dgemm Failed", 1); } else if (me == 0) { printf("\nGA_Dgemm OK\n\n"); } break; case C_DCPL: zdiff = NGA_Zdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (ztmp.real != 0.0 || ztmp.imag != 0.0) { ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag) /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag)); } else { ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag); } if(fabs(ddiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE); GA_Error("GA_Zgemm Failed", 1); } else if (me == 0) { printf("\nGA_Zgemm OK\n\n"); } break; case C_SCPL: cdiff = NGA_Cdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi); if (ctmp.real != 0.0 || ctmp.imag != 0.0) { fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag) /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag)); } else { fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag); } if(fabs(fdiff) > TOLERANCE) { printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE); GA_Error("GA_Cgemm Failed", 1); } else if (me == 0) { printf("\nGA_Cgemm OK\n\n"); } break; default: GA_Error("wrong data type", data_type); } } #endif free(abuf); free(bbuf); free(cbuf); GA_Destroy(g_a); GA_Destroy(g_b); GA_Destroy(g_c); }
main(int argc, char **argv) { int rank, nprocs, i, j; int g_A, g_B, g_C, local_C[DIM][DIM], dims[DIM]={5,5}, val1=5, val2=4, alpha=3, beta=2, ld=5; int alo[DIM]={2,2}, ahi[DIM]={3,3}, blo[DIM]={2,2}, bhi[DIM]={3,3}, clo[DIM]={1,1}, chi[DIM]={2,2}; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MA_init(C_INT, 1000, 1000); GA_Initialize(); g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL); g_B = GA_Duplicate(g_A, "array_B"); g_C = GA_Duplicate(g_A, "array_C"); GA_Fill(g_A, &val1); GA_Fill(g_B, &val2); GA_Zero(g_C); NGA_Add_patch(&alpha, g_A, clo, chi, &beta, g_B, blo, bhi, g_C, clo, chi); GA_Sync(); GA_Print(g_A); GA_Print(g_B); GA_Print(g_C); NGA_Get(g_C, clo, chi, local_C, &ld); //printf("check 1 \n"); for(i=0; i<DIM; i++) { for(j=0; j<DIM; j++)printf("%d ", local_C[i][j]); printf("\n"); } if(rank == 0) { printf("check 2\n"); for(i=0; i<DIM; i++) { for(j=0; j<DIM; j++) if(local_C[i][j]!=(alpha*val1)+(beta*val2)) printf("GA Error : \n"); } } if(rank==0) GA_PRINT_MSG(); GA_Sync(); /* GA_Destroy(g_A); GA_Destroy(g_B); GA_Destroy(g_C); */ //******************************************************************* /* what would be the possible reason for GA_destroy to get failed .., * solve this before consolidate the whole */ GA_Terminate(); MPI_Finalize(); }
void do_work() { int ZERO=0; /* useful constants */ int g_a, g_b; int n=N, ndim=2,type=MT_F_DBL,dims[2]={N,N},coord[2]; int me=GA_Nodeid(), nproc=GA_Nnodes(); int row, i, j; int lo[2], hi[2]; /* Note: on all current platforms DoublePrecision = double */ DoublePrecision buf[N], *max_row=NULL; MPI_Comm WORLD_COMM; MPI_Comm ROW_COMM; int ilo,ihi, jlo,jhi, ld, prow, pcol; int root=0, grp_me=-1; WORLD_COMM = GA_MPI_Comm_pgroup_default(); if(me==0)printf("Creating matrix A\n"); dims[0]=n; dims[1]=n; g_a = NGA_Create(type, ndim, dims, "A", NULL); if(!g_a) GA_Error("create failed: A",n); if(me==0)printf("OK\n"); if(me==0)printf("Creating matrix B\n"); dims[0]=n; g_b = NGA_Create(type, 1, dims, "B", NULL); if(!g_b) GA_Error("create failed: B",n); if(me==0)printf("OK\n"); GA_Zero(g_a); /* zero the matrix */ if(me==0)printf("Initializing matrix A\n"); /* fill in matrix A with values: A(i,j) = (i+j) */ for(row=me; row<n; row+= nproc){ /** * simple load balancing: * each process works on a different row in MIMD style */ for(i=0; i<n; i++) buf[i]=(DoublePrecision)(i+row+1); lo[0]=hi[0]=row; lo[1]=ZERO; hi[1]=n-1; NGA_Put(g_a, lo, hi, buf, &n); } /* GA_print(&g_a);*/ NGA_Distribution(g_a, me, lo, hi); ilo=lo[0]; ihi=hi[0]; jlo=lo[1]; jhi=hi[1]; GA_Sync(); if(ihi-ilo+1 >0){ max_row=(DoublePrecision*)malloc(sizeof(DoublePrecision)*(ihi-ilo+1)); if (!max_row) GA_Error("malloc 3 failed",(ihi-ilo+1)); for (i=0; i<(ihi-ilo+1); i++) { max_row[i] = 0.0; } } NGA_Proc_topology(g_a, me, coord); /* block coordinates */ prow = coord[0]; pcol = coord[1]; if(me==0)printf("Splitting comm according to distribution of A\n"); /* GA on SP1 requires synchronization before & after message-passing !!*/ GA_Sync(); if(me==0)printf("Computing max row elements\n"); /* create communicator for processes that 'own' A[:,jlo:jhi] */ MPI_Barrier(WORLD_COMM); if(pcol < 0 || prow <0) MPI_Comm_split(WORLD_COMM,MPI_UNDEFINED,MPI_UNDEFINED, &ROW_COMM); else MPI_Comm_split(WORLD_COMM, (int)pcol, (int)prow, &ROW_COMM); if(ROW_COMM != MPI_COMM_NULL){ double *ptr; MPI_Comm_rank(ROW_COMM, &grp_me); /* each process computes max elements in the block it 'owns' */ lo[0]=ilo; hi[0]=ihi; lo[1]=jlo; hi[1]=jhi; NGA_Access(g_a, lo, hi, &ptr, &ld); for(i=0; i<ihi-ilo+1; i++){ for(j=0; j<jhi-jlo+1; j++) if(max_row[i] < ptr[i*ld + j]){ max_row[i] = ptr[i*ld + j]; } } MPI_Reduce(max_row, buf, ihi-ilo+1, MPI_DOUBLE, MPI_MAX, root, ROW_COMM); }else fprintf(stderr,"process %d not participating\n",me); GA_Sync(); /* processes with rank=root in ROW_COMM put results into g_b */ ld = 1; if(grp_me == root) { lo[0]=ilo; hi[0]=ihi; NGA_Put(g_b, lo, hi, buf, &ld); } GA_Sync(); if(me==0)printf("Checking the result\n"); if(me==0){ lo[0]=ZERO; hi[0]=n-1; NGA_Get(g_b, lo, hi, buf, &n); for(i=0; i< n; i++)if(buf[i] != (double)n+i){ fprintf(stderr,"error:%d max=%f should be:%d\n",i,buf[i],n+i); GA_Error("terminating...",1); } } if(me==0)printf("OK\n"); GA_Destroy(g_a); GA_Destroy(g_b); }
int main(int argc, char **argv) { int me; int nproc; int status; int g_a; int dims[NDIM]; int chunk[NDIM]; int pg_world; size_t num = 10; double *p1 = NULL; double *p2 = NULL; size_t i; int num_mutex; int lo[1]; int hi[1]; int ld[1]={1}; MPI_Comm comm; MP_INIT(argc,argv); GA_INIT(argc,argv); me = GA_Nodeid(); nproc = GA_Nnodes(); comm = GA_MPI_Comm_pgroup_default(); printf("%d: Hello world!\n",me); if (me==0) printf("%d: GA_Initialize\n",me); /*if (me==0) printf("%d: ARMCI_Init\n",me);*/ /*ARMCI_Init();*/ /*if (me==0) printf("%d: MA_Init\n",me);*/ /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/ if (me==0) printf("%d: GA_Create_handle\n",me); g_a = GA_Create_handle(); if (me==0) printf("%d: GA_Set_array_name\n",me); GA_Set_array_name(g_a,"test array A"); dims[0] = 30; if (me==0) printf("%d: GA_Set_data\n",me); GA_Set_data(g_a,NDIM,dims,MT_DBL); chunk[0] = -1; if (me==0) printf("%d: GA_Set_chunk\n",me); GA_Set_chunk(g_a,chunk); if (me==0) printf("%d: GA_Pgroup_get_world\n",me); pg_world = GA_Pgroup_get_world(); if (me==0) printf("%d: GA_Set_pgroup\n",me); GA_Set_pgroup(g_a,pg_world); if (me==0) printf("%d: GA_Allocate\n",me); status = GA_Allocate(g_a); if(0 == status) MPI_Abort(comm,100); if (me==0) printf("%d: GA_Zero\n",me); GA_Zero(g_a); if (me==0) printf("%d: GA_Sync\n",me); GA_Sync(); num = 10; p1 = malloc(num*sizeof(double)); /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/ if (p1==NULL) MPI_Abort(comm,1000); p2 = malloc(num*sizeof(double)); /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/ if (p2==NULL) MPI_Abort(comm,2000); for ( i=0 ; i<num ; i++ ) p1[i] = 7.0; for ( i=0 ; i<num ; i++ ) p2[i] = 3.0; num_mutex = 17; status = GA_Create_mutexes(num_mutex); if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status); /***************************************************************/ if (me==0) { printf("%d: before GA_Lock\n",me); GA_Lock(0); lo[0] = 0; hi[0] = num-1; GA_Init_fence(); NGA_Put(g_a,lo,hi,p1,ld); GA_Fence(); GA_Unlock(0); printf("%d: after GA_Unlock\n",me); } GA_Print(g_a); if (me==1) { printf("%d: before GA_Lock\n",me); GA_Lock(0); lo[0] = 0; hi[0] = num-1; GA_Init_fence(); NGA_Get(g_a,lo,hi,p2,ld); GA_Fence(); GA_Unlock(0); printf("%d: after GA_Unlock\n",me); for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n", (long unsigned)i,p2[i]); } /***************************************************************/ status = GA_Destroy_mutexes(); if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status); /*ARMCI_Free(p2);*/ /*ARMCI_Free(p1);*/ free(p2); free(p1); if (me==0) printf("%d: GA_Destroy\n",me); GA_Destroy(g_a); /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/ /*ARMCI_Finalize();*/ if (me==0) printf("%d: GA_Terminate\n",me); GA_Terminate(); if (me==0) printf("%d: MPI_Finalize\n",me); MPI_Finalize(); return(0); }
void do_work() { int ONE=1 ; /* useful constants */ int g_a, g_b; int n=N, type=MT_F_DBL; int me=GA_Nodeid(), nproc=GA_Nnodes(); int i, row; int dims[2]={N,N}; int lo[2], hi[2], ld; /* Note: on all current platforms DoublePrecision == double */ double buf[N], err, alpha, beta; if(me==0)printf("Creating matrix A\n"); g_a = NGA_Create(type, 2, dims, "A", NULL); if(!g_a) GA_Error("create failed: A",n); if(me==0)printf("OK\n"); if(me==0)printf("Creating matrix B\n"); /* create matrix B so that it has dims and distribution of A*/ g_b = GA_Duplicate(g_a, "B"); if(! g_b) GA_Error("duplicate failed",n); if(me==0)printf("OK\n"); GA_Zero(g_a); /* zero the matrix */ if(me==0)printf("Initializing matrix A\n"); /* fill in matrix A with random values in range 0.. 1 */ lo[1]=0; hi[1]=n-1; for(row=me; row<n; row+= nproc){ /* each process works on a different row in MIMD style */ lo[0]=hi[0]=row; for(i=0; i<n; i++) buf[i]=sin((double)i + 0.1*(row+1)); NGA_Put(g_a, lo, hi, buf, &n); } if(me==0)printf("Symmetrizing matrix A\n"); GA_Symmetrize(g_a); /* symmetrize the matrix A = 0.5*(A+A') */ /* check if A is symmetric */ if(me==0)printf("Checking if matrix A is symmetric\n"); GA_Transpose(g_a, g_b); /* B=A' */ alpha=1.; beta=-1.; GA_Add(&alpha, g_a, &beta, g_b, g_b); /* B= A - B */ err= GA_Ddot(g_b, g_b); if(me==0)printf("Error=%f\n",(double)err); if(me==0)printf("\nChecking atomic accumulate \n"); GA_Zero(g_a); /* zero the matrix */ for(i=0; i<n; i++) buf[i]=(double)i; /* everybody accumulates to the same location/row */ alpha = 1.0; row = n/2; lo[0]=hi[0]=row; lo[1]=0; hi[1]=n-1; ld = hi[1]-lo[1]+1; NGA_Acc(g_a, lo, hi, buf, &ld, &alpha ); GA_Sync(); if(me==0){ /* node 0 is checking the result */ NGA_Get(g_a, lo, hi, buf,&ld); for(i=0; i<n; i++) if(buf[i] != (double)nproc*i) GA_Error("failed: column=",i); printf("OK\n\n"); } GA_Destroy(g_a); GA_Destroy(g_b); }
static int test(int shape_idx, int type_idx, int dist_idx) { int type = TYPES[type_idx]; int *dims = SHAPES[shape_idx]; int ndim = SHAPES_NDIM[shape_idx]; mock_ga_t *mock_a, *result_a; int g_a; int buffer[100]; int lo[GA_MAX_DIM], hi[GA_MAX_DIM], ld[GA_MAX_DIM], shape[GA_MAX_DIM]; int result=0, error_index=-1, error_proc=-1; mock_a = Mock_Create(type, ndim, dims, "mock", NULL); result_a = Mock_Create(type, ndim, dims, "mock", NULL); g_a = create_function[dist_idx](type, ndim, dims); mock_data(mock_a, g_a); mock_to_global(mock_a, g_a); Mock_Zero(mock_a); GA_Zero(g_a); global_to_mock(g_a, result_a); result = neq_mock(mock_a, result_a, &error_index); if (0 != result) { error_proc = GA_Nodeid(); } GA_Igop(&result, 1, "+"); GA_Igop(&error_proc, 1, "max"); if (error_proc != GA_Nodeid()) { error_index = 0; } GA_Igop(&error_index, 1, "+"); if (0 != result) { if (error_proc == GA_Nodeid()) { printf("ERROR: local result failed to compare to global result\n"); printf("\terror_proc=%d\n", error_proc); printf("\terror_index=%d\n", error_index); printf("***LOCAL RESULT***\n"); Mock_Print(mock_a); printf("***GLOBAL RESULT***\n"); Mock_Print(result_a); printf("\tprinting array distribution\n"); } GA_Sync(); GA_Print(g_a); GA_Print_distribution(g_a); return 1; } Mock_Destroy(mock_a); Mock_Destroy(result_a); GA_Destroy(g_a); return 0; }
void test_io_dbl() { int n, ndim = NDIM; double err, tt0, tt1, mbytes; int g_a, g_b, d_a; int i, itmp, j, req, loop; int glo[MAXDIM],ghi[MAXDIM]; dra_size_t dlo[MAXDIM],dhi[MAXDIM]; dra_size_t ddims[MAXDIM],reqdims[MAXDIM]; dra_size_t m; int index[MAXDIM], dims[MAXDIM]; int me, nproc, isize; double *ptr; double plus, minus; int ld[MAXDIM], chunk[MAXDIM]; char filename[80]; FILE *fd; n = SIZE; m = ((dra_size_t)NFACTOR)*((dra_size_t)SIZE); loop = 1; for (i=0; i<ndim; i++) loop *= NFACTOR; req = -1; nproc = GA_Nnodes(); me = GA_Nodeid(); if (me == 0) { printf("Creating temporary global arrays %d",n); for (i=1; i<ndim; i++) { printf(" x %d",n); } printf("\n"); } if (me == 0) fflush(stdout); GA_Sync(); for (i=0; i<ndim; i++) { dims[i] = n; chunk[i] = 1; } g_a = NGA_Create(MT_DBL, ndim, dims, "a", chunk); if (!g_a) GA_Error("NGA_Create failed: a", 0); g_b = NGA_Create(MT_DBL, ndim, dims, "b", chunk); if (!g_b) GA_Error("NGA_Create failed: b", 0); if (me == 0) printf("done\n"); if (me == 0) fflush(stdout); /* initialize g_a, g_b with random values ... use ga_access to avoid allocating local buffers for ga_put */ GA_Sync(); NGA_Distribution(g_a, me, glo, ghi); NGA_Access(g_a, glo, ghi, &ptr, ld); isize = 1; for (i=0; i<ndim; i++) isize *= (ghi[i]-glo[i]+1); fill_random(ptr, isize); GA_Sync(); GA_Zero(g_b); /*.......................................................................*/ if (me == 0) { printf("Creating Disk array %ld",m); for (i=1; i<ndim; i++) { printf(" x %ld",m); } printf("\n"); } if (me == 0) fflush(stdout); for (i=0; i<ndim; i++) { ddims[i] = m; reqdims[i] = (dra_size_t)n; } GA_Sync(); strcpy(filename,FNAME); if (! (fd = fopen(filename, "w"))) { strcpy(filename,FNAME_ALT); if (! (fd = fopen(filename, "w"))) { GA_Error("open failed",0); } } fclose(fd); if (NDRA_Create(MT_DBL, ndim, ddims, "A", filename, DRA_RW, reqdims, &d_a) != 0) { GA_Error("NDRA_Create failed(d_a): ",0); } if (me == 0) printf("testing write\n"); fflush(stdout); tt1 = 0.0; for (i=0; i<loop; i++) { itmp=i; for (j=0; j<ndim; j++) { index[j] = itmp%NFACTOR; itmp = (itmp - index[j])/NFACTOR; } for (j=0; j<ndim; j++) { glo[j] = 0; ghi[j] = SIZE - 1; dlo[j] = ((dra_size_t)index[j])*((dra_size_t)SIZE); dhi[j] = (((dra_size_t)index[j])+(dra_size_t)1) * ((dra_size_t)SIZE) - (dra_size_t)1; } tt0 = MP_TIMER(); if (NDRA_Write_section(FALSE, g_a, glo, ghi, d_a, dlo, dhi, &req) != 0) { GA_Error("ndra_write_section failed:",0); } if (DRA_Wait(req) != 0) { GA_Error("DRA_Wait failed(d_a): ",req); } tt1 += (MP_TIMER() - tt0); } GA_Dgop(&tt1,1,"+"); tt1 = tt1/((double)nproc); mbytes = 1.e-6 * (double)(pow(m,ndim)*sizeof(double)); if (me == 0) { printf("%11.2f MB time = %11.2f rate = %11.3f MB/s\n", mbytes,tt1,mbytes/tt1); } if (DRA_Close(d_a) != 0) { GA_Error("DRA_Close failed(d_a): ",d_a); } if (me == 0) printf("\n"); if (me == 0) printf("disk array closed\n"); if (me == 0) fflush(stdout); /*..........................................................*/ if (me == 0) printf("\n"); if (me == 0) printf("opening disk array\n"); if (DRA_Open(filename, DRA_R, &d_a) != 0) { GA_Error("DRA_Open failed",0); } if (me == 0) printf("testing read\n"); /* printf("testing read on proc %d\n",me); */ if (me == 0) fflush(stdout); tt1 = 0.0; for (i=0; i<loop; i++) { itmp=i; for (j=0; j<ndim; j++) { index[j] = itmp%NFACTOR; itmp = (itmp - index[j])/NFACTOR; } for (j=0; j<ndim; j++) { glo[j] = 0; ghi[j] = SIZE - 1; dlo[j] = ((dra_size_t)index[j])*((dra_size_t)SIZE); dhi[j] = (((dra_size_t)index[j])+(dra_size_t)1) * ((dra_size_t)SIZE) - (dra_size_t)1; } tt0 = MP_TIMER(); if (NDRA_Read_section(FALSE, g_b, glo, ghi, d_a, dlo, dhi, &req) != 0) { GA_Error("ndra_read_section failed:",0); } if (DRA_Wait(req) != 0) { GA_Error("DRA_Wait failed(d_a): ",req); } tt1 += (MP_TIMER() - tt0); plus = 1.0; minus = -1.0; GA_Add(&plus, g_a, &minus, g_b, g_b); err = GA_Ddot(g_b, g_b); if (err != 0) { if (me == 0) { printf("BTW, we have error = %f on loop value %d\n", err,i); } GA_Error(" bye",0); } } GA_Dgop(&tt1,1,"+"); tt1 = tt1/((double)nproc); if (me == 0) { printf("%11.2f MB time = %11.2f rate = %11.3f MB/s\n", mbytes,tt1,mbytes/tt1); } if (DRA_Delete(d_a) != 0) GA_Error("DRA_Delete failed",0); /*.......................................................................*/ GA_Destroy(g_a); GA_Destroy(g_b); }
/** * Evaluate offsets for each network component */ void setOffsets(void) { // Interleave contributions from buses and branches to match matrices int i,j,jdx,jdx1,jdx2; int *i_bus_offsets = new int[p_nBuses]; int *i_branch_offsets = new int[p_nBranches]; for (i=0; i<p_nBuses; i++) { i_bus_offsets[i] = 0; } for (i=0; i<p_nBranches; i++) { i_branch_offsets[i] = 0; } int icnt = 0; int nsize; // Evaluate offsets for individual network components for (i=0; i<p_nBuses; i++) { if (p_network->getActiveBus(i)) { i_bus_offsets[i] = icnt; icnt += p_network->getBus(i)->vectorNumElements(); std::vector<int> nghbrs = p_network->getConnectedBranches(i); nsize = nghbrs.size(); for (j=0; j<nsize; j++) { // Need to avoid double counting of branches when evaluating offsets. // If branch is non-local and it is active, then include it in offsets. // Otherwise, if branch is local and bus i is equal to the "from" bus, // then include it in the offsets. jdx = nghbrs[j]; if (isLocalBranch(jdx)) { p_network->getBranchEndpoints(jdx,&jdx1,&jdx2); if (jdx1 == i) { i_branch_offsets[jdx] = icnt; icnt += p_network->getBranch(jdx)->vectorNumElements(); } } else { if (p_network->getActiveBranch(jdx)) { i_branch_offsets[jdx] = icnt; icnt += p_network->getBranch(jdx)->vectorNumElements(); } } } } } // Total number of rows and columns from this processor have been evaluated, // now create buffers that can scatter individual offsets to global arrays int **i_bus_index = new int*[p_nBuses]; int **i_branch_index = new int*[p_nBranches]; int *i_bus_index_buf = new int[p_nBuses]; int *i_branch_index_buf = new int[p_nBranches]; int *i_bus_value_buf = new int[p_nBuses]; int *i_branch_value_buf = new int[p_nBranches]; int i_bus_cnt = 0; int i_branch_cnt = 0; int row_offset = p_Offsets[p_me]; int nbus = 0; int nbranch = 0; for (i=0; i<p_nBuses; i++) { if (p_network->getActiveBus(i)) { nbus++; i_bus_value_buf[i_bus_cnt] = i_bus_offsets[i]+row_offset; i_bus_index_buf[i_bus_cnt] = p_network->getGlobalBusIndex(i); i_bus_index[i_bus_cnt] = &i_bus_index_buf[i_bus_cnt]; i_bus_cnt++; } } for (i=0; i<p_nBranches; i++) { if (p_network->getActiveBranch(i)) { nbranch++; i_branch_value_buf[i_branch_cnt] = i_branch_offsets[i]+row_offset; i_branch_index_buf[i_branch_cnt] = p_network->getGlobalBranchIndex(i); i_branch_index[i_branch_cnt] = &i_branch_index_buf[i_branch_cnt]; i_branch_cnt++; } } delete [] i_bus_offsets; delete [] i_branch_offsets; // Create global arrays that hold column and row offsets for all buses and // branches in the network. First create map array for global arrays int *t_busMap = new int[p_nNodes]; int *t_branchMap = new int[p_nNodes]; for (i=0; i<p_nNodes; i++) { t_busMap[i] = 0; t_branchMap[i] = 0; } t_busMap[p_me] = nbus; t_branchMap[p_me] = nbranch; char plus[2]; strcpy(plus,"+"); GA_Pgroup_igop(p_GAgrp, t_busMap, p_nNodes, plus); GA_Pgroup_igop(p_GAgrp, t_branchMap, p_nNodes, plus); int *busMap = new int[p_nNodes]; int *branchMap = new int[p_nNodes]; busMap[0] = 0; branchMap[0] = 0; int total_buses = t_busMap[0]; int total_branches = t_branchMap[0]; for (i=1; i<p_nNodes; i++) { busMap[i] = busMap[i-1] + t_busMap[i-1]; total_buses += t_busMap[i]; branchMap[i] = branchMap[i-1] + t_branchMap[i-1]; total_branches += t_branchMap[i]; } delete [] t_busMap; delete [] t_branchMap; int one = 1; g_bus_offsets = GA_Create_handle(); GA_Set_data(g_bus_offsets, one, &total_buses, C_INT); GA_Set_irreg_distr(g_bus_offsets, busMap, &p_nNodes); GA_Set_pgroup(g_bus_offsets, p_GAgrp); if (!GA_Allocate(g_bus_offsets)) { char buf[256]; sprintf(buf,"GenVectorMap::setOffsets: Unable to allocate distributed array for bus offsets\n"); printf("%s",buf); throw gridpack::Exception(buf); } GA_Zero(g_bus_offsets); g_branch_offsets = GA_Create_handle(); GA_Set_data(g_branch_offsets, one, &total_branches, C_INT); GA_Set_irreg_distr(g_branch_offsets, branchMap, &p_nNodes); GA_Set_pgroup(g_branch_offsets, p_GAgrp); if (!GA_Allocate(g_branch_offsets)) { char buf[256]; sprintf(buf,"GenVectorMap::setOffsets: Unable to allocate distributed array for branch offsets\n"); printf("%s",buf); throw gridpack::Exception(buf); } GA_Zero(g_branch_offsets); delete [] busMap; delete [] branchMap; // Scatter offsets to global arrays NGA_Scatter(g_bus_offsets, i_bus_value_buf, i_bus_index, i_bus_cnt); NGA_Scatter(g_branch_offsets, i_branch_value_buf, i_branch_index, i_branch_cnt); NGA_Pgroup_sync(p_GAgrp); delete [] i_bus_index; delete [] i_branch_index; delete [] i_bus_index_buf; delete [] i_branch_index_buf; delete [] i_bus_value_buf; delete [] i_branch_value_buf; }
// note: Sayan: brings down memory requirement to about 268 MB int main(int argc, char **argv) { int me, nproc, g_a = -1, i, j; #if defined(USE_ELEMENTAL) int ndim=2, dims[2]= {N1,N2}; #else int ndim=2, type=MT_F_DBL, dims[2]= {N1,N2}; #endif double *buf; int lo[2], hi[2], ld[1]; double alpha = 1.0; #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); ElGlobalArrays_d eldga; // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); printf ("INITIALIZED elemental global array...\n"); #else MP_INIT(argc,argv); GA_Initialize_ltd(-1); me=GA_Nodeid(); nproc=GA_Nnodes(); #endif if(me==0) printf("Using %ld processes\n",(long)nproc); if(me==0) printf("memory = %ld bytes\n",((long)N1)*((long)N2)*8); #if defined(USE_ELEMENTAL) // create and allocate a global array printf ("ndim = %d\n", ndim); printf ("dim[0] = %d and dim[1] = %d\n", dims[0], dims[1]); ElGlobalArraysCreate_d( eldga, ndim, dims, "A", &g_a); printf ("CREATED elemental global array...\n"); // print distribution ElGlobalArraysPrint_d( eldga, g_a ); #else g_a = NGA_Create(type, ndim, dims, "A", NULL); GA_Zero(g_a); /* zero the matrix */ GA_Print_distribution(g_a); #endif if(me == 0) { // buf = (double*)(malloc(N1*1024*sizeof(double))); buf = (double*)(malloc(N1*128*sizeof(double))); // for(j = 0; j < N1*1024; ++j) buf[j] = 1.0; // for(i = 0; i < N2/1024; ++i) { for(j = 0; j < N1*128; ++j) buf[j] = 1.0; for(i = 0; i < N2/128; ++i) { lo[0] = 0; hi[0] = lo[0] + N1 -1; /* lo[1] = i*1024; hi[1] = lo[1] + 1024 -1; ld[0] = 1024; */ lo[1] = i*128; hi[1] = lo[1] + 128 -1; ld[0] = 128; printf("NGA_Acc.%d: %d:%d %d:%d\n",i,lo[0],hi[0],lo[1],hi[1]); #if defined(USE_ELEMENTAL) ElGlobalArraysAccumulate_d( eldga, g_a, lo, hi, buf, ld, &alpha ); // there is an explicit flush in NGA_Acc/Put, so when it returns, the buffer // can be reused and data has reached the destination #else NGA_Init_fence(); NGA_Acc(g_a, lo, hi, buf, ld, &alpha); NGA_Fence(); #endif } } #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysTerminate_d( eldga ); // call el::global arrays destructor ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Sync(); GA_Destroy(g_a); GA_Terminate(); MP_FINALIZE(); #endif return 0; }
/* Square matrix-matrix multiplication */ void matrix_multiply(int M, int N, int K, int blockX_len, int blockY_len) { /* Local buffers and Global arrays declaration */ double *a=NULL, *b=NULL, *c=NULL; int dims[NDIMS], ld[NDIMS], chunks[NDIMS]; int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */ int g_a, g_b, g_c, g_cnt, g_cnt2; int offset; double alpha = 1.0, beta=0.0; int count_p = 0, next_p = 0; int count_gac = 0, next_gac = 0; double t1,t2,seconds; ga_nbhdl_t nbh; int count_acc = 0; /* Find local processor ID and the number of processes */ int proc=GA_Nodeid(), nprocs=GA_Nnodes(); if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 || (K % blockX_len) != 0 || (K % blockY_len) != 0) GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101); /* Allocate/Set process local buffers */ a = malloc (blockX_len * blockY_len * sizeof(double)); b = malloc (blockX_len * blockY_len * sizeof(double)); c = malloc (blockX_len * blockY_len * sizeof(double)); cdims[0] = blockX_len; cdims[1] = blockY_len; /* Configure array dimensions */ for(int i = 0; i < NDIMS; i++) { dims[i] = N; chunks[i] = -1; ld[i] = cdims[i]; /* leading dimension/stride of the local buffer */ } /* create a global array g_a and duplicate it to get g_b and g_c*/ g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks); if (!g_a) GA_Error("NGA_Create failed: A", NDIMS); #if DEBUG>1 if (proc == 0) printf(" Created Array A\n"); #endif /* Ditto for C and B */ g_b = GA_Duplicate(g_a, "array B"); g_c = GA_Duplicate(g_a, "array C"); if (!g_b || !g_c) GA_Error("GA_Duplicate failed",NDIMS); if (proc == 0) printf("Created Arrays B and C\n"); /* Subscript array for read-incr, which is nothing but proc */ int * rdcnt = malloc (nprocs * sizeof(int)); memset (rdcnt, 0, nprocs * sizeof(int)); int * rdcnt2 = malloc (nprocs * sizeof(int)); memset (rdcnt2, 0, nprocs * sizeof(int)); /* Create global array of nprocs elements for nxtval */ int counter_dim[1]; counter_dim[0] = nprocs; g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL); if (!g_cnt) GA_Error("Shared counter failed",1); g_cnt2 = GA_Duplicate(g_cnt, "another shared counter"); if (!g_cnt2) GA_Error("Another shared counter failed",1); GA_Zero(g_cnt); GA_Zero(g_cnt2); #if DEBUG>1 /* initialize data in matrices a and b */ if(proc == 0) printf("Initializing local buffers - a and b\n"); #endif int w = 0; int l = 7; for(int i = 0; i < cdims[0]; i++) { for(int j = 0; j < cdims[1]; j++) { a[i*cdims[1] + j] = (double)(++w%29); b[i*cdims[1] + j] = (double)(++l%37); } } /* Copy data to global arrays g_a and g_b from local buffers */ next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); for (int i = 0; i < N; i+=cdims[0]) { if (next_p == count_p) { for (int j = 0; j < N; j+=cdims[1]) { /* Indices of patch */ lo[0] = i; lo[1] = j; hi[0] = lo[0] + cdims[0]; hi[1] = lo[1] + cdims[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>1 printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Put(g_a, lo, hi, a, ld); NGA_Put(g_b, lo, hi, b, ld); } next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); } count_p++; } #if DEBUG>1 printf ("After NGA_PUT to global - A and B arrays\n"); #endif /* Synchronize all processors to make sure puts from nprocs has finished before proceeding with dgemm */ GA_Sync(); t1 = GA_Wtime(); next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); for (int m = 0; m < N; m+=cdims[0]) { for (int k = 0; k < N; k+=cdims[0]) { if (next_gac == count_gac) { /* A = m x k */ lo[0] = m; lo[1] = k; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_a, lo, hi, a, ld); for (int n = 0; n < N; n+=cdims[1]) { memset (c, 0, sizeof(double) * cdims[0] * cdims[1]); /* B = k x n */ lo[0] = k; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_b, lo, hi, b, ld); //_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0); /* TODO I am assuming square matrix blocks, further testing/work required for rectangular matrices */ cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */ cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha, a, cdims[0], /* lda */ b, cdims[1], /* ldb */ beta=1.0, c, cdims[0] /* ldc */); NGA_NbWait(&nbh); /* C = m x n */ lo[0] = m; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh); count_acc += 1; } /* END LOOP N */ next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); } /* ENDIF if count == next */ count_gac++; } /* END LOOP K */ } /* END LOOP M */ GA_Sync(); t2 = GA_Wtime(); seconds = t2 - t1; if (proc == 0) printf("Time taken for MM (secs):%lf \n", seconds); printf("Number of ACC: %d\n", count_acc); /* Correctness test - modify data again before this function */ for (int i = 0; i < NDIMS; i++) { lo[i] = 0; hi[i] = dims[i]-1; ld[i] = dims[i]; } verify(g_a, g_b, g_c, lo, hi, ld, N); /* Clear local buffers */ free(a); free(b); free(c); free(rdcnt); free(rdcnt2); GA_Sync(); /* Deallocate arrays */ GA_Destroy(g_a); GA_Destroy(g_b); GA_Destroy(g_c); GA_Destroy(g_cnt); GA_Destroy(g_cnt2); }
/* * test ga_dgemm * Note: - change nummax for large arrays * - turn off "dgemm_verify" for large arrays due to memory * limitations, as dgemm_verify=1 for large arrays produces * segfault, dumps core,or any crap. */ int main(int argc, char **argv) { int num_m; int num_n; int num_k; int i; int ii; double *h0; int g_c; int g_b; int g_a; double a; double t1; double mf; double avg_t[ntrans]; double avg_mf[ntrans]; int itime; int ntimes; int nums_m[/*howmany*/] = {512,1024}; int nums_n[/*howmany*/] = {512,1024}; int nums_k[/*howmany*/] = {512,1024}; char transa[/*ntrans*/] = "ntnt"; char transb[/*ntrans*/] = "nntt"; char ta; char tb; double *tmpa; double *tmpb; double *tmpc; int ndim; int dims[2]; #ifdef BLOCK_CYCLIC int block_size[2]; #endif #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); #else MP_INIT(argc,argv); if (!MA_init(MT_DBL,1,20000000)) { GA_Error("failed: ma_init(MT_DBL,1,20000000)",10); } GA_INIT(argc,argv); me = GA_Nodeid(); #endif h0 = (double*)malloc(sizeof(double) * nummax*nummax); tmpa = (double*)malloc(sizeof(double) * nummax*nummax); tmpb = (double*)malloc(sizeof(double) * nummax*nummax); tmpc = (double*)malloc(sizeof(double) * nummax*nummax); ii = 0; for (i=0; i<nummax*nummax; i++) { ii = ii + 1; if (ii > nummax) { ii = 0; } h0[i] = ii; } /* Compute times assuming 500 mflops and 5 second target time */ /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */ ntimes = 5; for (ii=0; ii<howmany; ii++) { num_m = nums_m[ii]; num_n = nums_n[ii]; num_k = nums_k[ii]; a = 0.5/(num_m*num_n); if (num_m > nummax || num_n > nummax || num_k > nummax) { GA_Error("Insufficient memory: check nummax", 1); } #ifndef BLOCK_CYCLIC ndim = 2; /* dims[0] = num_m; dims[1] = num_n; */ dims[1] = num_m; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c ); #else if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) { GA_Error("failed: create g_c",20); } #endif /* dims[0] = num_k; dims[1] = num_n; */ dims[1] = num_k; dims[0] = num_n; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b ); #else if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) { GA_Error("failed: create g_b",30); } #endif /* dims[0] = num_m; dims[1] = num_k; */ dims[1] = num_m; dims[0] = num_k; #if defined(USE_ELEMENTAL) ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a ); #else if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) { GA_Error("failed: create g_a",40); } #endif #else ndim = 2; block_size[0] = 128; block_size[1] = 128; dims[0] = num_m; dims[1] = num_n; g_c = GA_Create_handle(); GA_Set_data(g_c,ndim,dims,MT_DBL); GA_Set_array_name(g_c,"g_c"); GA_Set_block_cyclic(g_c,block_size); if (!GA_Allocate(g_c)) { GA_Error("failed: create g_c",40); } dims[0] = num_k; dims[1] = num_n; g_b = GA_Create_handle(); GA_Set_data(g_b,ndim,dims,MT_DBL); GA_Set_array_name(g_b,"g_b"); GA_Set_block_cyclic(g_b,block_size); if (!ga_allocate(g_b)) { GA_Error("failed: create g_b",40); } dims[0] = num_m; dims[1] = num_k; g_a = GA_Create_handle(); GA_Set_data(g_a,ndim,dims,MT_DBL); GA_Set_array_name(g_a,"g_a"); GA_Set_block_cyclic(g_a,block_size); if (!ga_allocate(g_a)) { GA_Error('failed: create g_a',40); } #endif /* Initialize matrices A and B */ if (me == 0) { load_ga(g_a, h0, num_m, num_k); load_ga(g_b, h0, num_k, num_n); } #if defined(USE_ELEMENTAL) double zero = 0.0; ElGlobalArraysFill_d( eldga, g_c, &zero ); ElGlobalArraysSync_d( eldga ); #else GA_Zero(g_c); GA_Sync(); #endif #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n", (long)num_m, (long)num_k, (long)num_k, (long)num_n); fflush(stdout); } for (i=0; i<ntrans; i++) { avg_t[i] = 0.0; avg_mf[i] = 0.0; } for (itime=0; itime<ntimes; itime++) { for (i=0; i<ntrans; i++) { #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); #else GA_Sync(); #endif ta = transa[i]; tb = transb[i]; t1 = MP_TIMER(); #if defined(USE_ELEMENTAL) ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c ); #else GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c); #endif t1 = MP_TIMER() - t1; #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif #if defined(USE_ELEMENTAL) mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc; #else mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes(); #endif avg_t[i] = avg_t[i]+t1; avg_mf[i] = avg_mf[i] + mf; printf("%15s%2d: %12.4f seconds %12.1f mflops/proc %c %c\n", "Run#", itime, t1, mf, ta, tb); fflush(stdout); if (dgemm_verify && itime == 0) { /* recall the C API swaps the matrix order */ /* we swap it here for the Fortran-based verify */ verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0, g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc); } } } } #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("\n"); for (i=0; i<ntrans; i++) { printf("%17s: %12.4f seconds %12.1f mflops/proc %c %c\n", "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes, transa[i], transb[i]); } if(dgemm_verify) { printf("All GA_Dgemms are verified...O.K.\n"); } fflush(stdout); } /* GA_Print(g_a); GA_Print(g_b); GA_Print(g_c); */ #if defined(USE_ELEMENTAL) ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysDestroy_d( eldga, g_b ); ElGlobalArraysDestroy_d( eldga, g_c ); #else GA_Destroy(g_c); GA_Destroy(g_b); GA_Destroy(g_a); #endif } /* ??? format(a15, i2, ': ', e12.4, ' seconds ',f12.1, . ' mflops/proc ', 3a2) */ #if defined(USE_ELEMENTAL) if (me == 0) { #else if (GA_Nodeid() == 0) { #endif printf("All tests successful\n"); } free(h0); free(tmpa); free(tmpb); free(tmpc); #if defined(USE_ELEMENTAL) // call el::global arrays destructor ElGlobalArraysTerminate_d( eldga ); ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Terminate(); MP_FINALIZE(); #endif return 0; } /* * Verify for correctness. Process 0 computes BLAS dgemm * locally. For larger arrays, disbale this test as memory * might not be sufficient */ void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k, double alpha, int g_a, int g_b, double beta, int g_c, double *tmpa, double *tmpb, double *tmpc) { int i,j,type,ndim,dims[2],lo[2],hi[2]; double abs_value; for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { tmpc[j+i*num_m] = -1.0; tmpa[j+i*num_m] = -2.0; } } #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_a, lo, hi, tmpa, &dims[1]); #endif #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] ); #else NGA_Get(g_b, lo, hi, tmpb, &dims[1]); #endif /* compute dgemm sequentially */ #if defined(USE_ELEMENTAL) cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), ( xt2 == 'n'? CblasNoTrans: CblasTrans ), num_m /* M */, num_n /* N */, num_k /* K */, alpha, tmpa, num_m, /* lda */ tmpb, num_k, /* ldb */ beta, tmpc, num_m /* ldc */); #else xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k, &alpha, tmpa, &num_m, tmpb, &num_k, &beta, tmpc, &num_m); #endif /* after computing c locally, verify it with the values in g_c */ #if defined(USE_ELEMENTAL) ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims ); #else NGA_Inquire(g_a, &type, &ndim, dims); #endif lo[0] = 0; lo[1] = 0; hi[0] = dims[0]-1; hi[1] = dims[1]-1; #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] ); #else NGA_Get(g_c, lo, hi, tmpa, &dims[1]); #endif for (i=0; i<num_n; i++) { for (j=0; j<num_m; j++) { abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]); if(abs_value > 1.0 || abs_value < -1.0) { printf("Values are = %f %f\n", tmpc[j+i*num_m], tmpa[j+i*num_m]); printf("Values are = %f %f\n", fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value); fflush(stdout); GA_Error("verify ga_dgemm failed", 1); } } } } /** * called by process '0' (or your master process ) */ void load_ga(int handle, double *f, int dim1, int dim2) { int lo[2], hi[2]; if (dim1 < 0 || dim2 < 0) { return; } lo[0] = 0; lo[1] = 0; hi[0] = dim1-1; hi[1] = dim2-1; #if defined(USE_ELEMENTAL) ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 ); #else NGA_Put(handle, lo, hi, f, &dim1); #endif }