// ------------------------------------------------------------- // MatSetValues_DenseGA // ------------------------------------------------------------- static PetscErrorCode MatSetValues_DenseGA(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], const PetscScalar v[],InsertMode addv) { PetscErrorCode ierr = 0; struct MatGACtx *ctx; int i, j, idx; PetscScalar vij, one(1.0); int lo[2], hi[2], ld[2] = {1, 1}; ierr = MatShellGetContext(mat, (void *)&ctx); CHKERRQ(ierr); idx = 0; for (i = 0; i < m; ++i) { for (j = 0; j < n; ++j, ++idx) { lo[0] = idxm[i]; hi[0] = idxm[i]; lo[1] = idxn[j]; hi[1] = idxn[j]; vij = v[idx]; switch (addv) { case INSERT_VALUES: NGA_Put(ctx->ga, lo, hi, (void *)&vij, ld); break; case ADD_VALUES: NGA_Acc(ctx->ga, lo, hi, (void *)&vij, ld, &one); break; default: BOOST_ASSERT_MSG(false, "Unknown set operation"); } } } return ierr; }
void do_work() { int ONE=1 ; /* useful constants */ int g_a, g_b; int n=N, type=MT_F_DBL; int me=GA_Nodeid(), nproc=GA_Nnodes(); int i, row; int dims[2]={N,N}; int lo[2], hi[2], ld; /* Note: on all current platforms DoublePrecision == double */ double buf[N], err, alpha, beta; if(me==0)printf("Creating matrix A\n"); g_a = NGA_Create(type, 2, dims, "A", NULL); if(!g_a) GA_Error("create failed: A",n); if(me==0)printf("OK\n"); if(me==0)printf("Creating matrix B\n"); /* create matrix B so that it has dims and distribution of A*/ g_b = GA_Duplicate(g_a, "B"); if(! g_b) GA_Error("duplicate failed",n); if(me==0)printf("OK\n"); GA_Zero(g_a); /* zero the matrix */ if(me==0)printf("Initializing matrix A\n"); /* fill in matrix A with random values in range 0.. 1 */ lo[1]=0; hi[1]=n-1; for(row=me; row<n; row+= nproc){ /* each process works on a different row in MIMD style */ lo[0]=hi[0]=row; for(i=0; i<n; i++) buf[i]=sin((double)i + 0.1*(row+1)); NGA_Put(g_a, lo, hi, buf, &n); } if(me==0)printf("Symmetrizing matrix A\n"); GA_Symmetrize(g_a); /* symmetrize the matrix A = 0.5*(A+A') */ /* check if A is symmetric */ if(me==0)printf("Checking if matrix A is symmetric\n"); GA_Transpose(g_a, g_b); /* B=A' */ alpha=1.; beta=-1.; GA_Add(&alpha, g_a, &beta, g_b, g_b); /* B= A - B */ err= GA_Ddot(g_b, g_b); if(me==0)printf("Error=%f\n",(double)err); if(me==0)printf("\nChecking atomic accumulate \n"); GA_Zero(g_a); /* zero the matrix */ for(i=0; i<n; i++) buf[i]=(double)i; /* everybody accumulates to the same location/row */ alpha = 1.0; row = n/2; lo[0]=hi[0]=row; lo[1]=0; hi[1]=n-1; ld = hi[1]-lo[1]+1; NGA_Acc(g_a, lo, hi, buf, &ld, &alpha ); GA_Sync(); if(me==0){ /* node 0 is checking the result */ NGA_Get(g_a, lo, hi, buf,&ld); for(i=0; i<n; i++) if(buf[i] != (double)nproc*i) GA_Error("failed: column=",i); printf("OK\n\n"); } GA_Destroy(g_a); GA_Destroy(g_b); }
// note: Sayan: brings down memory requirement to about 268 MB int main(int argc, char **argv) { int me, nproc, g_a = -1, i, j; #if defined(USE_ELEMENTAL) int ndim=2, dims[2]= {N1,N2}; #else int ndim=2, type=MT_F_DBL, dims[2]= {N1,N2}; #endif double *buf; int lo[2], hi[2], ld[1]; double alpha = 1.0; #if defined(USE_ELEMENTAL) // initialize Elemental (which will initialize MPI) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &me ); ElMPICommSize( MPI_COMM_WORLD, &nproc ); ElGlobalArrays_d eldga; // instantiate el::global array ElGlobalArraysConstruct_d( &eldga ); // initialize global arrays ElGlobalArraysInitialize_d( eldga ); printf ("INITIALIZED elemental global array...\n"); #else MP_INIT(argc,argv); GA_Initialize_ltd(-1); me=GA_Nodeid(); nproc=GA_Nnodes(); #endif if(me==0) printf("Using %ld processes\n",(long)nproc); if(me==0) printf("memory = %ld bytes\n",((long)N1)*((long)N2)*8); #if defined(USE_ELEMENTAL) // create and allocate a global array printf ("ndim = %d\n", ndim); printf ("dim[0] = %d and dim[1] = %d\n", dims[0], dims[1]); ElGlobalArraysCreate_d( eldga, ndim, dims, "A", &g_a); printf ("CREATED elemental global array...\n"); // print distribution ElGlobalArraysPrint_d( eldga, g_a ); #else g_a = NGA_Create(type, ndim, dims, "A", NULL); GA_Zero(g_a); /* zero the matrix */ GA_Print_distribution(g_a); #endif if(me == 0) { // buf = (double*)(malloc(N1*1024*sizeof(double))); buf = (double*)(malloc(N1*128*sizeof(double))); // for(j = 0; j < N1*1024; ++j) buf[j] = 1.0; // for(i = 0; i < N2/1024; ++i) { for(j = 0; j < N1*128; ++j) buf[j] = 1.0; for(i = 0; i < N2/128; ++i) { lo[0] = 0; hi[0] = lo[0] + N1 -1; /* lo[1] = i*1024; hi[1] = lo[1] + 1024 -1; ld[0] = 1024; */ lo[1] = i*128; hi[1] = lo[1] + 128 -1; ld[0] = 128; printf("NGA_Acc.%d: %d:%d %d:%d\n",i,lo[0],hi[0],lo[1],hi[1]); #if defined(USE_ELEMENTAL) ElGlobalArraysAccumulate_d( eldga, g_a, lo, hi, buf, ld, &alpha ); // there is an explicit flush in NGA_Acc/Put, so when it returns, the buffer // can be reused and data has reached the destination #else NGA_Init_fence(); NGA_Acc(g_a, lo, hi, buf, ld, &alpha); NGA_Fence(); #endif } } #if defined(USE_ELEMENTAL) ElGlobalArraysSync_d( eldga ); ElGlobalArraysDestroy_d( eldga, g_a ); ElGlobalArraysTerminate_d( eldga ); // call el::global arrays destructor ElGlobalArraysDestruct_d( eldga ); ElFinalize(); #else GA_Sync(); GA_Destroy(g_a); GA_Terminate(); MP_FINALIZE(); #endif return 0; }
double time_op(int g_a, double *buf_, int chunk, int loop, int proc, int ndim, int op) { double start_time = 0; double stop_time = 0; double total_time = 0; int lo[2] = {-1,-1}; int hi[2] = {-1,-1}; int ld = -1; int i = 0; int bal = 0; double *buf = buf_; double alpha = 1; /* get the location within the g_a for the given proc */ #if defined(USE_ELEMENTAL) ElGlobalArraysDistribution_d( eldga, g_a, proc, lo, hi ); #else NGA_Distribution(g_a, proc, lo, hi); #endif /* determine how much data to grab based on the chunk and dimensionality */ if (ndim == 1) { hi[0] = lo[0] + chunk*chunk - 1; } else if (ndim == 2) { hi[0] = lo[0] + chunk - 1; hi[1] = lo[1] + chunk - 1; ld = chunk; } else { GA_Error("invalid ndim for time_op", ndim); } start_time = TIMER(); for (i=0; i<loop; ++i) { switch (op) { case OP_GET: #if defined(USE_ELEMENTAL) ElGlobalArraysGet_d( eldga, g_a, lo, hi, buf, &ld ); #else NGA_Get(g_a, lo, hi, buf, &ld); #endif break; case OP_PUT: #if defined(USE_ELEMENTAL) ElGlobalArraysPut_d( eldga, g_a, lo, hi, buf, &ld ); #else NGA_Put(g_a, lo, hi, buf, &ld); #endif break; case OP_ACC: #if defined(USE_ELEMENTAL) ElGlobalArraysPut_d( eldga, g_a, lo, hi, buf, &ld ); #else NGA_Acc(g_a, lo, hi, buf, &ld, &alpha); #endif break; default: GA_Error("bad case value for op", op); } /* prepare next src location and dst ptr: avoid cache locality */ if (bal == 0) { lo[0] += 128; lo[1] += 128; hi[0] += 128; hi[1] += 128; buf += 128; bal = 1; } else { lo[0] -= 128; lo[1] -= 128; hi[0] -= 128; hi[1] -= 128; buf -= 128; bal = 0; } } stop_time = TIMER(); total_time = (stop_time - start_time); if (total_time == 0.0) { total_time = 0.000001; /* workaround for inaccurate timers */ warn_accuracy++; } return(total_time / loop); }