/* Square matrix-matrix multiplication */ void matrix_multiply(int M, int N, int K, int blockX_len, int blockY_len) { /* Local buffers and Global arrays declaration */ double *a=NULL, *b=NULL, *c=NULL; int dims[NDIMS], ld[NDIMS], chunks[NDIMS]; int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */ int g_a, g_b, g_c, g_cnt, g_cnt2; int offset; double alpha = 1.0, beta=0.0; int count_p = 0, next_p = 0; int count_gac = 0, next_gac = 0; double t1,t2,seconds; ga_nbhdl_t nbh; int count_acc = 0; /* Find local processor ID and the number of processes */ int proc=GA_Nodeid(), nprocs=GA_Nnodes(); if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 || (K % blockX_len) != 0 || (K % blockY_len) != 0) GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101); /* Allocate/Set process local buffers */ a = malloc (blockX_len * blockY_len * sizeof(double)); b = malloc (blockX_len * blockY_len * sizeof(double)); c = malloc (blockX_len * blockY_len * sizeof(double)); cdims[0] = blockX_len; cdims[1] = blockY_len; /* Configure array dimensions */ for(int i = 0; i < NDIMS; i++) { dims[i] = N; chunks[i] = -1; ld[i] = cdims[i]; /* leading dimension/stride of the local buffer */ } /* create a global array g_a and duplicate it to get g_b and g_c*/ g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks); if (!g_a) GA_Error("NGA_Create failed: A", NDIMS); #if DEBUG>1 if (proc == 0) printf(" Created Array A\n"); #endif /* Ditto for C and B */ g_b = GA_Duplicate(g_a, "array B"); g_c = GA_Duplicate(g_a, "array C"); if (!g_b || !g_c) GA_Error("GA_Duplicate failed",NDIMS); if (proc == 0) printf("Created Arrays B and C\n"); /* Subscript array for read-incr, which is nothing but proc */ int * rdcnt = malloc (nprocs * sizeof(int)); memset (rdcnt, 0, nprocs * sizeof(int)); int * rdcnt2 = malloc (nprocs * sizeof(int)); memset (rdcnt2, 0, nprocs * sizeof(int)); /* Create global array of nprocs elements for nxtval */ int counter_dim[1]; counter_dim[0] = nprocs; g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL); if (!g_cnt) GA_Error("Shared counter failed",1); g_cnt2 = GA_Duplicate(g_cnt, "another shared counter"); if (!g_cnt2) GA_Error("Another shared counter failed",1); GA_Zero(g_cnt); GA_Zero(g_cnt2); #if DEBUG>1 /* initialize data in matrices a and b */ if(proc == 0) printf("Initializing local buffers - a and b\n"); #endif int w = 0; int l = 7; for(int i = 0; i < cdims[0]; i++) { for(int j = 0; j < cdims[1]; j++) { a[i*cdims[1] + j] = (double)(++w%29); b[i*cdims[1] + j] = (double)(++l%37); } } /* Copy data to global arrays g_a and g_b from local buffers */ next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); for (int i = 0; i < N; i+=cdims[0]) { if (next_p == count_p) { for (int j = 0; j < N; j+=cdims[1]) { /* Indices of patch */ lo[0] = i; lo[1] = j; hi[0] = lo[0] + cdims[0]; hi[1] = lo[1] + cdims[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>1 printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Put(g_a, lo, hi, a, ld); NGA_Put(g_b, lo, hi, b, ld); } next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1); } count_p++; } #if DEBUG>1 printf ("After NGA_PUT to global - A and B arrays\n"); #endif /* Synchronize all processors to make sure puts from nprocs has finished before proceeding with dgemm */ GA_Sync(); t1 = GA_Wtime(); next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); for (int m = 0; m < N; m+=cdims[0]) { for (int k = 0; k < N; k+=cdims[0]) { if (next_gac == count_gac) { /* A = m x k */ lo[0] = m; lo[1] = k; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_a, lo, hi, a, ld); for (int n = 0; n < N; n+=cdims[1]) { memset (c, 0, sizeof(double) * cdims[0] * cdims[1]); /* B = k x n */ lo[0] = k; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_Get(g_b, lo, hi, b, ld); //_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0); /* TODO I am assuming square matrix blocks, further testing/work required for rectangular matrices */ cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */ cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha, a, cdims[0], /* lda */ b, cdims[1], /* ldb */ beta=1.0, c, cdims[0] /* ldc */); NGA_NbWait(&nbh); /* C = m x n */ lo[0] = m; lo[1] = n; hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1]; hi[0] = hi[0]-1; hi[1] = hi[1]-1; #if DEBUG>3 printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]); #endif NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh); count_acc += 1; } /* END LOOP N */ next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1); } /* ENDIF if count == next */ count_gac++; } /* END LOOP K */ } /* END LOOP M */ GA_Sync(); t2 = GA_Wtime(); seconds = t2 - t1; if (proc == 0) printf("Time taken for MM (secs):%lf \n", seconds); printf("Number of ACC: %d\n", count_acc); /* Correctness test - modify data again before this function */ for (int i = 0; i < NDIMS; i++) { lo[i] = 0; hi[i] = dims[i]-1; ld[i] = dims[i]; } verify(g_a, g_b, g_c, lo, hi, ld, N); /* Clear local buffers */ free(a); free(b); free(c); free(rdcnt); free(rdcnt2); GA_Sync(); /* Deallocate arrays */ GA_Destroy(g_a); GA_Destroy(g_b); GA_Destroy(g_c); GA_Destroy(g_cnt); GA_Destroy(g_cnt2); }
int main(int argc, char **argv) { int nmax, nprocs, me, me_plus; int g_a_data, g_a_i, g_a_j, isize; int gt_a_data, gt_a_i, gt_a_j; int g_b, g_c; int i, j, jj, k, one, jcnt; int chunk, kp1, ld; int *p_i, *p_j; double *p_data, *p_b, *p_c; double t_beg, t_beg2, t_ga_tot, t_get, t_mult, t_cnstrct, t_mpi_in, t_ga_in; double t_hypre_strct, t_ga_trans, t_gp_get; double t_get_blk_csr, t_trans_blk_csr, t_trans_blk, t_create_csr_ga, t_beg3; double t_gp_tget, t_gp_malloc, t_gp_assign, t_beg4; double prdot, dotga, dothypre, tempc; double prtot, gatot, hypretot, gatot2, hypretot2; double prdot2, prtot2; int status; int idim, jdim, kdim, idum, memsize; int lsize, ntot; int heap=200000, fudge=100, stack=200000, ma_heap; double *cbuf, *vector; int pdi, pdj, pdk, ip, jp, kp, ncells; int lo[3],hi[3]; int blo[3], bhi[3]; int ld_a, ld_b, ld_c, ld_i, ld_j, irows, ioff, joff, total_procs; int iproc, iblock, btot; double *amat, *bvec; int *ivec, *jvec; int *proclist, *proc_inv, *icnt; int *voffset, *offset, *mapc; int iloop, lo_bl, hi_bl; char *buf, **buf_ptr; int *iparams, *jval, *ival; double *rval, *rvalt; int imin, imax, jmin, jmax, irow, icol, nnz; int nrows, kmin, kmax, lmin, lmax, jdx; int LOOPNUM = 100; void **blk_ptr; void *blk; int blk_size, tsize, zero; int *iblk, *jblk, *blkidx; int *tblk_ptr; int *ivalt, *jvalt, *iparamst; int *iblk_t, *jblk_t, *blkidx_t; /* Hypre declarations */ int ierr; #if USE_HYPRE HYPRE_StructGrid grid; HYPRE_StructStencil stencil; HYPRE_StructMatrix matrix; HYPRE_StructVector vec_x, vec_y; int i4, j4, ndim, nelems, offsets[7][3]; int stencil_indices[7], hlo[3], hhi[3]; double weights[7]; double *values; double alpha, beta; int *rows, *cols; #endif /* *** Intitialize a message passing library */ zero = 0; one = 1; ierr = MPI_Init(&argc, &argv); /* *** Initialize GA There are 2 choices: ga_initialize or ga_initialize_ltd. In the first case, there is no explicit limit on memory usage. In the second, user can set limit (per processor) in bytes. */ t_beg = GA_Wtime(); NGA_Initialize(); GP_Initialize(); t_ga_in = GA_Wtime() - t_beg; NGA_Dgop(&t_ga_in,one,"+"); t_ga_tot = 0.0; t_ga_trans = 0.0; t_get_blk_csr = 0.0; t_create_csr_ga = 0.0; t_trans_blk_csr = 0.0; t_trans_blk = 0.0; t_gp_get = 0.0; t_gp_malloc = 0.0; t_gp_assign = 0.0; t_mult = 0.0; t_get = 0.0; t_gp_tget = 0.0; t_hypre_strct = 0.0; prtot = 0.0; prtot2 = 0.0; gatot = 0.0; hypretot = 0.0; me = NGA_Nodeid(); me_plus = me + 1; nprocs = NGA_Nnodes(); if (me == 0) { printf("Time to initialize GA: %12.4f\n", t_ga_in/((double)nprocs)); } /* we can also use GA_set_memory_limit BEFORE first ga_create call */ ma_heap = heap + fudge; /* call GA_set_memory_limit(util_mdtob(ma_heap)) */ if (me == 0) { printf("\nNumber of cores used: %d\n\nGA initialized\n\n",nprocs); } /* *** Initialize the MA package MA must be initialized before any global array is allocated */ if (!MA_init(MT_DBL, stack, ma_heap)) NGA_Error("ma_init failed",-1); /* create a sparse LMAX x LMAX matrix and two vectors of length LMAX. The matrix is stored in compressed row format. One of the vectors is filled with random data and the other is filled with zeros. */ idim = IMAX; jdim = JMAX; kdim = KMAX; ntot = idim*jdim*kdim; if (me == 0) { printf("\nDimension of matrix: %d\n\n",ntot); } t_beg = GA_Wtime(); grid_factor(nprocs,idim,jdim,kdim,&pdi,&pdj,&pdk); if (me == 0) { printf("\nProcessor grid configuration\n"); printf(" PDX: %d\n",pdi); printf(" PDY: %d\n",pdj); printf(" PDZ: %d\n\n",pdk); printf(" Number of Loops: %d\n",LOOPNUM); } create_laplace_mat(idim,jdim,kdim,pdi,pdj,pdk,&g_a_data,&g_a_j,&g_a_i,&mapc); t_cnstrct = GA_Wtime() - t_beg; g_b = NGA_Create_handle(); NGA_Set_data(g_b,one,&ntot,MT_DBL); NGA_Set_irreg_distr(g_b,mapc,&nprocs); status = NGA_Allocate(g_b); /* fill g_b with random values */ NGA_Distribution(g_b,me,blo,bhi); NGA_Access(g_b,blo,bhi,&p_b,&ld); ld = bhi[0]-blo[0]+1; btot = ld; vector = (double*)malloc(ld*sizeof(double)); for (i=0; i<ld; i++) { idum = 0; p_b[i] = ran3(&idum); vector[i] = p_b[i]; } NGA_Release(g_b,blo,bhi); NGA_Sync(); g_c = NGA_Create_handle(); NGA_Set_data(g_c,one,&ntot,MT_DBL); NGA_Set_irreg_distr(g_c,mapc,&nprocs); status = NGA_Allocate(g_c); NGA_Zero(g_c); #if USE_HYPRE /* Assemble HYPRE grid and use that to create matrix. Start by creating grid partition */ ndim = 3; i = me; ip = i%pdi; i = (i-ip)/pdi; jp = i%pdj; kp = (i-jp)/pdj; lo[0] = (int)(((double)idim)*((double)ip)/((double)pdi)); if (ip < pdi-1) { hi[0] = (int)(((double)idim)*((double)(ip+1))/((double)pdi)) - 1; } else { hi[0] = idim - 1; } lo[1] = (int)(((double)jdim)*((double)jp)/((double)pdj)); if (jp < pdj-1) { hi[1] = (int)(((double)jdim)*((double)(jp+1))/((double)pdj)) - 1; } else { hi[1] = jdim - 1; } lo[2] = (int)(((double)kdim)*((double)kp)/((double)pdk)); if (kp < pdk-1) { hi[2] = (int)(((double)kdim)*((double)(kp+1))/((double)pdk)) - 1; } else { hi[2] = kdim - 1; } /* Create grid */ hlo[0] = lo[0]; hlo[1] = lo[1]; hlo[2] = lo[2]; hhi[0] = hi[0]; hhi[1] = hi[1]; hhi[2] = hi[2]; ierr = HYPRE_StructGridCreate(MPI_COMM_WORLD, ndim, &grid); ierr = HYPRE_StructGridSetExtents(grid, hlo, hhi); ierr = HYPRE_StructGridAssemble(grid); /* Create stencil */ offsets[0][0] = 0; offsets[0][1] = 0; offsets[0][2] = 0; offsets[1][0] = 1; offsets[1][1] = 0; offsets[1][2] = 0; offsets[2][0] = 0; offsets[2][1] = 1; offsets[2][2] = 0; offsets[3][0] = 0; offsets[3][1] = 0; offsets[3][2] = 1; offsets[4][0] = -1; offsets[4][1] = 0; offsets[4][2] = 0; offsets[5][0] = 0; offsets[5][1] = -1; offsets[5][2] = 0; offsets[6][0] = 0; offsets[6][1] = 0; offsets[6][2] = -1; nelems = 7; ierr = HYPRE_StructStencilCreate(ndim, nelems, &stencil); for (i=0; i<nelems; i++) { ierr = HYPRE_StructStencilSetElement(stencil, i, offsets[i]); } ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1); jcnt = 7*ncells; values = (double*)malloc(jcnt*sizeof(double)); jcnt = 0; weights[0] = 6.0; weights[1] = -1.0; weights[2] = -1.0; weights[3] = -1.0; weights[4] = -1.0; weights[5] = -1.0; weights[6] = -1.0; for (i=0; i<ncells; i++) { for (j=0; j<7; j++) { values[jcnt] = weights[j]; jcnt++; } } ierr = HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &matrix); ierr = HYPRE_StructMatrixInitialize(matrix); for (i=0; i<7; i++) { stencil_indices[i] = i; } ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, 7, stencil_indices, values); free(values); /* Check all six sides of current box to see if any are boundaries. Set values to zero if they are. */ if (hi[0] == idim-1) { ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1); hlo[0] = idim-1; hhi[0] = idim-1; hlo[1] = lo[1]; hhi[1] = hi[1]; hlo[2] = lo[2]; hhi[2] = hi[2]; values = (double*)malloc(ncells*sizeof(double)); for (i=0; i<ncells; i++) values[i] = 0.0; i4 = 1; j4 = 1; ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values); free(values); } if (hi[1] == jdim-1) { ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1); hlo[0] = lo[0]; hhi[0] = hi[0]; hlo[1] = jdim-1; hhi[1] = jdim-1; hlo[2] = lo[2]; hhi[2] = hi[2]; values = (double*)malloc(ncells*sizeof(double)); for (i=0; i<ncells; i++) values[i] = 0.0; i4 = 1; j4 = 2; ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values); free(values); } if (hi[2] == kdim-1) { ncells = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1); hlo[0] = lo[0]; hhi[0] = hi[0]; hlo[1] = lo[1]; hhi[1] = hi[1]; hlo[2] = kdim-1; hhi[2] = kdim-1; values = (double*)malloc(ncells*sizeof(double)); for (i=0; i<ncells; i++) values[i] = 0.0; i4 = 1; j4 = 3; ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values); free(values); } if (lo[0] == 0) { ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1); hlo[0] = 0; hhi[0] = 0; hlo[1] = lo[1]; hhi[1] = hi[1]; hlo[2] = lo[2]; hhi[2] = hi[2]; values = (double*)malloc(ncells*sizeof(double)); for (i=0; i<ncells; i++) values[i] = 0.0; i4 = 1; j4 = 4; ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values); free(values); } if (lo[1] == 0) { ncells = (hi[0]-lo[0]+1)*(hi[2]-lo[2]+1); hlo[0] = lo[0]; hhi[0] = hi[0]; hlo[1] = 0; hhi[1] = 0; hlo[2] = lo[2]; hhi[2] = hi[2]; values = (double*)malloc(ncells*sizeof(double)); for (i=0; i<ncells; i++) values[i] = 0.0; i4 = 1; j4 = 5; ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values); free(values); } if (lo[2] == 1) { ncells = (hi[1]-lo[1]+1)*(hi[2]-lo[2]+1); hlo[0] = lo[0]; hhi[0] = hi[0]; hlo[1] = lo[1]; hhi[1] = hi[1]; hlo[2] = 0; hhi[2] = 0; values = (double*)malloc(ncells*sizeof(double)); for (i=0; i<ncells; i++) values[i] = 0.0; i4 = 1; j4 = 6; ierr = HYPRE_StructMatrixSetBoxValues(matrix, hlo, hhi, i4, &j4, values); free(values); } ierr = HYPRE_StructMatrixAssemble(matrix); /* Create vectors for matrix-vector multiply */ ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_x); ierr = HYPRE_StructVectorInitialize(vec_x); hlo[0] = lo[0]; hlo[1] = lo[1]; hlo[2] = lo[2]; hhi[0] = hi[0]; hhi[1] = hi[1]; hhi[2] = hi[2]; ierr = HYPRE_StructVectorSetBoxValues(vec_x, hlo, hhi, vector); ierr = HYPRE_StructVectorAssemble(vec_x); NGA_Distribution(g_a_i,me,blo,bhi); if (bhi[1] > ntot-1) { bhi[1] = ntot-1; } btot = (hi[0]-lo[0]+1)*(hi[1]-lo[1]+1)*(hi[2]-lo[2]+1); for (i=0; i<btot; i++) vector[i] = 0.0; hlo[0] = lo[0]; hlo[1] = lo[1]; hlo[2] = lo[2]; hhi[0] = hi[0]; hhi[1] = hi[1]; hhi[2] = hi[2]; ierr = HYPRE_StructVectorGetBoxValues(vec_x, hlo, hhi, vector); for (i=0; i<btot; i++) vector[i] = 0.0; ierr = HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &vec_y); ierr = HYPRE_StructVectorInitialize(vec_y); ierr = HYPRE_StructVectorSetBoxValues(vec_y, hlo, hhi, vector); ierr = HYPRE_StructVectorAssemble(vec_y); #endif /* Multiply sparse matrix. Start by accessing pointers to local portions of g_a_data, g_a_j, g_a_i */ NGA_Sync(); for (iloop=0; iloop<LOOPNUM; iloop++) { t_beg2 = GA_Wtime(); NGA_Distribution(g_c,me,blo,bhi); NGA_Access(g_c,blo,bhi,&p_c,&ld_c); for (i = 0; i<bhi[0]-blo[0]+1; i++) { p_c[i] = 0.0; } /* get number of matrix blocks coupled to this process */ NGA_Get(g_a_i,&me,&me,&lo_bl,&one); #if 1 NGA_Get(g_a_i,&me_plus,&me_plus,&hi_bl,&one); hi_bl--; total_procs = hi_bl - lo_bl + 1; blk_ptr = (void**)malloc(sizeof(void*)); /* Loop through matrix blocks */ ioff = 0; for (iblock = 0; iblock<total_procs; iblock++) { t_beg = GA_Wtime(); jdx = lo_bl+iblock; #if 0 GP_Access_element(g_a_data, &jdx, &blk_ptr[0], &isize); #endif #if 1 GP_Get_size(g_a_data, &jdx, &jdx, &isize); #endif blk = (void*)malloc(isize); #if 1 GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0); #endif t_gp_get = t_gp_get + GA_Wtime() - t_beg; iparams = (int*)blk_ptr[0]; rval = (double*)(iparams+7); imin = iparams[0]; imax = iparams[1]; jmin = iparams[2]; jmax = iparams[3]; irow = iparams[4]; icol = iparams[5]; nnz = iparams[6]; jval = (int*)(rval+nnz); ival = (int*)(jval+nnz); nrows = imax - imin + 1; bvec = (double*)malloc((jmax-jmin+1)*sizeof(double)); j = 0; t_beg = GA_Wtime(); NGA_Get(g_b,&jmin,&jmax,bvec,&j); t_get = t_get + GA_Wtime() - t_beg; t_beg = GA_Wtime(); for (i=0; i<nrows; i++) { kmin = ival[i]; kmax = ival[i+1]-1; tempc = 0.0; for (j = kmin; j<=kmax; j++) { jj = jval[j]; tempc = tempc + rval[j]*bvec[jj]; } p_c[i] = p_c[i] + tempc; } t_mult = t_mult + GA_Wtime() - t_beg; free(bvec); free(blk); } NGA_Sync(); t_ga_tot = t_ga_tot + GA_Wtime() - t_beg2; NGA_Distribution(g_c,me,blo,bhi); NGA_Release(g_c,blo,bhi); #if USE_HYPRE alpha = 1.0; beta = 0.0; t_beg = GA_Wtime(); ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y); t_hypre_strct = t_hypre_strct + GA_Wtime() - t_beg; hlo[0] = lo[0]; hlo[1] = lo[1]; hlo[2] = lo[2]; hhi[0] = hi[0]; hhi[1] = hi[1]; hhi[2] = hi[2]; ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector); NGA_Distribution(g_c,me,hlo,hhi); cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double)); NGA_Get(g_c,hlo,hhi,cbuf,&one); prdot = 0.0; dotga = 0.0; dothypre = 0.0; for (i=0; i<(hhi[0]-hlo[0]+1); i++) { dothypre = dothypre + vector[i]*vector[i]; dotga = dotga + cbuf[i]*cbuf[i]; prdot = prdot + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]); } NGA_Dgop(&dotga,1,"+"); NGA_Dgop(&dothypre,1,"+"); NGA_Dgop(&prdot,1,"+"); gatot += sqrt(dotga); hypretot += sqrt(dothypre); prtot += sqrt(prdot); free(cbuf); #endif /* Transpose matrix. Start by making local copies of ival and jval arrays for the sparse matrix of blocks stored in the GP array */ #if 1 t_beg2 = GA_Wtime(); t_beg3 = GA_Wtime(); iblk = (int*)malloc((nprocs+1)*sizeof(int)); iblk_t = (int*)malloc((nprocs+1)*sizeof(int)); #if 0 NGA_Get(g_a_i,&zero,&nprocs,iblk,&one); #else if (me == 0) { NGA_Get(g_a_i,&zero,&nprocs,iblk,&one); } else { for (i=0; i<nprocs+1; i++) { iblk[i] = 0; } } GA_Igop(iblk,nprocs+1,"+"); #endif jblk = (int*)malloc(iblk[nprocs]*sizeof(int)); jblk_t = (int*)malloc(iblk[nprocs]*sizeof(int)); iblock = iblk[nprocs]-1; #if 0 NGA_Get(g_a_j,&zero,&iblock,jblk,&one); #else if (me == 0) { NGA_Get(g_a_j,&zero,&iblock,jblk,&one); } else { for (i=0; i<iblock+1; i++) { jblk[i] = 0; } } GA_Igop(jblk,iblock+1,"+"); #endif iblock++; blkidx = (int*)malloc(iblk[nprocs]*sizeof(int)); blkidx_t = (int*)malloc(iblk[nprocs]*sizeof(int)); for (i=0; i<iblock; i++) { blkidx[i] = i; } iblock = nprocs; t_get_blk_csr = t_get_blk_csr + GA_Wtime() - t_beg3; t_beg3 = GA_Wtime(); stran(iblock, iblock, iblk, jblk, blkidx, iblk_t, jblk_t, blkidx_t); t_trans_blk_csr = t_trans_blk_csr + GA_Wtime() - t_beg3; t_beg3 = GA_Wtime(); gt_a_data = GP_Create_handle(); i = iblk_t[nprocs]; GP_Set_dimensions(gt_a_data, one, &i); GP_Set_irreg_distr(gt_a_data, iblk_t, &nprocs); GP_Allocate(gt_a_data); gt_a_j = NGA_Create_handle(); i = iblk_t[nprocs]; NGA_Set_data(gt_a_j, one, &i, C_INT); NGA_Set_irreg_distr(gt_a_j, iblk_t, &nprocs); NGA_Allocate(gt_a_j); gt_a_i = NGA_Create_handle(); i = nprocs+1; NGA_Set_data(gt_a_i,one,&i,C_INT); for (i=0; i<nprocs; i++) mapc[i] = i; NGA_Set_irreg_distr(gt_a_i, mapc, &nprocs); NGA_Allocate(gt_a_i); /* copy i and j arrays of transposed matrix into distributed arrays */ if (me==0) { lo_bl = 0; hi_bl = nprocs; NGA_Put(gt_a_i,&lo_bl,&hi_bl,iblk_t,&one); lo_bl = 0; hi_bl = iblk_t[nprocs]-1; NGA_Put(gt_a_j,&lo_bl,&hi_bl,jblk_t,&one); } NGA_Sync(); lo_bl = iblk[me]; hi_bl = iblk[me+1]; total_procs = hi_bl - lo_bl + 1; total_procs = hi_bl - lo_bl; t_create_csr_ga = t_create_csr_ga + GA_Wtime() - t_beg3; for (iblock = lo_bl; iblock < hi_bl; iblock++) { t_beg4 = GA_Wtime(); jdx = blkidx_t[iblock]; GP_Get_size(g_a_data, &jdx, &jdx, &isize); blk = (void*)malloc(isize); GP_Get(g_a_data, &jdx, &jdx, blk, blk_ptr, &one, &blk_size, &one, &tsize, 0); /* Parameters for original block */ iparams = (int*)blk_ptr[0]; rval = (double*)(iparams+7); imin = iparams[0]; imax = iparams[1]; jmin = iparams[2]; jmax = iparams[3]; irow = iparams[4]; icol = iparams[5]; nnz = iparams[6]; jval = (int*)(rval+nnz); ival = (int*)(jval+nnz); /* Create transposed block */ isize = 7*sizeof(int) + nnz*(sizeof(double)+sizeof(int)) + (jmax-jmin+2)*sizeof(int); t_gp_tget = t_gp_tget + GA_Wtime() - t_beg4; t_beg4 = GA_Wtime(); tblk_ptr = (int*)GP_Malloc(isize); t_gp_malloc = t_gp_malloc + GA_Wtime() - t_beg4; t_beg3 = GA_Wtime(); iparamst = (int*)tblk_ptr; rvalt = (double*)(iparamst+7); jvalt = (int*)(rvalt+nnz); ivalt = (int*)(jvalt+nnz); iparamst[0] = jmin; iparamst[1] = jmax; iparamst[2] = imin; iparamst[3] = imax; iparamst[4] = icol; iparamst[5] = irow; iparamst[6] = nnz; i = imax-imin+1; j = jmax-jmin+1; stranr(i, j, ival, jval, rval, ivalt, jvalt, rvalt); t_trans_blk = t_trans_blk + GA_Wtime() - t_beg3; t_beg4 = GA_Wtime(); GP_Assign_local_element(gt_a_data, &iblock, (void*)tblk_ptr, isize); t_gp_assign = t_gp_assign + GA_Wtime() - t_beg4; #if 1 free(blk); #endif } /* Clean up after transpose */ #if 1 free(iblk); free(iblk_t); free(jblk); free(jblk_t); free(blkidx); free(blkidx_t); #endif NGA_Sync(); t_ga_trans = t_ga_trans + GA_Wtime() - t_beg2; #if USE_HYPRE alpha = 1.0; beta = 0.0; ierr = HYPRE_StructMatrixMatvec(alpha, matrix, vec_x, beta, vec_y); hlo[0] = lo[0]; hlo[1] = lo[1]; hlo[2] = lo[2]; hhi[0] = hi[0]; hhi[1] = hi[1]; hhi[2] = hi[2]; ierr = HYPRE_StructVectorGetBoxValues(vec_y, hlo, hhi, vector); NGA_Distribution(g_c,me,hlo,hhi); cbuf = (double*)malloc((hhi[0]-hlo[0]+1)*sizeof(double)); NGA_Get(g_c,hlo,hhi,cbuf,&one); dothypre = 0.0; dotga = 0.0; prdot2 = 0.0; for (i=0; i<(hhi[0]-hlo[0]+1); i++) { dothypre = dothypre + vector[i]*vector[i]; dotga = dotga + cbuf[i]*cbuf[i]; if (fabs(vector[i]-cbuf[i]) > 1.0e-10) { printf("p[%d] i: %d vector: %f cbuf: %f\n",me,i,vector[i],cbuf[i]); } prdot2 = prdot2 + (vector[i]-cbuf[i])*(vector[i]-cbuf[i]); } NGA_Dgop(&dotga,1,"+"); NGA_Dgop(&dothypre,1,"+"); NGA_Dgop(&prdot2,1,"+"); prtot2 += sqrt(prdot2); gatot2 += sqrt(dotga); hypretot2 += sqrt(dothypre); free(cbuf); free(blk_ptr); #endif /* Clean up transposed matrix */ GP_Distribution(gt_a_data,me,blo,bhi); for (i=blo[0]; i<bhi[0]; i++) { GP_Free(GP_Free_local_element(gt_a_data,&i)); } GP_Destroy(gt_a_data); NGA_Destroy(gt_a_i); NGA_Destroy(gt_a_j); #endif #endif } free(vector); #if USE_HYPRE if (me == 0) { printf("Magnitude of GA solution: %e\n", gatot/((double)LOOPNUM)); printf("Magnitude of HYPRE solution: %e\n", hypretot/((double)LOOPNUM)); printf("Magnitude of GA solution(2): %e\n", gatot2/((double)LOOPNUM)); printf("Magnitude of HYPRE solution(2): %e\n", hypretot2/((double)LOOPNUM)); printf("Difference between GA and HYPRE (Struct) results: %e\n", prtot/((double)LOOPNUM)); printf("Difference between transpose and HYPRE results: %e\n", prtot2/((double)LOOPNUM)); } #endif /* Clean up arrays */ NGA_Destroy(g_b); NGA_Destroy(g_c); GP_Distribution(g_a_data,me,blo,bhi); for (i=blo[0]; i<bhi[0]; i++) { GP_Free(GP_Free_local_element(g_a_data,&i)); } GP_Destroy(g_a_data); NGA_Destroy(g_a_i); NGA_Destroy(g_a_j); #if USE_HYPRE ierr = HYPRE_StructStencilDestroy(stencil); ierr = HYPRE_StructGridDestroy(grid); ierr = HYPRE_StructMatrixDestroy(matrix); ierr = HYPRE_StructVectorDestroy(vec_x); ierr = HYPRE_StructVectorDestroy(vec_y); #endif NGA_Dgop(&t_cnstrct,1,"+"); NGA_Dgop(&t_get,1,"+"); NGA_Dgop(&t_gp_get,1,"+"); NGA_Dgop(&t_mult,1,"+"); NGA_Dgop(&t_ga_tot,1,"+"); NGA_Dgop(&t_ga_trans,1,"+"); NGA_Dgop(&t_get_blk_csr,1,"+"); NGA_Dgop(&t_trans_blk_csr,1,"+"); NGA_Dgop(&t_trans_blk,1,"+"); NGA_Dgop(&t_create_csr_ga,1,"+"); NGA_Dgop(&t_gp_tget,1,"+"); NGA_Dgop(&t_gp_malloc,1,"+"); NGA_Dgop(&t_gp_assign,1,"+"); #if USE_HYPRE NGA_Dgop(&t_hypre_strct,1,"+"); #endif free(mapc); if (me == 0) { printf("Time to create sparse matrix: %12.4f\n", t_cnstrct/((double)(nprocs*LOOPNUM))); printf("Time to get right hand side vector: %12.4f\n", t_get/((double)(nprocs*LOOPNUM))); printf("Time to get GP blocks: %12.4f\n", t_gp_get/((double)(nprocs*LOOPNUM))); printf("Time for sparse matrix block multiplication: %12.4f\n", t_mult/((double)(nprocs*LOOPNUM))); printf("Time for total sparse matrix multiplication: %12.4f\n", t_ga_tot/((double)(nprocs*LOOPNUM))); #if USE_HYPRE printf("Total time for HYPRE (Struct) matrix-vector multiply:%12.4f\n", t_hypre_strct/((double)(nprocs*LOOPNUM))); #endif printf("Time to get block CSR distribution: %12.4f\n", t_get_blk_csr/((double)(nprocs*LOOPNUM))); printf("Time for transposing block CSR distribution: %12.4f\n", t_trans_blk_csr/((double)(nprocs*LOOPNUM))); printf("Time for creating transposed block CSR GA: %12.4f\n", t_create_csr_ga/((double)(nprocs*LOOPNUM))); printf("Time for transposing blocks: %12.4f\n", t_trans_blk/((double)(nprocs*LOOPNUM))); printf("Time to get GP blocks for transpose: %12.4f\n", t_gp_tget/((double)(nprocs*LOOPNUM))); printf("Time to malloc GP blocks for transpose: %12.4f\n", t_gp_malloc/((double)(nprocs*LOOPNUM))); printf("Time to assign GP blocks for transpose: %12.4f\n", t_gp_assign/((double)(nprocs*LOOPNUM))); printf("Time for total sparse matrix transpose: %12.4f\n", t_ga_trans/((double)(nprocs*LOOPNUM))); } if (me==0) { printf("Terminating GA library\n"); } NGA_Terminate(); /* *** Tidy up after message-passing library */ ierr = MPI_Finalize(); }