/*-------------------------------------------------------------------------- * hypre_SStructSharedDOF_ParcsrMatRowsComm * Given a sstruct_grid & parcsr matrix with rows corresponding to the * sstruct_grid, determine and extract the rows that must be communicated. * These rows are for shared dof that geometrically lie on processor * boundaries but internally are stored on one processor. * Algo: * for each cellbox * RECVs: * i) stretch the cellbox to the variable box * ii) in the appropriate (dof-dependent) direction, take the * boundary and boxman_intersect to extract boxmanentries * that contain these boundary edges. * iii)loop over the boxmanentries and see if they belong * on this proc or another proc * a) if belong on another proc, these are the recvs: * count and prepare the communication buffers and * values. * * SENDs: * i) form layer of cells that is one layer off cellbox * (stretches in the appropriate direction) * ii) boxman_intersect with the cellgrid boxman * iii)loop over the boxmanentries and see if they belong * on this proc or another proc * a) if belong on another proc, these are the sends: * count and prepare the communication buffers and * values. * * Note: For the recv data, the dof can come from only one processor. * For the send data, the dof can go to more than one processor * (the same dof is on the boundary of several cells). *--------------------------------------------------------------------------*/ HYPRE_Int hypre_SStructSharedDOF_ParcsrMatRowsComm( hypre_SStructGrid *grid, hypre_ParCSRMatrix *A, HYPRE_Int *num_offprocrows_ptr, hypre_MaxwellOffProcRow ***OffProcRows_ptr) { MPI_Comm A_comm= hypre_ParCSRMatrixComm(A); MPI_Comm grid_comm= hypre_SStructGridComm(grid); HYPRE_Int matrix_type= HYPRE_PARCSR; HYPRE_Int nparts= hypre_SStructGridNParts(grid); HYPRE_Int ndim = hypre_SStructGridNDim(grid); hypre_SStructGrid *cell_ssgrid; hypre_SStructPGrid *pgrid; hypre_StructGrid *cellgrid; hypre_BoxArray *cellboxes; hypre_Box *box, *cellbox, vbox, boxman_entry_box; hypre_Index loop_size, start; HYPRE_Int loopi, loopj, loopk; HYPRE_Int start_rank, end_rank, rank; HYPRE_Int i, j, k, m, n, t, part, var, nvars; HYPRE_SStructVariable *vartypes; HYPRE_Int nbdry_slabs; hypre_BoxArray *recv_slabs, *send_slabs; hypre_Index varoffset; hypre_BoxManager **boxmans, *cell_boxman; hypre_BoxManEntry **boxman_entries, *entry; HYPRE_Int nboxman_entries; hypre_Index ishift, jshift, kshift, zero_index; hypre_Index ilower, iupper, index; HYPRE_Int proc, nprocs, myproc; HYPRE_Int *SendToProcs, *RecvFromProcs; HYPRE_Int **send_RowsNcols; /* buffer for rows & ncols */ HYPRE_Int *send_RowsNcols_alloc; HYPRE_Int *send_ColsData_alloc; HYPRE_Int *tot_nsendRowsNcols, *tot_sendColsData; double **vals; /* buffer for cols & data */ HYPRE_Int *col_inds; double *values; hypre_MPI_Request *requests; hypre_MPI_Status *status; HYPRE_Int **rbuffer_RowsNcols; double **rbuffer_ColsData; HYPRE_Int num_sends, num_recvs; hypre_MaxwellOffProcRow **OffProcRows; HYPRE_Int *starts; HYPRE_Int ierr= 0; hypre_MPI_Comm_rank(A_comm, &myproc); hypre_MPI_Comm_size(grid_comm, &nprocs); start_rank= hypre_ParCSRMatrixFirstRowIndex(A); end_rank = hypre_ParCSRMatrixLastRowIndex(A); hypre_SetIndex(ishift, 1, 0, 0); hypre_SetIndex(jshift, 0, 1, 0); hypre_SetIndex(kshift, 0, 0, 1); hypre_SetIndex(zero_index, 0, 0, 0); /* need a cellgrid boxman to determine the send boxes -> only the cell dofs are unique so a boxman intersect can be used to get the edges that must be sent. */ HYPRE_SStructGridCreate(grid_comm, ndim, nparts, &cell_ssgrid); vartypes= hypre_CTAlloc(HYPRE_SStructVariable, 1); vartypes[0]= HYPRE_SSTRUCT_VARIABLE_CELL; for (i= 0; i< nparts; i++) { pgrid= hypre_SStructGridPGrid(grid, i); cellgrid= hypre_SStructPGridCellSGrid(pgrid); cellboxes= hypre_StructGridBoxes(cellgrid); hypre_ForBoxI(j, cellboxes) { box= hypre_BoxArrayBox(cellboxes, j); HYPRE_SStructGridSetExtents(cell_ssgrid, i, hypre_BoxIMin(box), hypre_BoxIMax(box)); } HYPRE_SStructGridSetVariables(cell_ssgrid, i, 1, vartypes); }
int main(int argc, char *argv[]) { GRID *g; DOF *u_h; MAT *A, *A0, *B; MAP *map; INT i; size_t nnz, mem, mem_peak; VEC *x, *y0, *y1, *y2; double t0, t1, dnz, dnz1, mflops, mop; char *fn = "../test/cube.dat"; FLOAT mem_max = 300; INT refine = 0; phgOptionsRegisterFilename("-mesh_file", "Mesh file", (char **)&fn); phgOptionsRegisterInt("-loop_count", "Loop count", &loop_count); phgOptionsRegisterInt("-refine", "Refinement level", &refine); phgOptionsRegisterFloat("-mem_max", "Maximum memory", &mem_max); phgInit(&argc, &argv); g = phgNewGrid(-1); if (!phgImport(g, fn, FALSE)) phgError(1, "can't read file \"%s\".\n", fn); phgRefineAllElements(g, refine); u_h = phgDofNew(g, DOF_DEFAULT, 1, "u_h", DofNoAction); while (TRUE) { phgPrintf("\n"); if (phgBalanceGrid(g, 1.2, 1, NULL, 0.)) phgPrintf("Repartition mesh, %d submeshes, load imbalance: %lg\n", g->nprocs, (double)g->lif); map = phgMapCreate(u_h, NULL); A = phgMapCreateMat(map, map); A->handle_bdry_eqns = TRUE; build_matrix(A, u_h); phgMatAssemble(A); /* Note: A is unsymmetric (A' != A) if boundary entries not removed */ phgMatRemoveBoundaryEntries(A); #if 0 /* test block matrix operation */ A0 = phgMatCreateBlockMatrix(g->comm, 1, 1, &A, NULL); #else A0 = A; #endif phgPrintf("%d DOF, %d elems, %d submeshes, matrix size: %d, LIF: %lg\n", DofGetDataCountGlobal(u_h), g->nleaf_global, g->nprocs, A->rmap->nglobal, (double)g->lif); /* test PHG mat-vec multiply */ x = phgMapCreateVec(A->cmap, 1); y1 = phgMapCreateVec(A->rmap, 1); phgVecRandomize(x, 123); phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); y0 = phgVecCopy(y1, NULL); nnz = A->nnz_d + A->nnz_o; #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif mop = loop_count * (dnz + dnz - A->rmap->nlocal) * 1e-6; phgPrintf("\n"); t1 -= t0; phgPrintf(" PHG: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF)\n", t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops); /* test trans(A)*x */ phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { phgMatVec(MAT_OP_T, 1.0, A0, x, 0.0, &y1); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); t1 -= t0; phgPrintf(" A'*x: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); /* time A * trans(A) */ phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); B = phgMatMat(MAT_OP_N, MAT_OP_N, 1.0, A, A, 0.0, NULL); t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); nnz = B->nnz_d + B->nnz_o; #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif /* compare B*x <--> A*A*x */ y2 = phgMatVec(MAT_OP_N, 1.0, B, x, 0.0, NULL); phgMatVec(MAT_OP_N, 1.0, A0, y0, 0.0, &y1); phgMatDestroy(&B); t1 -= t0; phgPrintf(" A*A: time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n", t1, dnz, mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL)); #if USE_PETSC { Mat ma, mb; MatInfo info; Vec va, vb, vc; PetscScalar *vec; ma = phgPetscCreateMatAIJ(A); MatGetVecs(ma, PETSC_NULL, &va); VecDuplicate(va, &vb); VecGetArray(va, &vec); memcpy(vec, x->data, x->map->nlocal * sizeof(*vec)); VecRestoreArray(va, &vec); MatMult(ma, va, vb); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { MatMult(ma, va, vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); VecGetArray(vb, &vec); memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vb, &vec); MatGetInfo(ma, MAT_GLOBAL_SUM, &info); /*phgPrintf(" --------------------------------------------" "-------------------------\n");*/ phgPrintf("\n"); t1 -= t0; dnz = info.nz_used; phgPrintf(" PETSc: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { MatMultTranspose(ma, va, vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); VecGetArray(vb, &vec); memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vb, &vec); t1 -= t0; phgPrintf(" A'*x: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); MatMatMult(ma, ma, MAT_INITIAL_MATRIX, PETSC_DEFAULT, &mb); t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); t1 -= t0; MatGetInfo(mb, MAT_GLOBAL_SUM, &info); dnz = info.nz_used; VecDuplicate(va, &vc); /* compare B*x <--> A*A*x */ MatMult(ma, vb, vc); MatMult(mb, va, vb); VecGetArray(vb, &vec); memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vb, &vec); VecGetArray(vc, &vec); memcpy(y2->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vc, &vec); phgPrintf(" A*A: time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n", t1, dnz, mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL)); phgPetscMatDestroy(&mb); phgPetscMatDestroy(&ma); phgPetscVecDestroy(&va); phgPetscVecDestroy(&vb); phgPetscVecDestroy(&vc); } #endif /* USE_PETSC */ #if USE_HYPRE { HYPRE_IJMatrix ma; HYPRE_IJVector va, vb, vc; HYPRE_ParCSRMatrix par_ma; hypre_ParCSRMatrix *par_mb; HYPRE_ParVector par_va, par_vb, par_vc; HYPRE_Int offset, *ni, start, end; assert(sizeof(INT)==sizeof(int) && sizeof(FLOAT)==sizeof(double)); setup_hypre_mat(A, &ma); ni = phgAlloc(2 * A->rmap->nlocal * sizeof(*ni)); offset = A->cmap->partition[A->cmap->rank]; for (i = 0; i < A->rmap->nlocal; i++) ni[i] = i + offset; HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1, &va); HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1, &vb); HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1, &vc); HYPRE_IJVectorSetObjectType(va, HYPRE_PARCSR); HYPRE_IJVectorSetObjectType(vb, HYPRE_PARCSR); HYPRE_IJVectorSetObjectType(vc, HYPRE_PARCSR); HYPRE_IJVectorSetMaxOffProcElmts(va, 0); HYPRE_IJVectorSetMaxOffProcElmts(vb, 0); HYPRE_IJVectorSetMaxOffProcElmts(vc, 0); HYPRE_IJVectorInitialize(va); HYPRE_IJVectorInitialize(vb); HYPRE_IJVectorInitialize(vc); HYPRE_IJMatrixGetObject(ma, (void **)(void *)&par_ma); HYPRE_IJVectorGetObject(va, (void **)(void *)&par_va); HYPRE_IJVectorGetObject(vb, (void **)(void *)&par_vb); HYPRE_IJVectorGetObject(vc, (void **)(void *)&par_vc); HYPRE_IJVectorSetValues(va, A->cmap->nlocal, ni, (double *)x->data); HYPRE_IJVectorAssemble(va); HYPRE_IJVectorAssemble(vb); HYPRE_IJVectorAssemble(vc); HYPRE_IJMatrixGetRowCounts(ma, A->cmap->nlocal, ni, ni + A->rmap->nlocal); for (i = 0, nnz = 0; i < A->rmap->nlocal; i++) nnz += ni[A->rmap->nlocal + i]; #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_va, 0.0, par_vb); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_va, 0.0, par_vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data); /*phgPrintf(" --------------------------------------------" "-------------------------\n");*/ phgPrintf("\n"); t1 -= t0; phgPrintf(" HYPRE: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { HYPRE_ParCSRMatrixMatvecT(1.0, par_ma, par_va, 0.0, par_vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data); t1 -= t0; phgPrintf(" A'*x: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); /* Note: 'HYPRE_ParCSRMatrix' is currently typedef'ed to * 'hypre_ParCSRMatrix *' */ par_mb = hypre_ParMatmul((hypre_ParCSRMatrix *)par_ma, (hypre_ParCSRMatrix *)par_ma); t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); start = hypre_ParCSRMatrixFirstRowIndex(par_mb); end = hypre_ParCSRMatrixLastRowIndex(par_mb) + 1; for (i = start, nnz = 0; i < end; i++) { HYPRE_Int ncols; hypre_ParCSRMatrixGetRow(par_mb, i, &ncols, NULL, NULL); hypre_ParCSRMatrixRestoreRow(par_mb, i, &ncols, NULL, NULL); nnz += ncols; } #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif /* compare B*x <--> A*A*x */ HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_vb, 0.0, par_vc); HYPRE_ParCSRMatrixMatvec(1.0, (void *)par_mb, par_va, 0.0, par_vb); HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data); HYPRE_IJVectorGetValues(vc, A->rmap->nlocal, ni, (double*)y2->data); hypre_ParCSRMatrixDestroy((par_mb)); t1 -= t0; phgPrintf(" A*A: time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n", t1, dnz, mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL)); phgFree(ni); HYPRE_IJMatrixDestroy(ma); HYPRE_IJVectorDestroy(va); HYPRE_IJVectorDestroy(vb); HYPRE_IJVectorDestroy(vc); } #endif /* USE_HYPRE */ if (A0 != A) phgMatDestroy(&A0); #if 0 if (A->rmap->nglobal > 1000) { VEC *v = phgMapCreateVec(A->rmap, 3); for (i = 0; i < v->map->nlocal; i++) { v->data[i + 0 * v->map->nlocal] = 1 * (i + v->map->partition[g->rank]); v->data[i + 1 * v->map->nlocal] = 2 * (i + v->map->partition[g->rank]); v->data[i + 2 * v->map->nlocal] = 3 * (i + v->map->partition[g->rank]); } phgMatDumpMATLAB(A, "A", "A.m"); phgVecDumpMATLAB(v, "v", "v.m"); phgFinalize(); exit(0); } #endif phgMatDestroy(&A); phgVecDestroy(&x); phgVecDestroy(&y0); phgVecDestroy(&y1); phgVecDestroy(&y2); phgMapDestroy(&map); mem = phgMemoryUsage(g, &mem_peak); dnz = mem / (1024.0 * 1024.0); dnz1 = mem_peak / (1024.0 * 1024.0); /*phgPrintf(" --------------------------------------------" "-------------------------\n");*/ phgPrintf("\n"); phgPrintf(" Memory: current %0.4lgMB, peak %0.4lgMB\n", dnz, dnz1); #if 0 { static int loop_count = 0; if (++loop_count == 4) break; } #endif if (mem_peak > 1024 * (size_t)1024 * mem_max) break; phgRefineAllElements(g, 1); } phgDofFree(&u_h); phgFreeGrid(&g); phgFinalize(); return 0; }