int main(int argc, char **argv ) { int me, /* holds the index of "this" process */ nprocs, /* holds the number of processes involved */ nprows, npcols, /* mesh sizes */ myrow, mycol, /* this node's mesh coordinates */ n, /* global matrix size */ local_m, /* local row size of A, B, C */ local_n; /* local column size of A, B, C */ double *global_A, /* array in which to hold matrix A */ *global_B, /* array in which to hold matrix B */ *global_C, /* array in which to hold matrix C */ *local_A, /* array in which to hold local part of matrix A */ *local_B, /* array in which to hold local part of matrix B */ *local_C, /* array in which to hold local part of matrix C */ *local_C_ref, /* array in which to hold local part of matrix C */ local_diff, /* hold difference between sequential and */ diff, /* parallel result */ d_one = 1.0; /* double precision one, to pass by address */ MPI_Comm comm_row, comm_col; /* communicators for the row and col in which this node exists */ /* Initialize MPI, passing in the command-line parameters. MPI_Init strips out the command-line parameters for MPI (e.g., the -np 5 in our example) and then returns argc and argv with without those parameters. */ MPI_Init( &argc, &argv ); /* Inquire how many processes were started up by mpiexec */ MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); /* Inquire the index (rank) of "this" process within the proceses started up by mpiexec */ MPI_Comm_rank( MPI_COMM_WORLD, &me ); /* Process 0 accepts input */ if ( me == 0 ){ printf("enter matrix size n:"); scanf( "%d", &n ); printf("enter nprows, npcols:"); scanf( "%d%d", &nprows, &npcols ); } /* share parameters with all nodes */ MPI_Bcast( &n, 1, MPI_INT, 0, MPI_COMM_WORLD ); MPI_Bcast( &nprows, 1, MPI_INT, 0, MPI_COMM_WORLD ); MPI_Bcast( &npcols, 1, MPI_INT, 0, MPI_COMM_WORLD ); if ( nprows * npcols != nprocs ){ printf( "mesh not of right size\n" ); exit( 0 ); } /* Figure out what my index is */ mycol = me / nprows; myrow = me % nprows; /* create a communicator for the row of which I am part */ MPI_Comm_split( MPI_COMM_WORLD, myrow, mycol, &comm_row ); /* create a communicator for the column of which I am part */ MPI_Comm_split( MPI_COMM_WORLD, mycol, myrow, &comm_col ); /* create buffers into which to hold the global A, B, C (everyone will have a copy) */ global_A = ( double * ) malloc ( sizeof( double ) * n * n ); global_B = ( double * ) malloc ( sizeof( double ) * n * n ); global_C = ( double * ) malloc ( sizeof( double ) * n * n ); /* create random matrices on node zero and share with all nodes */ if ( me == 0 ){ random_matrix( n, n, global_A, n ); random_matrix( n, n, global_B, n ); random_matrix( n, n, global_C, n ); } MPI_Bcast( global_A, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD ); MPI_Bcast( global_B, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD ); MPI_Bcast( global_C, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD ); /* compute local matrix sizes */ local_m = n / nprows + ( myrow < n % nprows ? 1 : 0 ); local_n = n / npcols + ( mycol < n % npcols ? 1 : 0 ); /* create buffer into which to hold the ocal A, B, C */ local_A = ( double * ) malloc ( sizeof( double ) * local_m * local_n ); local_B = ( double * ) malloc ( sizeof( double ) * local_m * local_n ); local_C = ( double * ) malloc ( sizeof( double ) * local_m * local_n ); /* copy the local parts */ CopyMatrixGlobalToLocal( n, n, global_A, n, local_A, local_m, comm_row, comm_col ); CopyMatrixGlobalToLocal( n, n, global_B, n, local_B, local_m, comm_row, comm_col ); CopyMatrixGlobalToLocal( n, n, global_C, n, local_C, local_m, comm_row, comm_col ); /* Compute parallel matrix-matrix multiply */ ParallelMMult( n, n, n, local_A, local_m, local_B, local_m, local_C, local_m, comm_row, comm_col ); /* Compute sequential matrix-matrix multiply on all nodes */ dgemm_( "N", "N", &n, &n, &n, &d_one, global_A, &n, global_B, &n, &d_one, global_C, &n ); local_C_ref = ( double * ) malloc ( sizeof( double ) * local_m * local_n ); CopyMatrixGlobalToLocal( n, n, global_C, n, local_C_ref, local_m, comm_row, comm_col ); local_diff = compare_matrices( local_m, local_n, local_C, local_m, local_C_ref, local_m ); MPI_Allreduce( &local_diff, &diff, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); if ( me == 0 ) printf("\ndiff = %le\n", diff ); free( global_A ); free( global_B ); free( global_C ); free( local_A ); free( local_B ); free( local_C ); free( local_C_ref); MPI_Comm_free( &comm_row ); MPI_Comm_free( &comm_col ); /* Cleanup up the MPI environment */ MPI_Finalize(); exit( 0 ); }
static PetscErrorCode triangulateAndFormProl(IS selected_2,PetscInt data_stride,PetscReal coords[],PetscInt nselected_1,const PetscInt clid_lid_1[],const PetscCoarsenData *agg_lists_1, const PetscInt crsGID[],PetscInt bs,Mat a_Prol,PetscReal *a_worst_best) { #if defined(PETSC_HAVE_TRIANGLE) PetscErrorCode ierr; PetscInt jj,tid,tt,idx,nselected_2; struct triangulateio in,mid; const PetscInt *selected_idx_2; PetscMPIInt rank; PetscInt Istart,Iend,nFineLoc,myFine0; int kk,nPlotPts,sid; MPI_Comm comm; PetscReal tm; PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)a_Prol,&comm);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); ierr = ISGetSize(selected_2, &nselected_2);CHKERRQ(ierr); if (nselected_2 == 1 || nselected_2 == 2) { /* 0 happens on idle processors */ *a_worst_best = 100.0; /* this will cause a stop, but not globalized (should not happen) */ } else *a_worst_best = 0.0; ierr = MPI_Allreduce(a_worst_best, &tm, 1, MPIU_REAL, MPIU_MAX, comm);CHKERRQ(ierr); if (tm > 0.0) { *a_worst_best = 100.0; PetscFunctionReturn(0); } ierr = MatGetOwnershipRange(a_Prol, &Istart, &Iend);CHKERRQ(ierr); nFineLoc = (Iend-Istart)/bs; myFine0 = Istart/bs; nPlotPts = nFineLoc; /* locals */ /* traingle */ /* Define input points - in*/ in.numberofpoints = nselected_2; in.numberofpointattributes = 0; /* get nselected points */ ierr = PetscMalloc1(2*nselected_2, &in.pointlist);CHKERRQ(ierr); ierr = ISGetIndices(selected_2, &selected_idx_2);CHKERRQ(ierr); for (kk=0,sid=0; kk<nselected_2; kk++,sid += 2) { PetscInt lid = selected_idx_2[kk]; in.pointlist[sid] = coords[lid]; in.pointlist[sid+1] = coords[data_stride + lid]; if (lid>=nFineLoc) nPlotPts++; } if (sid != 2*nselected_2) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sid %D != 2*nselected_2 %D",sid,nselected_2); in.numberofsegments = 0; in.numberofedges = 0; in.numberofholes = 0; in.numberofregions = 0; in.trianglelist = 0; in.segmentmarkerlist = 0; in.pointattributelist = 0; in.pointmarkerlist = 0; in.triangleattributelist = 0; in.trianglearealist = 0; in.segmentlist = 0; in.holelist = 0; in.regionlist = 0; in.edgelist = 0; in.edgemarkerlist = 0; in.normlist = 0; /* triangulate */ mid.pointlist = 0; /* Not needed if -N switch used. */ /* Not needed if -N switch used or number of point attributes is zero: */ mid.pointattributelist = 0; mid.pointmarkerlist = 0; /* Not needed if -N or -B switch used. */ mid.trianglelist = 0; /* Not needed if -E switch used. */ /* Not needed if -E switch used or number of triangle attributes is zero: */ mid.triangleattributelist = 0; mid.neighborlist = 0; /* Needed only if -n switch used. */ /* Needed only if segments are output (-p or -c) and -P not used: */ mid.segmentlist = 0; /* Needed only if segments are output (-p or -c) and -P and -B not used: */ mid.segmentmarkerlist = 0; mid.edgelist = 0; /* Needed only if -e switch used. */ mid.edgemarkerlist = 0; /* Needed if -e used and -B not used. */ mid.numberoftriangles = 0; /* Triangulate the points. Switches are chosen to read and write a */ /* PSLG (p), preserve the convex hull (c), number everything from */ /* zero (z), assign a regional attribute to each element (A), and */ /* produce an edge list (e), a Voronoi diagram (v), and a triangle */ /* neighbor list (n). */ if (nselected_2 != 0) { /* inactive processor */ char args[] = "npczQ"; /* c is needed ? */ triangulate(args, &in, &mid, (struct triangulateio*) NULL); /* output .poly files for 'showme' */ if (!PETSC_TRUE) { static int level = 1; FILE *file; char fname[32]; sprintf(fname,"C%d_%d.poly",level,rank); file = fopen(fname, "w"); /*First line: <# of vertices> <dimension (must be 2)> <# of attributes> <# of boundary markers (0 or 1)>*/ fprintf(file, "%d %d %d %d\n",in.numberofpoints,2,0,0); /*Following lines: <vertex #> <x> <y> */ for (kk=0,sid=0; kk<in.numberofpoints; kk++,sid += 2) { fprintf(file, "%d %e %e\n",kk,in.pointlist[sid],in.pointlist[sid+1]); } /*One line: <# of segments> <# of boundary markers (0 or 1)> */ fprintf(file, "%d %d\n",0,0); /*Following lines: <segment #> <endpoint> <endpoint> [boundary marker] */ /* One line: <# of holes> */ fprintf(file, "%d\n",0); /* Following lines: <hole #> <x> <y> */ /* Optional line: <# of regional attributes and/or area constraints> */ /* Optional following lines: <region #> <x> <y> <attribute> <maximum area> */ fclose(file); /* elems */ sprintf(fname,"C%d_%d.ele",level,rank); file = fopen(fname, "w"); /* First line: <# of triangles> <nodes per triangle> <# of attributes> */ fprintf(file, "%d %d %d\n",mid.numberoftriangles,3,0); /* Remaining lines: <triangle #> <node> <node> <node> ... [attributes] */ for (kk=0,sid=0; kk<mid.numberoftriangles; kk++,sid += 3) { fprintf(file, "%d %d %d %d\n",kk,mid.trianglelist[sid],mid.trianglelist[sid+1],mid.trianglelist[sid+2]); } fclose(file); sprintf(fname,"C%d_%d.node",level,rank); file = fopen(fname, "w"); /* First line: <# of vertices> <dimension (must be 2)> <# of attributes> <# of boundary markers (0 or 1)> */ /* fprintf(file, "%d %d %d %d\n",in.numberofpoints,2,0,0); */ fprintf(file, "%d %d %d %d\n",nPlotPts,2,0,0); /*Following lines: <vertex #> <x> <y> */ for (kk=0,sid=0; kk<in.numberofpoints; kk++,sid+=2) { fprintf(file, "%d %e %e\n",kk,in.pointlist[sid],in.pointlist[sid+1]); } sid /= 2; for (jj=0; jj<nFineLoc; jj++) { PetscBool sel = PETSC_TRUE; for (kk=0; kk<nselected_2 && sel; kk++) { PetscInt lid = selected_idx_2[kk]; if (lid == jj) sel = PETSC_FALSE; } if (sel) fprintf(file, "%d %e %e\n",sid++,coords[jj],coords[data_stride + jj]); } fclose(file); if (sid != nPlotPts) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sid %D != nPlotPts %D",sid,nPlotPts); level++; } } #if defined PETSC_GAMG_USE_LOG ierr = PetscLogEventBegin(petsc_gamg_setup_events[FIND_V],0,0,0,0);CHKERRQ(ierr); #endif { /* form P - setup some maps */ PetscInt clid,mm,*nTri,*node_tri; ierr = PetscMalloc2(nselected_2, &node_tri,nselected_2, &nTri);CHKERRQ(ierr); /* need list of triangles on node */ for (kk=0; kk<nselected_2; kk++) nTri[kk] = 0; for (tid=0,kk=0; tid<mid.numberoftriangles; tid++) { for (jj=0; jj<3; jj++) { PetscInt cid = mid.trianglelist[kk++]; if (nTri[cid] == 0) node_tri[cid] = tid; nTri[cid]++; } } #define EPS 1.e-12 /* find points and set prolongation */ for (mm = clid = 0; mm < nFineLoc; mm++) { PetscBool ise; ierr = PetscCDEmptyAt(agg_lists_1,mm,&ise);CHKERRQ(ierr); if (!ise) { const PetscInt lid = mm; /* for (clid_iterator=0;clid_iterator<nselected_1;clid_iterator++) { */ PetscScalar AA[3][3]; PetscBLASInt N=3,NRHS=1,LDA=3,IPIV[3],LDB=3,INFO; PetscCDPos pos; ierr = PetscCDGetHeadPos(agg_lists_1,lid,&pos);CHKERRQ(ierr); while (pos) { PetscInt flid; ierr = PetscLLNGetID(pos, &flid);CHKERRQ(ierr); ierr = PetscCDGetNextPos(agg_lists_1,lid,&pos);CHKERRQ(ierr); if (flid < nFineLoc) { /* could be a ghost */ PetscInt bestTID = -1; PetscReal best_alpha = 1.e10; const PetscInt fgid = flid + myFine0; /* compute shape function for gid */ const PetscReal fcoord[3] = {coords[flid],coords[data_stride+flid],1.0}; PetscBool haveit =PETSC_FALSE; PetscScalar alpha[3]; PetscInt clids[3]; /* look for it */ for (tid = node_tri[clid], jj=0; jj < 5 && !haveit && tid != -1; jj++) { for (tt=0; tt<3; tt++) { PetscInt cid2 = mid.trianglelist[3*tid + tt]; PetscInt lid2 = selected_idx_2[cid2]; AA[tt][0] = coords[lid2]; AA[tt][1] = coords[data_stride + lid2]; AA[tt][2] = 1.0; clids[tt] = cid2; /* store for interp */ } for (tt=0; tt<3; tt++) alpha[tt] = (PetscScalar)fcoord[tt]; /* SUBROUTINE DGESV(N, NRHS, A, LDA, IPIV, B, LDB, INFO) */ PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&N, &NRHS, (PetscScalar*)AA, &LDA, IPIV, alpha, &LDB, &INFO)); { PetscBool have=PETSC_TRUE; PetscReal lowest=1.e10; for (tt = 0, idx = 0; tt < 3; tt++) { if (PetscRealPart(alpha[tt]) > (1.0+EPS) || PetscRealPart(alpha[tt]) < -EPS) have = PETSC_FALSE; if (PetscRealPart(alpha[tt]) < lowest) { lowest = PetscRealPart(alpha[tt]); idx = tt; } } haveit = have; } tid = mid.neighborlist[3*tid + idx]; } if (!haveit) { /* brute force */ for (tid=0; tid<mid.numberoftriangles && !haveit; tid++) { for (tt=0; tt<3; tt++) { PetscInt cid2 = mid.trianglelist[3*tid + tt]; PetscInt lid2 = selected_idx_2[cid2]; AA[tt][0] = coords[lid2]; AA[tt][1] = coords[data_stride + lid2]; AA[tt][2] = 1.0; clids[tt] = cid2; /* store for interp */ } for (tt=0; tt<3; tt++) alpha[tt] = fcoord[tt]; /* SUBROUTINE DGESV(N, NRHS, A, LDA, IPIV, B, LDB, INFO) */ PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&N, &NRHS, (PetscScalar*)AA, &LDA, IPIV, alpha, &LDB, &INFO)); { PetscBool have=PETSC_TRUE; PetscReal worst=0.0, v; for (tt=0; tt<3 && have; tt++) { if (PetscRealPart(alpha[tt]) > 1.0+EPS || PetscRealPart(alpha[tt]) < -EPS) have=PETSC_FALSE; if ((v=PetscAbs(PetscRealPart(alpha[tt])-0.5)) > worst) worst = v; } if (worst < best_alpha) { best_alpha = worst; bestTID = tid; } haveit = have; } } } if (!haveit) { if (best_alpha > *a_worst_best) *a_worst_best = best_alpha; /* use best one */ for (tt=0; tt<3; tt++) { PetscInt cid2 = mid.trianglelist[3*bestTID + tt]; PetscInt lid2 = selected_idx_2[cid2]; AA[tt][0] = coords[lid2]; AA[tt][1] = coords[data_stride + lid2]; AA[tt][2] = 1.0; clids[tt] = cid2; /* store for interp */ } for (tt=0; tt<3; tt++) alpha[tt] = fcoord[tt]; /* SUBROUTINE DGESV(N, NRHS, A, LDA, IPIV, B, LDB, INFO) */ PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&N, &NRHS, (PetscScalar*)AA, &LDA, IPIV, alpha, &LDB, &INFO)); } /* put in row of P */ for (idx=0; idx<3; idx++) { PetscScalar shp = alpha[idx]; if (PetscAbs(PetscRealPart(shp)) > 1.e-6) { PetscInt cgid = crsGID[clids[idx]]; PetscInt jj = cgid*bs, ii = fgid*bs; /* need to gloalize */ for (tt=0; tt < bs; tt++, ii++, jj++) { ierr = MatSetValues(a_Prol,1,&ii,1,&jj,&shp,INSERT_VALUES);CHKERRQ(ierr); } } } } } /* aggregates iterations */ clid++; } /* a coarse agg */ } /* for all fine nodes */ ierr = ISRestoreIndices(selected_2, &selected_idx_2);CHKERRQ(ierr); ierr = MatAssemblyBegin(a_Prol,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(a_Prol,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = PetscFree2(node_tri,nTri);CHKERRQ(ierr); } #if defined PETSC_GAMG_USE_LOG ierr = PetscLogEventEnd(petsc_gamg_setup_events[FIND_V],0,0,0,0);CHKERRQ(ierr); #endif free(mid.trianglelist); free(mid.neighborlist); ierr = PetscFree(in.pointlist);CHKERRQ(ierr); PetscFunctionReturn(0); #else SETERRQ(PetscObjectComm((PetscObject)a_Prol),PETSC_ERR_PLIB,"configure with TRIANGLE to use geometric MG"); #endif }
bool isFinished() { int ret; MPI_Allreduce(&active, &ret, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); return ret == 0; }
//============================================================================ //Here's the main... //============================================================================ int poisson_main(int argc, char** argv, MPI_Comm comm, int numProcs, int localProc){ int outputLevel; int masterProc = 0, err = 0; double start_time = fei::utils::cpu_time(); std::vector<std::string> stdstrings; CHK_ERR( fei_test_utils::get_filename_and_read_input(argc, argv, comm, localProc, stdstrings) ); const char** params = NULL; int numParams = 0; fei::utils::strings_to_char_ptrs(stdstrings, numParams, params); fei::ParameterSet paramset; fei::utils::parse_strings(stdstrings, " ", paramset); std::string which_fei; std::string solverName; int L = 0; int errcode = 0; errcode += paramset.getStringParamValue("SOLVER_LIBRARY", solverName); errcode += paramset.getStringParamValue("WHICH_FEI", which_fei); errcode += paramset.getIntParamValue("L", L); if (errcode != 0) { fei::console_out() << "Failed to find one or more required parameters in input-file." << FEI_ENDL << "Required parameters:"<<FEI_ENDL << "SOLVER_LIBRARY" << FEI_ENDL << "WHICH_FEI" << FEI_ENDL << "L" << FEI_ENDL; return(-1); } if (localProc == 0) { int nodes = (L+1)*(L+1); int eqns = nodes; FEI_COUT << FEI_ENDL; FEI_COUT << "========================================================" << FEI_ENDL; FEI_COUT << "Square size L: " << L << " elements." << FEI_ENDL; FEI_COUT << "Global number of elements: " << L*L << FEI_ENDL; FEI_COUT << "Global number of nodes: " << nodes << FEI_ENDL; FEI_COUT << "Global number of equations: " << eqns <<FEI_ENDL; FEI_COUT << "========================================================" << FEI_ENDL; } outputLevel = fei_test_utils::whichArg(numParams, params, "outputLevel 1"); if (outputLevel >= 0) outputLevel = 1; if (outputLevel < 0) outputLevel = 0; if ((masterProc == localProc)&&(outputLevel>0)) { fei_test_utils::print_args(argc, argv); } if (outputLevel == 1) { if (localProc != 0) outputLevel = 0; } //PoissonData is the object that will be in charge of generating the //data to pump into the FEI object. PoissonData poissonData(L, numProcs, localProc, outputLevel); double start_init_time = fei::utils::cpu_time(); fei::SharedPtr<fei::Factory> factory; fei::SharedPtr<LibraryWrapper> wrapper; fei::SharedPtr<FEI> fei; if (which_fei == "OLDFEI") { try { wrapper = fei::create_LibraryWrapper(comm, solverName.c_str()); } catch (std::runtime_error& exc) { fei::console_out() << exc.what()<<FEI_ENDL; ERReturn(-1); } fei.reset(new FEI_Implementation(wrapper, comm)); } else if (which_fei == "fei::FEI_Impl") { try { factory = fei::create_fei_Factory(comm, solverName.c_str()); } catch (std::runtime_error& exc) { fei::console_out() << exc.what()<<FEI_ENDL; ERReturn(-1); } fei = factory->createFEI(comm); } else { fei::console_out() << "poisson_main ERROR, value of 'WHICH_FEI' must be 'OLDFEI' or 'fei::FEI_Impl'"<< FEI_ENDL; ERReturn(-1); } const char* feiVersionString; CHK_ERR( fei->version(feiVersionString) ); if (localProc==0) FEI_COUT << feiVersionString << FEI_ENDL; //load some parameters. CHK_ERR( fei->parameters( numParams, params ) ); delete [] params; if (outputLevel>0 && localProc==0) FEI_COUT << "setSolveType" << FEI_ENDL; CHK_ERR( fei->setSolveType(FEI_SINGLE_SYSTEM) ); int numFields = poissonData.getNumFields(); int* fieldSizes = poissonData.getFieldSizes(); int* fieldIDs = poissonData.getFieldIDs(); if (outputLevel>0 && localProc==0) FEI_COUT << "initFields" << FEI_ENDL; CHK_ERR( fei->initFields( numFields, fieldSizes, fieldIDs ) ); CHK_ERR( init_elem_connectivities(fei.get(), poissonData) ); CHK_ERR( set_shared_nodes(fei.get(), poissonData) ); //The FEI_COUT and IOS_... macros are defined in base/fei_iostream.hpp FEI_COUT.setf(IOS_FIXED, IOS_FLOATFIELD); if (outputLevel>0 && localProc==0) FEI_COUT << "initComplete" << FEI_ENDL; CHK_ERR( fei->initComplete() ); double fei_init_time = fei::utils::cpu_time() - start_init_time; //Now the initialization phase is complete. Next we'll do the load phase, //which for this problem just consists of loading the element data //(element-wise stiffness arrays and load vectors) and the boundary //condition data. //This simple problem doesn't have an constraint relations, etc. double start_load_time = fei::utils::cpu_time(); CHK_ERR( load_elem_data(fei.get(), poissonData) ); CHK_ERR( load_BC_data(fei.get(), poissonData) ); double fei_load_time = fei::utils::cpu_time() - start_load_time; // //now the load phase is complete, so we're ready to launch the underlying //solver and solve Ax=b // int status; if (outputLevel>0 && localProc==0) FEI_COUT << "solve..." << FEI_ENDL; double start_solve_time = fei::utils::cpu_time(); err = fei->solve(status); if (err) { if (localProc==0) FEI_COUT << "solve returned err: " << err << FEI_ENDL; } double iTime, lTime, sTime, rTime; CHK_ERR(fei->cumulative_cpu_times(iTime, lTime, sTime, rTime) ); double solve_time = fei::utils::cpu_time() - start_solve_time; if (localProc == 0) { FEI_COUT << "FEI cpu-times:" << FEI_ENDL << " init. phase: " << iTime << FEI_ENDL << " load phase: " << lTime << FEI_ENDL << " solve time: " << sTime << FEI_ENDL; } double norm = 0.0; FEI_COUT.setf(IOS_SCIENTIFIC, IOS_FLOATFIELD); CHK_ERR( fei->residualNorm(1, 1, &fieldIDs[0], &norm) ); if (localProc == 0) { FEI_COUT << "returned residual norm: " << norm << FEI_ENDL; } int itersTaken = 0; CHK_ERR( fei->iterations(itersTaken) ); // //We oughtta make sure the solution we just computed is correct... // int numNodes = 0; CHK_ERR( fei->getNumLocalNodes(numNodes) ); double maxErr = 0.0; if (numNodes > 0) { int lenNodeIDs = numNodes; GlobalID* nodeIDs = new GlobalID[lenNodeIDs]; double* soln = new double[lenNodeIDs]; if (nodeIDs != NULL && soln != NULL) { CHK_ERR( fei->getLocalNodeIDList(numNodes, nodeIDs, lenNodeIDs) ); int fieldID = 1; CHK_ERR( fei->getNodalFieldSolution(fieldID, numNodes, nodeIDs, soln)); for(int i=0; i<numNodes; i++) { int nID = (int)nodeIDs[i]; double x = (1.0* ((nID-1)%(L+1)))/L; double y = (1.0* ((nID-1)/(L+1)))/L; double exactSoln = x*x + y*y; double error = std::abs(exactSoln - soln[i]); if (maxErr < error) maxErr = error; } delete [] nodeIDs; delete [] soln; } else { fei::console_out() << "allocation of nodeIDs or soln failed." << FEI_ENDL; } } #ifndef FEI_SER double globalMaxErr = 0.0; MPI_Allreduce(&maxErr, &globalMaxErr, 1, MPI_DOUBLE, MPI_MAX, comm); maxErr = globalMaxErr; #endif bool testPassed = true; if (maxErr > 1.e-6) testPassed = false; if (testPassed && localProc == 0) { FEI_COUT << "poisson: TEST PASSED, maxErr = " << maxErr << ", iterations: " << itersTaken << FEI_ENDL; //This is something the SIERRA runtest tool looks for in test output... FEI_COUT << "SIERRA execution successful" << FEI_ENDL; } if (testPassed == false && localProc == 0) { FEI_COUT << "maxErr = " << maxErr << ", TEST FAILED" << FEI_ENDL; FEI_COUT << "(Test is deemed to have passed if the maximum difference" << " between the exact and computed solutions is 1.e-6 or less.)" << FEI_ENDL; } double elapsed_cpu_time = fei::utils::cpu_time() - start_time; //The following IOS_... macros are defined in base/fei_macros.h FEI_COUT.setf(IOS_FIXED, IOS_FLOATFIELD); if (localProc==0) { FEI_COUT << "Proc0 cpu times (seconds):" << FEI_ENDL << " FEI initialize: " << fei_init_time << FEI_ENDL << " FEI load: " << fei_load_time << FEI_ENDL << " solve: " << solve_time << FEI_ENDL << "Total program time: " << elapsed_cpu_time << FEI_ENDL; } wrapper.reset(); fei.reset(); factory.reset(); //If Prometheus is being used, we need to make sure that the //LibraryWrapper inside factory is deleted before MPI_Finalize() is called. //(That's why we call the 'reset' methods on these shared-pointers rather //than just letting them destroy themselves when they go out of scope.) return(0); }
double cell_mindt( struct Cell *** theCells, struct Sim * theSim, struct GravMass * theGravMasses ){ int i_m,j_m,k_m; double dt_m = 1.e100;//HUGE_VAL; double a_m,r_m,dx_m; double mag_vel_m; int i,j,k; for( k=sim_Nghost_min(theSim,Z_DIR) ; k<sim_N(theSim,Z_DIR)-sim_Nghost_max(theSim,Z_DIR) ; ++k ){ double zm = sim_FacePos(theSim,k-1,Z_DIR); double zp = sim_FacePos(theSim,k,Z_DIR); double dz = zp-zm; for( i=sim_Nghost_min(theSim,R_DIR) ; i<sim_N(theSim,R_DIR)-sim_Nghost_max(theSim,R_DIR) ; ++i ){ double rm = sim_FacePos(theSim,i-1,R_DIR); double rp = sim_FacePos(theSim,i,R_DIR); double dr = rp-rm; double r = .5*(rp+rm); for( j=0 ; j<sim_N_p(theSim,i) ; ++j ){ int jm = j-1; if( j==0 ) jm = sim_N_p(theSim,i)-1; double w = .5*(theCells[k][i][j].wiph+theCells[k][i][jm].wiph); double dx = dr; double rdphi = .5*(rp+rm)*theCells[k][i][j].dphi; if( rdphi<dr ) { dx = rdphi; } if( dx>dz ) { dx = dz; } double a = maxvel( theCells[k][i][j].prim , w , r ,theSim); double rho = theCells[k][i][j].prim[RHO]; double Pp = theCells[k][i][j].prim[PPP]; double dt = sim_CFL(theSim)*dx/a; if( sim_EXPLICIT_VISCOSITY(theSim)>0.0 ){ double nu; if (sim_VISC_CONST(theSim)==1){ nu = sim_EXPLICIT_VISCOSITY(theSim); } else{ double tiph = theCells[k][i][j].tiph - 0.5*theCells[k][i][j].dphi; if (sim_InitialDataType(theSim)==SHEAR){ double HoR = 0.1; //nu = sim_EXPLICIT_VISCOSITY(theSim)*HoR*HoR*pow(fabs((r*cos(tiph))),1.5); nu = sim_EXPLICIT_VISCOSITY(theSim)*sim_GAMMALAW(theSim)*Pp/rho*pow(fabs(r*cos(tiph)),2.0); if (r*cos(tiph)>20.) nu=0.000000001; } else{ double M0 = gravMass_M(theGravMasses,0); double M1 = gravMass_M(theGravMasses,1); double dist_bh0 = gravMass_dist(theGravMasses,0,r,tiph,0.); double dist_bh1 = gravMass_dist(theGravMasses,1,r,tiph,0.); double alpha = sim_EXPLICIT_VISCOSITY(theSim); double eps = sim_G_EPS(theSim); //nu = sim_EXPLICIT_VISCOSITY(theSim)*sim_GAMMALAW(theSim)*Pp/rho*pow(r,1.5); nu = alpha*Pp/rho/sqrt(pow(dist_bh0*dist_bh0+eps*eps,-1.5)*M0+pow(dist_bh1*dist_bh1+eps*eps,-1.5)*M1); } } double dt_visc = .25*dx*dx/nu; dt = dt/( 1. + dt/dt_visc ); } if( dt_m > dt ) { dt_m = dt; } } } } double dt2; MPI_Allreduce( &dt_m , &dt2 , 1 , MPI_DOUBLE , MPI_MIN , sim_comm ); return( dt2 ); }
/* Check the process affinity mask and if it is found to be non-zero, * will honor it and disable mdrun internal affinity setting. * Note that this will only work on Linux as we use a GNU feature. */ void gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr, gmx_hw_opt_t *hw_opt, int gmx_unused nthreads_hw_avail, gmx_bool bAfterOpenmpInit) { GMX_RELEASE_ASSERT(hw_opt, "hw_opt must be a non-NULL pointer"); if (!bAfterOpenmpInit) { /* Check for externally set OpenMP affinity and turn off internal * pinning if any is found. We need to do this check early to tell * thread-MPI whether it should do pinning when spawning threads. * TODO: the above no longer holds, we should move these checks later */ if (hw_opt->thread_affinity != threadaffOFF) { char *message; if (!gmx_omp_check_thread_affinity(&message)) { /* TODO: with -pin auto we should only warn when using all cores */ md_print_warn(cr, fplog, "%s", message); sfree(message); hw_opt->thread_affinity = threadaffOFF; } } /* With thread-MPI this is needed as pinning might get turned off, * which needs to be known before starting thread-MPI. * With thread-MPI hw_opt is processed here on the master rank * and passed to the other ranks later, so we only do this on master. */ if (!SIMMASTER(cr)) { return; } #ifndef GMX_THREAD_MPI return; #endif } #ifdef HAVE_SCHED_AFFINITY int ret; cpu_set_t mask_current; if (hw_opt->thread_affinity == threadaffOFF) { /* internal affinity setting is off, don't bother checking process affinity */ return; } CPU_ZERO(&mask_current); if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0) { /* failed to query affinity mask, will just return */ if (debug) { fprintf(debug, "Failed to query affinity mask (error %d)", ret); } return; } /* Before proceeding with the actual check, make sure that the number of * detected CPUs is >= the CPUs in the current set. * We need to check for CPU_COUNT as it was added only in glibc 2.6. */ #ifdef CPU_COUNT if (nthreads_hw_avail < CPU_COUNT(&mask_current)) { if (debug) { fprintf(debug, "%d hardware threads detected, but %d was returned by CPU_COUNT", nthreads_hw_avail, CPU_COUNT(&mask_current)); } return; } #endif /* CPU_COUNT */ gmx_bool bAllSet = TRUE; for (int i = 0; (i < nthreads_hw_avail && i < CPU_SETSIZE); i++) { bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0); } #ifdef GMX_LIB_MPI gmx_bool bAllSet_All; MPI_Allreduce(&bAllSet, &bAllSet_All, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); bAllSet = bAllSet_All; #endif if (!bAllSet) { if (hw_opt->thread_affinity == threadaffAUTO) { if (!bAfterOpenmpInit) { md_print_warn(cr, fplog, "Non-default thread affinity set, disabling internal thread affinity"); } else { md_print_warn(cr, fplog, "Non-default thread affinity set probably by the OpenMP library,\n" "disabling internal thread affinity"); } hw_opt->thread_affinity = threadaffOFF; } else { /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */ if (bAfterOpenmpInit) { md_print_warn(cr, fplog, "Overriding thread affinity set outside %s\n", gmx::getProgramContext().displayName()); } } if (debug) { fprintf(debug, "Non-default affinity mask found\n"); } } else { if (debug) { fprintf(debug, "Default affinity mask found\n"); } } #endif /* HAVE_SCHED_AFFINITY */ }
void hunt_problem(Comm_Ex *cx, /* array of communications structures */ Exo_DB *exo, /* ptr to the finite element mesh database */ Dpi *dpi) /* distributed processing information */ { int *ija=NULL; /* column pointer array */ double *a=NULL; /* nonzero array */ double *a_old=NULL; /* nonzero array */ double *x=NULL; /* solution vector */ int iAC; /* COUNTER */ double *x_AC = NULL; /* SOLUTION VECTOR OF EXTRA UNKNOWNS */ double *x_AC_old=NULL; /* old SOLUTION VECTOR OF EXTRA UNKNOWNS */ double *x_AC_dot = NULL; int iHC; /* COUNTER */ int *ija_attic=NULL; /* storage for external dofs */ int eb_indx, ev_indx; /* * variables for path traversal */ double *x_old=NULL; /* old solution vector */ double *x_older=NULL; /* older solution vector */ double *x_oldest=NULL; /* oldest solution vector saved */ double *xdot=NULL; /* current path derivative of soln */ double *xdot_old=NULL; double *x_update=NULL; double *x_sens=NULL; /* solution sensitivity */ double **x_sens_p=NULL; /* solution sensitivity for parameters */ int num_pvector=0; /* number of solution sensitivity vectors */ #ifdef COUPLED_FILL struct Aztec_Linear_Solver_System *ams[NUM_ALSS]={NULL}; #else /* COUPLED_FILL */ struct Aztec_Linear_Solver_System *ams[NUM_ALSS]={NULL, NULL}; #endif /* COUPLED_FILL */ /* sl_util_structs.h */ double *resid_vector=NULL; /* residual */ double *resid_vector_sens=NULL; /* residual sensitivity */ double *scale=NULL; /* scale vector for modified newton */ int *node_to_fill = NULL; int n; /* total number of path steps attempted */ int ni; /* total number of nonlinear solves */ int nt; /* total number of successful path steps */ int path_step_reform; /* counter for jacobian reformation stride */ int converged; /* success or failure of Newton iteration */ int success_ds; /* success or failure of path step */ int i; int nprint=0, num_total_nodes; int numProcUnknowns; int *const_delta_s=NULL; int step_print; double i_print; int good_mesh = TRUE; double *path=NULL, *path1=NULL; double *delta_s=NULL, *delta_s_new=NULL, *delta_s_old=NULL; double *delta_s_older=NULL, *delta_s_oldest=NULL; double *hDelta_s0=NULL, *hDelta_s_min=NULL, *hDelta_s_max=NULL; double delta_t; double theta=0.0; double eps; double *lambda=NULL, *lambdaEnd=NULL; double hunt_par, dhunt_par, hunt_par_old; /* hunting continuation parameter */ double timeValueRead = 0.0; /* * ALC management variables */ int alqALC; int *aldALC=NULL; /* * Other local variables */ int error, err, is_steady_state, inewton; int *gindex = NULL, gsize; int *p_gsize=NULL; double *gvec=NULL; double ***gvec_elem; FILE *file=NULL; double toler_org[3],damp_org; struct Results_Description *rd=NULL; int tnv; /* total number of nodal variables and kinds */ int tev; /* total number of elem variables and kinds */ int tnv_post; /* total number of nodal variables and kinds for post processing */ int tev_post; /* total number of elem variables and kinds for post processing */ #ifdef HAVE_FRONT int max_unk_elem, one, three; /* variables used as mf_setup arguments*/ #endif unsigned int matrix_systems_mask; double evol_local=0.0; #ifdef PARALLEL double evol_global=0.0; #endif static char yo[]="hunt_problem"; /* * BEGIN EXECUTION */ #ifdef DEBUG fprintf(stderr, "hunt_problem() begins...\n"); #endif toler_org[0] = custom_tol1; toler_org[1] = custom_tol2; toler_org[2] = custom_tol3; damp_org = damp_factor1; is_steady_state = TRUE; p_gsize = &gsize; /* * set aside space for gather global vectors to print to exoII file * note: this is temporary * * For 2D prototype problem: allocate space for T, dx, dy arrays */ if( strlen( Soln_OutFile) ) { #ifdef DEBUG printf("Trying to open \"%s\" for writing.\n", Soln_OutFile); #endif file = fopen(Soln_OutFile, "w"); if (file == NULL) { DPRINTF(stderr, "%s: opening soln file for writing\n", yo); EH(-1, "\t"); } } #ifdef PARALLEL check_parallel_error("Soln output file error"); #endif /* * Some preliminaries to help setup EXODUS II database output. */ #ifdef DEBUG fprintf(stderr, "cnt_nodal_vars() begins...\n"); #endif tnv = cnt_nodal_vars(); /* tnv_post is calculated in load_nodal_tkn*/ tev = cnt_elem_vars(); /* tev_post is calculated in load_elem_tkn*/ #ifdef DEBUG fprintf(stderr, "Found %d total primitive nodal variables to output.\n", tnv); fprintf(stderr, "Found %d total primitive elem variables to output.\n", tev); #endif if ( tnv < 0 ) { DPRINTF(stderr, "%s:\tbad tnv.\n", yo); EH(-1, "\t"); } if ( tev < 0 ) { DPRINTF(stderr, "%s:\tMaybe bad tev? See goma design committee ;) \n", yo); /* exit(-1); */ } rd = (struct Results_Description *) smalloc(sizeof(struct Results_Description)); if (rd == NULL) { EH(-1, "Could not grab Results Description."); } (void) memset((void *) rd, 0, sizeof(struct Results_Description)); rd->nev = 0; /* number element variables in results */ rd->ngv = 0; /* number global variables in results */ rd->nhv = 0; /* number history variables in results */ if ( is_steady_state == TRUE ) { rd->ngv = 5; /* number global variables in results see load_global_var_info for names*/ error = load_global_var_info(rd, 0, "CONV"); error = load_global_var_info(rd, 1, "NEWT_IT"); error = load_global_var_info(rd, 2, "MAX_IT"); error = load_global_var_info(rd, 3, "CONVRATE"); error = load_global_var_info(rd, 4, "MESH_VOLUME"); } /* load nodal types, kinds, names */ error = load_nodal_tkn( rd, &tnv, &tnv_post); /* load nodal types, kinds, names */ if (error !=0) { DPRINTF(stderr, "%s: problem with load_nodal_tkn()\n", yo); EH(-1,"\t"); } /* load elem types, names */ error = load_elem_tkn( rd, exo, tev, &tev_post); /* load elem types, names */ if ( error !=0 ) { DPRINTF(stderr, "%s: problem with load_elem_tkn()\n", yo); EH(-1,"\t"); } /* * Write out the names of the nodal variables that we will be sending to * the EXODUS II output file later. */ #ifdef DEBUG fprintf(stderr, "wr_result_prelim() starts...\n", tnv); #endif gvec_elem = (double ***) smalloc ( (exo->num_elem_blocks)*sizeof(double **)); for (i = 0; i < exo->num_elem_blocks; i++) { gvec_elem[i] = (double **) smalloc ( (tev + tev_post)*sizeof(double *)); } wr_result_prelim_exo( rd, exo, ExoFileOut, gvec_elem ); #ifdef DEBUG fprintf(stderr, "P_%d: wr_result_prelim_exo() ends...\n", ProcID, tnv); #endif /* * This gvec workhorse transports output variables as nodal based vectors * that are gather from the solution vector. Note: it is NOT a global * vector at all and only carries this processor's nodal variables to * the exodus database. */ asdv(&gvec, Num_Node); /* * Allocate space and manipulate for all the nodes that this processor * is aware of... */ num_total_nodes = dpi->num_universe_nodes; numProcUnknowns = NumUnknowns + NumExtUnknowns; /* allocate memory for Volume Constraint Jacobian. ACS 2/99 */ if ( nAC > 0) { for(iAC=0;iAC<nAC;iAC++) { augc[iAC].d_evol_dx = (double*) malloc(numProcUnknowns*sizeof(double)); } } asdv(&resid_vector, numProcUnknowns); asdv(&resid_vector_sens, numProcUnknowns); asdv(&scale, numProcUnknowns); for (i=0;i<NUM_ALSS;i++) { ams[i] = (struct Aztec_Linear_Solver_System *) array_alloc(1, 1, sizeof(struct Aztec_Linear_Solver_System )); } #ifdef MPI AZ_set_proc_config( ams[0]->proc_config, MPI_COMM_WORLD ); #ifndef COUPLED_FILL if( Explicit_Fill ) AZ_set_proc_config( ams[1]->proc_config, MPI_COMM_WORLD ); #endif /* not COUPLED_FILL */ #else /* MPI */ AZ_set_proc_config( ams[0]->proc_config, 0 ); #ifndef COUPLED_FILL if( Explicit_Fill ) AZ_set_proc_config( ams[1]->proc_config, 0 ); #endif /* not COUPLED_FILL */ #endif /* MPI */ /* * allocate space for and initialize solution arrays */ asdv(&x, numProcUnknowns); asdv(&x_old, numProcUnknowns); asdv(&x_older, numProcUnknowns); asdv(&x_oldest, numProcUnknowns); asdv(&xdot, numProcUnknowns); asdv(&xdot_old, numProcUnknowns); asdv(&x_update, numProcUnknowns); asdv(&x_sens, numProcUnknowns); /* * Initialize solid inertia flag */ set_solid_inertia(); /* * ALLOCATE ALL THOSE WORK VECTORS FOR HUNTING */ asdv(&lambda, nHC); asdv(&lambdaEnd, nHC); asdv(&path, nHC); asdv(&path1, nHC); asdv(&hDelta_s0, nHC); asdv(&hDelta_s_min, nHC); asdv(&hDelta_s_max, nHC); asdv(&delta_s, nHC); asdv(&delta_s_new, nHC); asdv(&delta_s_old, nHC); asdv(&delta_s_older, nHC); asdv(&delta_s_oldest, nHC); aldALC = Ivector_birth(nHC); const_delta_s = Ivector_birth(nHC); /* HUNTING BY ZERO AND FIRST ORDER CONTINUATION */ alqALC = 1; delta_t = 0.0; tran->delta_t = 0.0; /*for Newmark-Beta terms in Lagrangian Solid*/ nprint = 0; MaxPathSteps = cont->MaxPathSteps; eps = cont->eps; for (iHC=0;iHC<nHC;iHC++) { const_delta_s[iHC] = 0; lambda[iHC] = hunt[iHC].BegParameterValue; lambdaEnd[iHC] = hunt[iHC].EndParameterValue; if ((lambdaEnd[iHC]-lambda[iHC]) > 0.0) { aldALC[iHC] = +1; } else { aldALC[iHC] = -1; } if (hunt[iHC].ramp == 1) { hunt[iHC].Delta_s0 = fabs(lambdaEnd[iHC]-lambda[iHC])/((double)(MaxPathSteps-1)); const_delta_s[iHC] = 1; } hDelta_s0[iHC] = hunt[iHC].Delta_s0; hDelta_s_min[iHC] = hunt[iHC].Delta_s_min; hDelta_s_max[iHC] = hunt[iHC].Delta_s_max; path[iHC] = path1[iHC] = lambda[iHC]; if (Debug_Flag && ProcID == 0) { fprintf(stderr,"MaxPathSteps: %d \tlambdaEnd: %f\n", MaxPathSteps, lambdaEnd[iHC]); fprintf(stderr,"continuation in progress\n"); } if (hDelta_s0[iHC] > hDelta_s_max[iHC]) { hDelta_s0[iHC] = hDelta_s_max[iHC]; } delta_s[iHC] = delta_s_old[iHC] = delta_s_older[iHC] = hDelta_s0[iHC]; /* * ADJUST NATURAL PARAMETER */ update_parameterHC(iHC, path1[iHC], x, xdot, x_AC, delta_s[iHC], cx, exo, dpi); } /* define continuation parameter */ if(hunt[0].EndParameterValue == hunt[0].BegParameterValue) { hunt_par = 1.0; } else { hunt_par = (path1[0]-hunt[0].BegParameterValue) /(hunt[0].EndParameterValue - hunt[0].BegParameterValue) ; hunt_par=fabs(hunt_par); } hunt_par_old = hunt_par; /* Call prefront (or mf_setup) if necessary */ if (Linear_Solver == FRONT) { if (Num_Proc > 1) EH(-1, "Whoa. No front allowed with nproc>1"); #ifdef HAVE_FRONT /* Also got to define these because it wants pointers to these numbers */ max_unk_elem = (MAX_PROB_VAR + MAX_CONC)*MDE; one = 1; three = 3; /* NOTE: We need a overall flag in the vn_glob struct that tells whether FULL_DG is on anywhere in domain. This assumes only one material. See sl_front_setup for test. that test needs to be in the input parser. */ if(vn_glob[0]->dg_J_model == FULL_DG) { max_unk_elem = (MAX_PROB_VAR + MAX_CONC)*MDE + 4*vn_glob[0]->modes*4*MDE; } err = mf_setup(&exo->num_elems, &NumUnknowns, &max_unk_elem, &three, &one, exo->elem_order_map, fss->el_proc_assign, fss->level, fss->nopdof, fss->ncn, fss->constraint, front_scratch_directory, &fss->ntra); EH(err,"problems in frontal setup "); #else EH(-1,"Don't have frontal solver compiled and linked in"); #endif } /* * if compute parameter sensitivities, allocate space for solution * sensitivity vectors */ for(i=0;i<nn_post_fluxes_sens;i++) { num_pvector=MAX(num_pvector,pp_fluxes_sens[i]->vector_id);} for(i=0;i<nn_post_data_sens;i++) { num_pvector=MAX(num_pvector,pp_data_sens[i]->vector_id);} if((nn_post_fluxes_sens + nn_post_data_sens) > 0) { num_pvector++; num_pvector = MAX(num_pvector,2); x_sens_p = Dmatrix_birth(num_pvector,numProcUnknowns); } else { x_sens_p = NULL; } if (nAC > 0) { asdv(&x_AC, nAC); asdv(&x_AC_old, nAC); asdv(&x_AC_dot, nAC); } /* Allocate sparse matrix */ if( strcmp( Matrix_Format, "msr" ) == 0) { log_msg("alloc_MSR_sparse_arrays..."); alloc_MSR_sparse_arrays(&ija, &a, &a_old, 0, node_to_fill, exo, dpi); /* * An attic to store external dofs column names is needed when * running in parallel. */ alloc_extern_ija_buffer(num_universe_dofs, num_internal_dofs+num_boundary_dofs, ija, &ija_attic); /* * Any necessary one time initialization of the linear * solver package (Aztec). */ ams[JAC]->bindx = ija; ams[JAC]->val = a; ams[JAC]->belfry = ija_attic; ams[JAC]->val_old = a_old; /* * These point to nowhere since we're using MSR instead of VBR * format. */ ams[JAC]->indx = NULL; ams[JAC]->bpntr = NULL; ams[JAC]->rpntr = NULL; ams[JAC]->cpntr = NULL; ams[JAC]->npn = dpi->num_internal_nodes + dpi->num_boundary_nodes; ams[JAC]->npn_plus = dpi->num_internal_nodes + dpi->num_boundary_nodes + dpi->num_external_nodes; ams[JAC]->npu = num_internal_dofs+num_boundary_dofs; ams[JAC]->npu_plus = num_universe_dofs; ams[JAC]->nnz = ija[num_internal_dofs+num_boundary_dofs] - 1; ams[JAC]->nnz_plus = ija[num_universe_dofs]; } else if( strcmp( Matrix_Format, "vbr" ) == 0) { log_msg("alloc_VBR_sparse_arrays..."); alloc_VBR_sparse_arrays ( ams[JAC], exo, dpi); ija_attic = NULL; ams[JAC]->belfry = ija_attic; a = ams[JAC]->val; if( !save_old_A ) a_old = ams[JAC]->val_old; } else if ( strcmp( Matrix_Format, "front") == 0 ) { /* Don't allocate any sparse matrix space when using front */ ams[JAC]->bindx = NULL; ams[JAC]->val = NULL; ams[JAC]->belfry = NULL; ams[JAC]->val_old = NULL; ams[JAC]->indx = NULL; ams[JAC]->bpntr = NULL; ams[JAC]->rpntr = NULL; ams[JAC]->cpntr = NULL; } else { EH(-1,"Attempted to allocate unknown sparse matrix format"); } init_vec(x, cx, exo, dpi, x_AC, nAC, &timeValueRead); /* if read ACs, update data floats */ if (nAC > 0) { if(augc[0].iread == 1) { for(iAC=0 ; iAC<nAC ; iAC++) { update_parameterAC(iAC, x, xdot, x_AC, cx, exo, dpi); } } } /* * set boundary conditions on the initial conditions */ find_and_set_Dirichlet(x, xdot, exo, dpi); exchange_dof(cx, dpi, x); dcopy1(numProcUnknowns,x,x_old); dcopy1(numProcUnknowns,x_old,x_older); dcopy1(numProcUnknowns,x_older,x_oldest); if( nAC > 0) { dcopy1(nAC,x_AC, x_AC_old);} /* * initialize the counters for when to print out data */ step_print = 1; matrix_systems_mask = 1; log_msg("sl_init()..."); sl_init(matrix_systems_mask, ams, exo, dpi, cx); #ifdef PARALLEL /* * Make sure the solver was properly initialized on all processors. */ check_parallel_error("Solver initialization problems"); #endif ams[JAC]->options[AZ_keep_info] = 1; DPRINTF(stderr, "\nINITIAL ELEMENT QUALITY CHECK---\n"); good_mesh = element_quality(exo, x, ams[0]->proc_config); /* * set the number of successful path steps to zero */ nt = 0; /* * LOOP THROUGH PARAMETER UNTIL MAX NUMBER * OF STEPS SURPASSED */ for (n=0;n<MaxPathSteps;n++) { alqALC = 1; for (iHC=0;iHC<nHC;iHC++) { switch (aldALC[iHC]) { case -1: /* REDUCING PARAMETER DIRECTION */ if (path1[iHC] <= lambdaEnd[iHC]) { alqALC = -1; path1[iHC] = lambdaEnd[iHC]; delta_s[iHC] = path[iHC]-path1[iHC]; } break; case +1: /* RISING PARAMETER DIRECTION */ if (path1[iHC] >= lambdaEnd[iHC]) { alqALC = -1; path1[iHC] = lambdaEnd[iHC]; delta_s[iHC] = path1[iHC]-path[iHC]; } break; } /* * ADJUST NATURAL PARAMETER */ update_parameterHC(iHC, path1[iHC], x, xdot, x_AC, delta_s[iHC], cx, exo, dpi); } /* end of iHC loop */ if(hunt[0].EndParameterValue == hunt[0].BegParameterValue) { hunt_par = 1.0; } else { hunt_par = (path1[0]-hunt[0].BegParameterValue) /(hunt[0].EndParameterValue - hunt[0].BegParameterValue) ; hunt_par=fabs(hunt_par); } /* * IF STEP CHANGED, REDO FIRST ORDER PREDICTION */ if(alqALC == -1) { DPRINTF(stderr,"\n\t ******** LAST PATH STEP!\n"); dcopy1(numProcUnknowns,x_old,x); dhunt_par = hunt_par-hunt_par_old; switch (Continuation) { case HUN_ZEROTH: break; case HUN_FIRST: v1add(numProcUnknowns, &x[0], dhunt_par, &x_sens[0]); break; } } /* * reset Dirichlet condition Mask, node->DBC to -1 where it * is set in order for Dirichlet conditions to be * set appropriately for each path step */ nullify_dirichlet_bcs(); find_and_set_Dirichlet (x, xdot, exo, dpi); exchange_dof(cx, dpi, x); if(ProcID ==0) { DPRINTF(stderr, "\n\t----------------------------------"); switch (Continuation) { case HUN_ZEROTH: DPRINTF(stderr, "\n\tZero Order Hunting:"); break; case HUN_FIRST: DPRINTF(stderr, "\n\tFirst Order Hunting:"); break; } DPRINTF(stderr, "\n\tStep number: %4d of %4d (max)", n+1, MaxPathSteps); DPRINTF(stderr, "\n\tAttempting solution at: theta = %g",hunt_par); for (iHC=0;iHC<nHC;iHC++) { switch (hunt[iHC].Type) { case 1: /* BC */ DPRINTF(stderr, "\n\tBCID=%3d DFID=%5d", hunt[iHC].BCID, hunt[iHC].DFID); break; case 2: /* MT */ DPRINTF(stderr, "\n\tMTID=%3d MPID=%5d", hunt[iHC].MTID, hunt[iHC].MPID); break; case 3: /* AC */ DPRINTF(stderr, "\n\tACID=%3d DFID=%5d", hunt[iHC].BCID, hunt[iHC].DFID); break; } DPRINTF(stderr, " Parameter= % 10.6e delta_s= %10.6e", path1[iHC], delta_s[iHC]); } } ni = 0; do { #ifdef DEBUG fprintf(stderr, "%s: starting solve_nonlinear_problem\n", yo); #endif err = solve_nonlinear_problem(ams[JAC], x, delta_t, theta, x_old, x_older, xdot, xdot_old, resid_vector, x_update, scale, &converged, &nprint, tev, tev_post, NULL, rd, gindex, p_gsize, gvec, gvec_elem, path1[0], exo, dpi, cx, 0, &path_step_reform, is_steady_state, x_AC, x_AC_dot, hunt_par, resid_vector_sens, x_sens, x_sens_p, NULL); #ifdef DEBUG fprintf(stderr, "%s: returned from solve_nonlinear_problem\n", yo); #endif if (err == -1) converged = 0; inewton = err; if (converged) { EH(error, "error writing ASCII soln file."); /* srs need to check */ if (Write_Intermediate_Solutions == 0) { #ifdef DEBUG fprintf(stderr, "%s: write_solution call WIS\n", yo); #endif write_solution(ExoFileOut, resid_vector, x, x_sens_p, x_old, xdot, xdot_old, tev, tev_post,NULL, rd, gindex, p_gsize, gvec, gvec_elem, &nprint, delta_s[0], theta, path1[0], NULL, exo, dpi); #ifdef DEBUG fprintf(stderr, "%s: write_solution end call WIS\n", yo); #endif } /* * PRINT OUT VALUES OF EXTRA UNKNOWNS * FROM AUGMENTING CONDITIONS */ if (nAC > 0) { DPRINTF(stderr, "\n------------------------------\n"); DPRINTF(stderr, "Augmenting Conditions: %4d\n", nAC); DPRINTF(stderr, "Number of extra unknowns: %4d\n\n", nAC); for (iAC = 0; iAC < nAC; iAC++) { if (augc[iAC].Type == AC_USERBC) { DPRINTF(stderr, "\tAC[%4d] DF[%4d] = %10.6e\n", augc[iAC].BCID, augc[iAC].DFID, x_AC[iAC]); } else if (augc[iAC].Type == AC_USERMAT || augc[iAC].Type == AC_FLUX_MAT ) { DPRINTF(stderr, "\n MT[%4d] MP[%4d] = %10.6e\n", augc[iAC].MTID, augc[iAC].MPID, x_AC[iAC]); } else if(augc[iAC].Type == AC_VOLUME) { evol_local = augc[iAC].evol; #ifdef PARALLEL if( Num_Proc > 1 ) { MPI_Allreduce( &evol_local, &evol_global, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } evol_local = evol_global; #endif DPRINTF(stderr, "\tMT[%4d] VC[%4d]=%10.6e Param=%10.6e\n", augc[iAC].MTID, augc[iAC].VOLID, evol_local, x_AC[iAC]); } else if(augc[iAC].Type == AC_POSITION) { evol_local = augc[iAC].evol; #ifdef PARALLEL if( Num_Proc > 1 ) { MPI_Allreduce( &evol_local, &evol_global, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } evol_local = evol_global; #endif DPRINTF(stderr, "\tMT[%4d] XY[%4d]=%10.6e Param=%10.6e\n", augc[iAC].MTID, augc[iAC].VOLID, evol_local, x_AC[iAC]); } else if(augc[iAC].Type == AC_FLUX) { DPRINTF(stderr, "\tBC[%4d] DF[%4d]=%10.6e\n", augc[iAC].BCID, augc[iAC].DFID, x_AC[iAC]); } } } /* Check element quality */ good_mesh = element_quality(exo, x, ams[0]->proc_config); /* INTEGRATE FLUXES, FORCES */ for (i = 0; i < nn_post_fluxes; i++) { evaluate_flux ( exo, dpi, pp_fluxes[i]->ss_id, pp_fluxes[i]->flux_type , pp_fluxes[i]->flux_type_name , pp_fluxes[i]->blk_id , pp_fluxes[i]->species_number, pp_fluxes[i]->flux_filenm, pp_fluxes[i]->profile_flag, x,xdot,NULL,delta_s[0],path1[0],1); } /* COMPUTE FLUX, FORCE SENSITIVITIES */ for (i = 0; i < nn_post_fluxes_sens; i++) { evaluate_flux_sens ( exo, dpi, pp_fluxes_sens[i]->ss_id, pp_fluxes_sens[i]->flux_type , pp_fluxes_sens[i]->flux_type_name , pp_fluxes_sens[i]->blk_id , pp_fluxes_sens[i]->species_number, pp_fluxes_sens[i]->sens_type, pp_fluxes_sens[i]->sens_id, pp_fluxes_sens[i]->sens_flt, pp_fluxes_sens[i]->sens_flt2, pp_fluxes_sens[i]->vector_id, pp_fluxes_sens[i]->flux_filenm, pp_fluxes_sens[i]->profile_flag, x,xdot,x_sens_p,delta_s[0],path1[0],1); } /* * Compute global volumetric quantities */ for (i = 0; i < nn_volume; i++ ) { evaluate_volume_integral(exo, dpi, pp_volume[i]->volume_type, pp_volume[i]->volume_name, pp_volume[i]->blk_id, pp_volume[i]->species_no, pp_volume[i]->volume_fname, pp_volume[i]->params, NULL, x, xdot, delta_s[0], path1[0], 1); } } /* end of if converged block */ /* * INCREMENT COUNTER */ ni++; /* * * DID IT CONVERGE ? * IF NOT, REDUCE STEP SIZE AND TRY AGAIN * */ if (!converged) { if (ni > 10) { DPRINTF(stderr,"\n ************************************\n"); DPRINTF(stderr," W: Did not converge in Newton steps.\n"); DPRINTF(stderr," Find better initial guess. \n"); DPRINTF(stderr," ************************************\n"); exit(0); } /* * ADJUST STEP SIZE - unless failed on first step */ if ( nt != 0 ) { DPRINTF(stderr, "\n\tFailed to converge:\n"); for (iHC=0;iHC<nHC;iHC++) { delta_s[iHC] *= 0.5; switch (aldALC[iHC]) { case -1: path1[iHC] = path[iHC] - delta_s[iHC]; break; case +1: path1[iHC] = path[iHC] + delta_s[iHC]; break; } /* * RESET */ alqALC = 1; DPRINTF(stderr, "Decreasing step-length to %10.6e.\n", delta_s[iHC]); if (delta_s[iHC] < hDelta_s_min[iHC]) { DPRINTF(stderr,"\n X: C step-length reduced below minimum."); DPRINTF(stderr,"\n Program terminated.\n"); /* This needs to have a return value of 0, indicating * success, for the continuation script to not treat this * as a failed command. */ exit(0); } #ifdef PARALLEL check_parallel_error("\t"); #endif /* * ADJUST NATURAL PARAMETER */ update_parameterHC(iHC, path1[iHC], x, xdot, x_AC, delta_s[iHC], cx, exo, dpi); } /* end of iHC loop */ if(hunt[0].EndParameterValue == hunt[0].BegParameterValue) { hunt_par = 1.0; } else { hunt_par = (path1[0]-hunt[0].BegParameterValue) /(hunt[0].EndParameterValue - hunt[0].BegParameterValue) ; hunt_par=fabs(hunt_par); } /* * GET ZERO OR FIRST ORDER PREDICTION */ dhunt_par = hunt_par-hunt_par_old; switch (Continuation) { case HUN_ZEROTH: vcopy(numProcUnknowns, &x[0], 1.0, &x_old[0]); break; case HUN_FIRST: v2sum(numProcUnknowns, &x[0], 1.0, &x_old[0], dhunt_par, &x_sens[0]); break; } /* MMH: Needed to put this in, o/w it may find that the * solution and residual HAPPEN to satisfy the convergence * criterion for the next newton solve... */ find_and_set_Dirichlet(x, xdot, exo, dpi); exchange_dof(cx, dpi, x); if (nAC > 0) { dcopy1(nAC, x_AC_old, x_AC); for(iAC=0 ; iAC<nAC ; iAC++) { update_parameterAC(iAC, x, xdot, x_AC, cx, exo, dpi); } } if(hunt[0].EndParameterValue == hunt[0].BegParameterValue) { hunt_par = 1.0; } else { hunt_par = (path1[0]-hunt[0].BegParameterValue) /(hunt[0].EndParameterValue - hunt[0].BegParameterValue) ; hunt_par=fabs(hunt_par); } } else if (inewton == -1) { DPRINTF(stderr,"\nHmm... trouble on first step \n Let's try some more relaxation \n"); if((damp_factor1 <= 1. && damp_factor1 >= 0.) && (damp_factor2 <= 1. && damp_factor2 >= 0.) && (damp_factor3 <= 1. && damp_factor3 >= 0.)) { custom_tol1 *= 0.01; custom_tol2 *= 0.01; custom_tol3 *= 0.01; DPRINTF(stderr," custom tolerances %g %g %g \n",custom_tol1,custom_tol2,custom_tol3); } else { damp_factor1 *= 0.5; DPRINTF(stderr," damping factor %g \n",damp_factor1); } vcopy(numProcUnknowns, &x[0], 1.0, &x_old[0]); /* MMH: Needed to put this in, o/w it may find that the * solution and residual HAPPEN to satisfy the convergence * criterion for the next newton solve... */ find_and_set_Dirichlet(x, xdot, exo, dpi); exchange_dof(cx, dpi, x); if (nAC > 0) { dcopy1(nAC, x_AC_old, x_AC); for(iAC=0 ; iAC<nAC ; iAC++) { update_parameterAC(iAC, x, xdot, x_AC, cx, exo, dpi); } } } else { DPRINTF(stderr,"\nHmm... could not converge on first step\n Let's try some more iterations\n"); if((damp_factor1 <= 1. && damp_factor1 >= 0.) && (damp_factor2 <= 1. && damp_factor2 >= 0.) && (damp_factor3 <= 1. && damp_factor3 >= 0.)) { custom_tol1 *= 100.; custom_tol2 *= 100.; custom_tol3 *= 100.; DPRINTF(stderr," custom tolerances %g %g %g \n",custom_tol1,custom_tol2,custom_tol3); } else { damp_factor1 *= 2.0; damp_factor1 = MIN(damp_factor1,1.0); DPRINTF(stderr," damping factor %g \n",damp_factor1); } } } /* end of !converged */ } while (converged == 0); /* * CONVERGED */ nt++; custom_tol1 = toler_org[0]; custom_tol2 = toler_org[1]; custom_tol3 = toler_org[2]; damp_factor1 = damp_org; DPRINTF(stderr, "\n\tStep accepted, theta (proportion complete) = %10.6e\n", hunt_par); for (iHC=0;iHC<nHC;iHC++) { switch (hunt[iHC].Type) { case 1: /* BC */ DPRINTF(stderr, "\tStep accepted, BCID=%3d DFID=%5d", hunt[iHC].BCID, hunt[iHC].DFID); break; case 2: /* MT */ DPRINTF(stderr, "\tStep accepted, MTID=%3d MPID=%5d", hunt[iHC].MTID, hunt[iHC].MPID); break; case 3: /* AC */ DPRINTF(stderr, "\tStep accepted, ACID=%3d DFID=%5d", hunt[iHC].BCID, hunt[iHC].DFID); break; } DPRINTF(stderr, " Parameter= % 10.6e\n", path1[iHC]); } /* * check path step error, if too large do not enlarge path step */ for (iHC=0;iHC<nHC;iHC++) { if ((ni == 1) && (n != 0) && (!const_delta_s[iHC])) { delta_s_new[iHC] = path_step_control(num_total_nodes, delta_s[iHC], delta_s_old[iHC], x, eps, &success_ds, cont->use_var_norm, inewton); if (delta_s_new[iHC] > hDelta_s_max[iHC]) {delta_s_new[iHC] = hDelta_s_max[iHC];} } else { success_ds = 1; delta_s_new[iHC] = delta_s[iHC]; } } /* * determine whether to print out the data or not */ i_print = 0; if (nt == step_print) { i_print = 1; step_print += cont->print_freq; } if (alqALC == -1) { i_print = 1; } if (i_print) { error = write_ascii_soln(x, resid_vector, numProcUnknowns, x_AC, nAC, path1[0], file); if (error) { DPRINTF(stderr, "%s: error writing ASCII soln file\n", yo); } if ( Write_Intermediate_Solutions == 0 ) { write_solution(ExoFileOut, resid_vector, x, x_sens_p, x_old, xdot, xdot_old, tev, tev_post, NULL, rd, gindex, p_gsize, gvec, gvec_elem, &nprint, delta_s[0], theta, path1[0], NULL, exo, dpi); nprint++; } } /* * backup old solutions * can use previous solutions for prediction one day */ dcopy1(numProcUnknowns,x_older,x_oldest); dcopy1(numProcUnknowns,x_old,x_older); dcopy1(numProcUnknowns,x,x_old); dcopy1(nHC,delta_s_older,delta_s_oldest); dcopy1(nHC,delta_s_old ,delta_s_older ); dcopy1(nHC,delta_s ,delta_s_old ); dcopy1(nHC,delta_s_new ,delta_s ); /* delta_s_oldest = delta_s_older; delta_s_older = delta_s_old; delta_s_old = delta_s; delta_s = delta_s_new; */ hunt_par_old=hunt_par; if ( nAC > 0) { dcopy1(nAC, x_AC, x_AC_old); } /* * INCREMENT/DECREMENT PARAMETER */ for (iHC=0;iHC<nHC;iHC++) { path[iHC] = path1[iHC]; switch (aldALC[iHC]) { case -1: path1[iHC] = path[iHC] - delta_s[iHC]; break; case +1: path1[iHC] = path[iHC] + delta_s[iHC]; break; } /* * ADJUST NATURAL PARAMETER */ update_parameterHC(iHC, path1[iHC], x, xdot, x_AC, delta_s[iHC], cx, exo, dpi); } /* end of iHC loop */ /* * GET FIRST ORDER PREDICTION */ if(hunt[0].EndParameterValue == hunt[0].BegParameterValue) { hunt_par = 1.0; } else { hunt_par = (path1[0]-hunt[0].BegParameterValue) /(hunt[0].EndParameterValue - hunt[0].BegParameterValue) ; hunt_par=fabs(hunt_par); } dhunt_par = hunt_par-hunt_par_old; switch (Continuation) { case HUN_ZEROTH: break; case HUN_FIRST: v1add(numProcUnknowns, &x[0], dhunt_par, &x_sens[0]); break; } if (!good_mesh) goto free_and_clear; /* * * CHECK END CONTINUATION * */ if (alqALC == -1) { alqALC = 0; } else { alqALC = 1; } if (alqALC == 0) { DPRINTF(stderr,"\n\n\t I will continue no more!\n\t No more continuation for you!\n"); goto free_and_clear; } } /* n */ if(n == MaxPathSteps && aldALC[0] * (lambdaEnd[0] - path[0]) > 0) { DPRINTF(stderr,"\n\tFailed to reach end of hunt in maximum number of successful steps (%d).\n\tSorry.\n", MaxPathSteps); exit(0); } #ifdef PARALLEL check_parallel_error("Hunting error"); #endif /* * DONE CONTINUATION */ free_and_clear: /* * Transform the node point coordinates according to the * displacements and write out all the results using the * displaced coordinates. Set the displacement field to * zero, too. */ if (Anneal_Mesh) { #ifdef DEBUG fprintf(stderr, "%s: anneal_mesh()...\n", yo); #endif err = anneal_mesh(x, tev, tev_post, NULL, rd, path1[0], exo, dpi); #ifdef DEBUG DPRINTF(stderr, "%s: anneal_mesh()-done\n", yo); #endif EH(err, "anneal_mesh() bad return."); } /* * Free a bunch of variables that aren't needed anymore */ safer_free((void **) &ROT_Types); safer_free((void **) &node_to_fill); safer_free( (void **) &resid_vector); safer_free( (void **) &resid_vector_sens); safer_free( (void **) &scale); safer_free( (void **) &x); if (nAC > 0) { safer_free( (void **) &x_AC); safer_free( (void **) &x_AC_old); safer_free( (void **) &x_AC_dot); } safer_free( (void **) &x_old); safer_free( (void **) &x_older); safer_free( (void **) &x_oldest); safer_free( (void **) &xdot); safer_free( (void **) &xdot_old); safer_free( (void **) &x_update); safer_free( (void **) &x_sens); if((nn_post_data_sens+nn_post_fluxes_sens) > 0) Dmatrix_death(x_sens_p,num_pvector,numProcUnknowns); for(i = 0; i < MAX_NUMBER_MATLS; i++) { for(n = 0; n < MAX_MODES; n++) { safer_free((void **) &(ve_glob[i][n]->gn)); safer_free((void **) &(ve_glob[i][n])); } safer_free((void **) &(vn_glob[i])); } sl_free(matrix_systems_mask, ams); for (i=0;i<NUM_ALSS;i++) { safer_free( (void**) &(ams[i])); } safer_free( (void **) &gvec); safer_free( (void **) &lambda); safer_free( (void **) &lambdaEnd); safer_free( (void **) &path); safer_free( (void **) &path1); safer_free( (void **) &hDelta_s0); safer_free( (void **) &hDelta_s_min); safer_free( (void **) &hDelta_s_max); safer_free( (void **) &delta_s); safer_free( (void **) &delta_s_new); safer_free( (void **) &delta_s_old); safer_free( (void **) &delta_s_older); safer_free( (void **) &delta_s_oldest); Ivector_death(&aldALC[0], nHC); Ivector_death(&const_delta_s[0], nHC); i = 0; for ( eb_indx = 0; eb_indx < exo->num_elem_blocks; eb_indx++ ) { for ( ev_indx = 0; ev_indx < rd->nev; ev_indx++ ) { if ( exo->elem_var_tab[i++] == 1 ) { safer_free ((void **) &(gvec_elem [eb_indx][ev_indx]) ); } } safer_free ((void **) &(gvec_elem [eb_indx])); } safer_free( (void **) &gvec_elem); safer_free( (void **) &rd); safer_free( (void **) &Local_Offset); safer_free( (void **) &Dolphin); if( strlen( Soln_OutFile) ) { fclose(file); } return; } /* END of routine hunt_problem */
int main( int argc, char **argv ) { MPI_Comm comm; int *sbuf, *rbuf; int rank, size; int *sendcounts, *recvcounts, *rdispls, *sdispls; int i, j, *p, err, toterr; MPI_Init( &argc, &argv ); err = 0; comm = MPI_COMM_WORLD; /* Create the buffer */ MPI_Comm_size( comm, &size ); MPI_Comm_rank( comm, &rank ); sbuf = (int *)malloc( size * size * sizeof(int) ); rbuf = (int *)malloc( size * size * sizeof(int) ); if (!sbuf || !rbuf) { fprintf( stderr, "Could not allocated buffers!\n" ); MPI_Abort( comm, 1 ); } /* Load up the buffers */ for (i=0; i<size*size; i++) { sbuf[i] = i + 100*rank; rbuf[i] = -i; } /* Create and load the arguments to alltoallv */ sendcounts = (int *)malloc( size * sizeof(int) ); recvcounts = (int *)malloc( size * sizeof(int) ); rdispls = (int *)malloc( size * sizeof(int) ); sdispls = (int *)malloc( size * sizeof(int) ); if (!sendcounts || !recvcounts || !rdispls || !sdispls) { fprintf( stderr, "Could not allocate arg items!\n" ); MPI_Abort( comm, 1 ); } for (i=0; i<size; i++) { sendcounts[i] = i; recvcounts[i] = rank; rdispls[i] = i * rank; sdispls[i] = (i * (i+1))/2; } MPI_Alltoallv( sbuf, sendcounts, sdispls, MPI_INT, rbuf, recvcounts, rdispls, MPI_INT, comm ); /* Check rbuf */ for (i=0; i<size; i++) { p = rbuf + rdispls[i]; for (j=0; j<rank; j++) { if (p[j] != i * 100 + (rank*(rank+1))/2 + j) { fprintf( stderr, "[%d] got %d expected %d for %dth\n", rank, p[j],(i*(i+1))/2 + j, j ); err++; } } } free( sdispls ); free( rdispls ); free( recvcounts ); free( sendcounts ); free( rbuf ); free( sbuf ); MPI_Allreduce( &err, &toterr, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); if (rank == 0) { if (toterr > 0) fprintf( stderr, "Test FAILED with %d errors\n", toterr ); else fprintf( stderr, "Test passed\n" ); } MPI_Finalize(); return 0; }
void LU_decomp(struct problem *info, struct fmatrix *X, int *reorder, MPI_Datatype pivot_type, MPI_Op best_pivot_op) { MPI_Request req_spiv, req_sa, req_sm; MPI_Status status; number_type *m = malloc(info->blksz * sizeof(*m)); int diag; for (diag = 0; diag < info->n; diag++) { /* we do partial pivoting, so the proc with the pivot is on this column: */ int pivot_h = diag / info->blksz; int r, c, i; double start_time = MPI_Wtime(); double start_time2; struct pivot pivot = { -1, 0. }; /* choose pivot across the column */ if (info->coords[HDIM] == pivot_h) { /* column with pivot in block */ int pivot_c = diag % info->blksz; /* Argo doesn't want aliasing in allreduce */ struct pivot pivot_cand = { -1, 0. }; for (i = 0; i < info->blksz; i++) { if (reorder[i] > diag && fabs(CELL(X, i, pivot_c)) > fabs(pivot_cand.value)) { pivot_cand.row = info->blksz*info->coords[VDIM] + i; pivot_cand.value = CELL(X, i, pivot_c); } } start_time2 = MPI_Wtime(); MPI_Allreduce(&pivot_cand, &pivot, 1, pivot_type, best_pivot_op, info->vcomm); pivot_allr_time += MPI_Wtime() - start_time2; } /* broadcast pivot choice across row towards the right */ start_time2 = MPI_Wtime(); pipeline_right(info, pivot_h, &pivot, 1, pivot_type, 45, &req_spiv); pivot_bcast_time += MPI_Wtime() - start_time2; pivot_time += MPI_Wtime() - start_time; /* find rank of proc with pivot on the vertical communicator */ int pivot_v = pivot.row / info->blksz; /* fill in reorder */ if (info->coords[VDIM] == pivot_v) { reorder[pivot.row % info->blksz] = diag; } /* calculate and distribute the ms */ for (r = 0; r < info->blksz; r++) { if (reorder[r] > diag) { if (info->coords[HDIM] == pivot_h) { int pivot_c = diag % info->blksz; m[r] = CELL(X, r, pivot_c) / pivot.value; CELL(X, r, pivot_c) = m[r]; } /* broadcast m towards right */ start_time = MPI_Wtime(); pipeline_right(info, pivot_h, &m[r], 1, MPI_number_type, 64, &req_sm); m_bcast_time += MPI_Wtime() - start_time; } } /* distribute the pivot row and eliminate */ int startc = 0; if (info->coords[HDIM] == pivot_h) startc = (diag+1) % info->blksz; if (info->coords[HDIM] < pivot_h) startc = info->blksz; /* elimination */ for (c = startc; c < info->blksz; c++) { number_type a; if (info->coords[VDIM] == pivot_v) { a = CELL(X, pivot.row % info->blksz, c); } start_time = MPI_Wtime(); int up = (info->coords[VDIM]+info->sqp-1)%info->sqp; int down = (info->coords[VDIM]+1)%info->sqp; if (info->coords[VDIM] != pivot_v) { MPI_Recv(&a, 1, MPI_number_type, up, 78, info->vcomm, &status); } if (down != pivot_v) { MPI_Isend(&a, 1, MPI_number_type, down, 78, info->vcomm, &req_sa); } a_bcast_time += MPI_Wtime() - start_time; for (r = 0; r < info->blksz; r++) { if (reorder[r] > diag) { CELL(X, r,c) -= m[r]*a; } } if (down != pivot_v) MPI_Wait(&req_sa, &status); } } }
PetscErrorCode DMPlexPreallocateOperator(DM dm, PetscInt bs, PetscSection section, PetscSection sectionGlobal, PetscInt dnz[], PetscInt onz[], PetscInt dnzu[], PetscInt onzu[], Mat A, PetscBool fillMatrix) { MPI_Comm comm; MatType mtype; PetscSF sf, sfDof, sfAdj; PetscSection leafSectionAdj, rootSectionAdj, sectionAdj, anchorSectionAdj; PetscInt nroots, nleaves, l, p; const PetscInt *leaves; const PetscSFNode *remotes; PetscInt dim, pStart, pEnd, numDof, globalOffStart, globalOffEnd, numCols; PetscInt *tmpAdj = NULL, *adj, *rootAdj, *anchorAdj = NULL, *cols, *remoteOffsets; PetscInt adjSize; PetscLayout rLayout; PetscInt locRows, rStart, rEnd, r; PetscMPIInt size; PetscBool doCommLocal, doComm, debug = PETSC_FALSE, isSymBlock, isSymSeqBlock, isSymMPIBlock; PetscBool useAnchors; PetscErrorCode ierr; PetscFunctionBegin; PetscValidHeaderSpecific(dm, DM_CLASSID, 1); PetscValidHeaderSpecific(section, PETSC_SECTION_CLASSID, 3); PetscValidHeaderSpecific(sectionGlobal, PETSC_SECTION_CLASSID, 4); PetscValidHeaderSpecific(A, MAT_CLASSID, 9); if (dnz) PetscValidPointer(dnz,5); if (onz) PetscValidPointer(onz,6); if (dnzu) PetscValidPointer(dnzu,7); if (onzu) PetscValidPointer(onzu,8); ierr = PetscLogEventBegin(DMPLEX_Preallocate,dm,0,0,0);CHKERRQ(ierr); ierr = PetscObjectGetComm((PetscObject)dm,&comm);CHKERRQ(ierr); ierr = PetscOptionsGetBool(NULL, "-dm_view_preallocation", &debug, NULL);CHKERRQ(ierr); ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr); ierr = DMGetDimension(dm, &dim);CHKERRQ(ierr); ierr = DMGetPointSF(dm, &sf);CHKERRQ(ierr); ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr); doCommLocal = (size > 1) && (nroots >= 0) ? PETSC_TRUE : PETSC_FALSE; ierr = MPI_Allreduce(&doCommLocal, &doComm, 1, MPIU_BOOL, MPI_LAND, comm);CHKERRQ(ierr); /* Create dof SF based on point SF */ if (debug) { ierr = PetscPrintf(comm, "Input Section for Preallocation:\n");CHKERRQ(ierr); ierr = PetscSectionView(section, PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr); ierr = PetscPrintf(comm, "Input Global Section for Preallocation:\n");CHKERRQ(ierr); ierr = PetscSectionView(sectionGlobal, PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr); ierr = PetscPrintf(comm, "Input SF for Preallocation:\n");CHKERRQ(ierr); ierr = PetscSFView(sf, NULL);CHKERRQ(ierr); } ierr = PetscSFCreateRemoteOffsets(sf, section, section, &remoteOffsets);CHKERRQ(ierr); ierr = PetscSFCreateSectionSF(sf, section, remoteOffsets, section, &sfDof);CHKERRQ(ierr); if (debug) { ierr = PetscPrintf(comm, "Dof SF for Preallocation:\n");CHKERRQ(ierr); ierr = PetscSFView(sfDof, NULL);CHKERRQ(ierr); } /* Create section for dof adjacency (dof ==> # adj dof) */ ierr = PetscSectionGetChart(section, &pStart, &pEnd);CHKERRQ(ierr); ierr = PetscSectionGetStorageSize(section, &numDof);CHKERRQ(ierr); ierr = PetscSectionCreate(comm, &leafSectionAdj);CHKERRQ(ierr); ierr = PetscSectionSetChart(leafSectionAdj, 0, numDof);CHKERRQ(ierr); ierr = PetscSectionCreate(comm, &rootSectionAdj);CHKERRQ(ierr); ierr = PetscSectionSetChart(rootSectionAdj, 0, numDof);CHKERRQ(ierr); /* Fill in the ghost dofs on the interface */ ierr = PetscSFGetGraph(sf, NULL, &nleaves, &leaves, &remotes);CHKERRQ(ierr); /* use constraints in finding adjacency in this routine */ ierr = DMPlexGetAdjacencyUseAnchors(dm,&useAnchors);CHKERRQ(ierr); ierr = DMPlexSetAdjacencyUseAnchors(dm,PETSC_TRUE);CHKERRQ(ierr); /* section - maps points to (# dofs, local dofs) sectionGlobal - maps points to (# dofs, global dofs) leafSectionAdj - maps unowned local dofs to # adj dofs rootSectionAdj - maps owned local dofs to # adj dofs adj - adj global dofs indexed by leafSectionAdj rootAdj - adj global dofs indexed by rootSectionAdj sf - describes shared points across procs sfDof - describes shared dofs across procs sfAdj - describes shared adjacent dofs across procs ** The bootstrapping process involves six rounds with similar structure of visiting neighbors of each point. (0). If there are point-to-point constraints, add the adjacencies of constrained points to anchors in anchorAdj (This is done in DMPlexComputeAnchorAdjacencies()) 1. Visit unowned points on interface, count adjacencies placing in leafSectionAdj Reduce those counts to rootSectionAdj (now redundantly counting some interface points) 2. Visit owned points on interface, count adjacencies placing in rootSectionAdj Create sfAdj connecting rootSectionAdj and leafSectionAdj 3. Visit unowned points on interface, write adjacencies to adj Gather adj to rootAdj (note that there is redundancy in rootAdj when multiple procs find the same adjacencies) 4. Visit owned points on interface, write adjacencies to rootAdj Remove redundancy in rootAdj ** The last two traversals use transitive closure 5. Visit all owned points in the subdomain, count dofs for each point (sectionAdj) Allocate memory addressed by sectionAdj (cols) 6. Visit all owned points in the subdomain, insert dof adjacencies into cols ** Knowing all the column adjacencies, check ownership and sum into dnz and onz */ ierr = DMPlexComputeAnchorAdjacencies(dm,section,sectionGlobal,&anchorSectionAdj,&anchorAdj);CHKERRQ(ierr); for (l = 0; l < nleaves; ++l) { PetscInt dof, off, d, q, anDof; PetscInt p = leaves[l], numAdj = PETSC_DETERMINE; if ((p < pStart) || (p >= pEnd)) continue; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, p, &off);CHKERRQ(ierr); ierr = DMPlexGetAdjacency(dm, p, &numAdj, &tmpAdj);CHKERRQ(ierr); for (q = 0; q < numAdj; ++q) { const PetscInt padj = tmpAdj[q]; PetscInt ndof, ncdof; if ((padj < pStart) || (padj >= pEnd)) continue; ierr = PetscSectionGetDof(section, padj, &ndof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, padj, &ncdof);CHKERRQ(ierr); for (d = off; d < off+dof; ++d) { ierr = PetscSectionAddDof(leafSectionAdj, d, ndof-ncdof);CHKERRQ(ierr); } } ierr = PetscSectionGetDof(anchorSectionAdj, p, &anDof);CHKERRQ(ierr); if (anDof) { for (d = off; d < off+dof; ++d) { ierr = PetscSectionAddDof(leafSectionAdj, d, anDof);CHKERRQ(ierr); } } } ierr = PetscSectionSetUp(leafSectionAdj);CHKERRQ(ierr); if (debug) { ierr = PetscPrintf(comm, "Adjacency Section for Preallocation on Leaves:\n");CHKERRQ(ierr); ierr = PetscSectionView(leafSectionAdj, PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr); } /* Get maximum remote adjacency sizes for owned dofs on interface (roots) */ if (doComm) { ierr = PetscSFReduceBegin(sfDof, MPIU_INT, leafSectionAdj->atlasDof, rootSectionAdj->atlasDof, MPI_SUM);CHKERRQ(ierr); ierr = PetscSFReduceEnd(sfDof, MPIU_INT, leafSectionAdj->atlasDof, rootSectionAdj->atlasDof, MPI_SUM);CHKERRQ(ierr); } if (debug) { ierr = PetscPrintf(comm, "Adjancency Section for Preallocation on Roots:\n");CHKERRQ(ierr); ierr = PetscSectionView(rootSectionAdj, PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr); } /* Add in local adjacency sizes for owned dofs on interface (roots) */ for (p = pStart; p < pEnd; ++p) { PetscInt numAdj = PETSC_DETERMINE, adof, dof, off, d, q, anDof; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, p, &off);CHKERRQ(ierr); if (!dof) continue; ierr = PetscSectionGetDof(rootSectionAdj, off, &adof);CHKERRQ(ierr); if (adof <= 0) continue; ierr = DMPlexGetAdjacency(dm, p, &numAdj, &tmpAdj);CHKERRQ(ierr); for (q = 0; q < numAdj; ++q) { const PetscInt padj = tmpAdj[q]; PetscInt ndof, ncdof; if ((padj < pStart) || (padj >= pEnd)) continue; ierr = PetscSectionGetDof(section, padj, &ndof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, padj, &ncdof);CHKERRQ(ierr); for (d = off; d < off+dof; ++d) { ierr = PetscSectionAddDof(rootSectionAdj, d, ndof-ncdof);CHKERRQ(ierr); } } ierr = PetscSectionGetDof(anchorSectionAdj, p, &anDof);CHKERRQ(ierr); if (anDof) { for (d = off; d < off+dof; ++d) { ierr = PetscSectionAddDof(rootSectionAdj, d, anDof);CHKERRQ(ierr); } } } ierr = PetscSectionSetUp(rootSectionAdj);CHKERRQ(ierr); if (debug) { ierr = PetscPrintf(comm, "Adjancency Section for Preallocation on Roots after local additions:\n");CHKERRQ(ierr); ierr = PetscSectionView(rootSectionAdj, PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr); } /* Create adj SF based on dof SF */ ierr = PetscSFCreateRemoteOffsets(sfDof, rootSectionAdj, leafSectionAdj, &remoteOffsets);CHKERRQ(ierr); ierr = PetscSFCreateSectionSF(sfDof, rootSectionAdj, remoteOffsets, leafSectionAdj, &sfAdj);CHKERRQ(ierr); if (debug) { ierr = PetscPrintf(comm, "Adjacency SF for Preallocation:\n");CHKERRQ(ierr); ierr = PetscSFView(sfAdj, NULL);CHKERRQ(ierr); } ierr = PetscSFDestroy(&sfDof);CHKERRQ(ierr); /* Create leaf adjacency */ ierr = PetscSectionSetUp(leafSectionAdj);CHKERRQ(ierr); ierr = PetscSectionGetStorageSize(leafSectionAdj, &adjSize);CHKERRQ(ierr); ierr = PetscCalloc1(adjSize, &adj);CHKERRQ(ierr); for (l = 0; l < nleaves; ++l) { PetscInt dof, off, d, q, anDof, anOff; PetscInt p = leaves[l], numAdj = PETSC_DETERMINE; if ((p < pStart) || (p >= pEnd)) continue; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, p, &off);CHKERRQ(ierr); ierr = DMPlexGetAdjacency(dm, p, &numAdj, &tmpAdj);CHKERRQ(ierr); ierr = PetscSectionGetDof(anchorSectionAdj, p, &anDof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(anchorSectionAdj, p, &anOff);CHKERRQ(ierr); for (d = off; d < off+dof; ++d) { PetscInt aoff, i = 0; ierr = PetscSectionGetOffset(leafSectionAdj, d, &aoff);CHKERRQ(ierr); for (q = 0; q < numAdj; ++q) { const PetscInt padj = tmpAdj[q]; PetscInt ndof, ncdof, ngoff, nd; if ((padj < pStart) || (padj >= pEnd)) continue; ierr = PetscSectionGetDof(section, padj, &ndof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, padj, &ncdof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionGlobal, padj, &ngoff);CHKERRQ(ierr); for (nd = 0; nd < ndof-ncdof; ++nd) { adj[aoff+i] = (ngoff < 0 ? -(ngoff+1) : ngoff) + nd; ++i; } } for (q = 0; q < anDof; q++) { adj[aoff+i] = anchorAdj[anOff+q]; ++i; } } } /* Debugging */ if (debug) { IS tmp; ierr = PetscPrintf(comm, "Leaf adjacency indices\n");CHKERRQ(ierr); ierr = ISCreateGeneral(comm, adjSize, adj, PETSC_USE_POINTER, &tmp);CHKERRQ(ierr); ierr = ISView(tmp, NULL);CHKERRQ(ierr); ierr = ISDestroy(&tmp);CHKERRQ(ierr); } /* Gather adjacent indices to root */ ierr = PetscSectionGetStorageSize(rootSectionAdj, &adjSize);CHKERRQ(ierr); ierr = PetscMalloc1(adjSize, &rootAdj);CHKERRQ(ierr); for (r = 0; r < adjSize; ++r) rootAdj[r] = -1; if (doComm) { const PetscInt *indegree; PetscInt *remoteadj, radjsize = 0; ierr = PetscSFComputeDegreeBegin(sfAdj, &indegree);CHKERRQ(ierr); ierr = PetscSFComputeDegreeEnd(sfAdj, &indegree);CHKERRQ(ierr); for (p = 0; p < adjSize; ++p) radjsize += indegree[p]; ierr = PetscMalloc1(radjsize, &remoteadj);CHKERRQ(ierr); ierr = PetscSFGatherBegin(sfAdj, MPIU_INT, adj, remoteadj);CHKERRQ(ierr); ierr = PetscSFGatherEnd(sfAdj, MPIU_INT, adj, remoteadj);CHKERRQ(ierr); for (p = 0, l = 0, r = 0; p < adjSize; ++p, l = PetscMax(p, l + indegree[p-1])) { PetscInt s; for (s = 0; s < indegree[p]; ++s, ++r) rootAdj[l+s] = remoteadj[r]; } if (r != radjsize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inconsistency in communication %d != %d", r, radjsize); if (l != adjSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Inconsistency in communication %d != %d", l, adjSize); ierr = PetscFree(remoteadj);CHKERRQ(ierr); } ierr = PetscSFDestroy(&sfAdj);CHKERRQ(ierr); ierr = PetscFree(adj);CHKERRQ(ierr); /* Debugging */ if (debug) { IS tmp; ierr = PetscPrintf(comm, "Root adjacency indices after gather\n");CHKERRQ(ierr); ierr = ISCreateGeneral(comm, adjSize, rootAdj, PETSC_USE_POINTER, &tmp);CHKERRQ(ierr); ierr = ISView(tmp, NULL);CHKERRQ(ierr); ierr = ISDestroy(&tmp);CHKERRQ(ierr); } /* Add in local adjacency indices for owned dofs on interface (roots) */ for (p = pStart; p < pEnd; ++p) { PetscInt numAdj = PETSC_DETERMINE, adof, dof, off, d, q, anDof, anOff; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, p, &off);CHKERRQ(ierr); if (!dof) continue; ierr = PetscSectionGetDof(rootSectionAdj, off, &adof);CHKERRQ(ierr); if (adof <= 0) continue; ierr = DMPlexGetAdjacency(dm, p, &numAdj, &tmpAdj);CHKERRQ(ierr); ierr = PetscSectionGetDof(anchorSectionAdj, p, &anDof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(anchorSectionAdj, p, &anOff);CHKERRQ(ierr); for (d = off; d < off+dof; ++d) { PetscInt adof, aoff, i; ierr = PetscSectionGetDof(rootSectionAdj, d, &adof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(rootSectionAdj, d, &aoff);CHKERRQ(ierr); i = adof-1; for (q = 0; q < anDof; q++) { rootAdj[aoff+i] = anchorAdj[anOff+q]; --i; } for (q = 0; q < numAdj; ++q) { const PetscInt padj = tmpAdj[q]; PetscInt ndof, ncdof, ngoff, nd; if ((padj < pStart) || (padj >= pEnd)) continue; ierr = PetscSectionGetDof(section, padj, &ndof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, padj, &ncdof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionGlobal, padj, &ngoff);CHKERRQ(ierr); for (nd = 0; nd < ndof-ncdof; ++nd) { rootAdj[aoff+i] = ngoff < 0 ? -(ngoff+1)+nd : ngoff+nd; --i; } } } } /* Debugging */ if (debug) { IS tmp; ierr = PetscPrintf(comm, "Root adjacency indices\n");CHKERRQ(ierr); ierr = ISCreateGeneral(comm, adjSize, rootAdj, PETSC_USE_POINTER, &tmp);CHKERRQ(ierr); ierr = ISView(tmp, NULL);CHKERRQ(ierr); ierr = ISDestroy(&tmp);CHKERRQ(ierr); } /* Compress indices */ ierr = PetscSectionSetUp(rootSectionAdj);CHKERRQ(ierr); for (p = pStart; p < pEnd; ++p) { PetscInt dof, cdof, off, d; PetscInt adof, aoff; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, p, &cdof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, p, &off);CHKERRQ(ierr); if (!dof) continue; ierr = PetscSectionGetDof(rootSectionAdj, off, &adof);CHKERRQ(ierr); if (adof <= 0) continue; for (d = off; d < off+dof-cdof; ++d) { ierr = PetscSectionGetDof(rootSectionAdj, d, &adof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(rootSectionAdj, d, &aoff);CHKERRQ(ierr); ierr = PetscSortRemoveDupsInt(&adof, &rootAdj[aoff]);CHKERRQ(ierr); ierr = PetscSectionSetDof(rootSectionAdj, d, adof);CHKERRQ(ierr); } } /* Debugging */ if (debug) { IS tmp; ierr = PetscPrintf(comm, "Adjancency Section for Preallocation on Roots after compression:\n");CHKERRQ(ierr); ierr = PetscSectionView(rootSectionAdj, PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr); ierr = PetscPrintf(comm, "Root adjacency indices after compression\n");CHKERRQ(ierr); ierr = ISCreateGeneral(comm, adjSize, rootAdj, PETSC_USE_POINTER, &tmp);CHKERRQ(ierr); ierr = ISView(tmp, NULL);CHKERRQ(ierr); ierr = ISDestroy(&tmp);CHKERRQ(ierr); } /* Build adjacency section: Maps global indices to sets of adjacent global indices */ ierr = PetscSectionGetOffsetRange(sectionGlobal, &globalOffStart, &globalOffEnd);CHKERRQ(ierr); ierr = PetscSectionCreate(comm, §ionAdj);CHKERRQ(ierr); ierr = PetscSectionSetChart(sectionAdj, globalOffStart, globalOffEnd);CHKERRQ(ierr); for (p = pStart; p < pEnd; ++p) { PetscInt numAdj = PETSC_DETERMINE, dof, cdof, off, goff, d, q, anDof; PetscBool found = PETSC_TRUE; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, p, &cdof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, p, &off);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionGlobal, p, &goff);CHKERRQ(ierr); for (d = 0; d < dof-cdof; ++d) { PetscInt ldof, rdof; ierr = PetscSectionGetDof(leafSectionAdj, off+d, &ldof);CHKERRQ(ierr); ierr = PetscSectionGetDof(rootSectionAdj, off+d, &rdof);CHKERRQ(ierr); if (ldof > 0) { /* We do not own this point */ } else if (rdof > 0) { ierr = PetscSectionSetDof(sectionAdj, goff+d, rdof);CHKERRQ(ierr); } else { found = PETSC_FALSE; } } if (found) continue; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionGlobal, p, &goff);CHKERRQ(ierr); ierr = DMPlexGetAdjacency(dm, p, &numAdj, &tmpAdj);CHKERRQ(ierr); for (q = 0; q < numAdj; ++q) { const PetscInt padj = tmpAdj[q]; PetscInt ndof, ncdof, noff; if ((padj < pStart) || (padj >= pEnd)) continue; ierr = PetscSectionGetDof(section, padj, &ndof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, padj, &ncdof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, padj, &noff);CHKERRQ(ierr); for (d = goff; d < goff+dof-cdof; ++d) { ierr = PetscSectionAddDof(sectionAdj, d, ndof-ncdof);CHKERRQ(ierr); } } ierr = PetscSectionGetDof(anchorSectionAdj, p, &anDof);CHKERRQ(ierr); if (anDof) { for (d = goff; d < goff+dof-cdof; ++d) { ierr = PetscSectionAddDof(sectionAdj, d, anDof);CHKERRQ(ierr); } } } ierr = PetscSectionSetUp(sectionAdj);CHKERRQ(ierr); if (debug) { ierr = PetscPrintf(comm, "Adjacency Section for Preallocation:\n");CHKERRQ(ierr); ierr = PetscSectionView(sectionAdj, PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr); } /* Get adjacent indices */ ierr = PetscSectionGetStorageSize(sectionAdj, &numCols);CHKERRQ(ierr); ierr = PetscMalloc1(numCols, &cols);CHKERRQ(ierr); for (p = pStart; p < pEnd; ++p) { PetscInt numAdj = PETSC_DETERMINE, dof, cdof, off, goff, d, q, anDof, anOff; PetscBool found = PETSC_TRUE; ierr = PetscSectionGetDof(section, p, &dof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, p, &cdof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(section, p, &off);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionGlobal, p, &goff);CHKERRQ(ierr); for (d = 0; d < dof-cdof; ++d) { PetscInt ldof, rdof; ierr = PetscSectionGetDof(leafSectionAdj, off+d, &ldof);CHKERRQ(ierr); ierr = PetscSectionGetDof(rootSectionAdj, off+d, &rdof);CHKERRQ(ierr); if (ldof > 0) { /* We do not own this point */ } else if (rdof > 0) { PetscInt aoff, roff; ierr = PetscSectionGetOffset(sectionAdj, goff+d, &aoff);CHKERRQ(ierr); ierr = PetscSectionGetOffset(rootSectionAdj, off+d, &roff);CHKERRQ(ierr); ierr = PetscMemcpy(&cols[aoff], &rootAdj[roff], rdof * sizeof(PetscInt));CHKERRQ(ierr); } else { found = PETSC_FALSE; } } if (found) continue; ierr = DMPlexGetAdjacency(dm, p, &numAdj, &tmpAdj);CHKERRQ(ierr); ierr = PetscSectionGetDof(anchorSectionAdj, p, &anDof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(anchorSectionAdj, p, &anOff);CHKERRQ(ierr); for (d = goff; d < goff+dof-cdof; ++d) { PetscInt adof, aoff, i = 0; ierr = PetscSectionGetDof(sectionAdj, d, &adof);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionAdj, d, &aoff);CHKERRQ(ierr); for (q = 0; q < numAdj; ++q) { const PetscInt padj = tmpAdj[q]; PetscInt ndof, ncdof, ngoff, nd; const PetscInt *ncind; /* Adjacent points may not be in the section chart */ if ((padj < pStart) || (padj >= pEnd)) continue; ierr = PetscSectionGetDof(section, padj, &ndof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintDof(section, padj, &ncdof);CHKERRQ(ierr); ierr = PetscSectionGetConstraintIndices(section, padj, &ncind);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionGlobal, padj, &ngoff);CHKERRQ(ierr); for (nd = 0; nd < ndof-ncdof; ++nd, ++i) { cols[aoff+i] = ngoff < 0 ? -(ngoff+1)+nd : ngoff+nd; } } for (q = 0; q < anDof; q++, i++) { cols[aoff+i] = anchorAdj[anOff + q]; } if (i != adof) SETERRQ4(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Invalid number of entries %D != %D for dof %D (point %D)", i, adof, d, p); } } ierr = PetscSectionDestroy(&anchorSectionAdj);CHKERRQ(ierr); ierr = PetscSectionDestroy(&leafSectionAdj);CHKERRQ(ierr); ierr = PetscSectionDestroy(&rootSectionAdj);CHKERRQ(ierr); ierr = PetscFree(anchorAdj);CHKERRQ(ierr); ierr = PetscFree(rootAdj);CHKERRQ(ierr); ierr = PetscFree(tmpAdj);CHKERRQ(ierr); /* Debugging */ if (debug) { IS tmp; ierr = PetscPrintf(comm, "Column indices\n");CHKERRQ(ierr); ierr = ISCreateGeneral(comm, numCols, cols, PETSC_USE_POINTER, &tmp);CHKERRQ(ierr); ierr = ISView(tmp, NULL);CHKERRQ(ierr); ierr = ISDestroy(&tmp);CHKERRQ(ierr); } /* Create allocation vectors from adjacency graph */ ierr = MatGetLocalSize(A, &locRows, NULL);CHKERRQ(ierr); ierr = PetscLayoutCreate(PetscObjectComm((PetscObject)A), &rLayout);CHKERRQ(ierr); ierr = PetscLayoutSetLocalSize(rLayout, locRows);CHKERRQ(ierr); ierr = PetscLayoutSetBlockSize(rLayout, 1);CHKERRQ(ierr); ierr = PetscLayoutSetUp(rLayout);CHKERRQ(ierr); ierr = PetscLayoutGetRange(rLayout, &rStart, &rEnd);CHKERRQ(ierr); ierr = PetscLayoutDestroy(&rLayout);CHKERRQ(ierr); /* Only loop over blocks of rows */ if (rStart%bs || rEnd%bs) SETERRQ3(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Invalid layout [%d, %d) for matrix, must be divisible by block size %d", rStart, rEnd, bs); for (r = rStart/bs; r < rEnd/bs; ++r) { const PetscInt row = r*bs; PetscInt numCols, cStart, c; ierr = PetscSectionGetDof(sectionAdj, row, &numCols);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionAdj, row, &cStart);CHKERRQ(ierr); for (c = cStart; c < cStart+numCols; ++c) { if ((cols[c] >= rStart*bs) && (cols[c] < rEnd*bs)) { ++dnz[r-rStart]; if (cols[c] >= row) ++dnzu[r-rStart]; } else { ++onz[r-rStart]; if (cols[c] >= row) ++onzu[r-rStart]; } } } if (bs > 1) { for (r = 0; r < locRows/bs; ++r) { dnz[r] /= bs; onz[r] /= bs; dnzu[r] /= bs; onzu[r] /= bs; } } /* Set matrix pattern */ ierr = MatXAIJSetPreallocation(A, bs, dnz, onz, dnzu, onzu);CHKERRQ(ierr); ierr = MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); /* Check for symmetric storage */ ierr = MatGetType(A, &mtype);CHKERRQ(ierr); ierr = PetscStrcmp(mtype, MATSBAIJ, &isSymBlock);CHKERRQ(ierr); ierr = PetscStrcmp(mtype, MATSEQSBAIJ, &isSymSeqBlock);CHKERRQ(ierr); ierr = PetscStrcmp(mtype, MATMPISBAIJ, &isSymMPIBlock);CHKERRQ(ierr); if (isSymBlock || isSymSeqBlock || isSymMPIBlock) {ierr = MatSetOption(A, MAT_IGNORE_LOWER_TRIANGULAR, PETSC_TRUE);CHKERRQ(ierr);} /* Fill matrix with zeros */ if (fillMatrix) { PetscScalar *values; PetscInt maxRowLen = 0; for (r = rStart; r < rEnd; ++r) { PetscInt len; ierr = PetscSectionGetDof(sectionAdj, r, &len);CHKERRQ(ierr); maxRowLen = PetscMax(maxRowLen, len); } ierr = PetscCalloc1(maxRowLen, &values);CHKERRQ(ierr); for (r = rStart; r < rEnd; ++r) { PetscInt numCols, cStart; ierr = PetscSectionGetDof(sectionAdj, r, &numCols);CHKERRQ(ierr); ierr = PetscSectionGetOffset(sectionAdj, r, &cStart);CHKERRQ(ierr); ierr = MatSetValues(A, 1, &r, numCols, &cols[cStart], values, INSERT_VALUES);CHKERRQ(ierr); } ierr = PetscFree(values);CHKERRQ(ierr); ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); } /* restore original useAnchors */ ierr = DMPlexSetAdjacencyUseAnchors(dm,useAnchors);CHKERRQ(ierr); ierr = PetscSectionDestroy(§ionAdj);CHKERRQ(ierr); ierr = PetscFree(cols);CHKERRQ(ierr); ierr = PetscLogEventEnd(DMPLEX_Preallocate,dm,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); }
struct fft_plan_3d *fft_3d_create_plan( MPI_Comm comm, int nfast, int nmid, int nslow, int in_ilo, int in_ihi, int in_jlo, int in_jhi, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf) { struct fft_plan_3d *plan; int me,nprocs; int /*i,num,*/flag,remapflag;//,fftflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; // int list[50]; /* system specific variables */ #ifdef FFT_INTEL FFT_DATA dummy; #endif #ifdef FFT_T3E FFT_DATA dummy[5]; int isign,isys; double scalef; #endif /* query MPI info */ MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); /* compute division of procs in 2 dimensions not on-processor */ bifactor(nprocs,&np1,&np2); ip1 = me % np1; ip2 = me/np1; /* allocate memory for plan data struct */ plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d)); if (plan == NULL) return NULL; /* remap from initial distribution to layout needed for 1st set of 1d FFTs not needed if all procs own entire fast axis initially first indices = distribution after 1st set of FFTs */ if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { first_ilo = in_ilo; first_ihi = in_ihi; first_jlo = in_jlo; first_jhi = in_jhi; first_klo = in_klo; first_khi = in_khi; plan->pre_plan = NULL; } else { first_ilo = 0; first_ihi = nfast - 1; first_jlo = ip1*nmid/np1; first_jhi = (ip1+1)*nmid/np1 - 1; first_klo = ip2*nslow/np2; first_khi = (ip2+1)*nslow/np2 - 1; plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, FFT_PRECISION,0,0,2); if (plan->pre_plan == NULL) return NULL; } /* 1d FFTs along fast axis */ plan->length1 = nfast; plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); /* remap from 1st to 2nd FFT choose which axis is split over np1 vs np2 to minimize communication second indices = distribution after 2nd set of FFTs */ second_ilo = ip1*nfast/np1; second_ihi = (ip1+1)*nfast/np1 - 1; second_jlo = 0; second_jhi = nmid - 1; second_klo = ip2*nslow/np2; second_khi = (ip2+1)*nslow/np2 - 1; plan->mid1_plan = remap_3d_create_plan(comm, first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi, FFT_PRECISION,1,0,2); if (plan->mid1_plan == NULL) return NULL; /* 1d FFTs along mid axis */ plan->length2 = nmid; plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); /* remap from 2nd to 3rd FFT if final distribution is permute=2 with all procs owning entire slow axis then this remapping goes directly to final distribution third indices = distribution after 3rd set of FFTs */ if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) { third_ilo = out_ilo; third_ihi = out_ihi; third_jlo = out_jlo; third_jhi = out_jhi; third_klo = out_klo; third_khi = out_khi; } else { third_ilo = ip1*nfast/np1; third_ihi = (ip1+1)*nfast/np1 - 1; third_jlo = ip2*nmid/np2; third_jhi = (ip2+1)*nmid/np2 - 1; third_klo = 0; third_khi = nslow - 1; } plan->mid2_plan = remap_3d_create_plan(comm, second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, third_ilo,third_ihi, FFT_PRECISION,1,0,2); if (plan->mid2_plan == NULL) return NULL; /* 1d FFTs along slow axis */ plan->length3 = nslow; plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; /* remap from 3rd FFT to final distribution not needed if permute = 2 and third indices = out indices on all procs */ if (permute == 2 && out_ilo == third_ilo && out_ihi == third_ihi && out_jlo == third_jlo && out_jhi == third_jhi && out_klo == third_klo && out_khi == third_khi) flag = 0; else flag = 1; MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); if (remapflag == 0) plan->post_plan = NULL; else { plan->post_plan = remap_3d_create_plan(comm, third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, out_jlo,out_jhi, FFT_PRECISION,(permute+1)%3,0,2); if (plan->post_plan == NULL) return NULL; } /* configure plan memory pointers and allocate work space out_size = amount of memory given to FFT by user first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps copy_size = amount needed internally for extra copy of data scratch_size = amount needed internally for remap scratch space for each remap: use out space for result if big enough, else require copy buffer accumulate largest required remap scratch space */ out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * (second_khi-second_klo+1); third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * (third_khi-third_klo+1); copy_size = 0; scratch_size = 0; if (plan->pre_plan) { if (first_size <= out_size) plan->pre_target = 0; else { plan->pre_target = 1; copy_size = MAX(copy_size,first_size); } scratch_size = MAX(scratch_size,first_size); } if (plan->mid1_plan) { if (second_size <= out_size) plan->mid1_target = 0; else { plan->mid1_target = 1; copy_size = MAX(copy_size,second_size); } scratch_size = MAX(scratch_size,second_size); } if (plan->mid2_plan) { if (third_size <= out_size) plan->mid2_target = 0; else { plan->mid2_target = 1; copy_size = MAX(copy_size,third_size); } scratch_size = MAX(scratch_size,third_size); } if (plan->post_plan) scratch_size = MAX(scratch_size,out_size); *nbuf = copy_size + scratch_size; if (copy_size) { plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA)); if (plan->copy == NULL) return NULL; } else plan->copy = NULL; if (scratch_size) { plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA)); if (plan->scratch == NULL) return NULL; } else plan->scratch = NULL; /* system specific pre-computation of 1d FFT coeffs and scaling normalization */ #ifdef FFT_SGI plan->coeff1 = (FFT_DATA *) malloc((nfast+15)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((nmid+15)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((nslow+15)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; FFT_1D_INIT(nfast,plan->coeff1); FFT_1D_INIT(nmid,plan->coeff2); FFT_1D_INIT(nslow,plan->coeff3); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif #ifdef FFT_INTEL flag = 0; num = 0; factor(nfast,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nmid,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; num = 0; factor(nslow,&num,list); for (i = 0; i < num; i++) if (list[i] != 2 && list[i] != 3 && list[i] != 5) flag = 1; MPI_Allreduce(&flag,&fftflag,1,MPI_INT,MPI_MAX,comm); if (fftflag) { if (me == 0) printf("ERROR: FFTs are not power of 2,3,5\n"); return NULL; } plan->coeff1 = (FFT_DATA *) malloc((3*nfast/2+1)*sizeof(FFT_DATA)); plan->coeff2 = (FFT_DATA *) malloc((3*nmid/2+1)*sizeof(FFT_DATA)); plan->coeff3 = (FFT_DATA *) malloc((3*nslow/2+1)*sizeof(FFT_DATA)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; flag = 0; FFT_1D_INIT(&dummy,&nfast,&flag,plan->coeff1); FFT_1D_INIT(&dummy,&nmid,&flag,plan->coeff2); FFT_1D_INIT(&dummy,&nslow,&flag,plan->coeff3); if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #endif #ifdef FFT_DEC if (scaled == 0) { plan->scaled = 1; plan->norm = nfast*nmid*nslow; plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } else plan->scaled = 0; #endif #ifdef FFT_T3E plan->coeff1 = (double *) malloc((12*nfast)*sizeof(double)); plan->coeff2 = (double *) malloc((12*nmid)*sizeof(double)); plan->coeff3 = (double *) malloc((12*nslow)*sizeof(double)); if (plan->coeff1 == NULL || plan->coeff2 == NULL || plan->coeff3 == NULL) return NULL; plan->work1 = (double *) malloc((8*nfast)*sizeof(double)); plan->work2 = (double *) malloc((8*nmid)*sizeof(double)); plan->work3 = (double *) malloc((8*nslow)*sizeof(double)); if (plan->work1 == NULL || plan->work2 == NULL || plan->work3 == NULL) return NULL; isign = 0; scalef = 1.0; isys = 0; FFT_1D_INIT(&isign,&nfast,&scalef,dummy,dummy,plan->coeff1,dummy,&isys); FFT_1D_INIT(&isign,&nmid,&scalef,dummy,dummy,plan->coeff2,dummy,&isys); FFT_1D_INIT(&isign,&nslow,&scalef,dummy,dummy,plan->coeff3,dummy,&isys); if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif #ifdef FFT_FFTW plan->plan_fast_forward = fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_fast_backward = fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); if (nmid == nfast) { plan->plan_mid_forward = plan->plan_fast_forward; plan->plan_mid_backward = plan->plan_fast_backward; } else { plan->plan_mid_forward = fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_mid_backward = fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (nslow == nfast) { plan->plan_slow_forward = plan->plan_fast_forward; plan->plan_slow_backward = plan->plan_fast_backward; } else if (nslow == nmid) { plan->plan_slow_forward = plan->plan_mid_forward; plan->plan_slow_backward = plan->plan_mid_backward; } else { plan->plan_slow_forward = fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); plan->plan_slow_backward = fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE); } if (scaled == 0) plan->scaled = 0; else { plan->scaled = 1; plan->norm = 1.0/(nfast*nmid*nslow); plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); } #endif return plan; }
/* This version is the traditional level-synchronized BFS using two queues. A * bitmap is used to indicate which vertices have been visited. Messages are * sent and processed asynchronously throughout the code to hopefully overlap * communication with computation. */ void run_bfs(int64_t root, int64_t* pred) { allocate_memory(); const ptrdiff_t nlocalverts = g.nlocalverts; const size_t* const restrict rowstarts = g.rowstarts; const int64_t* const restrict column = g.column; /* Set up the visited bitmap. */ const int ulong_bits = sizeof(unsigned long) * CHAR_BIT; const int ulong_bits_squared = ulong_bits * ulong_bits; int64_t local_queue_summary_size = g_local_queue_summary_size; int64_t local_queue_size = g_local_queue_size; int lg_local_queue_size = g_lg_local_queue_size; int64_t global_queue_summary_size = g_global_queue_summary_size; int64_t global_queue_size = g_global_queue_size; #define SWIZZLE_VERTEX(c) (((int64_t)(VERTEX_OWNER(c)) << lg_local_queue_size) | (int64_t)(VERTEX_LOCAL(c))) #if 0 int64_t* restrict column_swizzled = (int64_t*)xmalloc(nlocaledges * sizeof(int64_t)); { size_t i; for (i = 0; i < nlocaledges; ++i) { int64_t c = column[i]; column_swizzled[i] = SWIZZLE_VERTEX(c); } } #endif unsigned long* restrict in_queue = g_in_queue; memset(in_queue, 0, global_queue_size * sizeof(unsigned long)); unsigned long* restrict in_queue_summary = g_in_queue_summary; memset(in_queue_summary, 0, global_queue_summary_size * sizeof(unsigned long)); unsigned long* restrict out_queue = g_out_queue; unsigned long* restrict out_queue_summary = g_out_queue_summary; unsigned long* restrict visited = g_visited; memset(visited, 0, local_queue_size * sizeof(unsigned long)); #define SET_IN(v) do {int64_t vs = SWIZZLE_VERTEX(v); size_t word_idx = vs / ulong_bits; int bit_idx = vs % ulong_bits; unsigned long mask = (1UL << bit_idx); in_queue_summary[word_idx / ulong_bits] |= (1UL << (word_idx % ulong_bits)); in_queue[word_idx] |= mask;} while (0) #define TEST_IN(vs) (((in_queue_summary[vs / ulong_bits / ulong_bits] & (1UL << ((vs / ulong_bits) % ulong_bits))) != 0) && ((in_queue[vs / ulong_bits] & (1UL << (vs % ulong_bits))) != 0)) #define TEST_VISITED_LOCAL(v) ((visited[(v) / ulong_bits] & (1UL << ((v) % ulong_bits))) != 0) // #define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); __sync_fetch_and_or(&visited[word_idx], mask); __sync_fetch_and_or(&out_queue[word_idx], mask);} while (0) #define SET_VISITED_LOCAL(v) do {size_t word_idx = (v) / ulong_bits; int bit_idx = (v) % ulong_bits; unsigned long mask = (1UL << bit_idx); visited[word_idx] |= mask; out_queue[word_idx] |= mask;} while (0) SET_IN(root); {ptrdiff_t i; _Pragma("omp parallel for schedule(static)") for (i = 0; i < nlocalverts; ++i) pred[i] = -1;} if (VERTEX_OWNER(root) == rank) { pred[VERTEX_LOCAL(root)] = root; SET_VISITED_LOCAL(VERTEX_LOCAL(root)); } uint16_t cur_level = 0; while (1) { ++cur_level; #if 0 if (rank == 0) fprintf(stderr, "BFS level %" PRIu16 "\n", cur_level); #endif memset(out_queue, 0, (nlocalverts + ulong_bits - 1) / ulong_bits * sizeof(unsigned long)); // memset(out_queue_summary, 0, (nlocalverts + ulong_bits_squared - 1) / ulong_bits_squared * sizeof(unsigned long)); ptrdiff_t i, ii; #if 0 #pragma omp parallel for schedule(static) for (i = 0; i < global_queue_summary_size; ++i) { unsigned long val = 0UL; int j; unsigned long mask = 1UL; for (j = 0; j < ulong_bits; ++j, mask <<= 1) { if (in_queue[i * ulong_bits + j]) val |= mask; } in_queue_summary[i] = val; } #endif unsigned long not_done = 0; #pragma omp parallel for schedule(static) reduction(|:not_done) for (ii = 0; ii < nlocalverts; ii += ulong_bits) { size_t i, i_end = ii + ulong_bits; if (i_end > nlocalverts) i_end = nlocalverts; for (i = ii; i < i_end; ++i) { if (!TEST_VISITED_LOCAL(i)) { size_t j, j_end = rowstarts[i + 1]; for (j = rowstarts[i]; j < j_end; ++j) { int64_t v1 = column[j]; int64_t v1_swizzled = SWIZZLE_VERTEX(v1); if (TEST_IN(v1_swizzled)) { pred[i] = (v1 & INT64_C(0xFFFFFFFFFFFF)) | ((int64_t)cur_level << 48); not_done |= 1; SET_VISITED_LOCAL(i); break; } } } } } #if 1 #pragma omp parallel for schedule(static) for (i = 0; i < local_queue_summary_size; ++i) { unsigned long val = 0UL; int j; unsigned long mask = 1UL; for (j = 0; j < ulong_bits; ++j, mask <<= 1) { unsigned long full_val = out_queue[i * ulong_bits + j]; visited[i * ulong_bits + j] |= full_val; if (full_val) val |= mask; } out_queue_summary[i] = val; // not_done |= val; } #endif MPI_Allreduce(MPI_IN_PLACE, ¬_done, 1, MPI_UNSIGNED_LONG, MPI_BOR, MPI_COMM_WORLD); if (not_done == 0) break; MPI_Allgather(out_queue, local_queue_size, MPI_UNSIGNED_LONG, in_queue, local_queue_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD); MPI_Allgather(out_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, in_queue_summary, local_queue_summary_size, MPI_UNSIGNED_LONG, MPI_COMM_WORLD); } deallocate_memory(); }
void star_density(void) { int j; #ifdef EDDINGTON_TENSOR_STARS int i, dummy; int ngrp, sendTask, recvTask, place, nexport, nimport, ndone, ndone_flag; #endif /* clear Je in all gas particles */ for(j = 0; j < N_gas; j++) { if(P[j].Type == 0) SphP[j].Je = 0; #ifdef SFR if(P[j].Type == 0) { SphP[j].Je += SphP[j].Sfr * All.IonizingLumPerSFR * (PROTONMASS / (P[j].Mass * All.UnitMass_in_g / All.HubbleParam)) * All.UnitTime_in_s / All.HubbleParam; } #endif } #ifdef EDDINGTON_TENSOR_STARS /* allocate buffers to arrange communication */ Ngblist = (int *) mymalloc(NumPart * sizeof(int)); All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + 2 * sizeof(struct stardata_in))); DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist)); i = FirstActiveParticle; /* beginn with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i >= 0; i = NextActiveParticle[i]) { if(P[i].Type == 4) { if(star_density_evaluate(i, 0, &nexport, Send_count) < 0) break; } } #ifdef MYSORT mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #else qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #endif MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask]; nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } StarDataGet = (struct stardata_in *) mymalloc(nimport * sizeof(struct stardata_in)); StarDataIn = (struct stardata_in *) mymalloc(nexport * sizeof(struct stardata_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; StarDataIn[j].Pos[0] = P[place].Pos[0]; StarDataIn[j].Pos[1] = P[place].Pos[1]; StarDataIn[j].Pos[2] = P[place].Pos[2]; StarDataIn[j].Hsml = PPP[place].Hsml; StarDataIn[j].Density = P[place].DensAroundStar; StarDataIn[j].Mass = P[place].Mass; memcpy(StarDataIn[j].NodeList, DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int)); } /* exchange particle data */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the particles */ MPI_Sendrecv(&StarDataIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct stardata_in), MPI_BYTE, recvTask, TAG_DENS_A, &StarDataGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct stardata_in), MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } myfree(StarDataIn); /* now do the particles that were sent to us */ for(j = 0; j < nimport; j++) star_density_evaluate(j, 1, &dummy, &dummy); /* check whether this is the last iteration */ if(i < 0) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); myfree(StarDataGet); } while(ndone < NTask); myfree(DataNodeList); myfree(DataIndexTable); myfree(Ngblist); #endif //for EDDINGTON_TENSOR_STARS }
PetscErrorCode heavyEdgeMatchAgg(IS perm,Mat a_Gmat,PetscInt verbose,PetscCoarsenData **a_locals_llist) { PetscErrorCode ierr; PetscBool isMPI; MPI_Comm wcomm = ((PetscObject)a_Gmat)->comm; PetscInt sub_it,kk,n,ix,*idx,*ii,iter,Iend,my0; PetscMPIInt rank,size; const PetscInt nloc = a_Gmat->rmap->n,n_iter=6; /* need to figure out how to stop this */ PetscInt *lid_cprowID,*lid_gid; PetscBool *lid_matched; Mat_SeqAIJ *matA, *matB=0; Mat_MPIAIJ *mpimat=0; PetscScalar one=1.; PetscCoarsenData *agg_llists = PETSC_NULL,*deleted_list = PETSC_NULL; Mat cMat,tMat,P; MatScalar *ap; PetscMPIInt tag1,tag2; PetscFunctionBegin; ierr = MPI_Comm_rank(wcomm, &rank);CHKERRQ(ierr); ierr = MPI_Comm_size(wcomm, &size);CHKERRQ(ierr); ierr = MatGetOwnershipRange(a_Gmat, &my0, &Iend);CHKERRQ(ierr); ierr = PetscCommGetNewTag(wcomm, &tag1);CHKERRQ(ierr); ierr = PetscCommGetNewTag(wcomm, &tag2);CHKERRQ(ierr); ierr = PetscMalloc(nloc*sizeof(PetscInt), &lid_gid);CHKERRQ(ierr); /* explicit array needed */ ierr = PetscMalloc(nloc*sizeof(PetscInt), &lid_cprowID);CHKERRQ(ierr); ierr = PetscMalloc(nloc*sizeof(PetscBool), &lid_matched);CHKERRQ(ierr); ierr = PetscCDCreate(nloc, &agg_llists);CHKERRQ(ierr); /* ierr = PetscCDSetChuckSize(agg_llists, nloc+1);CHKERRQ(ierr); */ *a_locals_llist = agg_llists; ierr = PetscCDCreate(size, &deleted_list);CHKERRQ(ierr); ierr = PetscCDSetChuckSize(deleted_list, 100);CHKERRQ(ierr); /* setup 'lid_gid' for scatters and add self to all lists */ for (kk=0;kk<nloc;kk++) { lid_gid[kk] = kk + my0; ierr = PetscCDAppendID(agg_llists, kk, my0+kk);CHKERRQ(ierr); } /* make a copy of the graph, this gets destroyed in iterates */ ierr = MatDuplicate(a_Gmat,MAT_COPY_VALUES,&cMat);CHKERRQ(ierr); ierr = PetscObjectTypeCompare((PetscObject)a_Gmat, MATMPIAIJ, &isMPI);CHKERRQ(ierr); iter = 0; while(iter++ < n_iter) { PetscScalar *cpcol_gid,*cpcol_max_ew,*cpcol_max_pe,*lid_max_ew; PetscBool *cpcol_matched; PetscMPIInt *cpcol_pe,proc; Vec locMaxEdge,locMaxPE,ghostMaxEdge,ghostMaxPE; PetscInt nEdges,n_nz_row,jj; Edge *Edges; PetscInt gid; const PetscInt *perm_ix, n_sub_its = 120; /* get submatrices of cMat */ if (isMPI) { mpimat = (Mat_MPIAIJ*)cMat->data; matA = (Mat_SeqAIJ*)mpimat->A->data; matB = (Mat_SeqAIJ*)mpimat->B->data; /* force compressed storage of B */ matB->compressedrow.check = PETSC_TRUE; ierr = MatCheckCompressedRow(mpimat->B,&matB->compressedrow,matB->i,cMat->rmap->n,-1.0);CHKERRQ(ierr); assert(matB->compressedrow.use); } else { matA = (Mat_SeqAIJ*)cMat->data; } assert(matA && !matA->compressedrow.use); assert(matB==0 || matB->compressedrow.use); /* set max edge on nodes */ ierr = MatGetVecs(cMat, &locMaxEdge, 0);CHKERRQ(ierr); ierr = MatGetVecs(cMat, &locMaxPE, 0);CHKERRQ(ierr); /* get 'cpcol_pe' & 'cpcol_gid' & init. 'cpcol_matched' using 'mpimat->lvec' */ if (mpimat) { Vec vec; PetscScalar vval; ierr = MatGetVecs(cMat, &vec, 0);CHKERRQ(ierr); /* cpcol_pe */ vval = (PetscScalar)(rank); for (kk=0,gid=my0;kk<nloc;kk++,gid++) { ierr = VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */ } ierr = VecAssemblyBegin(vec);CHKERRQ(ierr); ierr = VecAssemblyEnd(vec);CHKERRQ(ierr); ierr = VecScatterBegin(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecScatterEnd(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecGetArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr); /* get proc ID in 'cpcol_gid' */ ierr = VecGetLocalSize(mpimat->lvec, &n);CHKERRQ(ierr); ierr = PetscMalloc(n*sizeof(PetscInt), &cpcol_pe);CHKERRQ(ierr); for (kk=0;kk<n;kk++) cpcol_pe[kk] = (PetscMPIInt)PetscRealPart(cpcol_gid[kk]); ierr = VecRestoreArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr); /* cpcol_gid */ for (kk=0,gid=my0;kk<nloc;kk++,gid++) { vval = (PetscScalar)(gid); ierr = VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */ } ierr = VecAssemblyBegin(vec);CHKERRQ(ierr); ierr = VecAssemblyEnd(vec);CHKERRQ(ierr); ierr = VecScatterBegin(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecScatterEnd(mpimat->Mvctx,vec,mpimat->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecDestroy(&vec);CHKERRQ(ierr); ierr = VecGetArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr); /* get proc ID in 'cpcol_gid' */ /* cpcol_matched */ ierr = VecGetLocalSize(mpimat->lvec, &n);CHKERRQ(ierr); ierr = PetscMalloc(n*sizeof(PetscBool), &cpcol_matched);CHKERRQ(ierr); for (kk=0;kk<n;kk++) cpcol_matched[kk] = PETSC_FALSE; } /* need an inverse map - locals */ for (kk=0;kk<nloc;kk++) lid_cprowID[kk] = -1; /* set index into compressed row 'lid_cprowID' */ if (matB) { ii = matB->compressedrow.i; for (ix=0; ix<matB->compressedrow.nrows; ix++) { lid_cprowID[matB->compressedrow.rindex[ix]] = ix; } } /* get removed IS, use '' */ /* if (iter==1) { */ /* PetscInt *lid_rem,idx; */ /* ierr = PetscMalloc(nloc*sizeof(PetscInt), &lid_rem);CHKERRQ(ierr); */ /* for (kk=idx=0;kk<nloc;kk++){ */ /* PetscInt nn,lid=kk; */ /* ii = matA->i; nn = ii[lid+1] - ii[lid]; */ /* if ((ix=lid_cprowID[lid]) != -1) { /\* if I have any ghost neighbors *\/ */ /* ii = matB->compressedrow.i; */ /* nn += ii[ix+1] - ii[ix]; */ /* } */ /* if (nn < 2) { */ /* lid_rem[idx++] = kk + my0; */ /* } */ /* } */ /* ierr = PetscCDSetRemovedIS(agg_llists, wcomm, idx, lid_rem);CHKERRQ(ierr); */ /* ierr = PetscFree(lid_rem);CHKERRQ(ierr); */ /* } */ /* compute 'locMaxEdge' & 'locMaxPE', and create list of edges, count edges' */ for (nEdges=0,kk=0,gid=my0;kk<nloc;kk++,gid++){ PetscReal max_e = 0., tt; PetscScalar vval; PetscInt lid = kk; PetscMPIInt max_pe=rank,pe; ii = matA->i; n = ii[lid+1] - ii[lid]; idx = matA->j + ii[lid]; ap = matA->a + ii[lid]; for (jj=0; jj<n; jj++) { PetscInt lidj = idx[jj]; if (lidj != lid && PetscRealPart(ap[jj]) > max_e) max_e = PetscRealPart(ap[jj]); if (lidj > lid) nEdges++; } if ((ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */ ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix]; ap = matB->a + ii[ix]; idx = matB->j + ii[ix]; for (jj=0 ; jj<n ; jj++) { if ((tt=PetscRealPart(ap[jj])) > max_e) max_e = tt; nEdges++; if ((pe=cpcol_pe[idx[jj]]) > max_pe) max_pe = pe; } } vval = max_e; ierr = VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); vval = (PetscScalar)max_pe; ierr = VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); } ierr = VecAssemblyBegin(locMaxEdge);CHKERRQ(ierr); ierr = VecAssemblyEnd(locMaxEdge);CHKERRQ(ierr); ierr = VecAssemblyBegin(locMaxPE);CHKERRQ(ierr); ierr = VecAssemblyEnd(locMaxPE);CHKERRQ(ierr); /* get 'cpcol_max_ew' & 'cpcol_max_pe' */ if (mpimat) { ierr = VecDuplicate(mpimat->lvec, &ghostMaxEdge);CHKERRQ(ierr); ierr = VecScatterBegin(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecScatterEnd(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecGetArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr); ierr = VecDuplicate(mpimat->lvec, &ghostMaxPE);CHKERRQ(ierr); ierr = VecScatterBegin(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecScatterEnd(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecGetArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr); } /* setup sorted list of edges */ ierr = PetscMalloc(nEdges*sizeof(Edge), &Edges);CHKERRQ(ierr); ierr = ISGetIndices(perm, &perm_ix);CHKERRQ(ierr); for (nEdges=n_nz_row=kk=0;kk<nloc;kk++){ PetscInt nn, lid = perm_ix[kk]; ii = matA->i; nn = n = ii[lid+1] - ii[lid]; idx = matA->j + ii[lid]; ap = matA->a + ii[lid]; for (jj=0; jj<n; jj++) { PetscInt lidj = idx[jj]; assert(PetscRealPart(ap[jj])>0.); if (lidj > lid) { Edges[nEdges].lid0 = lid; Edges[nEdges].gid1 = lidj + my0; Edges[nEdges].cpid1 = -1; Edges[nEdges].weight = PetscRealPart(ap[jj]); nEdges++; } } if ((ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */ ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix]; ap = matB->a + ii[ix]; idx = matB->j + ii[ix]; nn += n; for (jj=0 ; jj<n ; jj++) { assert(PetscRealPart(ap[jj])>0.); Edges[nEdges].lid0 = lid; Edges[nEdges].gid1 = (PetscInt)PetscRealPart(cpcol_gid[idx[jj]]); Edges[nEdges].cpid1 = idx[jj]; Edges[nEdges].weight = PetscRealPart(ap[jj]); nEdges++; } } if (nn > 1) n_nz_row++; else if (iter == 1){ /* should select this because it is technically in the MIS but lets not */ ierr = PetscCDRemoveAll(agg_llists, lid);CHKERRQ(ierr); } } ierr = ISRestoreIndices(perm,&perm_ix);CHKERRQ(ierr); qsort(Edges, nEdges, sizeof(Edge), gamg_hem_compare); /* projection matrix */ ierr = MatCreateAIJ(wcomm, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE, 1, 0, 1, 0, &P);CHKERRQ(ierr); /* clear matched flags */ for (kk=0;kk<nloc;kk++) lid_matched[kk] = PETSC_FALSE; /* process - communicate - process */ for (sub_it=0;sub_it<n_sub_its;sub_it++){ PetscInt nactive_edges; ierr = VecGetArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr); for (kk=nactive_edges=0;kk<nEdges;kk++){ /* HEM */ const Edge *e = &Edges[kk]; const PetscInt lid0=e->lid0,gid1=e->gid1,cpid1=e->cpid1,gid0=lid0+my0,lid1=gid1-my0; PetscBool isOK = PETSC_TRUE; /* skip if either (local) vertex is done already */ if (lid_matched[lid0] || (gid1>=my0 && gid1<Iend && lid_matched[gid1-my0])) { continue; } /* skip if ghost vertex is done */ if (cpid1 != -1 && cpcol_matched[cpid1]) { continue; } nactive_edges++; /* skip if I have a bigger edge someplace (lid_max_ew gets updated) */ if (PetscRealPart(lid_max_ew[lid0]) > e->weight + 1.e-12) { continue; } if (cpid1 == -1) { if (PetscRealPart(lid_max_ew[lid1]) > e->weight + 1.e-12) { continue; } } else { /* see if edge might get matched on other proc */ PetscReal g_max_e = PetscRealPart(cpcol_max_ew[cpid1]); if (g_max_e > e->weight + 1.e-12) { continue; } else if (e->weight > g_max_e - 1.e-12 && (PetscMPIInt)PetscRealPart(cpcol_max_pe[cpid1]) > rank) { /* check for max_e == to this edge and larger processor that will deal with this */ continue; } } /* check ghost for v0 */ if (isOK){ PetscReal max_e,ew; if ((ix=lid_cprowID[lid0]) != -1) { /* if I have any ghost neighbors */ ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix]; ap = matB->a + ii[ix]; idx = matB->j + ii[ix]; for (jj=0 ; jj<n && isOK; jj++) { PetscInt lidj = idx[jj]; if (cpcol_matched[lidj]) continue; ew = PetscRealPart(ap[jj]); max_e = PetscRealPart(cpcol_max_ew[lidj]); /* check for max_e == to this edge and larger processor that will deal with this */ if (ew > max_e - 1.e-12 && ew > PetscRealPart(lid_max_ew[lid0]) - 1.e-12 && (PetscMPIInt)PetscRealPart(cpcol_max_pe[lidj]) > rank){ isOK = PETSC_FALSE; } } } /* for v1 */ if (cpid1 == -1 && isOK){ if ((ix=lid_cprowID[lid1]) != -1) { /* if I have any ghost neighbors */ ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix]; ap = matB->a + ii[ix]; idx = matB->j + ii[ix]; for (jj=0 ; jj<n && isOK ; jj++) { PetscInt lidj = idx[jj]; if (cpcol_matched[lidj]) continue; ew = PetscRealPart(ap[jj]); max_e = PetscRealPart(cpcol_max_ew[lidj]); /* check for max_e == to this edge and larger processor that will deal with this */ if (ew > max_e - 1.e-12 && ew > PetscRealPart(lid_max_ew[lid1]) - 1.e-12 && (PetscMPIInt)PetscRealPart(cpcol_max_pe[lidj]) > rank) { isOK = PETSC_FALSE; } } } } } /* do it */ if (isOK){ if (cpid1 == -1) { lid_matched[lid1] = PETSC_TRUE; /* keep track of what we've done this round */ ierr = PetscCDAppendRemove(agg_llists, lid0, lid1);CHKERRQ(ierr); } else if (sub_it != n_sub_its-1) { /* add gid1 to list of ghost deleted by me -- I need their children */ proc = cpcol_pe[cpid1]; cpcol_matched[cpid1] = PETSC_TRUE; /* messing with VecGetArray array -- needed??? */ ierr = PetscCDAppendID(deleted_list, proc, cpid1);CHKERRQ(ierr); /* cache to send messages */ ierr = PetscCDAppendID(deleted_list, proc, lid0);CHKERRQ(ierr); } else { continue; } lid_matched[lid0] = PETSC_TRUE; /* keep track of what we've done this round */ /* set projection */ ierr = MatSetValues(P,1,&gid0,1,&gid0,&one,INSERT_VALUES);CHKERRQ(ierr); ierr = MatSetValues(P,1,&gid1,1,&gid0,&one,INSERT_VALUES);CHKERRQ(ierr); } /* matched */ } /* edge loop */ /* deal with deleted ghost on first pass */ if (size>1 && sub_it != n_sub_its-1){ PetscCDPos pos; PetscBool ise = PETSC_FALSE; PetscInt nSend1, **sbuffs1,nSend2; #define REQ_BF_SIZE 100 MPI_Request *sreqs2[REQ_BF_SIZE],*rreqs2[REQ_BF_SIZE]; MPI_Status status; /* send request */ for (proc=0,nSend1=0;proc<size;proc++){ ierr = PetscCDEmptyAt(deleted_list,proc,&ise);CHKERRQ(ierr); if (!ise) nSend1++; } ierr = PetscMalloc(nSend1*sizeof(PetscInt*), &sbuffs1);CHKERRQ(ierr); /* ierr = PetscMalloc4(nSend1, PetscInt*, sbuffs1, nSend1, PetscInt*, rbuffs1, nSend1, MPI_Request*, sreqs1, nSend1, MPI_Request*, rreqs1);CHKERRQ(ierr); */ /* PetscFree4(sbuffs1,rbuffs1,sreqs1,rreqs1); */ for (proc=0,nSend1=0;proc<size;proc++){ /* count ghosts */ ierr = PetscCDSizeAt(deleted_list,proc,&n);CHKERRQ(ierr); if (n>0){ #define CHUNCK_SIZE 100 PetscInt *sbuff,*pt; MPI_Request *request; assert(n%2==0); n /= 2; ierr = PetscMalloc((2 + 2*n + n*CHUNCK_SIZE)*sizeof(PetscInt) + 2*sizeof(MPI_Request), &sbuff);CHKERRQ(ierr); /* PetscMalloc4(2+2*n,PetscInt,sbuffs1[nSend1],n*CHUNCK_SIZE,PetscInt,rbuffs1[nSend1],1,MPI_Request,rreqs2[nSend1],1,MPI_Request,sreqs2[nSend1]); */ /* save requests */ sbuffs1[nSend1] = sbuff; request = (MPI_Request*)sbuff; sbuff = pt = (PetscInt*)(request+1); *pt++ = n; *pt++ = rank; ierr = PetscCDGetHeadPos(deleted_list,proc,&pos);CHKERRQ(ierr); while(pos){ PetscInt lid0, cpid, gid; ierr = PetscLLNGetID(pos, &cpid);CHKERRQ(ierr); gid = (PetscInt)PetscRealPart(cpcol_gid[cpid]); ierr = PetscCDGetNextPos(deleted_list,proc,&pos);CHKERRQ(ierr); ierr = PetscLLNGetID(pos, &lid0);CHKERRQ(ierr); ierr = PetscCDGetNextPos(deleted_list,proc,&pos);CHKERRQ(ierr); *pt++ = gid; *pt++ = lid0; } /* send request tag1 [n, proc, n*[gid1,lid0] ] */ ierr = MPI_Isend(sbuff, 2*n+2, MPIU_INT, proc, tag1, wcomm, request);CHKERRQ(ierr); /* post recieve */ request = (MPI_Request*)pt; rreqs2[nSend1] = request; /* cache recv request */ pt = (PetscInt*)(request+1); ierr = MPI_Irecv(pt, n*CHUNCK_SIZE, MPIU_INT, proc, tag2, wcomm, request);CHKERRQ(ierr); /* clear list */ ierr = PetscCDRemoveAll(deleted_list, proc);CHKERRQ(ierr); nSend1++; } } /* recieve requests, send response, clear lists */ kk = nactive_edges; ierr = MPI_Allreduce(&kk,&nactive_edges,1,MPIU_INT,MPI_SUM,wcomm);CHKERRQ(ierr); /* not correct syncronization and global */ nSend2 = 0; while(1){ #define BF_SZ 10000 PetscMPIInt flag,count; PetscInt rbuff[BF_SZ],*pt,*pt2,*pt3,count2,*sbuff,count3; MPI_Request *request; ierr = MPI_Iprobe(MPI_ANY_SOURCE, tag1, wcomm, &flag, &status);CHKERRQ(ierr); if (!flag) break; ierr = MPI_Get_count(&status, MPIU_INT, &count);CHKERRQ(ierr); if (count > BF_SZ) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"buffer too small for recieve: %d",count); proc = status.MPI_SOURCE; /* recieve request tag1 [n, proc, n*[gid1,lid0] ] */ ierr = MPI_Recv(rbuff, count, MPIU_INT, proc, tag1, wcomm, &status);CHKERRQ(ierr); /* count sends */ pt = rbuff; count3 = count2 = 0; n = *pt++; kk = *pt++; assert(kk==proc); while(n--){ PetscInt gid1=*pt++, lid1=gid1-my0; kk=*pt++; assert(lid1>=0 && lid1<nloc); if (lid_matched[lid1]){ PetscPrintf(PETSC_COMM_SELF,"\t *** [%d]%s %d) ERROR recieved deleted gid %d, deleted by (lid) %d from proc %d\n",rank,__FUNCT__,sub_it,gid1,kk); PetscSleep(1); } assert(!lid_matched[lid1]); lid_matched[lid1] = PETSC_TRUE; /* keep track of what we've done this round */ ierr = PetscCDSizeAt(agg_llists, lid1, &kk);CHKERRQ(ierr); count2 += kk + 2; count3++; /* number of verts requested (n) */ } assert(pt-rbuff==count); if (count2 > count3*CHUNCK_SIZE) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Irecv will be too small: %d",count2); /* send tag2 *[lid0, n, n*[gid] ] */ ierr = PetscMalloc(count2*sizeof(PetscInt) + sizeof(MPI_Request), &sbuff);CHKERRQ(ierr); request = (MPI_Request*)sbuff; sreqs2[nSend2++] = request; /* cache request */ if (nSend2==REQ_BF_SIZE) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"buffer too small for requests: %d",nSend2); pt2 = sbuff = (PetscInt*)(request+1); pt = rbuff; n = *pt++; kk = *pt++; assert(kk==proc); while(n--){ /* read [n, proc, n*[gid1,lid0] */ PetscInt gid1=*pt++, lid1=gid1-my0, lid0=*pt++; assert(lid1>=0 && lid1<nloc); /* write [lid0, n, n*[gid] ] */ *pt2++ = lid0; pt3 = pt2++; /* save pointer for later */ /* for (pos=PetscCDGetHeadPos(agg_llists,lid1) ; pos ; pos=PetscCDGetNextPos(agg_llists,lid1,pos)){ */ ierr = PetscCDGetHeadPos(agg_llists,lid1,&pos);CHKERRQ(ierr); while(pos){ PetscInt gid; ierr = PetscLLNGetID(pos, &gid);CHKERRQ(ierr); ierr = PetscCDGetNextPos(agg_llists,lid1,&pos);CHKERRQ(ierr); *pt2++ = gid; } *pt3 = (pt2-pt3)-1; /* clear list */ ierr = PetscCDRemoveAll(agg_llists, lid1);CHKERRQ(ierr); } assert(pt2-sbuff==count2); assert(pt-rbuff==count); /* send requested data tag2 *[lid0, n, n*[gid1] ] */ ierr = MPI_Isend(sbuff, count2, MPIU_INT, proc, tag2, wcomm, request);CHKERRQ(ierr); } /* recieve tag2 *[lid0, n, n*[gid] ] */ for (kk=0;kk<nSend1;kk++){ PetscMPIInt count; MPI_Request *request; PetscInt *pt, *pt2; request = rreqs2[kk]; /* no need to free -- buffer is in 'sbuffs1' */ ierr = MPI_Wait(request, &status);CHKERRQ(ierr); ierr = MPI_Get_count(&status, MPIU_INT, &count);CHKERRQ(ierr); pt = pt2 = (PetscInt*)(request+1); while(pt-pt2 < count){ PetscInt lid0 = *pt++, n = *pt++; assert(lid0>=0 && lid0<nloc); while(n--){ PetscInt gid1 = *pt++; ierr = PetscCDAppendID(agg_llists, lid0, gid1);CHKERRQ(ierr); } } assert(pt-pt2==count); } /* wait for tag1 isends */ while(nSend1--){ MPI_Request *request; request = (MPI_Request*)sbuffs1[nSend1]; ierr = MPI_Wait(request, &status);CHKERRQ(ierr); ierr = PetscFree(request);CHKERRQ(ierr); } ierr = PetscFree(sbuffs1);CHKERRQ(ierr); /* wait for tag2 isends */ while(nSend2--){ MPI_Request *request = sreqs2[nSend2]; ierr = MPI_Wait(request, &status);CHKERRQ(ierr); ierr = PetscFree(request);CHKERRQ(ierr); } ierr = VecRestoreArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr); ierr = VecRestoreArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr); /* get 'cpcol_matched' - use locMaxPE, ghostMaxEdge, cpcol_max_ew */ for (kk=0,gid=my0;kk<nloc;kk++,gid++) { PetscScalar vval = lid_matched[kk] ? 1.0 : 0.0; ierr = VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */ } ierr = VecAssemblyBegin(locMaxPE);CHKERRQ(ierr); ierr = VecAssemblyEnd(locMaxPE);CHKERRQ(ierr); ierr = VecScatterBegin(mpimat->Mvctx,locMaxPE,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecScatterEnd(mpimat->Mvctx,locMaxPE,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecGetArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr); ierr = VecGetLocalSize(mpimat->lvec, &n);CHKERRQ(ierr); for (kk=0;kk<n;kk++) { cpcol_matched[kk] = (PetscBool)(PetscRealPart(cpcol_max_ew[kk]) != 0.0); } ierr = VecRestoreArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr); } /* size > 1 */ /* compute 'locMaxEdge' */ ierr = VecRestoreArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr); for (kk=0,gid=my0;kk<nloc;kk++,gid++){ PetscReal max_e = 0.,tt; PetscScalar vval; PetscInt lid = kk; if (lid_matched[lid]) vval = 0.; else { ii = matA->i; n = ii[lid+1] - ii[lid]; idx = matA->j + ii[lid]; ap = matA->a + ii[lid]; for (jj=0; jj<n; jj++) { PetscInt lidj = idx[jj]; if (lid_matched[lidj]) continue; /* this is new - can change local max */ if (lidj != lid && PetscRealPart(ap[jj]) > max_e) max_e = PetscRealPart(ap[jj]); } if (lid_cprowID && (ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */ ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix]; ap = matB->a + ii[ix]; idx = matB->j + ii[ix]; for (jj=0 ; jj<n ; jj++) { PetscInt lidj = idx[jj]; if (cpcol_matched[lidj]) continue; if ((tt=PetscRealPart(ap[jj])) > max_e) max_e = tt; } } } vval = (PetscScalar)max_e; ierr = VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); /* set with GID */ } ierr = VecAssemblyBegin(locMaxEdge);CHKERRQ(ierr); ierr = VecAssemblyEnd(locMaxEdge);CHKERRQ(ierr); if (size>1 && sub_it != n_sub_its-1){ /* compute 'cpcol_max_ew' */ ierr = VecScatterBegin(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecScatterEnd(mpimat->Mvctx,locMaxEdge,ghostMaxEdge,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecGetArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr); ierr = VecGetArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr); /* compute 'cpcol_max_pe' */ for (kk=0,gid=my0;kk<nloc;kk++,gid++){ PetscInt lid = kk; PetscReal ew,v1_max_e,v0_max_e=PetscRealPart(lid_max_ew[lid]); PetscScalar vval; PetscMPIInt max_pe=rank,pe; if (lid_matched[lid]) vval = (PetscScalar)rank; else if ((ix=lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */ ii = matB->compressedrow.i; n = ii[ix+1] - ii[ix]; ap = matB->a + ii[ix]; idx = matB->j + ii[ix]; for (jj=0 ; jj<n ; jj++) { PetscInt lidj = idx[jj]; if (cpcol_matched[lidj]) continue; ew = PetscRealPart(ap[jj]); v1_max_e = PetscRealPart(cpcol_max_ew[lidj]); /* get max pe that has a max_e == to this edge w */ if ((pe=cpcol_pe[idx[jj]]) > max_pe && ew > v1_max_e - 1.e-12 && ew > v0_max_e - 1.e-12) max_pe = pe; assert(ew < v0_max_e + 1.e-12 && ew < v1_max_e + 1.e-12); } vval = (PetscScalar)max_pe; } ierr = VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES);CHKERRQ(ierr); } ierr = VecAssemblyBegin(locMaxPE);CHKERRQ(ierr); ierr = VecAssemblyEnd(locMaxPE);CHKERRQ(ierr); ierr = VecScatterBegin(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecScatterEnd(mpimat->Mvctx,locMaxPE,ghostMaxPE,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); ierr = VecGetArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr); ierr = VecRestoreArray(locMaxEdge, &lid_max_ew);CHKERRQ(ierr); } /* deal with deleted ghost */ if (verbose>2) PetscPrintf(wcomm,"\t[%d]%s %d.%d: %d active edges.\n", rank,__FUNCT__,iter,sub_it,nactive_edges); if (!nactive_edges) break; } /* sub_it loop */ /* clean up iteration */ ierr = PetscFree(Edges);CHKERRQ(ierr); if (mpimat){ ierr = VecRestoreArray(ghostMaxEdge, &cpcol_max_ew);CHKERRQ(ierr); ierr = VecDestroy(&ghostMaxEdge);CHKERRQ(ierr); ierr = VecRestoreArray(ghostMaxPE, &cpcol_max_pe);CHKERRQ(ierr); ierr = VecDestroy(&ghostMaxPE);CHKERRQ(ierr); ierr = PetscFree(cpcol_pe);CHKERRQ(ierr); ierr = PetscFree(cpcol_matched);CHKERRQ(ierr); } ierr = VecDestroy(&locMaxEdge);CHKERRQ(ierr); ierr = VecDestroy(&locMaxPE);CHKERRQ(ierr); if (mpimat){ ierr = VecRestoreArray(mpimat->lvec, &cpcol_gid);CHKERRQ(ierr); } /* create next G if needed */ if (iter == n_iter) { /* hard wired test - need to look at full surrounded nodes or something */ ierr = MatDestroy(&P);CHKERRQ(ierr); ierr = MatDestroy(&cMat);CHKERRQ(ierr); break; } else { Vec diag; /* add identity for unmatched vertices so they stay alive */ for (kk=0,gid=my0;kk<nloc;kk++,gid++){ if (!lid_matched[kk]) { gid = kk+my0; ierr = MatGetRow(cMat,gid,&n,0,0);CHKERRQ(ierr); if (n>1){ ierr = MatSetValues(P,1,&gid,1,&gid,&one,INSERT_VALUES);CHKERRQ(ierr); } ierr = MatRestoreRow(cMat,gid,&n,0,0);CHKERRQ(ierr); } } ierr = MatAssemblyBegin(P,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(P,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); /* project to make new graph with colapsed edges */ ierr = MatPtAP(cMat,P,MAT_INITIAL_MATRIX,1.0,&tMat);CHKERRQ(ierr); ierr = MatDestroy(&P);CHKERRQ(ierr); ierr = MatDestroy(&cMat);CHKERRQ(ierr); cMat = tMat; ierr = MatGetVecs(cMat, &diag, 0);CHKERRQ(ierr); ierr = MatGetDiagonal(cMat, diag);CHKERRQ(ierr); /* effectively PCJACOBI */ ierr = VecReciprocal(diag);CHKERRQ(ierr); ierr = VecSqrtAbs(diag);CHKERRQ(ierr); ierr = MatDiagonalScale(cMat, diag, diag);CHKERRQ(ierr); ierr = VecDestroy(&diag);CHKERRQ(ierr); } } /* coarsen iterator */ /* make fake matrix */ if (size>1){ Mat mat; PetscCDPos pos; PetscInt gid, NN, MM, jj = 0, mxsz = 0; for (kk=0;kk<nloc;kk++){ ierr = PetscCDSizeAt(agg_llists, kk, &jj);CHKERRQ(ierr); if (jj > mxsz) mxsz = jj; } ierr = MatGetSize(a_Gmat, &MM, &NN);CHKERRQ(ierr); if (mxsz > MM-nloc) mxsz = MM-nloc; ierr = MatCreateAIJ(wcomm, nloc, nloc,PETSC_DETERMINE, PETSC_DETERMINE,0, 0, mxsz, 0, &mat);CHKERRQ(ierr); /* */ for (kk=0,gid=my0;kk<nloc;kk++,gid++){ /* for (pos=PetscCDGetHeadPos(agg_llists,kk) ; pos ; pos=PetscCDGetNextPos(agg_llists,kk,pos)){ */ ierr = PetscCDGetHeadPos(agg_llists,kk,&pos);CHKERRQ(ierr); while(pos){ PetscInt gid1; ierr = PetscLLNGetID(pos, &gid1);CHKERRQ(ierr); ierr = PetscCDGetNextPos(agg_llists,kk,&pos);CHKERRQ(ierr); if (gid1 < my0 || gid1 >= my0+nloc) { ierr = MatSetValues(mat,1,&gid,1,&gid1,&one,ADD_VALUES);CHKERRQ(ierr); } } } ierr = MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = PetscCDSetMat(agg_llists, mat);CHKERRQ(ierr); } ierr = PetscFree(lid_cprowID);CHKERRQ(ierr); ierr = PetscFree(lid_gid);CHKERRQ(ierr); ierr = PetscFree(lid_matched);CHKERRQ(ierr); ierr = PetscCDDestroy(deleted_list);CHKERRQ(ierr); PetscFunctionReturn(0); }
int read_exoII_file(int Proc, int Num_Proc, PROB_INFO_PTR prob, PARIO_INFO_PTR pio_info, MESH_INFO_PTR mesh) { #ifndef ZOLTAN_NEMESIS Gen_Error(0, "Fatal: Nemesis requested but not linked with driver."); return 0; #else /* ZOLTAN_NEMESIS */ /* Local declarations. */ char *yo = "read_exoII_mesh"; char par_nem_fname[FILENAME_MAX+1], title[MAX_LINE_LENGTH+1]; char cmesg[256]; float ver; int i, pexoid, cpu_ws = 0, io_ws = 0; int *nnodes = NULL, *etypes = NULL; #ifdef DEBUG_EXO int j, k, elem; #endif FILE *fdtmp; /***************************** BEGIN EXECUTION ******************************/ DEBUG_TRACE_START(Proc, yo); /* since this is a test driver, set error reporting in exodus */ ex_opts(EX_VERBOSE | EX_DEBUG); /* generate the parallel filename for this processor */ gen_par_filename(pio_info->pexo_fname, par_nem_fname, pio_info, Proc, Num_Proc); /* * check whether parallel file exists. do the check with fopen * as ex_open coredumps on the paragon when files do not exist. */ if ((fdtmp = fopen(par_nem_fname, "r")) == NULL) { sprintf(cmesg,"fatal: parallel Exodus II file %s does not exist", par_nem_fname); Gen_Error(0, cmesg); return 0; } else fclose(fdtmp); /* * now open the existing parallel file using Exodus calls. */ if ((pexoid = ex_open(par_nem_fname, EX_READ, &cpu_ws, &io_ws, &ver)) < 0) { sprintf(cmesg,"fatal: could not open parallel Exodus II file %s", par_nem_fname); Gen_Error(0, cmesg); return 0; } /* and get initial information */ if (ex_get_init(pexoid, title, &(mesh->num_dims), &(mesh->num_nodes), &(mesh->num_elems), &(mesh->num_el_blks), &(mesh->num_node_sets), &(mesh->num_side_sets)) < 0) { Gen_Error(0, "fatal: Error returned from ex_get_init"); return 0; } /* alocate some memory for the element blocks */ mesh->data_type = MESH; mesh->vwgt_dim = 1; /* One weight for now. */ mesh->ewgt_dim = 1; /* One weight for now. */ mesh->eb_etypes = (int *) malloc (5 * mesh->num_el_blks * sizeof(int)); if (!mesh->eb_etypes) { Gen_Error(0, "fatal: insufficient memory"); return 0; } mesh->eb_ids = mesh->eb_etypes + mesh->num_el_blks; mesh->eb_cnts = mesh->eb_ids + mesh->num_el_blks; mesh->eb_nnodes = mesh->eb_cnts + mesh->num_el_blks; mesh->eb_nattrs = mesh->eb_nnodes + mesh->num_el_blks; mesh->eb_names = (char **) malloc (mesh->num_el_blks * sizeof(char *)); if (!mesh->eb_names) { Gen_Error(0, "fatal: insufficient memory"); return 0; } mesh->hindex = (int *) malloc(sizeof(int)); mesh->hindex[0] = 0; if (ex_get_elem_blk_ids(pexoid, mesh->eb_ids) < 0) { Gen_Error(0, "fatal: Error returned from ex_get_elem_blk_ids"); return 0; } /* allocate temporary storage for items needing global reduction. */ /* nemesis does not store most element block info about blocks for */ /* which the processor owns no elements. */ /* we, however, use this information in migration, so we need to */ /* accumulate it for all element blocks. kdd 2/2001 */ if (mesh->num_el_blks > 0) { nnodes = (int *) malloc(2 * mesh->num_el_blks * sizeof(int)); if (!nnodes) { Gen_Error(0, "fatal: insufficient memory"); return 0; } etypes = nnodes + mesh->num_el_blks; } /* get the element block information */ for (i = 0; i < mesh->num_el_blks; i++) { /* allocate space for name */ mesh->eb_names[i] = (char *) malloc((MAX_STR_LENGTH+1) * sizeof(char)); if (!mesh->eb_names[i]) { Gen_Error(0, "fatal: insufficient memory"); return 0; } if (ex_get_elem_block(pexoid, mesh->eb_ids[i], mesh->eb_names[i], &(mesh->eb_cnts[i]), &(nnodes[i]), &(mesh->eb_nattrs[i])) < 0) { Gen_Error(0, "fatal: Error returned from ex_get_elem_block"); return 0; } if (mesh->eb_cnts[i] > 0) { if ((etypes[i] = (int) get_elem_type(mesh->eb_names[i], nnodes[i], mesh->num_dims)) == E_TYPE_ERROR) { Gen_Error(0, "fatal: could not get element type"); return 0; } } else etypes[i] = (int) NULL_EL; } /* Perform reduction on necessary fields of element blocks. kdd 2/2001 */ MPI_Allreduce(nnodes, mesh->eb_nnodes, mesh->num_el_blks, MPI_INT, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(etypes, mesh->eb_etypes, mesh->num_el_blks, MPI_INT, MPI_MIN, MPI_COMM_WORLD); for (i = 0; i < mesh->num_el_blks; i++) { strcpy(mesh->eb_names[i], get_elem_name(mesh->eb_etypes[i])); } free(nnodes); /* * allocate memory for the elements * allocate a little extra for element migration latter */ mesh->elem_array_len = mesh->num_elems + 5; mesh->elements = (ELEM_INFO_PTR) malloc (mesh->elem_array_len * sizeof(ELEM_INFO)); if (!(mesh->elements)) { Gen_Error(0, "fatal: insufficient memory"); return 0; } /* * intialize all of the element structs as unused by * setting the globalID to -1 */ for (i = 0; i < mesh->elem_array_len; i++) initialize_element(&(mesh->elements[i])); /* read the information for the individual elements */ if (!read_elem_info(pexoid, Proc, prob, mesh)) { Gen_Error(0, "fatal: Error returned from read_elem_info"); return 0; } /* read the communication information */ if (!read_comm_map_info(pexoid, Proc, prob, mesh)) { Gen_Error(0, "fatal: Error returned from read_comm_map_info"); return 0; } /* Close the parallel file */ if(ex_close (pexoid) < 0) { Gen_Error(0, "fatal: Error returned from ex_close"); return 0; } /* print out the distributed mesh */ if (Debug_Driver > 3) print_distributed_mesh(Proc, Num_Proc, mesh); DEBUG_TRACE_END(Proc, yo); return 1; #endif /* ZOLTAN_NEMESIS */ }
int main(int argc, char **argv) { const int MAX_ITER = 50; const double RELTOL = 1e-2; const double ABSTOL = 1e-4; /* * Some bookkeeping variables for MPI. The 'rank' of a process is its numeric id * in the process pool. For example, if we run a program via `mpirun -np 4 foo', then * the process ranks are 0 through 3. Here, N and size are the total number of processes * running (in this example, 4). */ int rank; int size; MPI_Init(&argc, &argv); // Initialize the MPI execution environment MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Determine current running process MPI_Comm_size(MPI_COMM_WORLD, &size); // Total number of processes double N = (double) size; // Number of subsystems/slaves for ADMM /* Read in local data */ int skinny; // A flag indicating whether the matrix A is fat or skinny FILE *f; int m, n; int row, col; double entry; /* * Subsystem n will look for files called An.dat and bn.dat * in the current directory; these are its local data and do not need to be * visible to any other processes. Note that * m and n here refer to the dimensions of the *local* coefficient matrix. */ /* Read A */ char s[20]; sprintf(s, "data/A%d.dat", rank + 1); printf("[%d] reading %s\n", rank, s); f = fopen(s, "r"); if (f == NULL) { printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s); exit(EXIT_FAILURE); } mm_read_mtx_array_size(f, &m, &n); gsl_matrix *A = gsl_matrix_calloc(m, n); for (int i = 0; i < m*n; i++) { row = i % m; col = floor(i/m); fscanf(f, "%lf", &entry); gsl_matrix_set(A, row, col, entry); } fclose(f); /* Read b */ sprintf(s, "data/b%d.dat", rank + 1); printf("[%d] reading %s\n", rank, s); f = fopen(s, "r"); if (f == NULL) { printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s); exit(EXIT_FAILURE); } mm_read_mtx_array_size(f, &m, &n); gsl_vector *b = gsl_vector_calloc(m); for (int i = 0; i < m; i++) { fscanf(f, "%lf", &entry); gsl_vector_set(b, i, entry); } fclose(f); m = A->size1; n = A->size2; skinny = (m >= n); /* * These are all variables related to ADMM itself. There are many * more variables than in the Matlab implementation because we also * require vectors and matrices to store various intermediate results. * The naming scheme follows the Matlab version of this solver. */ double rho = 1.0; gsl_vector *x = gsl_vector_calloc(n); gsl_vector *u = gsl_vector_calloc(n); gsl_vector *z = gsl_vector_calloc(n); gsl_vector *y = gsl_vector_calloc(n); gsl_vector *r = gsl_vector_calloc(n); gsl_vector *zprev = gsl_vector_calloc(n); gsl_vector *zdiff = gsl_vector_calloc(n); gsl_vector *q = gsl_vector_calloc(n); gsl_vector *w = gsl_vector_calloc(n); gsl_vector *Aq = gsl_vector_calloc(m); gsl_vector *p = gsl_vector_calloc(m); gsl_vector *Atb = gsl_vector_calloc(n); double send[3]; // an array used to aggregate 3 scalars at once double recv[3]; // used to receive the results of these aggregations double nxstack = 0; double nystack = 0; double prires = 0; double dualres = 0; double eps_pri = 0; double eps_dual = 0; /* Precompute and cache factorizations */ gsl_blas_dgemv(CblasTrans, 1, A, b, 0, Atb); // Atb = A^T b /* * The lasso regularization parameter here is just hardcoded * to 0.5 for simplicity. Using the lambda_max heuristic would require * network communication, since it requires looking at the *global* A^T b. */ double lambda = 0.5; if (rank == 0) { printf("using lambda: %.4f\n", lambda); } gsl_matrix *L; /* Use the matrix inversion lemma for efficiency; see section 4.2 of the paper */ if (skinny) { /* L = chol(AtA + rho*I) */ L = gsl_matrix_calloc(n,n); gsl_matrix *AtA = gsl_matrix_calloc(n,n); gsl_blas_dsyrk(CblasLower, CblasTrans, 1, A, 0, AtA); gsl_matrix *rhoI = gsl_matrix_calloc(n,n); gsl_matrix_set_identity(rhoI); gsl_matrix_scale(rhoI, rho); gsl_matrix_memcpy(L, AtA); gsl_matrix_add(L, rhoI); gsl_linalg_cholesky_decomp(L); gsl_matrix_free(AtA); gsl_matrix_free(rhoI); } else { /* L = chol(I + 1/rho*AAt) */ L = gsl_matrix_calloc(m,m); gsl_matrix *AAt = gsl_matrix_calloc(m,m); gsl_blas_dsyrk(CblasLower, CblasNoTrans, 1, A, 0, AAt); gsl_matrix_scale(AAt, 1/rho); gsl_matrix *eye = gsl_matrix_calloc(m,m); gsl_matrix_set_identity(eye); gsl_matrix_memcpy(L, AAt); gsl_matrix_add(L, eye); gsl_linalg_cholesky_decomp(L); gsl_matrix_free(AAt); gsl_matrix_free(eye); } /* Main ADMM solver loop */ int iter = 0; if (rank == 0) { printf("%3s %10s %10s %10s %10s %10s\n", "#", "r norm", "eps_pri", "s norm", "eps_dual", "objective"); } double startAllTime, endAllTime; startAllTime = MPI_Wtime(); while (iter < MAX_ITER) { /* u-update: u = u + x - z */ gsl_vector_sub(x, z); gsl_vector_add(u, x); /* x-update: x = (A^T A + rho I) \ (A^T b + rho z - y) */ gsl_vector_memcpy(q, z); gsl_vector_sub(q, u); gsl_vector_scale(q, rho); gsl_vector_add(q, Atb); // q = A^T b + rho*(z - u) double tmp, tmpq; gsl_blas_ddot(x, x, &tmp); gsl_blas_ddot(q, q, &tmpq); if (skinny) { /* x = U \ (L \ q) */ gsl_linalg_cholesky_solve(L, q, x); } else { /* x = q/rho - 1/rho^2 * A^T * (U \ (L \ (A*q))) */ gsl_blas_dgemv(CblasNoTrans, 1, A, q, 0, Aq); gsl_linalg_cholesky_solve(L, Aq, p); gsl_blas_dgemv(CblasTrans, 1, A, p, 0, x); /* now x = A^T * (U \ (L \ (A*q)) */ gsl_vector_scale(x, -1/(rho*rho)); gsl_vector_scale(q, 1/rho); gsl_vector_add(x, q); } /* * Message-passing: compute the global sum over all processors of the * contents of w and t. Also, update z. */ gsl_vector_memcpy(w, x); gsl_vector_add(w, u); // w = x + u gsl_blas_ddot(r, r, &send[0]); gsl_blas_ddot(x, x, &send[1]); gsl_blas_ddot(u, u, &send[2]); send[2] /= pow(rho, 2); gsl_vector_memcpy(zprev, z); // could be reduced to a single Allreduce call by concatenating send to w MPI_Allreduce(w->data, z->data, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(send, recv, 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); prires = sqrt(recv[0]); /* sqrt(sum ||r_i||_2^2) */ nxstack = sqrt(recv[1]); /* sqrt(sum ||x_i||_2^2) */ nystack = sqrt(recv[2]); /* sqrt(sum ||y_i||_2^2) */ gsl_vector_scale(z, 1/N); soft_threshold(z, lambda/(N*rho)); /* Termination checks */ /* dual residual */ gsl_vector_memcpy(zdiff, z); gsl_vector_sub(zdiff, zprev); dualres = sqrt(N) * rho * gsl_blas_dnrm2(zdiff); /* ||s^k||_2^2 = N rho^2 ||z - zprev||_2^2 */ /* compute primal and dual feasibility tolerances */ eps_pri = sqrt(n*N)*ABSTOL + RELTOL * fmax(nxstack, sqrt(N)*gsl_blas_dnrm2(z)); eps_dual = sqrt(n*N)*ABSTOL + RELTOL * nystack; if (rank == 0) { printf("%3d %10.4f %10.4f %10.4f %10.4f %10.4f\n", iter, prires, eps_pri, dualres, eps_dual, objective(A, b, lambda, z)); } if (prires <= eps_pri && dualres <= eps_dual) { break; } /* Compute residual: r = x - z */ gsl_vector_memcpy(r, x); gsl_vector_sub(r, z); iter++; } /* Have the master write out the results to disk */ if (rank == 0) { endAllTime = MPI_Wtime(); printf("Elapsed time is: %lf \n", endAllTime - startAllTime); f = fopen("data/solution.dat", "w"); gsl_vector_fprintf(f, z, "%lf"); fclose(f); } MPI_Finalize(); /* Shut down the MPI execution environment */ /* Clear memory */ gsl_matrix_free(A); gsl_matrix_free(L); gsl_vector_free(b); gsl_vector_free(x); gsl_vector_free(u); gsl_vector_free(z); gsl_vector_free(y); gsl_vector_free(r); gsl_vector_free(w); gsl_vector_free(zprev); gsl_vector_free(zdiff); gsl_vector_free(q); gsl_vector_free(Aq); gsl_vector_free(Atb); gsl_vector_free(p); return EXIT_SUCCESS; }
/* Set CPU affinity. Can be important for performance. On some systems (e.g. Cray) CPU Affinity is set by default. But default assigning doesn't work (well) with only some ranks having threads. This causes very low performance. External tools have cumbersome syntax for setting affinity in the case that only some ranks have threads. Thus it is important that GROMACS sets the affinity internally if only PME is using threads. */ void gmx_set_thread_affinity(FILE *fplog, const t_commrec *cr, const gmx_hw_opt_t *hw_opt, const gmx_hw_info_t *hwinfo) { int nth_affinity_set, thread0_id_node, nthread_local, nthread_node; int offset; const int *locality_order; int rc; if (hw_opt->thread_affinity == threadaffOFF) { /* Nothing to do */ return; } /* If the tMPI thread affinity setting is not supported encourage the user * to report it as it's either a bug or an exotic platform which we might * want to support. */ if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES) { /* we know Mac OS & BlueGene do not support setting thread affinity, so there's no point in warning the user in that case. In any other case the user might be able to do something about it. */ #if !defined(__APPLE__) && !defined(__bg__) md_print_warn(NULL, fplog, "NOTE: Cannot set thread affinities on the current platform."); #endif /* __APPLE__ */ return; } /* threads on this MPI process or TMPI thread */ if (cr->duty & DUTY_PP) { nthread_local = gmx_omp_nthreads_get(emntNonbonded); } else { nthread_local = gmx_omp_nthreads_get(emntPME); } /* map the current process to cores */ thread0_id_node = 0; nthread_node = nthread_local; #ifdef GMX_MPI if (PAR(cr) || MULTISIM(cr)) { /* We need to determine a scan of the thread counts in this * compute node. */ MPI_Comm comm_intra; MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(), cr->rank_intranode, &comm_intra); MPI_Scan(&nthread_local, &thread0_id_node, 1, MPI_INT, MPI_SUM, comm_intra); /* MPI_Scan is inclusive, but here we need exclusive */ thread0_id_node -= nthread_local; /* Get the total number of threads on this physical node */ MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra); MPI_Comm_free(&comm_intra); } #endif if (hw_opt->thread_affinity == threadaffAUTO && nthread_node != hwinfo->nthreads_hw_avail) { if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail) { md_print_warn(cr, fplog, "NOTE: The number of threads is not equal to the number of (logical) cores\n" " and the -pin option is set to auto: will not pin thread to cores.\n" " This can lead to significant performance degradation.\n" " Consider using -pin on (and -pinoffset in case you run multiple jobs).\n"); } return; } offset = 0; if (hw_opt->core_pinning_offset != 0) { offset = hw_opt->core_pinning_offset; md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset); } int core_pinning_stride = hw_opt->core_pinning_stride; rc = get_thread_affinity_layout(fplog, cr, hwinfo, nthread_node, offset, &core_pinning_stride, &locality_order); if (rc != 0) { /* Incompatible layout, don't pin, warning was already issued */ return; } /* Set the per-thread affinity. In order to be able to check the success * of affinity settings, we will set nth_affinity_set to 1 on threads * where the affinity setting succeded and to 0 where it failed. * Reducing these 0/1 values over the threads will give the total number * of threads on which we succeeded. */ // To avoid warnings from the static analyzer we initialize nth_affinity_set // to zero outside the OpenMP block, and then add to it inside the block. // The value will still always be 0 or 1 from each thread. nth_affinity_set = 0; #pragma omp parallel num_threads(nthread_local) reduction(+:nth_affinity_set) { try { int thread_id, thread_id_node; int index, core; gmx_bool setaffinity_ret; thread_id = gmx_omp_get_thread_num(); thread_id_node = thread0_id_node + thread_id; index = offset + thread_id_node*core_pinning_stride; if (locality_order != NULL) { core = locality_order[index]; } else { core = index; } setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core); /* store the per-thread success-values of the setaffinity */ nth_affinity_set += (setaffinity_ret == 0); if (debug) { fprintf(debug, "On rank %2d, thread %2d, index %2d, core %2d the affinity setting returned %d\n", cr->nodeid, gmx_omp_get_thread_num(), index, core, setaffinity_ret); } } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } if (nth_affinity_set > nthread_local) { char msg[STRLEN]; sprintf(msg, "Looks like we have set affinity for more threads than " "we have (%d > %d)!\n", nth_affinity_set, nthread_local); gmx_incons(msg); } else { /* check & warn if some threads failed to set their affinities */ if (nth_affinity_set != nthread_local) { char sbuf1[STRLEN], sbuf2[STRLEN]; /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */ sbuf1[0] = sbuf2[0] = '\0'; /* Only add rank info if we have more than one rank. */ if (cr->nnodes > 1) { #ifdef GMX_MPI #ifdef GMX_THREAD_MPI sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid); #else /* GMX_LIB_MPI */ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid); #endif #endif /* GMX_MPI */ } if (nthread_local > 1) { sprintf(sbuf2, "for %d/%d thread%s ", nthread_local - nth_affinity_set, nthread_local, nthread_local > 1 ? "s" : ""); } md_print_warn(NULL, fplog, "NOTE: %sAffinity setting %sfailed. This can cause performance degradation.\n" " If you think your settings are correct, ask on the gmx-users list.", sbuf1, sbuf2); } } return; }
void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[]) { wallcc_t *wcc; double buf[ewcNR],*cyc_all,*buf_all; int i; if (wc == NULL) { return; } wcc = wc->wcc; if (wcc[ewcDDCOMMLOAD].n > 0) { wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c; } if (wcc[ewcDDCOMMBOUND].n > 0) { wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c; } if (wcc[ewcPMEMESH].n > 0) { wcc[ewcFORCE].c -= wcc[ewcPMEMESH].c; } if (wcc[ewcPMEMESH_SEP].n > 0) { /* This must be a PME only node, calculate the Wait + Comm. time */ wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH_SEP].c; } else { /* Correct the PME mesh only call count */ wcc[ewcPMEMESH_SEP].n = wcc[ewcFORCE].n; wcc[ewcPMEWAITCOMM].n = wcc[ewcFORCE].n; } /* Store the cycles in a double buffer for summing */ for(i=0; i<ewcNR; i++) { cycles[i] = (double)wcc[i].c; } if (wcc[ewcUPDATE].n > 0) { /* Remove the constraint part from the update count */ cycles[ewcUPDATE] -= cycles[ewcCONSTR]; } #ifdef GMX_MPI if (cr->nnodes > 1) { MPI_Allreduce(cycles,buf,ewcNR,MPI_DOUBLE,MPI_SUM, cr->mpi_comm_mysim); for(i=0; i<ewcNR; i++) { cycles[i] = buf[i]; } if (wc->wcc_all != NULL) { snew(cyc_all,ewcNR*ewcNR); snew(buf_all,ewcNR*ewcNR); for(i=0; i<ewcNR*ewcNR; i++) { cyc_all[i] = wc->wcc_all[i].c; } MPI_Allreduce(cyc_all,buf_all,ewcNR*ewcNR,MPI_DOUBLE,MPI_SUM, cr->mpi_comm_mysim); for(i=0; i<ewcNR*ewcNR; i++) { wc->wcc_all[i].c = buf_all[i]; } sfree(buf_all); sfree(cyc_all); } } #endif }
void system::set_geometry(const bool init) { const double dt_max = 1.0/512; scheduler = Scheduler(dt_max); int np; float lx, ly, lz; FILE *fin = NULL; if (myproc == 0) { float wp; fin = fopen(fin_data, "r"); int ival; size_t nread; nread = fread(&ival, sizeof(int), 1, fin); assert(ival == 2*sizeof(int)); nread = fread(&np, sizeof(int), 1, fin); nread = fread(&wp, sizeof(float), 1, fin); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == 2*sizeof(int)); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == 3*sizeof(float)); nread = fread(&lx, sizeof(float), 1, fin); nread = fread(&ly, sizeof(float), 1, fin); nread = fread(&lz, sizeof(float), 1, fin); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == 3*sizeof(float)); fprintf(stderr, " np= %d wp= %g \n",np, wp); fprintf(stderr, " lx= %g ly= %g lz= %g \n", lx, ly, lz); } MPI_Bcast(&lx, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&ly, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&lz, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); t_end = 0.2; n_restart = 2; dt_restart = dt_max; dt_dump = 0.01; di_log = 100; global_n = local_n = 0; // eulerian = true; const vec3 rmin(0.0); const vec3 rmax(lx, ly, lz); global_domain = boundary(rmin, rmax); global_domain_size = global_domain.hsize() * 2.0; const vec3 Len3 = global_domain.hsize() * 2.0; pfloat<0>::set_scale(Len3.x); pfloat<1>::set_scale(Len3.y); pfloat<2>::set_scale(Len3.z); if (myproc == 0) { ptcl.resize(np); const int nx = (int)std::pow(np, 1.0/3.0); const dvec3 dr = dvec3(Len3.x/nx, Len3.y/nx, Len3.z/nx); const real rmax = dr.abs() * 1.0; fprintf(stderr, "dr= %g %g %g \n", dr.x, dr.y, dr.z); local_n = ptcl.size(); global_n = local_n; { std::vector<float> x(local_n), y(local_n), z(local_n); size_t nread; int ival; nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); nread = fread(&x[0], sizeof(float), local_n, fin); assert((int)nread == local_n); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); nread = fread(&y[0], sizeof(float), local_n, fin); assert((int)nread == local_n); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); nread = fread(&z[0], sizeof(float), local_n, fin); assert((int)nread == local_n); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); for (int i = 0; i < local_n; i++) { const dvec3 vel(0.0, 0.0, 0.0); ptcl[i] = Particle(x[i], y[i], z[i], vel.x, vel.y, vel.z, i); ptcl[i].rmax = rmax; ptcl[i].unset_derefine(); } } U.resize(local_n); const int var_list[7] = { Fluid::VELX, Fluid::VELY, Fluid::VELZ, Fluid::DENS, Fluid::BX, Fluid::BY, Fluid::BZ}; std::vector<float> data(local_n); for (int var = 0; var < 7; var++) { fprintf(stderr, " reading vat %d out of %d \n", var+1, 7); int ival; size_t nread; nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); nread = fread(&data[0], sizeof(float), local_n, fin); assert((int)nread == local_n); nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float)); for (int i = 0; i < local_n; i++) U[i][var_list[var]] = data[i]; } for (int i = 0; i < local_n; i++) { assert(U[i][Fluid::DENS] > 0.0); U[i][Fluid::ETHM] = cs2 * U[i][Fluid::DENS]; } fclose(fin); fprintf(stderr, " *** proc= %d : local_n= %d global_n= %d \n", myproc, local_n, global_n); } // myproc == 0 MPI_Bcast(&global_n, 1, MPI_INT, 0, MPI_COMM_WORLD); fprintf(stderr, " proc= %d distrubite \n", myproc); MPI_Barrier(MPI_COMM_WORLD); Distribute::int3 nt(1, 1, 1); switch(nproc) { case 1: break; case 2: nt.x = 2; nt.y = 1; nt.z = 1; break; case 4: nt.x = 2; nt.y = 2; nt.z = 1; break; case 6: nt.x = 3; nt.y = 2; nt.z = 1; break; case 8: nt.x = 2; nt.y = 2; nt.z = 2; break; case 16: nt.x = 4; nt.y = 2; nt.z = 2; break; case 32: nt.x = 4; nt.y = 4; nt.z = 2; break; case 64: nt.x = 4; nt.y = 4; nt.z = 4; break; case 128: nt.x = 8; nt.y = 4; nt.z = 4; break; case 256: nt.x = 8; nt.y = 8; nt.z = 4; break; case 512: nt.x = 8; nt.y = 8; nt.z = 8; break; default: assert(false); } const Distribute::int3 nt_glb(nt); const pBoundary pglobal_domain(pfloat3(0.0), pfloat3(Len3)); distribute_glb.set(nproc, nt, pglobal_domain); for (int k = 0; k < 5; k++) distribute_data(true, false); const int nloc_reserve = (int)(2.0*global_n/nproc); fit_reserve_vec(ptcl, nloc_reserve); fit_reserve_vec(ptcl_ppos, nloc_reserve); fit_reserve_vec(U, nloc_reserve); fit_reserve_vec(dU, nloc_reserve); fit_reserve_vec(Wgrad, nloc_reserve); fit_reserve_vec(gradPsi, nloc_reserve); fit_reserve_vec(cells, nloc_reserve); MPI_Barrier(MPI_COMM_WORLD); fprintf(stderr, " *** proc= %d : local_n= %d global_n= %d \n", myproc, local_n, global_n); fprintf(stderr, " proc= %d building_mesh \n", myproc); MPI_Barrier(MPI_COMM_WORLD); const double t10 = mytimer::get_wtime(); clear_mesh(); int nattempt = build_mesh(true); double dt10 = mytimer::get_wtime() - t10; double volume_loc = 0.0; { std::vector<TREAL> v(local_n); for (int i = 0; i < local_n; i++) v[i] = cells[i].Volume; std::sort(v.begin(), v.end()); // sort volumes from low to high, to avoid roundoff errors for (int i = 0; i < local_n; i++) volume_loc += v[i]; } double dt10max; MPI_Allreduce(&dt10, &dt10max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); double volume_glob = 0.0; int nattempt_max, nattempt_min; MPI_Allreduce(&volume_loc, &volume_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&nattempt, &nattempt_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(&nattempt, &nattempt_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); const double volume_exact = global_domain_size.x*global_domain_size.y*global_domain_size.z; if (myproc == 0) { fprintf(stderr, "first call build_mesh:[ %g sec :: %g cells/s/proc/thread ]\n", dt10max, global_n/nproc/dt10max); fprintf(stderr, " computed_volume= %g exact_volume= %g diff= %g [ %g ] nattempt= %d %d \n", volume_glob, volume_exact, volume_glob - volume_exact, (volume_glob - volume_exact)/volume_exact, nattempt_min, nattempt_max); } exchange_ptcl(); }
/* ************************************************************************ */ static void calculate_jacobi (struct calculation_arguments const* arguments, struct calculation_results *results, struct options const* options) { int i, j; /* local variables for loops */ int m1, m2; /* used as indices for old and new matrices */ double star; /* four times center value minus 4 neigh.b values */ double residuum; /* residuum of current iteration */ double maxresiduum; /* maximum residuum value of a slave in iteration */ const int nproc = arguments->nproc; const int rank = arguments->rank; int const N = arguments->N; int const N_global = arguments->N_global; double const h = arguments->h; int term_iteration = options->term_iteration; /* initialize m1 and m2 depending on algorithm */ if (options->method == METH_JACOBI) { m1 = 0; m2 = 1; } else { m1 = 0; m2 = 0; } while (term_iteration > 0) { double** Matrix_Out = arguments->Matrix[m1]; double** Matrix_In = arguments->Matrix[m2]; maxresiduum = 0; /* over all rows */ for (i = 1; i < N; i++) { /* over all columns */ for (j = 1; j < N_global; j++) { star = 0.25 * (Matrix_In[i-1][j] + Matrix_In[i][j-1] + Matrix_In[i][j+1] + Matrix_In[i+1][j]); if (options->inf_func == FUNC_FPISIN) { star += (0.25 * TWO_PI_SQUARE * h * h) * sin((PI * h) * ((double)i + arguments->offset)) * sin((PI * h) * (double)j); } if (options->termination == TERM_PREC || term_iteration == 1) { residuum = Matrix_In[i][j] - star; residuum = (residuum < 0) ? -residuum : residuum; maxresiduum = (residuum < maxresiduum) ? maxresiduum : residuum; } Matrix_Out[i][j] = star; } } // Communicate lines with each other into each other's extra allocated line if(rank > 0) { MPI_Sendrecv(Matrix_Out[1], N_global, MPI_DOUBLE, rank - 1, rank , Matrix_Out[0], N_global, MPI_DOUBLE, rank - 1, rank - 1, MPI_COMM_WORLD, NULL); } // Last rank can't communicate with higher ranks because there aren't any if(rank != nproc - 1) { MPI_Sendrecv(Matrix_Out[N - 1], N_global, MPI_DOUBLE, rank + 1, rank, Matrix_Out[N], N_global, MPI_DOUBLE, rank + 1, rank + 1, MPI_COMM_WORLD, NULL); } /* exchange m1 and m2 */ i = m1; m1 = m2; m2 = i; // Find lowest maxresiduum in whole process swarm MPI_Allreduce(MPI_IN_PLACE, &maxresiduum, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); results->stat_iteration++; results->stat_precision = maxresiduum; /* check for stopping calculation, depending on termination method */ if (options->termination == TERM_PREC) { if (maxresiduum < options->term_precision) { term_iteration = 0; } } else if (options->termination == TERM_ITER) { term_iteration--; } } results->m = m2; }
int main(int argc, char* argv[]) { LIS_MATRIX A,A0; LIS_VECTOR b,x,v; LIS_SCALAR ntimes,nmflops,nnrm2; LIS_SCALAR *value; int nprocs,my_rank; int nthreads, maxthreads; int gn,nnz,mode; int i,j,jj,j0,j1,l,k,n,np,h,ih; int m,nn,ii; int block; int rn,rmin,rmax,rb; int is,ie,clsize,ci,*iw; int err,iter,storage; int *ptr,*index; double mem,val,ra,rs,ri,ria,ca,time,time2,convtime,val2,nnzs,nnzap,nnzt; double commtime,comptime,flops; FILE *file; char path[1024]; LIS_DEBUG_FUNC_IN; lis_initialize(&argc, &argv); #ifdef USE_MPI MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); #else nprocs = 1; my_rank = 0; #endif if( argc < 4 ) { if( my_rank==0 ) printf("Usage: spmvtest4 matrix_filename matrix_type iter [block] \n"); lis_finalize(); exit(0); } file = fopen(argv[1], "r"); if( file==NULL ) CHKERR(1); storage = atoi(argv[2]); iter = atoi(argv[3]); if (argv[4] == NULL) { block = 2; } else { block = atoi(argv[4]); } if( storage<1 || storage>11 ) { if( my_rank==0 ) printf("storage=%d <1 or storage=%d >11\n",storage,storage); CHKERR(1); } if( iter<=0 ) { if( my_rank==0 ) printf("iter=%d <= 0\n",iter); CHKERR(1); } if( my_rank==0 ) { printf("\n"); printf("number of processes = %d\n",nprocs); } #ifdef _OPENMP if( my_rank==0 ) { nthreads = omp_get_num_procs(); maxthreads = omp_get_max_threads(); printf("max number of threads = %d\n", nthreads); printf("number of threads = %d\n", maxthreads); } #else nthreads = 1; maxthreads = 1; #endif /* create matrix and vectors */ lis_matrix_create(LIS_COMM_WORLD,&A0); err = lis_input(A0,NULL,NULL,argv[1]); CHKERR(err); n = A0->n; gn = A0->gn; nnz = A0->nnz; np = A0->np-n; #ifdef USE_MPI MPI_Allreduce(&nnz,&i,1,MPI_INT,MPI_SUM,A0->comm); nnzap = (double)i / (double)nprocs; nnzt = ((double)nnz -nnzap)*((double)nnz -nnzap); nnz = i; MPI_Allreduce(&nnzt,&nnzs,1,MPI_DOUBLE,MPI_SUM,A0->comm); nnzs = (nnzs / (double)nprocs)/nnzap; MPI_Allreduce(&np,&i,1,MPI_INT,MPI_SUM,A0->comm); np = i; #endif err = lis_vector_duplicate(A0,&x); if( err ) CHKERR(err); err = lis_vector_duplicate(A0,&b); if( err ) CHKERR(err); lis_matrix_get_range(A0,&is,&ie); for(i=0;i<n;i++) { err = lis_vector_set_value(LIS_INS_VALUE,i+is,1.0,x); } lis_matrix_duplicate(A0,&A); lis_matrix_set_type(A,storage); err = lis_matrix_convert(A0,A); if( err ) CHKERR(err); comptime = 0.0; commtime = 0.0; for(i=0;i<iter;i++) { #ifdef USE_MPI MPI_Barrier(A->comm); time = lis_wtime(); lis_send_recv(A->commtable,x->value); commtime += lis_wtime() - time; #endif time2 = lis_wtime(); lis_matvec(A,x,b); comptime += lis_wtime() - time2; } lis_vector_nrm2(b,&val); if( my_rank==0 ) { flops = 2.0*nnz*iter*1.0e-6 / comptime; if( A->matrix_type==LIS_MATRIX_BSR || A->matrix_type==LIS_MATRIX_BSC ) { A->bnr = block; A->bnc = block; printf("format = %s(%dx%d) (%2d), iteration = %d, computation = %e sec., %8.3f MFLOPS, communication = %e sec., communication/computation = %3.3f %%, 2-norm = %e\n",lis_storagename2[storage-1],block,block,storage,iter,comptime,flops,commtime,commtime/comptime*100,val); } else { printf("format = %s (%2d), iteration = %d, computation = %e sec., %8.3f MFLOPS, communication = %e sec., communication/computation = %3.3f %%, 2-norm = %e\n",lis_storagename2[storage-1],storage,iter,comptime,flops,commtime,commtime/comptime*100,val); } lis_matrix_destroy(A); } lis_matrix_destroy(A); lis_matrix_destroy(A0); lis_vector_destroy(b); lis_vector_destroy(x); lis_finalize(); LIS_DEBUG_FUNC_OUT; return 0; }
void _Pouliquen_etal_UpdateDrawParameters( void* rheology ) { Pouliquen_etal* self = (Pouliquen_etal*) rheology; Particle_Index lParticle_I; Particle_Index particleLocalCount; StrainWeakening* strainWeakening = self->strainWeakening; MaterialPoint* materialPoint; double length; double brightness; double opacity; double strainWeakeningRatio; double localMaxStrainIncrement; double localMeanStrainIncrement; Particle_Index localFailed; double globalMaxStrainIncrement; double globalMeanStrainIncrement; Particle_Index globalFailed; double averagedGlobalMaxStrainIncrement = 0.0; double oneOverGlobalMaxStrainIncrement; double postFailureWeakeningIncrement; /* Note : this function defines some drawing parameters (brightness, opacity, diameter) as * functions of the strain weakening - this needs to be improved since most of the parameters * that define this dependency are hard coded here. We need to have a more flexible way * to construct the viz parameters as functions of material parameters */ /* We should only update the drawing parameters if the strain weakening is defined */ if (strainWeakening==NULL) return; localMaxStrainIncrement = 0.0; localMeanStrainIncrement = 0.0; localFailed = 0; /* Update all variables */ Variable_Update( self->hasYieldedVariable->variable ); Variable_Update( self->brightness->variable ); Variable_Update( self->opacity->variable ); Variable_Update( self->diameter->variable ); Variable_Update( strainWeakening->postFailureWeakeningIncrement->variable ); particleLocalCount = self->hasYieldedVariable->variable->arraySize; for ( lParticle_I = 0 ; lParticle_I < particleLocalCount ; lParticle_I++ ) { if ( Variable_GetValueChar( self->hasYieldedVariable->variable, lParticle_I )) { localFailed++; postFailureWeakeningIncrement = Variable_GetValueDouble( strainWeakening->postFailureWeakeningIncrement->variable, lParticle_I ); localMeanStrainIncrement += postFailureWeakeningIncrement; if(localMaxStrainIncrement < postFailureWeakeningIncrement) localMaxStrainIncrement = postFailureWeakeningIncrement; } } (void)MPI_Allreduce( &localMaxStrainIncrement, &globalMaxStrainIncrement, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD ); (void)MPI_Allreduce( &localMeanStrainIncrement, &globalMeanStrainIncrement, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); (void)MPI_Allreduce( &localFailed, &globalFailed, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); if(globalFailed == 0) return; globalMeanStrainIncrement /= (double) globalFailed; averagedGlobalMaxStrainIncrement = 0.5 * averagedGlobalMaxStrainIncrement + 0.25 * globalMeanStrainIncrement + 0.25 * globalMaxStrainIncrement; /* Let's simply assume that twice the mean is a good place to truncate these values */ oneOverGlobalMaxStrainIncrement = 1.0 / averagedGlobalMaxStrainIncrement; for ( lParticle_I = 0 ; lParticle_I < particleLocalCount ; lParticle_I++ ) { materialPoint = (MaterialPoint*)Swarm_ParticleAt( strainWeakening->swarm, lParticle_I ); if ( Variable_GetValueChar( self->hasYieldedVariable->variable, lParticle_I ) == False || StrainWeakening_GetPostFailureWeakening( strainWeakening, materialPoint ) < 0.0 ) { Variable_SetValueFloat( self->brightness->variable, lParticle_I, 0.0 ); Variable_SetValueFloat( self->opacity->variable, lParticle_I, 0.0 ); Variable_SetValueFloat( self->diameter->variable, lParticle_I, 0.0 ); continue; } postFailureWeakeningIncrement = Variable_GetValueDouble( strainWeakening->postFailureWeakeningIncrement->variable, lParticle_I ); strainWeakeningRatio = StrainWeakening_CalcRatio( strainWeakening, materialPoint ); length = 0.001 + 0.003 * strainWeakeningRatio; brightness = strainWeakeningRatio * postFailureWeakeningIncrement * oneOverGlobalMaxStrainIncrement; opacity = 0.5 * brightness; if( brightness > 1.0 ) brightness = 1.0; if( opacity > 0.5 ) opacity = 0.5; if( opacity < 0.1 ) opacity = 0.0; Variable_SetValueFloat( self->brightness->variable, lParticle_I, brightness ); Variable_SetValueFloat( self->opacity->variable, lParticle_I, opacity ); Variable_SetValueFloat( self->diameter->variable, lParticle_I, (float) length ); } }
int main(int argc, char **argv) { char myhost[256]; real_t dt = 0; int nvtk = 0; char outnum[80]; int time_output = 0; long flops = 0; // real_t output_time = 0.0; real_t next_output_time = 0; double start_time = 0, end_time = 0; double start_iter = 0, end_iter = 0; double elaps = 0; struct timespec start, end; double cellPerCycle = 0; double avgCellPerCycle = 0; long nbCycle = 0; // array of timers to profile the code memset(functim, 0, TIM_END * sizeof(functim[0])); #ifdef MPI MPI_Init(&argc, &argv); #endif process_args(argc, argv, &H); hydro_init(&H, &Hv); if (H.mype == 0) fprintf(stdout, "Hydro starts in %s precision.\n", ((sizeof(real_t) == sizeof(double))? "double": "single")); gethostname(myhost, 255); if (H.mype == 0) { fprintf(stdout, "Hydro: Main process running on %s\n", myhost); } #ifdef _OPENMP if (H.mype == 0) { fprintf(stdout, "Hydro: OpenMP mode ON\n"); fprintf(stdout, "Hydro: OpenMP %d max threads\n", omp_get_max_threads()); fprintf(stdout, "Hydro: OpenMP %d num threads\n", omp_get_num_threads()); fprintf(stdout, "Hydro: OpenMP %d num procs\n", omp_get_num_procs()); } #endif #ifdef MPI if (H.mype == 0) { fprintf(stdout, "Hydro: MPI run with %d procs\n", H.nproc); } #else fprintf(stdout, "Hydro: standard build\n"); #endif // PRINTUOLD(H, &Hv); #ifdef MPI if (H.nproc > 1) #if FTI>0 MPI_Barrier(FTI_COMM_WORLD); #endif #if FTI==0 MPI_Barrier(MPI_COMM_WORLD); #endif #endif if (H.dtoutput > 0) { // outputs are in physical time not in time steps time_output = 1; next_output_time = next_output_time + H.dtoutput; } if (H.dtoutput > 0 || H.noutput > 0) vtkfile(++nvtk, H, &Hv); if (H.mype == 0) fprintf(stdout, "Hydro starts main loop.\n"); //pre-allocate memory before entering in loop //For godunov scheme start = cclock(); start = cclock(); allocate_work_space(H.nxyt, H, &Hw_godunov, &Hvw_godunov); compute_deltat_init_mem(H, &Hw_deltat, &Hvw_deltat); end = cclock(); #ifdef MPI #if FTI==1 FTI_Protect(0,functim, TIM_END,FTI_DBLE); FTI_Protect(1,&nvtk,1,FTI_INTG); FTI_Protect(2,&next_output_time,1,FTI_DBLE); FTI_Protect(3,&dt,1,FTI_DBLE); FTI_Protect(4,&MflopsSUM,1,FTI_DBLE); FTI_Protect(5,&nbFLOPS,1,FTI_LONG); FTI_Protect(6,&(H.nstep),1,FTI_INTG); FTI_Protect(7,&(H.t),1,FTI_DBLE); FTI_Protect(8,Hv.uold,H.nvar * H.nxt * H.nyt,FTI_DBLE); #endif #endif if (H.mype == 0) fprintf(stdout, "Hydro: init mem %lfs\n", ccelaps(start, end)); // we start timings here to avoid the cost of initial memory allocation start_time = dcclock(); while ((H.t < H.tend) && (H.nstep < H.nstepmax)) { //system("top -b -n1"); // reset perf counter for this iteration flopsAri = flopsSqr = flopsMin = flopsTra = 0; start_iter = dcclock(); outnum[0] = 0; if ((H.nstep % 2) == 0) { dt = 0; // if (H.mype == 0) fprintf(stdout, "Hydro computes deltat.\n"); start = cclock(); compute_deltat(&dt, H, &Hw_deltat, &Hv, &Hvw_deltat); end = cclock(); functim[TIM_COMPDT] += ccelaps(start, end); if (H.nstep == 0) { dt = dt / 2.0; if (H.mype == 0) fprintf(stdout, "Hydro computes initial deltat: %le\n", dt); } #ifdef MPI if (H.nproc > 1) { real_t dtmin; // printf("pe=%4d\tdt=%lg\n",H.mype, dt); #if FTI==0 if (sizeof(real_t) == sizeof(double)) { MPI_Allreduce(&dt, &dtmin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); } else { MPI_Allreduce(&dt, &dtmin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD); } #endif #if FTI>0 if (sizeof(real_t) == sizeof(double)) { MPI_Allreduce(&dt, &dtmin, 1, MPI_DOUBLE, MPI_MIN, FTI_COMM_WORLD); } else { MPI_Allreduce(&dt, &dtmin, 1, MPI_FLOAT, MPI_MIN, FTI_COMM_WORLD); } #endif dt = dtmin; } #endif } // dt = 1.e-3; // if (H.mype == 1) fprintf(stdout, "Hydro starts godunov.\n"); if ((H.nstep % 2) == 0) { hydro_godunov(1, dt, H, &Hv, &Hw_godunov, &Hvw_godunov); // hydro_godunov(2, dt, H, &Hv, &Hw, &Hvw); } else { hydro_godunov(2, dt, H, &Hv, &Hw_godunov, &Hvw_godunov); // hydro_godunov(1, dt, H, &Hv, &Hw, &Hvw); } end_iter = dcclock(); cellPerCycle = (double) (H.globnx * H.globny) / (end_iter - start_iter) / 1000000.0L; avgCellPerCycle += cellPerCycle; nbCycle++; H.nstep++; H.t += dt; { real_t iter_time = (real_t) (end_iter - start_iter); #ifdef MPI long flopsAri_t, flopsSqr_t, flopsMin_t, flopsTra_t; start = cclock(); #if FTI==0 MPI_Allreduce(&flopsAri, &flopsAri_t, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&flopsSqr, &flopsSqr_t, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&flopsMin, &flopsMin_t, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&flopsTra, &flopsTra_t, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD); #endif #if FTI>0 MPI_Allreduce(&flopsAri, &flopsAri_t, 1, MPI_LONG, MPI_SUM, FTI_COMM_WORLD); MPI_Allreduce(&flopsSqr, &flopsSqr_t, 1, MPI_LONG, MPI_SUM, FTI_COMM_WORLD); MPI_Allreduce(&flopsMin, &flopsMin_t, 1, MPI_LONG, MPI_SUM, FTI_COMM_WORLD); MPI_Allreduce(&flopsTra, &flopsTra_t, 1, MPI_LONG, MPI_SUM, FTI_COMM_WORLD); #endif // if (H.mype == 1) // printf("%ld %ld %ld %ld %ld %ld %ld %ld \n", flopsAri, flopsSqr, flopsMin, flopsTra, flopsAri_t, flopsSqr_t, flopsMin_t, flopsTra_t); flops = flopsAri_t * FLOPSARI + flopsSqr_t * FLOPSSQR + flopsMin_t * FLOPSMIN + flopsTra_t * FLOPSTRA; end = cclock(); functim[TIM_ALLRED] += ccelaps(start, end); #else flops = flopsAri * FLOPSARI + flopsSqr * FLOPSSQR + flopsMin * FLOPSMIN + flopsTra * FLOPSTRA; #endif nbFLOPS++; if (flops > 0) { if (iter_time > 1.e-9) { double mflops = (double) flops / (double) 1.e+6 / iter_time; MflopsSUM += mflops; sprintf(outnum, "%s {%.2f Mflops %ld Ops} (%.3fs)", outnum, mflops, flops, iter_time); } } else { sprintf(outnum, "%s (%.3fs)", outnum, iter_time); } } if (time_output == 0 && H.noutput > 0) { if ((H.nstep % H.noutput) == 0) { vtkfile(++nvtk, H, &Hv); sprintf(outnum, "%s [%04d]", outnum, nvtk); } } else { if (time_output == 1 && H.t >= next_output_time) { vtkfile(++nvtk, H, &Hv); next_output_time = next_output_time + H.dtoutput; sprintf(outnum, "%s [%04d]", outnum, nvtk); } } if (H.mype == 0) { fprintf(stdout, "--> step=%4d, %12.5e, %10.5e %.3lf MC/s%s\n", H.nstep, H.t, dt, cellPerCycle, outnum); fflush(stdout); } #ifdef MPI #if FTI==1 FTI_Snapshot(); #endif #endif } // while end_time = dcclock(); // Deallocate work spaces deallocate_work_space(H.nxyt, H, &Hw_godunov, &Hvw_godunov); compute_deltat_clean_mem(H, &Hw_deltat, &Hvw_deltat); hydro_finish(H, &Hv); elaps = (double) (end_time - start_time); timeToString(outnum, elaps); if (H.mype == 0) { fprintf(stdout, "Hydro ends in %ss (%.3lf) <%.2lf MFlops>.\n", outnum, elaps, (float) (MflopsSUM / nbFLOPS)); fprintf(stdout, " "); } if (H.nproc == 1) { int sizeFmt = sizeLabel(functim, TIM_END); printTimingsLabel(TIM_END, sizeFmt); fprintf(stdout, "\n"); if (sizeof(real_t) == sizeof(double)) { fprintf(stdout, "PE0_DP "); } else { fprintf(stdout, "PE0_SP "); } printTimings(functim, TIM_END, sizeFmt); fprintf(stdout, "\n"); fprintf(stdout, "%% "); percentTimings(functim, TIM_END); printTimings(functim, TIM_END, sizeFmt); fprintf(stdout, "\n"); } #ifdef MPI if (H.nproc > 1) { double timMAX[TIM_END]; double timMIN[TIM_END]; double timSUM[TIM_END]; #if FTI==0 MPI_Allreduce(functim, timMAX, TIM_END, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); MPI_Allreduce(functim, timMIN, TIM_END, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(functim, timSUM, TIM_END, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #endif #if FTI>0 MPI_Allreduce(functim, timMAX, TIM_END, MPI_DOUBLE, MPI_MAX, FTI_COMM_WORLD); MPI_Allreduce(functim, timMIN, TIM_END, MPI_DOUBLE, MPI_MIN, FTI_COMM_WORLD); MPI_Allreduce(functim, timSUM, TIM_END, MPI_DOUBLE, MPI_SUM, FTI_COMM_WORLD); #endif if (H.mype == 0) { int sizeFmt = sizeLabel(timMAX, TIM_END); printTimingsLabel(TIM_END, sizeFmt); fprintf(stdout, "\n"); fprintf(stdout, "MIN "); printTimings(timMIN, TIM_END, sizeFmt); fprintf(stdout, "\n"); fprintf(stdout, "MAX "); printTimings(timMAX, TIM_END, sizeFmt); fprintf(stdout, "\n"); fprintf(stdout, "AVG "); avgTimings(timSUM, TIM_END, H.nproc); printTimings(timSUM, TIM_END, sizeFmt); fprintf(stdout, "\n"); } } #endif if (H.mype == 0) { fprintf(stdout, "Average MC/s: %.3lf\n", (double)(avgCellPerCycle / nbCycle)); } #ifdef MPI #if FTI>0 FTI_Finalize(); #endif MPI_Finalize(); #endif return 0; }
//============================================================================== //Here's the main... //============================================================================== int main(int argc, char** argv) { MPI_Comm comm; int numProcs = 1, localProc = 0; CHK_ERR( fei_test_utils::initialize_mpi(argc, argv, localProc, numProcs) ); comm = MPI_COMM_WORLD; double start_time = fei::utils::cpu_time(); //read input parameters from a file specified on the command-line with // '-i file' std::vector<std::string> stdstrings; CHK_ERR( fei_test_utils::get_filename_and_read_input(argc, argv, comm, localProc, stdstrings) ); //parse the strings from the input file into a fei::ParameterSet object. fei::ParameterSet paramset; fei::utils::parse_strings(stdstrings, " ", paramset); std::string solverName; int L = 0; int outputLevel = 0; int errcode = 0; errcode += paramset.getStringParamValue("SOLVER_LIBRARY", solverName); errcode += paramset.getIntParamValue("L", L); paramset.getIntParamValue("outputLevel", outputLevel); if (errcode != 0) { fei::console_out() << "Failed to find one or more required parameters in input-file." << FEI_ENDL << "Required parameters:"<<FEI_ENDL << "SOLVER_LIBRARY" << FEI_ENDL << "L" << FEI_ENDL; #ifndef FEI_SER MPI_Finalize(); #endif return(-1); } if (localProc == 0) { int nodes = (L+1)*(L+1); int eqns = nodes; //macros FEI_COUT and FEI_ENDL are aliases for std::cout and std::endl, //defined in fei_iostream.hpp. FEI_COUT << "\n========================================================\n"; FEI_COUT << "FEI version: " << fei::utils::version() << "\n"; FEI_COUT << "Square size L: " << L << " elements.\n"; FEI_COUT << "Global number of elements: " << L*L << "\n"; FEI_COUT << "Global number of nodes: " << nodes << "\n"; FEI_COUT << "Global number of equations: " << eqns <<"\n"; FEI_COUT << "========================================================" << FEI_ENDL; } if (outputLevel == 1) { if (localProc != 0) outputLevel = 0; } if (outputLevel>0) { fei_test_utils::print_args(argc, argv); } //PoissonData is the object that will be in charge of generating the //data to pump into the FEI objects. PoissonData poissonData(L, numProcs, localProc, outputLevel); double start_init_time = fei::utils::cpu_time(); fei::SharedPtr<fei::Factory> factory; try { factory = fei::create_fei_Factory(comm, solverName.c_str()); } catch (std::runtime_error& exc) { FEI_COUT << "library " << solverName << " not available."<<FEI_ENDL; #ifndef FEI_SER MPI_Finalize(); #endif return(-1); } if (factory.get() == NULL) { FEI_COUT << "fei::Factory creation failed." << FEI_ENDL; #ifndef FEI_SER MPI_Finalize(); #endif return(-1); } factory->parameters(paramset); fei::SharedPtr<fei::VectorSpace> nodeSpace = factory->createVectorSpace(comm, "poisson3"); fei::SharedPtr<fei::VectorSpace> dummy; fei::SharedPtr<fei::MatrixGraph> matrixGraph = factory->createMatrixGraph(nodeSpace, dummy, "poisson3"); //load some control parameters. matrixGraph->setParameters(paramset); int numFields = poissonData.getNumFields(); int* fieldSizes = poissonData.getFieldSizes(); int* fieldIDs = poissonData.getFieldIDs(); int nodeIDType = 0; if (outputLevel>0 && localProc==0) FEI_COUT << "defineFields" << FEI_ENDL; nodeSpace->defineFields( numFields, fieldIDs, fieldSizes ); if (outputLevel>0 && localProc==0) FEI_COUT << "defineIDTypes" << FEI_ENDL; nodeSpace->defineIDTypes( 1, &nodeIDType ); CHK_ERR( init_elem_connectivities(matrixGraph.get(), poissonData) ); CHK_ERR( set_shared_nodes(nodeSpace.get(), poissonData) ); //The following IOS_... macros are defined in base/fei_macros.h FEI_COUT.setf(IOS_FIXED, IOS_FLOATFIELD); if (outputLevel>0 && localProc==0) FEI_COUT << "initComplete" << FEI_ENDL; CHK_ERR( matrixGraph->initComplete() ); double fei_init_time = fei::utils::cpu_time() - start_init_time; //Now the initialization phase is complete. Next we'll do the load phase, //which for this problem just consists of loading the element data //(element-wise stiffness arrays and load vectors) and the boundary //condition data. //This simple problem doesn't have any constraint relations, etc. double start_load_time = fei::utils::cpu_time(); fei::SharedPtr<fei::Matrix> mat = factory->createMatrix(matrixGraph); fei::SharedPtr<fei::Vector> solnVec = factory->createVector(nodeSpace, true); fei::SharedPtr<fei::Vector> rhsVec = factory->createVector(nodeSpace); fei::SharedPtr<fei::LinearSystem> linSys= factory->createLinearSystem(matrixGraph); linSys->setMatrix(mat); linSys->setSolutionVector(solnVec); linSys->setRHS(rhsVec); CHK_ERR( linSys->parameters(paramset)); CHK_ERR( load_elem_data(matrixGraph.get(), mat.get(), rhsVec.get(), poissonData) ); CHK_ERR( load_BC_data(linSys.get(), poissonData) ); CHK_ERR( linSys->loadComplete() ); double fei_load_time = fei::utils::cpu_time() - start_load_time; // //now the load phase is complete, so we're ready to launch the underlying //solver and solve Ax=b // fei::SharedPtr<fei::Solver> solver = factory->createSolver(); int status; int itersTaken = 0; if (outputLevel>0 && localProc==0) FEI_COUT << "solve..." << FEI_ENDL; double start_solve_time = fei::utils::cpu_time(); int err = solver->solve(linSys.get(), NULL, //preconditioningMatrix paramset, itersTaken, status); double solve_time = fei::utils::cpu_time() - start_solve_time; if (err!=0) { if (localProc==0) FEI_COUT << "solve returned err: " << err <<", status: " << status << FEI_ENDL; } CHK_ERR( solnVec->scatterToOverlap() ); // //We should make sure the solution we just computed is correct... // int numNodes = nodeSpace->getNumOwnedAndSharedIDs(nodeIDType); double maxErr = 0.0; if (numNodes > 0) { int lenNodeIDs = numNodes; GlobalID* nodeIDs = new GlobalID[lenNodeIDs]; double* soln = new double[lenNodeIDs]; if (nodeIDs != NULL && soln != NULL) { CHK_ERR( nodeSpace->getOwnedAndSharedIDs(nodeIDType, numNodes, nodeIDs, lenNodeIDs) ); int fieldID = 1; CHK_ERR( solnVec->copyOutFieldData(fieldID, nodeIDType, numNodes, nodeIDs, soln)); for(int i=0; i<numNodes; i++) { int nID = (int)nodeIDs[i]; double x = (1.0* ((nID-1)%(L+1)))/L; double y = (1.0* ((nID-1)/(L+1)))/L; double exactSoln = x*x + y*y; double error = std::abs(exactSoln - soln[i]); if (maxErr < error) maxErr = error; } delete [] nodeIDs; delete [] soln; } else { fei::console_out() << "allocation of nodeIDs or soln failed." << FEI_ENDL; } } #ifndef FEI_SER double globalMaxErr = 0.0; MPI_Allreduce(&maxErr, &globalMaxErr, 1, MPI_DOUBLE, MPI_MAX, comm); maxErr = globalMaxErr; #endif bool testPassed = true; if (maxErr > 1.e-6) testPassed = false; double elapsed_cpu_time = fei::utils::cpu_time() - start_time; int returnValue = 0; //The following IOS_... macros are defined in base/fei_macros.h FEI_COUT.setf(IOS_FIXED, IOS_FLOATFIELD); if (localProc==0) { FEI_COUT << "Proc0 cpu times (seconds):" << FEI_ENDL << " FEI initialize: " << fei_init_time << FEI_ENDL << " FEI load: " << fei_load_time << FEI_ENDL << " solve: " << solve_time << FEI_ENDL << "Total program time: " << elapsed_cpu_time << FEI_ENDL; } if (testPassed && returnValue==0 && localProc == 0) { FEI_COUT.setf(IOS_SCIENTIFIC, IOS_FLOATFIELD); FEI_COUT << "poisson: TEST PASSED, maxErr = " << maxErr << ", iterations: " << itersTaken << FEI_ENDL; FEI_COUT << "Poisson test successful" << FEI_ENDL; } if ((testPassed == false || returnValue != 0) && localProc == 0) { FEI_COUT << "maxErr = " << maxErr << ", TEST FAILED\n"; FEI_COUT << "(Test is deemed to have passed if the maximum difference" << " between the exact and computed solutions is 1.e-6 or less, *AND*" << " time-taken matches file-benchmark if available.)" << FEI_ENDL; } #ifndef FEI_SER MPI_Finalize(); #endif return(returnValue); }
int pdshrk(int ndim, int beta, int *resize, int *num) { /******************************************************************* * * If the best vertex is not replaced at the current step, the * convergence theorem requires us to shrink the simplex towards the * best vertex. Since the choice of search schemes may allow us to * to "look-ahead" several iterations, we would like to know the * size of the smallest simplex seen during the step (which, by the * way the scheme is generated, is guaranteed to be a contraction * towards the best vertex) and use this smallest simplex to start * the search at the next iteration so that we avoid as much as * possible needless repetition of points already investigated * during the current iteration. * * This is easily accomplished by simply searching the list of * scalars necessary to reconstruct the simplex associated with a * given point I (contained in the -1 row of SCHEME) for the * smallest value. We also count the number of times this smallest * value has been seen to insure that the entire shrink was * computed. * * Written by Virginia Torczon. * * Last modification: January 6, 1994. * * Parameters * * Input * * N dimension of the problem to be solved * * SSS size of the search scheme * * SCHEME matrix used to hold the processor's piece of * the template to be used for the search scheme * * Output * * RESIZE the size of the smallest shrink factor found * in the given search scheme (at least on this * processor) * * NUM the number of times this shrink factor was * seen *******************************************************************/ #ifdef OPTPP_HAVE_MPI int my_resize, my_num, resultlen, error=0; char buffer[MPI_MAX_ERROR_STRING], emesg[256]; my_resize = *resize; my_num = *num; error = MPI_Allreduce(&my_resize, resize, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); if (error != MPI_SUCCESS) { MPI_Error_string(error, buffer, &resultlen); printf("\npdshrk: MPI Error - %s\n", buffer); strcpy(emesg, "pdshrk: error returned by MPI_Allreduce\n"); return 15; } if (my_resize != *resize) my_num = 0; error = MPI_Allreduce(&my_num, num, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (error != MPI_SUCCESS) { MPI_Error_string(error, buffer, &resultlen); printf("\npdshrk: MPI Error - %s\n", buffer); strcpy(emesg, "pdshrk: error returned by MPI_Allreduce\n"); return 15; } #endif if (*num < ndim) *resize *= beta; return 0; }
/*---< main() >-------------------------------------------------------------*/ int main(int argc, char **argv) { int opt; extern char *optarg; extern int optind; int i, j; int isInFileBinary, isOutFileBinary; int is_output_timing, is_print_usage; int numClusters, numCoords, numObjs, totalNumObjs; int *membership; /* [numObjs] */ char *filename; float **objects; /* [numObjs][numCoords] data objects */ float **clusters; /* [numClusters][numCoords] cluster center */ float threshold; double timing, io_timing, clustering_timing; int rank, nproc, mpi_namelen; char mpi_name[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Get_processor_name(mpi_name,&mpi_namelen); /* some default values */ _debug = 0; threshold = 0.001; numClusters = 0; isInFileBinary = 0; isOutFileBinary = 0; is_output_timing = 0; is_print_usage = 0; filename = NULL; while ( (opt=getopt(argc,argv,"p:i:n:t:abdorh"))!= EOF) { switch (opt) { case 'i': filename=optarg; break; case 'b': isInFileBinary = 1; break; case 'r': isOutFileBinary = 1; break; case 't': threshold=atof(optarg); break; case 'n': numClusters = atoi(optarg); break; case 'o': is_output_timing = 1; break; case 'd': _debug = 1; break; case 'h': is_print_usage = 1; break; default: is_print_usage = 1; break; } } if (filename == 0 || numClusters <= 1 || is_print_usage == 1) { if (rank == 0) usage(argv[0], threshold); MPI_Finalize(); exit(1); } if (_debug) printf("Proc %d of %d running on %s\n", rank, nproc, mpi_name); MPI_Barrier(MPI_COMM_WORLD); io_timing = MPI_Wtime(); /* read data points from file ------------------------------------------*/ objects = mpi_read(isInFileBinary, filename, &numObjs, &numCoords, MPI_COMM_WORLD); if (_debug) { /* print the first 4 objects' coordinates */ int num = (numObjs < 4) ? numObjs : 4; for (i=0; i<num; i++) { char strline[1024], strfloat[16]; sprintf(strline,"%d: objects[%d]= ",rank,i); for (j=0; j<numCoords; j++) { sprintf(strfloat,"%10f",objects[i][j]); strcat(strline, strfloat); } strcat(strline, "\n"); printf("%s",strline); } } timing = MPI_Wtime(); io_timing = timing - io_timing; clustering_timing = timing; /* allocate a 2D space for clusters[] (coordinates of cluster centers) this array should be the same across all processes */ clusters = (float**) malloc(numClusters * sizeof(float*)); assert(clusters != NULL); clusters[0] = (float*) malloc(numClusters * numCoords * sizeof(float)); assert(clusters[0] != NULL); for (i=1; i<numClusters; i++) clusters[i] = clusters[i-1] + numCoords; MPI_Allreduce(&numObjs, &totalNumObjs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* pick first numClusters elements in feature[] as initial cluster centers*/ if (rank == 0) { for (i=0; i<numClusters; i++) for (j=0; j<numCoords; j++) clusters[i][j] = objects[i][j]; } MPI_Bcast(clusters[0], numClusters*numCoords, MPI_FLOAT, 0, MPI_COMM_WORLD); /* membership: the cluster id for each data object */ membership = (int*) malloc(numObjs * sizeof(int)); assert(membership != NULL); /* start the core computation -------------------------------------------*/ mpi_kmeans(objects, numCoords, numObjs, numClusters, threshold, membership, clusters, MPI_COMM_WORLD); free(objects[0]); free(objects); timing = MPI_Wtime(); clustering_timing = timing - clustering_timing; /* output: the coordinates of the cluster centres ----------------------*/ mpi_write(isOutFileBinary, filename, numClusters, numObjs, numCoords, clusters, membership, totalNumObjs, MPI_COMM_WORLD); free(membership); free(clusters[0]); free(clusters); /*---- output performance numbers ---------------------------------------*/ if (is_output_timing) { double max_io_timing, max_clustering_timing; io_timing += MPI_Wtime() - timing; /* get the max timing measured among all processes */ MPI_Reduce(&io_timing, &max_io_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&clustering_timing, &max_clustering_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (rank == 0) { printf("\nPerforming **** Simple Kmeans (MPI) ****\n"); printf("Num of processes = %d\n", nproc); printf("Input file: %s\n", filename); printf("numObjs = %d\n", totalNumObjs); printf("numCoords = %d\n", numCoords); printf("numClusters = %d\n", numClusters); printf("threshold = %.4f\n", threshold); printf("I/O time = %10.4f sec\n", max_io_timing); printf("Computation timing = %10.4f sec\n", max_clustering_timing); } } MPI_Finalize(); return(0); }
PetscErrorCode PCGAMGProlongator_GEO(PC pc,Mat Amat,Mat Gmat,PetscCoarsenData *agg_lists,Mat *a_P_out) { PC_MG *mg = (PC_MG*)pc->data; PC_GAMG *pc_gamg = (PC_GAMG*)mg->innerctx; const PetscInt dim = pc_gamg->data_cell_cols, data_cols = pc_gamg->data_cell_cols; PetscErrorCode ierr; PetscInt Istart,Iend,nloc,my0,jj,kk,ncols,nLocalSelected,bs,*clid_flid; Mat Prol; PetscMPIInt rank, size; MPI_Comm comm; IS selected_2,selected_1; const PetscInt *selected_idx; MatType mtype; PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)Amat,&comm);CHKERRQ(ierr); ierr = PetscLogEventBegin(PC_GAMGProlongator_GEO,0,0,0,0);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MatGetOwnershipRange(Amat, &Istart, &Iend);CHKERRQ(ierr); ierr = MatGetBlockSize(Amat, &bs);CHKERRQ(ierr); nloc = (Iend-Istart)/bs; my0 = Istart/bs; if ((Iend-Istart) % bs) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"(Iend %D - Istart %D) % bs %D",Iend,Istart,bs); /* get 'nLocalSelected' */ ierr = PetscCDGetMIS(agg_lists, &selected_1);CHKERRQ(ierr); ierr = ISGetSize(selected_1, &jj);CHKERRQ(ierr); ierr = PetscMalloc1(jj, &clid_flid);CHKERRQ(ierr); ierr = ISGetIndices(selected_1, &selected_idx);CHKERRQ(ierr); for (kk=0,nLocalSelected=0; kk<jj; kk++) { PetscInt lid = selected_idx[kk]; if (lid<nloc) { ierr = MatGetRow(Gmat,lid+my0,&ncols,0,0);CHKERRQ(ierr); if (ncols>1) clid_flid[nLocalSelected++] = lid; /* fiter out singletons */ ierr = MatRestoreRow(Gmat,lid+my0,&ncols,0,0);CHKERRQ(ierr); } } ierr = ISRestoreIndices(selected_1, &selected_idx);CHKERRQ(ierr); ierr = ISDestroy(&selected_1);CHKERRQ(ierr); /* this is selected_1 in serial */ /* create prolongator matrix */ ierr = MatGetType(Amat,&mtype);CHKERRQ(ierr); ierr = MatCreate(comm, &Prol);CHKERRQ(ierr); ierr = MatSetSizes(Prol,nloc*bs,nLocalSelected*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); ierr = MatSetBlockSizes(Prol, bs, bs);CHKERRQ(ierr); ierr = MatSetType(Prol, mtype);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation(Prol,3*data_cols,NULL);CHKERRQ(ierr); ierr = MatMPIAIJSetPreallocation(Prol,3*data_cols,NULL,3*data_cols,NULL);CHKERRQ(ierr); /* can get all points "removed" - but not on geomg */ ierr = MatGetSize(Prol, &kk, &jj);CHKERRQ(ierr); if (!jj) { ierr = PetscInfo(pc,"ERROE: no selected points on coarse grid\n");CHKERRQ(ierr); ierr = PetscFree(clid_flid);CHKERRQ(ierr); ierr = MatDestroy(&Prol);CHKERRQ(ierr); *a_P_out = NULL; /* out */ PetscFunctionReturn(0); } { PetscReal *coords; PetscInt data_stride; PetscInt *crsGID = NULL; Mat Gmat2; if (dim != data_cols) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"dim %D != data_cols %D",dim,data_cols); /* grow ghost data for better coarse grid cover of fine grid */ #if defined PETSC_GAMG_USE_LOG ierr = PetscLogEventBegin(petsc_gamg_setup_events[SET5],0,0,0,0);CHKERRQ(ierr); #endif /* messy method, squares graph and gets some data */ ierr = getGIDsOnSquareGraph(nLocalSelected, clid_flid, Gmat, &selected_2, &Gmat2, &crsGID);CHKERRQ(ierr); /* llist is now not valid wrt squared graph, but will work as iterator in 'triangulateAndFormProl' */ #if defined PETSC_GAMG_USE_LOG ierr = PetscLogEventEnd(petsc_gamg_setup_events[SET5],0,0,0,0);CHKERRQ(ierr); #endif /* create global vector of coorindates in 'coords' */ if (size > 1) { ierr = PCGAMGGetDataWithGhosts(Gmat2, dim, pc_gamg->data, &data_stride, &coords);CHKERRQ(ierr); } else { coords = (PetscReal*)pc_gamg->data; data_stride = pc_gamg->data_sz/pc_gamg->data_cell_cols; } ierr = MatDestroy(&Gmat2);CHKERRQ(ierr); /* triangulate */ if (dim == 2) { PetscReal metric,tm; #if defined PETSC_GAMG_USE_LOG ierr = PetscLogEventBegin(petsc_gamg_setup_events[SET6],0,0,0,0);CHKERRQ(ierr); #endif ierr = triangulateAndFormProl(selected_2, data_stride, coords,nLocalSelected, clid_flid, agg_lists, crsGID, bs, Prol, &metric);CHKERRQ(ierr); #if defined PETSC_GAMG_USE_LOG ierr = PetscLogEventEnd(petsc_gamg_setup_events[SET6],0,0,0,0);CHKERRQ(ierr); #endif ierr = PetscFree(crsGID);CHKERRQ(ierr); /* clean up and create coordinates for coarse grid (output) */ if (size > 1) ierr = PetscFree(coords);CHKERRQ(ierr); ierr = MPI_Allreduce(&metric, &tm, 1, MPIU_REAL, MPIU_MAX, comm);CHKERRQ(ierr); if (tm > 1.) { /* needs to be globalized - should not happen */ ierr = PetscInfo1(pc," failed metric for coarse grid %e\n",(double)tm);CHKERRQ(ierr); ierr = MatDestroy(&Prol);CHKERRQ(ierr); } else if (metric > .0) { ierr = PetscInfo1(pc,"worst metric for coarse grid = %e\n",(double)metric);CHKERRQ(ierr); } } else SETERRQ(comm,PETSC_ERR_PLIB,"3D not implemented for 'geo' AMG"); { /* create next coords - output */ PetscReal *crs_crds; ierr = PetscMalloc1(dim*nLocalSelected, &crs_crds);CHKERRQ(ierr); for (kk=0; kk<nLocalSelected; kk++) { /* grab local select nodes to promote - output */ PetscInt lid = clid_flid[kk]; for (jj=0; jj<dim; jj++) crs_crds[jj*nLocalSelected + kk] = pc_gamg->data[jj*nloc + lid]; } ierr = PetscFree(pc_gamg->data);CHKERRQ(ierr); pc_gamg->data = crs_crds; /* out */ pc_gamg->data_sz = dim*nLocalSelected; } ierr = ISDestroy(&selected_2);CHKERRQ(ierr); } *a_P_out = Prol; /* out */ ierr = PetscFree(clid_flid);CHKERRQ(ierr); ierr = PetscLogEventEnd(PC_GAMGProlongator_GEO,0,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); }
int main( int argc, char **argv ) { MPI_Datatype *types; void **inbufs, **outbufs; char **names; int *counts, *bytesize, ntype; MPI_Comm comms[20]; int ncomm = 20, rank, np, partner, tag; int i, j, k, err, toterr, world_rank, errloc; MPI_Status status, statuses[2]; int flag, index; char *obuf; MPI_Request requests[2]; MPI_Init( &argc, &argv ); AllocateForData( &types, &inbufs, &outbufs, &counts, &bytesize, &names, &ntype ); GenerateData( types, inbufs, outbufs, counts, bytesize, names, &ntype ); MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); MakeComms( comms, 20, &ncomm, 0 ); /* Test over a wide range of datatypes and communicators */ err = 0; for (i=0; i<ncomm; i++) { MPI_Comm_rank( comms[i], &rank ); MPI_Comm_size( comms[i], &np ); if (np < 2) continue; tag = i; for (j=0; j<ntype; j++) { if (world_rank == 0){ /* SI make size of outputindependent of number of processes */ if (i<2) fprintf( stdout, "Testing type %s\n",names[j] ); } /* This test does an irsend between both partners, with a sendrecv after the irecv used to guarentee that the irsend has a matching receive */ if (rank == 0) { partner = np - 1; #if 0 MPIR_PrintDatatypePack( stdout, counts[j], types[j], 0, 0 ); #endif obuf = outbufs[j]; for (k=0; k<bytesize[j]; k++) obuf[k] = 0; MPI_Irecv(outbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[0] ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, comms[i], &status ); MPI_Irsend( inbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[1] ); do { MPI_Waitany( 2, requests, &index, &status ); } while (index != 0); /* Always the possiblity that the Irsend is still waiting */ MPI_Waitall( 2, requests, statuses ); if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) { char *p1, *p2; fprintf( stderr, "Error in data with type %s (type %d on %d) at byte %d\n", names[j], j, world_rank, errloc - 1 ); p1 = (char *)inbufs[j]; p2 = (char *)outbufs[j]; fprintf( stderr, "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] ); err++; #if 0 MPIR_PrintDatatypeUnpack( stderr, counts[j], types[j], 0, 0 ); #endif } } else if (rank == np - 1) { partner = 0; obuf = outbufs[j]; for (k=0; k<bytesize[j]; k++) obuf[k] = 0; MPI_Irecv(outbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[0] ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, comms[i], &status ); /* Wait for irecv to complete */ do { MPI_Test( &requests[0], &flag, &status ); } while (!flag); if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) { char *p1, *p2; fprintf( stderr, "Error in data with type %s (type %d on %d) at byte %d\n", names[j], j, world_rank, errloc - 1 ); p1 = (char *)inbufs[j]; p2 = (char *)outbufs[j]; fprintf( stderr, "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] ); err++; #if 0 MPIR_PrintDatatypeUnpack( stderr, counts[j], types[j], 0, 0 ); #endif } MPI_Irsend( inbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[1] ); MPI_Waitall(1, &requests[1], &status ); } } } if (err > 0) { fprintf( stderr, "%d errors on %d\n", err, rank ); } MPI_Allreduce( &err, &toterr, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); if (world_rank == 0) { if (toterr == 0) { printf( " No Errors\n" ); } else { printf (" Found %d errors\n", toterr ); } } FreeDatatypes( types, inbufs, outbufs, counts, bytesize, names, ntype ); FreeComms( comms, ncomm ); MPI_Finalize(); return err; }
/** The "checksum" is a function of all the field data at a given step. * That field data, in turn, is a function of the checksum at the previous * step. That way I can be pretty sure that when this restart thing * produces the expected hdf5 files, it's not just fooling me. */ void EBRestart::updateChecksums( Vector<LevelData<EBCellFAB>*>& a_ebvector, int a_nlevs, EBRestart::CheckSumVect& a_checksums ) { for ( int lev=0; lev<a_nlevs; ++lev ) { LevelData<EBCellFAB>& field( *a_ebvector[lev] ); DataIterator dit = field.dataIterator(); for (dit.begin(); dit.ok(); ++dit) { EBCellFAB& ebcf((*a_ebvector[lev])[dit]); BaseFab<Real>& fab(ebcf.getSingleValuedFAB()); a_checksums[lev].len_reg += fab.box().numPts(); a_checksums[lev].len_irreg += ebcf.getEBISBox().getEBGraph().getIrregCells(fab.box()).numPts(); for ( int i=0; i<fab.box().numPts(); ++i ) { long x = long(fab.dataPtr()[i]); a_checksums[lev].sum += x*x; } } // // MPI-reduce the various members of a_checksums[lev]. // #ifdef CH_MPI Real all_proc_sum; Real proc_sum( a_checksums[lev].sum ); if ( MPI_SUCCESS != MPI_Allreduce(&proc_sum, &all_proc_sum, 1, MPI_CH_REAL, MPI_SUM, Chombo_MPI::comm) ) { MayDay::Error("Failure in MPI_Allreduce()."); } a_checksums[lev].sum = long(all_proc_sum); // There's no MPI_CH_LONG. Maybe I should make .sum Real then. // // Reduce a_checksums[lev].len_reg // Real all_proc_len_reg; Real proc_len_reg( a_checksums[lev].len_reg ); if ( MPI_SUCCESS != MPI_Allreduce(&proc_len_reg, &all_proc_len_reg, 1, MPI_CH_REAL, MPI_SUM, Chombo_MPI::comm) ) { MayDay::Error("Failure in MPI_Allreduce()."); } a_checksums[lev].len_reg = long(all_proc_len_reg); // // Reduce a_checksums[lev].len_irreg // Real all_proc_len_irreg; Real proc_len_irreg( a_checksums[lev].len_irreg ); if ( MPI_SUCCESS != MPI_Allreduce(&proc_len_irreg, &all_proc_len_irreg, 1, MPI_CH_REAL, MPI_SUM, Chombo_MPI::comm) ) { MayDay::Error("Failure in MPI_Allreduce()."); } a_checksums[lev].len_irreg = long(all_proc_len_irreg); #endif } }
int main(int argc, char* argv[]) { /****************************************************/ /* Parameter declaration */ /****************************************************/ // Index params int i,j,k = 0; // Directory path params char input_path[1024] = "/projects/isgs/lidar/champaign/las"; char scratch_path[1024] = "/gpfs_scratch/ncasler/data/tmp"; char out_path[1024] = ""; char tmp_path[1024] = ""; // MPI Params int world_size, world_rank, mpi_err; MPI_Comm world_comm = MPI_COMM_WORLD; MPI_Info info = MPI_INFO_NULL; MPI_Status status; MPI_Request request; MPI_Init(&argc, &argv); MPI_Comm_size(world_comm, &world_size); MPI_Comm_rank(world_comm, &world_rank); MPI_Errhandler_set(world_comm, MPI_ERRORS_RETURN); double starttime, endtime; // Set memory limit for grid allocations long buffer_lim = 8000000000; size_t max_size = buffer_lim / sizeof(Pixel); // File specific parameters int n_files, file_off, file_blk, file_end = 0; char ext[5] = ".las"; // Metadata double g_mins[3] = {DBL_MAX,DBL_MAX,DBL_MAX}; // Global min coord double g_maxs[3] = {-DBL_MAX,-DBL_MAX,-DBL_MAX}; // Global max coord double l_mins[3] = {DBL_MAX,DBL_MAX,DBL_MAX}; // Local min coord double l_maxs[3] = {-DBL_MAX,-DBL_MAX,-DBL_MAX}; // Local max coord // File specific params FileCollection *g_files = NULL; // Global file list int g_n_files = 0; // Global file count int l_n_files = 0; // Local file count BBox* g_file_bbox = NULL; // Global array of file bboxes BBox* l_file_bbox = NULL; // Local array of file bboxes // Grid specific params struct Point* origin = new Point(); DType datatype = DT_Float32; struct Grid* g_grid = NULL; // Global grid int g_cols = 0; // Global grid column count int g_rows = 0; // Global grid row count //Specify resolution > should be parameterized in prod double res = 5.0f; // Block scheme params struct BlockScheme* blk_scheme = NULL; int *l_blks = NULL; // Local block array int *g_blks = NULL; // Global block array int l_n_blks = 0; // Local block count int g_n_blks = 0; // Global block count int *blk_n_files = NULL; // Array holding block specific file counts // Las specific params las_file las; long long g_n_pts = 0; // Global point count long long l_n_pts = 0; // Local point count /**************************************************/ /* Begin first file scan */ /**************************************************/ starttime = MPI_Wtime(); g_files = new FileCollection(&input_path[0], &ext[0]); // Check the number of available LAS files g_n_files = g_files->countFiles(); g_file_bbox = new BBox[g_n_files]; l_file_bbox = new BBox[l_n_files]; // Create tif output dir sprintf(&tmp_path[0], "%s/blocks", scratch_path); printf("[%i] Using tmp dir: %s\n", tmp_path); struct stat st = {0}; if (stat(tmp_path, &st) == -1) mkdir(tmp_path, 0700); // Set file Block size file_blk = ceil((float)g_n_files /(float)world_size); file_off = file_blk * world_rank; if (file_off + file_blk > g_n_files) file_end = g_n_files - file_off; else file_end = file_off + file_blk; // Read subset of file paths from dir l_n_files = g_files->getMetadata(file_off, file_end); // Scan metadata from files for (i = file_off; i < file_end; i++) { las.open(g_files->fileList[i]); l_n_pts = l_n_pts + (long long) las.points_count(); compareMin(l_mins, las.minimums()); compareMax(l_maxs, las.maximums()); double *tmp_min = las.minimums(); double *tmp_max = las.maximums(); j = i - file_off; l_file_bbox[j].updateMin(tmp_min[0], tmp_min[1], tmp_min[2]); l_file_bbox[j].updateMax(tmp_max[0], tmp_max[1], tmp_max[2]); las.close(); } endtime = MPI_Wtime(); printf("[%i] Metadata gathered in %f seconds\n", world_rank, endtime - starttime); MPI_Barrier(world_comm); /*****************************************************/ /* Gather global min/max point count */ /* COMMUNICATIONS */ /*****************************************************/ MPI_Allreduce(&l_mins[0], &g_mins[0], 3, MPI_DOUBLE, MPI_MIN, world_comm); MPI_Allreduce(&l_maxs[0], &g_maxs[0], 3, MPI_DOUBLE, MPI_MAX, world_comm); MPI_Allreduce(&l_n_pts, &g_n_pts, 1, MPI_LONG_LONG, MPI_SUM, world_comm); // Gather the bounding box values for each file printf("[%i] Gather bbox values\n", world_rank); MPI_Allgather(&l_file_bbox, l_n_files*sizeof(BBox), MPI_BYTE, &g_file_bbox, l_n_files, l_n_files*sizeof(BBox), world_comm); double io_time = MPI_Wtime(); printf("[%i] Communication finished in %f seconds\n", world_rank, io_time - endtime); // Create origin for global grid origin->update(g_mins[0], g_maxs[0], g_mins[2]); g_cols = (int) ceil((g_maxs[0] - g_mins[0]) / res); g_rows = (int) ceil((g_maxs[1] - g_mins[1]) / res); // Create global grid g_grid = new Grid(origin, g_cols, g_rows, datatype, res, res); // Create global block scheme blk_scheme = new BlockScheme(g_grid, max_size, datatype, world_size); l_n_blks = blk_scheme->getBlockCount(world_rank); // Local block count g_n_blks = blk_scheme->cols * blk_scheme->rows; // Global block count l_blks = blk_scheme->getBlocks(world_rank); // Local block id array printf("[%i] Block Total: %i,Local: %i, first: %i, last: %i \n", g_n_blks, l_n_blks, l_blks[0], l_blks[l_n_blks-1]); /***********************************************************/ /* Get file list for block */ /***********************************************************/ /***********************************************************/ /* Read blocks */ /***********************************************************/ /***********************************************************/ /* Write */ /***********************************************************/ /***********************************************************/ /* Clean up */ /***********************************************************/ g_files->clear(); printf("[%i]Cleaning up\n", world_rank); delete(origin); delete(g_files); delete[] l_file_bbox; delete[] g_file_bbox; delete(g_grid); delete(blk_scheme); MPI_Finalize(); return 0; }