/*! Routine to compute the dot product of two vectors where: This is the reference dot-product implementation. It _CANNOT_ be modified for the purposes of this benchmark. @param[in] n the number of vector elements (on this processor) @param[in] x, y the input vectors @param[in] result a pointer to scalar value, on exit will contain result. @param[out] time_allreduce the time it took to perform the communication between processes @return returns 0 upon success and non-zero otherwise @see ComputeDotProduct */ int ComputeDotProduct_ref(const local_int_t n, const Vector & x, const Vector & y, double & result, double & time_allreduce) { assert(x.localLength>=n); // Test vector lengths assert(y.localLength>=n); double local_result = 0.0; double * xv = x.values; double * yv = y.values; if (yv==xv) { #ifndef HPCG_NO_OPENMP #pragma omp parallel for reduction (+:local_result) #endif for (local_int_t i=0; i<n; i++) local_result += xv[i]*xv[i]; } else { #ifndef HPCG_NO_OPENMP #pragma omp parallel for reduction (+:local_result) #endif for (local_int_t i=0; i<n; i++) local_result += xv[i]*yv[i]; } #ifndef HPCG_NO_MPI // Use MPI's reduce function to collect all partial sums double t0 = mytimer(); double global_result = 0.0; MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); result = global_result; time_allreduce += mytimer() - t0; #else time_allreduce += 0.0; result = local_result; #endif return 0; }
int ddot (const int n, const double * const x, const double * const y, double * const result, double & time_allreduce) { double local_result = 0.0; if (y==x) for (int i=0; i<n; i++) local_result += x[i]*x[i]; else for (int i=0; i<n; i++) local_result += x[i]*y[i]; // a little compute modeling SSTMAC_compute_loop(0, n, 1); #ifdef USING_MPI // Use MPI's reduce function to collect all partial sums double t0 = mytimer(); double global_result = 0.0; int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); // std::cout << "rank " << rank << " computed local ddot as " << local_result << "\n"; MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); *result = global_result; time_allreduce += mytimer() - t0; #else *result = local_result; #endif return(0); }
void perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh, const Box& local_elem_box, MatrixType& A, VectorType& b, Parameters& params) { typedef typename MatrixType::ScalarType Scalar; if (A.rows.size() == 0) return; int num_threads = params.numthreads; timer_type t0 = mytimer(); //We will iterate the local-element-box (local portion of the mesh), and //assemble the FE operators into the global sparse linear-system. int global_elems_x = mesh.global_box[0][1]; int global_elems_y = mesh.global_box[1][1]; int global_elems_z = mesh.global_box[2][1]; GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box); std::vector<GlobalOrdinal> elemIDs(num_elems); BoxIterator iter = BoxIterator::begin(local_elem_box); BoxIterator end = BoxIterator::end(local_elem_box); for(size_t i=0; iter != end; ++iter, ++i) { elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z, iter.x, iter.y, iter.z); } LockingMatrix<MatrixType> lockingA(A); LockingVector<VectorType> lockingb(b); FEAssembleSumInto<GlobalOrdinal,Scalar,MatrixType,VectorType> fe_op; fe_op.mesh = &mesh; fe_op.elemIDs = &elemIDs[0]; fe_op.A = &lockingA; fe_op.b = &lockingb; typedef typename VectorType::ComputeNodeType ComputeNodeType; ComputeNodeType& compute_node = b.compute_node; compute_node.parallel_for(elemIDs.size(), fe_op); std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl; std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl; }
int main(int argc, char *argv[]) { int ierr = 0, i; #ifdef EPETRA_MPI // Initialize MPI MPI_Init(&argc,&argv); int rank; // My process ID MPI_Comm_rank(MPI_COMM_WORLD, &rank); Epetra_MpiComm Comm(MPI_COMM_WORLD); #else int rank = 0; Epetra_SerialComm Comm; #endif #ifdef HAVE_EPETRA_TEUCHOS Teuchos::RCP<Teuchos::FancyOStream> fancyOut = Teuchos::VerboseObjectBase::getDefaultOStream(); if (Comm.NumProc() > 1 ) { fancyOut->setShowProcRank(true); fancyOut->setOutputToRootOnly(-1); } std::ostream &out = *fancyOut; #else std::ostream &out = std::cout; #endif Comm.SetTracebackMode(0); // This should shut down any error tracing bool verbose = false; // Check if we should print results to standard out if (argc>1) if (argv[1][0]=='-' && argv[1][1]=='v') verbose = true; // char tmp; // if (rank==0) out << "Press any key to continue..."<< endl; // if (rank==0) cin >> tmp; // Comm.Barrier(); int MyPID = Comm.MyPID(); int NumProc = Comm.NumProc(); if (verbose && MyPID==0) out << Epetra_Version() << endl << endl; if (verbose) out << Comm <<endl; bool verbose1 = verbose; // Redefine verbose to only print on PE 0 if (verbose && rank!=0) verbose = false; int NumMyElements = 10000; int NumMyElements1 = NumMyElements; // Needed for localmap int NumGlobalElements = NumMyElements*NumProc+EPETRA_MIN(NumProc,3); if (MyPID < 3) NumMyElements++; int IndexBase = 0; int ElementSize = 7; // Test LocalMap constructor // and Petra-defined uniform linear distribution constructor if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_LocalMap(NumMyElements1, IndexBase, Comm)" << endl; if (verbose) out << " and Epetra_BlockMap(NumGlobalElements, ElementSize, IndexBase, Comm)" << endl; if (verbose) out << "*********************************************************" << endl; Epetra_LocalMap *LocalMap = new Epetra_LocalMap(NumMyElements1, IndexBase, Comm); Epetra_BlockMap * BlockMap = new Epetra_BlockMap(NumGlobalElements, ElementSize, IndexBase, Comm); EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr); delete BlockMap; // Test User-defined linear distribution constructor if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_BlockMap(NumGlobalElements, NumMyElements, ElementSize, IndexBase, Comm)" << endl; if (verbose) out << "*********************************************************" << endl; BlockMap = new Epetra_BlockMap(NumGlobalElements, NumMyElements, ElementSize, IndexBase, Comm); EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr); delete BlockMap; // Test User-defined arbitrary distribution constructor // Generate Global Element List. Do in reverse for fun! int * MyGlobalElements = new int[NumMyElements]; int MaxMyGID = (Comm.MyPID()+1)*NumMyElements-1+IndexBase; if (Comm.MyPID()>2) MaxMyGID+=3; for (i = 0; i<NumMyElements; i++) MyGlobalElements[i] = MaxMyGID-i; if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements, ElementSize, IndexBase, Comm)" << endl; if (verbose) out << "*********************************************************" << endl; BlockMap = new Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements, ElementSize, IndexBase, Comm); EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr); delete BlockMap; int * ElementSizeList = new int[NumMyElements]; int NumMyEquations = 0; int NumGlobalEquations = 0; for (i = 0; i<NumMyElements; i++) { ElementSizeList[i] = i%6+2; // blocksizes go from 2 to 7 NumMyEquations += ElementSizeList[i]; } ElementSize = 7; // Set to maximum for use in checkmap NumGlobalEquations = Comm.NumProc()*NumMyEquations; // Adjust NumGlobalEquations based on processor ID if (Comm.NumProc() > 3) { if (Comm.MyPID()>2) NumGlobalEquations += 3*((NumMyElements)%6+2); else NumGlobalEquations -= (Comm.NumProc()-3)*((NumMyElements-1)%6+2); } if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements, ElementSizeList, IndexBase, Comm)" << endl; if (verbose) out << "*********************************************************" << endl; BlockMap = new Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements, ElementSizeList, IndexBase, Comm); EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr); // Test Copy constructor if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_BlockMap(*BlockMap)" << endl; if (verbose) out << "*********************************************************" << endl; Epetra_BlockMap * BlockMap1 = new Epetra_BlockMap(*BlockMap); EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr); delete [] ElementSizeList; delete [] MyGlobalElements; delete BlockMap; delete BlockMap1; // Test Petra-defined uniform linear distribution constructor if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_Map(NumGlobalElements, IndexBase, Comm)" << endl; if (verbose) out << "*********************************************************" << endl; Epetra_Map * Map = new Epetra_Map(NumGlobalElements, IndexBase, Comm); EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr); delete Map; // Test User-defined linear distribution constructor if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_Map(NumGlobalElements, NumMyElements, IndexBase, Comm)" << endl; if (verbose) out << "*********************************************************" << endl; Map = new Epetra_Map(NumGlobalElements, NumMyElements, IndexBase, Comm); EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr); delete Map; // Test User-defined arbitrary distribution constructor // Generate Global Element List. Do in reverse for fun! MyGlobalElements = new int[NumMyElements]; MaxMyGID = (Comm.MyPID()+1)*NumMyElements-1+IndexBase; if (Comm.MyPID()>2) MaxMyGID+=3; for (i = 0; i<NumMyElements; i++) MyGlobalElements[i] = MaxMyGID-i; if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_Map(NumGlobalElements, NumMyElements, MyGlobalElements, IndexBase, Comm)" << endl; if (verbose) out << "*********************************************************" << endl; Map = new Epetra_Map(NumGlobalElements, NumMyElements, MyGlobalElements, IndexBase, Comm); EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr); // Test Copy constructor if (verbose) out << "\n*********************************************************" << endl; if (verbose) out << "Checking Epetra_Map(*Map)" << endl; if (verbose) out << "*********************************************************" << endl; Epetra_Map Map1(*Map); EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr); EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr); delete [] MyGlobalElements; delete Map; if (verbose1) { // Test Vector MFLOPS for 2D Dot Product int M = 1; int K = 1000000; Epetra_Map Map2(-1, K, IndexBase, Comm); Epetra_LocalMap Map3(M, IndexBase, Comm); Epetra_Vector A(Map2);A.Random(); Epetra_Vector B(Map2);B.Random(); Epetra_Vector C(Map3);C.Random(); // Test Epetra_Vector label const char* VecLabel = A.Label(); const char* VecLabel1 = "Epetra::Vector"; if (verbose) out << endl << endl <<"This should say " << VecLabel1 << ": " << VecLabel << endl << endl << endl; EPETRA_TEST_ERR(strcmp(VecLabel1,VecLabel),ierr); if (verbose) out << "Testing Assignment operator" << endl; double tmp1 = 1.00001* (double) (MyPID+1); double tmp2 = tmp1; A[1] = tmp1; tmp2 = A[1]; out << "On PE "<< MyPID << " A[1] should equal = " << tmp1; if (tmp1==tmp2) out << " and it does!" << endl; else out << " but it equals " << tmp2; Comm.Barrier(); if (verbose) out << endl << endl << "Testing MFLOPs" << endl; Epetra_Flops counter; C.SetFlopCounter(counter); Epetra_Time mytimer(Comm); C.Multiply('T', 'N', 0.5, A, B, 0.0); double Multiply_time = mytimer.ElapsedTime(); double Multiply_flops = C.Flops(); if (verbose) out << "\n\nTotal FLOPs = " << Multiply_flops << endl; if (verbose) out << "Total Time = " << Multiply_time << endl; if (verbose) out << "MFLOPs = " << Multiply_flops/Multiply_time/1000000.0 << endl; Comm.Barrier(); // Test Vector ostream operator with Petra-defined uniform linear distribution constructor // and a small vector Epetra_Map Map4(100, IndexBase, Comm); double * Dp = new double[100]; for (i=0; i<100; i++) Dp[i] = i; Epetra_Vector D(View, Map4,Dp); if (verbose) out << "\n\nTesting ostream operator: Multivector should be 100-by-2 and print i,j indices" << endl << endl; out << D << endl; if (verbose) out << "Traceback Mode value = " << D.GetTracebackMode() << endl; delete [] Dp; } #ifdef EPETRA_MPI MPI_Finalize(); #endif return ierr; }
/*! Routine to compute an approximate solution to Ax = b @param[in] geom The description of the problem's geometry. @param[inout] A The known system matrix @param[inout] data The data structure with all necessary CG vectors preallocated @param[in] b The known right hand side vector @param[inout] x On entry: the initial guess; on exit: the new approximate solution @param[in] max_iter The maximum number of iterations to perform, even if tolerance is not met. @param[in] tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance. @param[out] niters The number of iterations actually performed. @param[out] normr The 2-norm of the residual vector after the last iteration. @param[out] normr0 The 2-norm of the residual vector before the first iteration. @param[out] times The 7-element vector of the timing information accumulated during all of the iterations. @param[in] doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration. @return Returns zero on success and a non-zero value otherwise. @see CG_ref() */ int CG(const SparseMatrix & A, CGData & data, const Vector & b, Vector & x, const int max_iter, const double tolerance, int & niters, double & normr, double & normr0, double * times, bool doPreconditioning) { double t_begin = mytimer(); // Start timing right away normr = 0.0; double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0; double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0; //#ifndef HPCG_NOMPI // double t6 = 0.0; //#endif local_int_t nrow = A.localNumberOfRows; Vector & r = data.r; // Residual vector Vector & z = data.z; // Preconditioned residual vector Vector & p = data.p; // Direction vector (in MPI mode ncol>=nrow) Vector & Ap = data.Ap; if (!doPreconditioning && A.geom->rank==0) HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl; #ifdef HPCG_DEBUG int print_freq = 1; if (print_freq>50) print_freq=50; if (print_freq<1) print_freq=1; #endif // p is of length ncols, copy x to p for sparse MV operation CopyVector(x, p); //TODO paralel TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); // Ap = A*p TICK(); ComputeWAXPBY(nrow, 1.0, b, -1.0, Ap, r, A.isWaxpbyOptimized); TOCK(t2); // r = b - Ax (x stored in p) TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); TOCK(t1); normr = sqrt(normr); #ifdef HPCG_DEBUG if (A.geom->rank==0) HPCG_fout << "Initial Residual = "<< normr << std::endl; #endif // Record initial residual for convergence testing normr0 = normr; // Start iterations for (int k=1; k<=max_iter && normr/normr0 > tolerance; k++ ) { TICK(); if (doPreconditioning) ComputeMG(A, r, z); // Apply preconditioner else CopyVector (r, z); // copy r to z (no preconditioning) TOCK(t5); // Preconditioner apply time if (k == 1) { TICK(); ComputeWAXPBY(nrow, 1.0, z, 0.0, z, p, A.isWaxpbyOptimized); TOCK(t2); // Copy Mr to p TICK(); ComputeDotProduct (nrow, r, z, rtz, t4, A.isDotProductOptimized); TOCK(t1); // rtz = r'*z } else { oldrtz = rtz; TICK(); ComputeDotProduct (nrow, r, z, rtz, t4, A.isDotProductOptimized); TOCK(t1); // rtz = r'*z beta = rtz/oldrtz; TICK(); ComputeWAXPBY (nrow, 1.0, z, beta, p, p, A.isWaxpbyOptimized); TOCK(t2); // p = beta*p + z } TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); // Ap = A*p TICK(); ComputeDotProduct(nrow, p, Ap, pAp, t4, A.isDotProductOptimized); TOCK(t1); // alpha = p'*Ap alpha = rtz/pAp; TICK(); ComputeWAXPBY(nrow, 1.0, x, alpha, p, x, A.isWaxpbyOptimized);// x = x + alpha*p ComputeWAXPBY(nrow, 1.0, r, -alpha, Ap, r, A.isWaxpbyOptimized); TOCK(t2);// r = r - alpha*Ap TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); TOCK(t1); normr = sqrt(normr); #ifdef HPCG_DEBUG if (A.geom->rank==0 && (k%print_freq == 0 || k == max_iter)) HPCG_fout << "Iteration = "<< k << " Scaled Residual = "<< normr/normr0 << std::endl; #endif niters = k; } // Store times times[1] += t1; // dot-product time times[2] += t2; // WAXPBY time times[3] += t3; // SPMV time times[4] += t4; // AllReduce time times[5] += t5; // preconditioner apply time //#ifndef HPCG_NOMPI // times[6] += t6; // exchange halo time //#endif times[0] += mytimer() - t_begin; // Total time. All done... return(0); }
/// main. int main(int argc, char** argv) { uint32_t maxrow=0, nrpd=0; int nparts, ndigits, ierr; // must have the output directory name if (argc < 2) { std::cerr << "\nUsage:\n" << *argv << " <output-dir> [#rows [#rows-per-dir [conf-file]]]\n" << "If the 4th argument is not provided, the number of " "rows per directory will be determined by the memory cache size, " "which is by default 1/2 of the physical memory size.\n" << std::endl; return -1; } //ibis::gVerbose = 8; // initialize the file manage with the 5th argument ibis::init(argc>4 ? argv[4] : (const char*)0); ibis::util::timer mytimer(*argv, 0); if (argc > 2) // user specified maxrow maxrow = (uint32_t)atof(argv[2]); if (maxrow <= 0) { double tmp = ibis::fileManager::currentCacheSize(); maxrow = (uint32_t) ibis::util::compactValue(tmp / 120.0, tmp / 80.0); nrpd = maxrow; } if (maxrow < 10) maxrow = 10; if (argc > 3) // user specified nrpd nrpd = (uint32_t) atof(argv[3]); if (nrpd <= 0) { double tmp = ibis::fileManager::currentCacheSize(); nrpd = (uint32_t) ibis::util::compactValue(tmp / 120.0, tmp / 80.0); } if (nrpd > maxrow) nrpd = maxrow; ibis::table::row val; std::auto_ptr<ibis::tablex> tab(ibis::tablex::create()); initColumns(*tab, val); ierr = tab->reserveBuffer(nrpd); if (ierr > 0 && (unsigned)ierr < nrpd) nrpd = ierr; LOGGER(1) << *argv << ' ' << argv[1] << ' ' << maxrow << ' ' << nrpd << std::endl; nparts = maxrow / nrpd; nparts += (maxrow > nparts*nrpd); ierr = nparts; for (ndigits = 1, ierr >>= 4; ierr > 0; ierr >>= 4, ++ ndigits); for (uint32_t irow = 1; irow <= maxrow;) { const uint32_t end = irow - 1 + nrpd; TDList tdl; std::string dir = argv[1]; if (nparts > 1) { // figure out the directory name const char* str = strrchr(argv[1], FASTBIT_DIRSEP); if (str != 0) { if (str[1] == 0) { while (str-1 > argv[1]) { if (*(str-1) == FASTBIT_DIRSEP) break; else -- str; } } else { ++ str; } } std::ostringstream oss; oss << FASTBIT_DIRSEP << (str ? str : "_") << std::hex << std::setprecision(ndigits) << std::setw(ndigits) << std::setfill('0') << irow / nrpd; dir += oss.str(); } for (; irow <= end; ++ irow) { fillRow(irow, val, tdl); ierr = tab->appendRow(val); LOGGER(ierr != 6) << "Warning -- " << *argv << " failed to append row " << irow << " to the in-memory table, appendRow returned " << ierr; LOGGER(irow % 100000 == 0) << " . " << irow; } LOGGER(1) << "\n"; ierr = tab->write(dir.c_str()); LOGGER(ierr < 0) << "Warning -- " << *argv << " failed to write " << tab->mRows() << " rows to " << dir << ", ibis::tablex::write returned " << ierr; writeTDList(tdl, dir.c_str()); tab->clearData(); tdl.clear(); } return 0; } // main
void timer_expired() { mytimer(); keep_going = 0; }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } char* str; int ngpu = 2; int local_rank = 0; int device = 0; int skip_gpu = 99999; if((str = getenv("CUDA_NGPU")) != NULL) { ngpu = atoi(str); } if((str = getenv("CUDA_SKIP_GPU")) != NULL) { skip_gpu = atoi(str); } if((str = getenv("SLURM_LOCALID")) != NULL) { local_rank = atoi(str); device = local_rank % ngpu; if(device >= skip_gpu) device++; } if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) { local_rank = atoi(str); device = local_rank % ngpu; if(device >= skip_gpu) device++; } if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) { local_rank = atoi(str); device = local_rank % ngpu; if(device >= skip_gpu) device++; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; NVAMG_SAFE_CALL(NVAMG_initialize()); NVAMG_SAFE_CALL(NVAMG_initialize_plugins()); NVAMG_matrix_handle matrix; NVAMG_vector_handle rhs; NVAMG_vector_handle soln; NVAMG_resources_handle rsrc = NULL; NVAMG_solver_handle solver = NULL; NVAMG_config_handle config; NVAMG_SAFE_CALL(NVAMG_config_create_from_file(&config,"NVAMG_CONFIG" )); MPI_Comm nvamg_comm; MPI_Comm_dup(MPI_COMM_WORLD, &nvamg_comm); int devices[] = {device}; NVAMG_resources_create(&rsrc, config, &nvamg_comm, 1, devices); NVAMG_SAFE_CALL(NVAMG_solver_create(&solver, rsrc, NVAMG_mode_dDDI, config)); NVAMG_SAFE_CALL(NVAMG_matrix_create(&matrix, rsrc, NVAMG_mode_dDDI)); NVAMG_SAFE_CALL(NVAMG_vector_create(&rhs, rsrc, NVAMG_mode_dDDI)); NVAMG_SAFE_CALL(NVAMG_vector_create(&soln, rsrc, NVAMG_mode_dDDI)); //Generating communication Maps for NVAMG if(A.neighbors.size()>0) { int** send_map = new int*[A.neighbors.size()]; int** recv_map = new int*[A.neighbors.size()]; int send_offset = 0; int recv_offset = A.row_offsets.size()-1;; for(int i = 0; i<A.neighbors.size();i++) { send_map[i] = &A.elements_to_send[send_offset]; send_offset += A.send_length[i]; recv_map[i] = new int[A.recv_length[i]]; for(int j=0; j<A.recv_length[i]; j++) recv_map[i][j] = recv_offset+j; recv_offset += A.recv_length[i]; } const int** send_map_c = (const int**) send_map; const int** recv_map_c = (const int**) recv_map; NVAMG_SAFE_CALL(NVAMG_matrix_comm_from_maps_one_ring( matrix, 1, A.neighbors.size(),A.neighbors.data(), A.send_length.data(), send_map_c, A.recv_length.data(), recv_map_c)); NVAMG_SAFE_CALL(NVAMG_vector_bind(rhs,matrix)); NVAMG_SAFE_CALL(NVAMG_vector_bind(soln,matrix)); for(int i=0; i<A.neighbors.size(); i++) delete [] recv_map[i]; } for(int i=0;i<x.coefs.size();i++) x.coefs[i]=1; VectorType r(b.startIndex, nrows); VectorType p(0, ncols); VectorType Ap(b.startIndex, nrows); normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot_r2(r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } { //Matrix upload needs to happen before vector, otherwise it crashes NVAMG_SAFE_CALL(NVAMG_matrix_upload_all(matrix,A.row_offsets.size()-1, A.packed_coefs.size(),1,1, &A.row_offsets[0],&A.packed_cols[0],&A.packed_coefs[0], NULL)); NVAMG_SAFE_CALL(NVAMG_vector_upload(soln, p.coefs.size(), 1, &p.coefs[0])); NVAMG_SAFE_CALL(NVAMG_vector_upload(rhs, b.coefs.size(), 1, &b.coefs[0])); int n = 0; int bsize_x = 0, bsize_y = 0; NVAMG_SAFE_CALL(NVAMG_solver_setup(solver, matrix)); NVAMG_SAFE_CALL(NVAMG_solver_solve(solver, rhs, soln)); NVAMG_SAFE_CALL(NVAMG_vector_download(soln, &x.coefs[0])); int niter; NVAMG_SAFE_CALL(NVAMG_solver_get_iterations_number(solver, &niter)); TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot_r2(r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Final Residual = "<< normr << " after " << niter << " iterations" << std::endl; } } my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }
int main(int argc, char* argv[]) { int i, j, loop, num_alive, maxloop; int ldboard, ldnbngb, ldlboard; double t1, t2; double temps; int *board; int *nbngb; int local_alive; int *global_board; struct grid grid; MPI_Comm comm; int nb_proc_row; int nb_proc_tot; int rank; int nb_in_block; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &nb_proc_tot); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // initialization of the grid communicator if (EXIT_FAILURE == compute_communicator(nb_proc_tot,&nb_proc_row,&comm,&rank)){ MPI_Finalize(); return EXIT_SUCCESS; } if (argc < 2) { maxloop = 10; } else if (argc > 2){ maxloop = atoi(argv[1]); BS = atoi(argv[2]); } else maxloop = atoi(argv[1]); num_alive = 0; local_alive = 0; /* Leading dimension of the board array */ ldboard = BS; if (ldboard % nb_proc_row != 0){ if (rank == 0) printf("Wrong BS (or wrong number of procs) ... exiting now.\n"); MPI_Finalize(); return EXIT_FAILURE; } /* Leading dimension of the neigbour counters array */ nb_in_block = ldboard / nb_proc_row; ldnbngb = nb_in_block; ldlboard = nb_in_block + 2; board = malloc( ldlboard * ldlboard * sizeof(int) ); nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) ); if (rank == 0){ global_board = malloc( ldboard * ldboard * sizeof(int) ); num_alive = generate_initial_board( &global_cell( 1, 1), ldboard ); printf("Starting number of living cells = %d\n", num_alive); t1 = mytimer(); } matrix_placement_proc(nb_proc_row, nb_in_block, &comm, &(global_cell( 1, 1)), &(cell( 1, 1)), SCATTER, ldlboard); mpi_grid_init(&comm, &grid, rank); //printf("rank #%d: %d %d\n", rank, grid.rank_I, grid.rank_J); //output_lboard( nb_in_block, board, ldlboard, 0, rank ); for (loop = 1; loop <= maxloop; loop++) { MPI_Datatype blocktype; // we need a specific type for row exchange MPI_Type_vector(nb_in_block, 1, ldlboard, MPI_INT, &blocktype); MPI_Type_commit(&blocktype); // for upper/lower ghost row MPI_Sendrecv(&(cell( 1, 1)), 1, blocktype, grid.proc_above, 99, &(cell( nb_in_block+1, 1)), 1, blocktype, grid.proc_under, 99, comm, MPI_STATUS_IGNORE); MPI_Sendrecv(&(cell( nb_in_block, 1)), 1, blocktype, grid.proc_under, 99, &(cell( 0, 1)), 1, blocktype, grid.proc_above, 99, comm, MPI_STATUS_IGNORE); // for left/right ghost col MPI_Sendrecv(&(cell( 0, 1)), ldlboard, MPI_INT, grid.proc_left, 98, &(cell( 0, nb_in_block+1)), ldlboard, MPI_INT, grid.proc_right, 98, comm, MPI_STATUS_IGNORE); MPI_Sendrecv(&(cell( 0, nb_in_block)), ldlboard, MPI_INT, grid.proc_right, 98, &(cell( 0, 0)), ldlboard, MPI_INT, grid.proc_left, 98, comm, MPI_STATUS_IGNORE); //debug /* if (loop == 1) */ /* output_lboard( nb_in_block, board, ldlboard, 0, rank ); */ //calcul du nombre de voisins for (j = 1; j <= nb_in_block; j++) { for (i = 1; i <= nb_in_block; i++) { ngb( i, j ) = cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) + cell( i-1, j ) + cell( i+1, j ) + cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 ); } } //mise à jour de la matrice local_alive = 0; for (j = 1; j <= nb_in_block; j++) { for (i = 1; i <= nb_in_block; i++) { if ( (ngb( i, j ) < 2) || (ngb( i, j ) > 3) ) { cell(i, j) = 0; } else { if ((ngb( i, j )) == 3) cell(i, j) = 1; } if (cell(i, j) == 1) { local_alive ++; } } } //output_lboard( nb_in_block, board, ldlboard, loop, rank ); #ifdef PRINT_ALIVE MPI_Reduce(&local_alive, &num_alive, 1, MPI_INT, MPI_SUM, 0, comm); if (rank == 0) printf("%d \n", num_alive); #endif } matrix_placement_proc(nb_proc_row, nb_in_block, &comm, &(cell( 1, 1)), &(global_cell( 1, 1)), GATHER, ldlboard); MPI_Reduce(&local_alive, &num_alive, 1, MPI_INT, MPI_SUM, 0, comm); if (rank == 0){ t2 = mytimer(); temps = t2 - t1; printf("Final number of living cells = %d\n", num_alive); printf("time=%.2lf ms\n",(double)temps * 1.e3); //output_board( BS, &(global_cell(1, 1)), ldboard, maxloop); free(global_board); } free(board); free(nbngb); MPI_Comm_free(&comm); MPI_Finalize(); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { int num_alive = 0; int ldboard, ldnbngb; double t1, t2; double temps; if (argc < 3) { printf("Usage: %s nb_iterations size [nb_threads]\n", argv[0]); return EXIT_SUCCESS; } else { maxloop = atoi(argv[1]); BS = atoi(argv[2]); //printf("Running sequential version, grid of size %d, %d iterations\n", BS, maxloop); } if(argc > 3) nb_threads = atoi(argv[3]); num_alive = 0; /* Leading dimension of the board array */ ldboard = BS + 2; /* Leading dimension of the neigbour counters array */ ldnbngb = BS; _board = malloc( ldboard * ldboard * sizeof(int) ); _nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) ); int *board = _board; num_alive = generate_initial_board( BS, &(cell(1, 1)), ldboard ); pthread_t *threads=malloc(nb_threads*sizeof(*threads)); nbdone = malloc(nb_threads*sizeof(*nbdone)); for(int i=0; i<nb_threads; i++) sem_init(nbdone+i, 0, 0); pthread_cond_init(&barrier_cond, NULL); pthread_mutex_init(&barrier_mut, NULL); printf("Starting number of living cells = %d\n", num_alive); t1 = mytimer(); for(int i=0; i<nb_threads; i++){ int *id = malloc(sizeof(*id)); *id = i; pthread_create(threads+i, NULL, thread_compute, (void *)id); } num_alive = 0; for(int i=0; i<nb_threads; i++){ void *result_alive; pthread_join(threads[i], &result_alive); num_alive += *(int*)result_alive; free(result_alive); } t2 = mytimer(); temps = t2 - t1; printf("Final number of living cells = %d\n", num_alive); printf("%.2lf\n",(double)temps * 1.e3); free(_board); free(_nbngb); return EXIT_SUCCESS; }
/** Write the content of the dictionary to the named file. The existing content in the named file is overwritten. The content of the dictionary file is laid out as follows. \li Signature "#IBIS Dictionary " and version number (currently 0x020000). (20 bytes) \li N = Number of strings in the file. (4 bytes) \li uint64_t[N+1]: the starting positions of the strings in this file. \li uint32_t[N]: The integer code corresponding to each string value. \li the string values packed one after the other with their nil terminators. */ int ibis::dictionary::write(const char* name) const { std::string evt = "dictionary::write"; if (name == 0 || *name == 0) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " can not proceed with a " "null string as the file name"; return -1; } if (ibis::gVerbose > 1) { evt += '('; evt += name; evt += ')'; } if (key_.size() > raw_.size()) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " can not write an inconsistent dictionary, key_.size(" << key_.size() << "), raw_.size(" << raw_.size() << ")"; return -2; } ibis::util::timer mytimer(evt.c_str(), 4); FILE* fptr = fopen(name, "wb"); if (fptr == 0) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to open the file ... " << (errno ? strerror(errno) : "no free stdio stream"); return -3; } IBIS_BLOCK_GUARD(fclose, fptr); int ierr = fwrite(_fastbit_dictionary_header, 1, 20, fptr); if (ierr != 20) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to write the header, fwrite returned " << ierr; return -4; } const uint32_t nkeys = key_.size(); ierr = fwrite(&nkeys, sizeof(nkeys), 1, fptr); if (ierr != 1) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to write the size(" << nkeys << "), fwrite returned " << ierr; return -5; } if (nkeys == 0) // nothing else to write return 0; mergeBuffers(); array_t<uint64_t> pos(nkeys+1); array_t<uint32_t> qos(nkeys); pos.clear(); qos.clear(); pos.push_back(0); if (buffer_.size() == 1) { for (uint32_t j = 0; j < raw_.size(); ++ j) { if (raw_[j] != 0) { pos.push_back(1U + strlen(raw_[j])); qos.push_back(j); } } ierr = writeBuffer(fptr, nkeys, pos, qos); } else { ierr = writeKeys(fptr, nkeys, pos, qos); } LOGGER(ibis::gVerbose > 1) << evt << " complete with ierr = " << ierr; return ierr; } // ibis::dictionary::write
/// Read the content of the named file. The file content is read into the /// buffer in one-shot and then digested. int ibis::dictionary::read(const char* name) { if (name == 0 || *name == 0) return -1; std::string evt = "dictionary::read("; evt += name; evt += ')'; // open the file to read int ierr = 0; FILE* fptr = fopen(name, "rb"); if (fptr == 0) { LOGGER(ibis::gVerbose > 3) << "Warning -- " << evt << " failed to open the file ... " << (errno ? strerror(errno) : "no free stdio stream"); return -2; } ibis::util::timer mytimer(evt.c_str(), 4); IBIS_BLOCK_GUARD(fclose, fptr); ierr = fseek(fptr, 0, SEEK_END); // to the end if (ierr != 0) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to seek to the end of the file"; return -3; } long int sz = ftell(fptr); // file size if (sz < 24) { // must be the old style dictionary file return readRaw(evt.c_str(), fptr); } else { char header[20]; ierr = fseek(fptr, 0, SEEK_SET); if (ierr != 0) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to seek to the beginning " "of the file"; return -4; } ierr = fread(header, 1, 20, fptr); if (ierr != 20) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to read the 20-byte header"; return -5; } if (header[0] == _fastbit_dictionary_header[0] && header[1] == _fastbit_dictionary_header[1] && header[2] == _fastbit_dictionary_header[2] && header[3] == _fastbit_dictionary_header[3] && header[4] == _fastbit_dictionary_header[4] && header[5] == _fastbit_dictionary_header[5] && header[6] == _fastbit_dictionary_header[6] && header[7] == _fastbit_dictionary_header[7] && header[8] == _fastbit_dictionary_header[8] && header[9] == _fastbit_dictionary_header[9] && header[10] == _fastbit_dictionary_header[10] && header[11] == _fastbit_dictionary_header[11] && header[12] == _fastbit_dictionary_header[12] && header[13] == _fastbit_dictionary_header[13] && header[14] == _fastbit_dictionary_header[14] && header[15] == _fastbit_dictionary_header[15] && header[16] == _fastbit_dictionary_header[16] && header[17] == _fastbit_dictionary_header[17] && header[18] == _fastbit_dictionary_header[18] && header[19] == _fastbit_dictionary_header[19]) { // got the expected header return readKeys(evt.c_str(), fptr); } else { LOGGER(ibis::gVerbose > 2) << evt << " did not find the expected header, assume " "to be an old-style dictionary"; return readRaw(evt.c_str(), fptr); } } } // ibis::dictionary::read
/// Read the content of the named file. The file content is read into the /// buffer in one-shot and then digested. /// /// This function determines the version of the dictionary and invokes the /// necessary reading function to perform the actual reading operations. /// Currently there are three possible version of dictioanries /// 0x02000000 - the version produced by the current write function, /// 0x01000000 - the version with 64-bit offsets, consecutive kyes, strings /// are stored in key order /// 0x00000000 - the version 32-bit offsets and stores strings in /// sorted order. /// unmarked - the version without a header, only has the bare strings in /// the code order. int ibis::dictionary::read(const char* name) { if (name == 0 || *name == 0) return -1; std::string evt = "dictionary::read"; if (ibis::gVerbose > 1) { evt += '('; evt += name; evt += ')'; } // open the file to read int ierr = 0; FILE* fptr = fopen(name, "rb"); if (fptr == 0) { LOGGER(ibis::gVerbose > 3) << "Warning -- " << evt << " failed to open the file ... " << (errno ? strerror(errno) : "no free stdio stream"); return -2; } ibis::util::timer mytimer(evt.c_str(), 4); IBIS_BLOCK_GUARD(fclose, fptr); ierr = fseek(fptr, 0, SEEK_END); // to the end if (ierr != 0) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to seek to the end of the file"; return -3; } uint32_t version = 0xFFFFFFFFU; long int sz = ftell(fptr); // file size if (sz > 24) { char header[20]; ierr = fseek(fptr, 0, SEEK_SET); if (ierr != 0) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to seek to the beginning " "of the file"; return -4; } ierr = fread(header, 1, 20, fptr); if (ierr != 20) { LOGGER(ibis::gVerbose > 1) << "Warning -- " << evt << " failed to read the 20-byte header"; return -5; } if (header[0] == _fastbit_dictionary_header[0] && header[1] == _fastbit_dictionary_header[1] && header[2] == _fastbit_dictionary_header[2] && header[3] == _fastbit_dictionary_header[3] && header[4] == _fastbit_dictionary_header[4] && header[5] == _fastbit_dictionary_header[5] && header[6] == _fastbit_dictionary_header[6] && header[7] == _fastbit_dictionary_header[7] && header[8] == _fastbit_dictionary_header[8] && header[9] == _fastbit_dictionary_header[9] && header[10] == _fastbit_dictionary_header[10] && header[11] == _fastbit_dictionary_header[11] && header[12] == _fastbit_dictionary_header[12] && header[13] == _fastbit_dictionary_header[13] && header[14] == _fastbit_dictionary_header[14] && header[15] == _fastbit_dictionary_header[15]) { version = (header[16] << 24 | header[17] << 16 | header[18] << 8 | header[19]); LOGGER(ibis::gVerbose > 3) << evt << " detected dictionary version 0x" << std::hex << version << std::dec; } else { LOGGER(ibis::gVerbose > 2) << evt << " did not find the expected header, assume " "to have no header (oldest version of dictioinary)"; } } // invoke the actual reader based on version number switch (version) { case 0x02000000: ierr = readKeys2(evt.c_str(), fptr); break; case 0x01000000: ierr = readKeys1(evt.c_str(), fptr); break; case 0x00000000: ierr = readKeys0(evt.c_str(), fptr); break; default: ierr = readRaw(evt.c_str(), fptr); break; } if (ibis::gVerbose > 3) { ibis::util::logger lg; lg() << evt << " completed with "; toASCII(lg()); } return ierr; } // ibis::dictionary::read
int main(int argc, char* argv[]) { int i, j, loop, num_alive, maxloop; int ldgboard,ldboard, ldnbngb; double t1, t2; double temps; int *gboard; int *board; int *nbngb; int size; int coord[2], id; int procs_per_lines_col; MPI_Init(NULL,NULL); MPI_Comm_size(MPI_COMM_WORLD, &size); procs_per_lines_col = sqrt(size); if(procs_per_lines_col * procs_per_lines_col != size) { fprintf(stderr, "Renseignez un nombre carré de processeurs siouplait !\n"); MPI_Finalize(); exit(EXIT_FAILURE); } int dims[2]; dims[0] = procs_per_lines_col; dims[1] = procs_per_lines_col; int periods[2]; periods[0] = 1; periods[1] = 1; MPI_Comm comm_cart; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_cart); MPI_Comm_rank(comm_cart, &id); MPI_Cart_coords(comm_cart, id, 2, coord); if (argc < 3) { printf("Usage: %s nb_iterations size\n", argv[0]); return EXIT_SUCCESS; } else { maxloop = atoi(argv[1]); BS = atoi(argv[2]); //printf("Running sequential version, grid of size %d, %d iterations\n", BS, maxloop); } num_alive = 0; //Generate the neighbours table /* Leading dimension of the global board array */ ldgboard = BS + 2; /* Leading dimension of the board array */ ldboard = BS/procs_per_lines_col + 2; /* Leading dimension of the neigbour counters array */ ldnbngb = BS/procs_per_lines_col; board = malloc( ldboard * ldboard * sizeof(int) ); nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) ); if(id == 0) { gboard = malloc(ldgboard * ldgboard * sizeof(int)); num_alive = generate_initial_board( BS, &gboard[1+ldgboard], ldgboard ); //fprintf(stderr,"Starting number of living cells = %d\n", num_alive); } MPI_Datatype block; MPI_Type_vector(ldboard-2, ldboard-2, ldgboard, MPI_INT, &block); MPI_Type_create_resized(block, 0, sizeof(int), &block); MPI_Type_commit(&block); MPI_Datatype subblock; MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &subblock); MPI_Type_create_resized(subblock, 0, sizeof(int), &subblock); MPI_Type_commit(&subblock); int * counts = (int*) malloc(size*sizeof(int)); int * displs = (int*) malloc(size*sizeof(int)); // Définition des déplacements pour chaque proc for (int i = 0; i < procs_per_lines_col; ++i) { for (int j = 0; j < procs_per_lines_col; ++j) { counts[i+j*procs_per_lines_col]= 1; displs[i+j*procs_per_lines_col]= i*ldgboard*(ldboard-2)+j*(ldboard-2); } } MPI_Scatterv(&gboard[1+ldgboard], counts, displs, block, &board[ldboard+1], 1, subblock,0, comm_cart); int neighbours[8]; make_neighbours_table(neighbours, comm_cart); MPI_Request req[8]; int block_size = ldboard - 2; MPI_Datatype block_line; MPI_Type_vector(block_size+2, 1, ldboard,MPI_INT, &block_line); MPI_Type_commit(&block_line); t1 = mytimer(); for (loop = 1; loop <= maxloop; loop++) { make_communications(req, comm_cart, neighbours, block_size, board, ldboard, block_line); /* cell( 0, 0 ) = cell(BS, BS); cell( 0, BS+1) = cell(BS, 1); cell(BS+1, 0 ) = cell( 1, BS); cell(BS+1, BS+1) = cell( 1, 1); for (i = 1; i <= BS; i++) { cell( i, 0) = cell( i, BS); cell( i, BS+1) = cell( i, 1); cell( 0, i) = cell(BS, i); cell(BS+1, i) = cell( 1, i); } */ //Inner cells for (j = 2; j <= block_size; j++) { for (i = 2; i <= block_size; i++) { ngb( i, j ) = cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) + cell( i-1, j ) + cell( i+1, j ) + cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 ); } } //On LEFT MPI_Wait(&req[0], MPI_STATUS_IGNORE); MPI_Wait(&req[4], MPI_STATUS_IGNORE); MPI_Wait(&req[6], MPI_STATUS_IGNORE); //CALCUL LIGNE GAUCHE for(j = 1; j <= block_size; j++) { ngb( 1, j ) = cell( 0, j-1 ) + cell( 1, j-1 ) + cell( 2, j-1 ) + cell( 0, j ) + cell( 2, j ) + cell( 0, j+1 ) + cell( 1, j+1 ) + cell( 2, j+1 ); } //On TOP MPI_Wait(&req[1], MPI_STATUS_IGNORE); MPI_Wait(&req[5], MPI_STATUS_IGNORE); //CALCUL LIGNE DESSUS for(i = 1; i <= block_size; i++) { ngb( i, 1 ) = cell( i - 1, 0) + cell( i, 0 ) + cell( i + 1, 0 ) + cell( i - 1, 1) + cell( i + 1, 1 ) + cell( i - 1, 2) + cell( i, 2 ) + cell( i + 1, 2 ); } //On RIGHT MPI_Wait(&req[2], MPI_STATUS_IGNORE); MPI_Wait(&req[7], MPI_STATUS_IGNORE); //CALCULER A DROITE for(j = 1; j <= block_size; j++) { ngb( block_size, j ) = cell( block_size - 1, j-1 ) + cell( block_size , j-1 ) + cell( block_size + 1, j-1 ) + cell( block_size - 1, j ) + cell( block_size + 1, j ) + cell( block_size - 1, j+1 ) + cell( block_size, j+1 ) + cell( block_size + 1, j+1 ); } //ON BOT MPI_Wait(&req[3], MPI_STATUS_IGNORE); //CALCULER EN BAS for(i = 1; i <= block_size; i++) { ngb( i, block_size ) = cell( i - 1, block_size - 1) + cell( i, block_size - 1 ) + cell( i + 1, block_size - 1 ) + cell( i - 1, block_size ) + cell( i + 1, block_size ) + cell( i - 1, block_size + 1 ) + cell( i, block_size + 1 ) + cell( i + 1, block_size + 1 ); } num_alive = 0; for (j = 1; j <= block_size; j++) { for (i = 1; i <= block_size; i++) { if ( (ngb( i, j ) < 2) || (ngb( i, j ) > 3) ) { cell(i, j) = 0; } else { if ((ngb( i, j )) == 3) cell(i, j) = 1; } if (cell(i, j) == 1) { num_alive ++; } } } /* Avec les celluls sur les bords (utile pour vérifier les comm MPI) */ /* output_board( BS+2, &(cell(0, 0)), ldboard, loop ); */ /* Avec juste les "vraies" cellules: on commence à l'élément (1,1) */ //output_board( BS, &(cell(1, 1)), ldboard, loop); //printf("%d cells are alive\n", num_alive); } MPI_Gatherv(&board[ldboard+1], 1, subblock,&gboard[ldgboard+1], counts,displs, block, 0, comm_cart); t2 = mytimer(); temps = t2 - t1; MPI_Allreduce(MPI_IN_PLACE,&temps, 1, MPI_DOUBLE, MPI_MAX, comm_cart); MPI_Allreduce(MPI_IN_PLACE,&num_alive, 1, MPI_INT, MPI_SUM, comm_cart); if(id == 0) { //printf("Final number of living cells = %d\n", num_alive); printf("%.2lf\n",(double)temps * 1.e3); } free(board); free(nbngb); MPI_Finalize(); return EXIT_SUCCESS; }
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; VectorType r(b.startIndex, nrows, 256); VectorType p(0, ncols, 512); VectorType Ap(b.startIndex, nrows, 64); normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); // print_vec(p.coefs, "p"); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot_r2(r); TOCK(tDOT); //std::cout << "rtrans="<<rtrans<<std::endl; normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon(); #ifdef MINIFE_DEBUG std::ostream& os = outstream(); os << "brkdown_tol = " << brkdown_tol << std::endl; #endif #ifdef MINIFE_DEBUG_OPENMP std::cout << "Starting CG Solve Phase..." << std::endl; #endif for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) { if (k == 1) { //TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY); TICK(); daxpby(one, r, zero, p); TOCK(tWAXPY); } else { oldrtrans = rtrans; TICK(); rtrans = dot_r2(r); TOCK(tDOT); const magnitude_type beta = rtrans/oldrtrans; TICK(); daxpby(one, r, beta, p); TOCK(tWAXPY); } normr = sqrt(rtrans); if (myproc == 0 && (k%print_freq==0 || k==max_iter)) { std::cout << "Iteration = "<<k<<" Residual = "<<normr<<std::endl; } magnitude_type alpha = 0; magnitude_type p_ap_dot = 0; TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT); #ifdef MINIFE_DEBUG os << "iter " << k << ", p_ap_dot = " << p_ap_dot; os.flush(); #endif if (p_ap_dot < brkdown_tol) { if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) { std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl; #ifdef MINIFE_DEBUG os << "ERROR, numerical breakdown!"<<std::endl; #endif //update the timers before jumping out. my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[TOTAL] = mytimer() - total_time; return; } else brkdown_tol = 0.1 * p_ap_dot; } alpha = rtrans/p_ap_dot; #ifdef MINIFE_DEBUG os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl; #endif TICK(); daxpby(alpha, p, one, x); daxpby(-alpha, Ap, one, r); TOCK(tWAXPY); num_iters = k; } my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }
int driver(const Box& global_box, Box& my_box, Parameters& params, YAML_Doc& ydoc) { int global_nx = global_box[0][1]; int global_ny = global_box[1][1]; int global_nz = global_box[2][1]; int numprocs = 1, myproc = 0; #ifdef HAVE_MPI MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (params.load_imbalance > 0) { add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc); } float largest_imbalance = 0, std_dev = 0; compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance, std_dev, ydoc, true); //Create a representation of the mesh: //Note that 'simple_mesh_description' is a virtual or conceptual //mesh that doesn't actually store mesh data. #ifdef TIME_IT if (myproc==0) { std::cout.width(30); std::cout << "creating/filling mesh..."; std::cout.flush(); } #endif timer_type t_start = mytimer(); timer_type t0 = mytimer(); simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box); timer_type mesh_fill = mytimer() - t0; timer_type t_total = mytimer() - t_start; #ifdef TIME_IT if (myproc==0) { std::cout << mesh_fill << "s, total time: " << t_total << std::endl; } #endif //next we will generate the matrix structure. //Declare matrix object: #if defined(MINIFE_ELL_MATRIX) typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; #else typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; #endif MatrixType A; timer_type gen_structure; RUN_TIMED_FUNCTION("generating matrix structure...", generate_matrix_structure(mesh, A), gen_structure, t_total); GlobalOrdinal local_nrows = A.rows.size(); GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1; Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows); Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows); //Assemble finite-element sub-matrices and sub-vectors into the global //linear system: timer_type fe_assembly; RUN_TIMED_FUNCTION("assembling FE data...", assemble_FE_data(mesh, A, b, params), fe_assembly, t_total); if (myproc == 0) { ydoc.add("Matrix structure generation",""); ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure); ydoc.add("FE assembly",""); ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly); } #ifdef MINIFE_DEBUG write_matrix("A_prebc.mtx", A); write_vector("b_prebc.vec", b); #endif //Now apply dirichlet boundary-conditions //(Apply the 0-valued surfaces first, then the 1-valued surface last.) timer_type dirbc_time; RUN_TIMED_FUNCTION("imposing Dirichlet BC...", impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total); RUN_TIMED_FUNCTION("imposing Dirichlet BC...", impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total); #ifdef MINIFE_DEBUG write_matrix("A.mtx", A); write_vector("b.vec", b); #endif //Transform global indices to local, set up communication information: timer_type make_local_time; RUN_TIMED_FUNCTION("making matrix indices local...", make_local_matrix(A), make_local_time, t_total); #ifdef MINIFE_DEBUG write_matrix("A_local.mtx", A); write_vector("b_local.vec", b); #endif size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc); //Prepare to perform conjugate gradient solve: LocalOrdinal max_iters = 200; LocalOrdinal num_iters = 0; typedef typename TypeTraits<Scalar>::magnitude_type magnitude; magnitude rnorm = 0; magnitude tol = std::numeric_limits<magnitude>::epsilon(); timer_type cg_times[NUM_TIMERS]; typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType; t_total = mytimer() - t_start; bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1; int verify_result = 0; #if MINIFE_KERNELS != 0 if (myproc==0) { std::cout.width(30); std::cout << "Starting kernel timing loops ..." << std::endl; } max_iters = 500; x.coefs[0] = 0.9; if (matvec_with_comm_overlap) { time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times); } else { time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times); } num_iters = max_iters; std::string title("Kernel timings"); #else if (myproc==0) { std::cout << "Starting CG solver ... " << std::endl; } if (matvec_with_comm_overlap) { #ifdef MINIFE_CSR_MATRIX rearrange_matrix_local_external(A); cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol, num_iters, rnorm, cg_times); #else std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl; #endif } else { cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol, num_iters, rnorm, cg_times); if (myproc == 0) { std::cout << "Final Resid Norm: " << rnorm << std::endl; } if (params.verify_solution > 0) { double tolerance = 0.06; bool verify_whole_domain = false; #ifdef MINIFE_DEBUG verify_whole_domain = true; #endif if (myproc == 0) { if (verify_whole_domain) std::cout << "verifying solution..." << std::endl; else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl; } verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain); } } #ifdef MINIFE_DEBUG write_vector("x.vec", x); #endif std::string title("CG solve"); #endif if (myproc == 0) { ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name()); ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name()); ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name()); ydoc.add(title,""); ydoc.get(title)->add("Iterations",num_iters); ydoc.get(title)->add("Final Resid Norm",rnorm); GlobalOrdinal global_nrows = global_nx; global_nrows *= global_ny*global_nz; //flops-per-mv, flops-per-dot, flops-per-waxpy: double mv_flops = global_nnz*2.0; double dot_flops = global_nrows*2.0; double waxpy_flops = global_nrows*3.0; #if MINIFE_KERNELS == 0 //if MINIFE_KERNELS == 0 then we did a CG solve, and in that case //there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys. mv_flops *= (num_iters+1); dot_flops *= (2*num_iters); waxpy_flops *= (3*num_iters+2); #else //if MINIFE_KERNELS then we did one of each operation per iteration. mv_flops *= num_iters; dot_flops *= num_iters; waxpy_flops *= num_iters; #endif double total_flops = mv_flops + dot_flops + waxpy_flops; double mv_mflops = -1; if (cg_times[MATVEC] > 1.e-4) mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]); double dot_mflops = -1; if (cg_times[DOT] > 1.e-4) dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]); double waxpy_mflops = -1; if (cg_times[WAXPY] > 1.e-4) waxpy_mflops = 1.e-6 * (waxpy_flops/cg_times[WAXPY]); double total_mflops = -1; if (cg_times[TOTAL] > 1.e-4) total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]); ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]); ydoc.get(title)->add("WAXPY Flops",waxpy_flops); if (waxpy_mflops >= 0) ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops); else ydoc.get(title)->add("WAXPY Mflops","inf"); ydoc.get(title)->add("DOT Time",cg_times[DOT]); ydoc.get(title)->add("DOT Flops",dot_flops); if (dot_mflops >= 0) ydoc.get(title)->add("DOT Mflops",dot_mflops); else ydoc.get(title)->add("DOT Mflops","inf"); ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]); ydoc.get(title)->add("MATVEC Flops",mv_flops); if (mv_mflops >= 0) ydoc.get(title)->add("MATVEC Mflops",mv_mflops); else ydoc.get(title)->add("MATVEC Mflops","inf"); #ifdef MINIFE_FUSED ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]); ydoc.get(title)->add("MATVECDOT Flops",mv_flops); if (mv_mflops >= 0) ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops); else ydoc.get(title)->add("MATVECDOT Mflops","inf"); #endif #if MINIFE_KERNELS == 0 ydoc.get(title)->add("Total",""); ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]); ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops); if (total_mflops >= 0) ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops); else ydoc.get(title)->get("Total")->add("Total CG Mflops","inf"); ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters); #endif } return verify_result; }
gint anim_next_frame(struct model_pak *model) { gulong time; gchar *text, *name; g_assert(model != NULL); /* increment and test if should we return to the start */ model->cur_frame += model->anim_step; if (model->cur_frame >= model->num_frames) if (model->anim_loop) model->cur_frame = 1; /* continue until we run out of frames (or a stop is flagged) */ if (model->cur_frame < model->num_frames && model->animating) { #if DEBUG_DISPLAY_NEXT_FRAME printf("displaying [%d]\n", model->cur_frame); #endif time = mytimer(); /* if a dialog exists - update via the current frame spinner */ if (dialog_exists(ANIM, model)) gui_relation_update(model); else { /* otherwise, update manually */ read_frame(model->afp, model->cur_frame, model); meas_graft_model(model); gui_active_refresh(); redraw_canvas(SINGLE); } /* animation adjusted redraw time */ time = mytimer() - time; model->redraw_cumulative += time; /* NEW - render to file */ if (sysenv.render.animate) { text = g_strdup_printf("%s_%06d.pov", sysenv.render.animate_file, model->cur_frame); name = g_build_filename(sysenv.cwd, text, NULL); write_povray(name, model); /* NB: added this as jago keeps locking up on multi-frame renders */ if (!sysenv.render.no_povray_exec) povray_exec(name); g_free(text); g_free(name); } return(TRUE); } /* FIXME - find a better way to do this... */ if (!model->transform_list) fclose(model->afp); /* done animation */ model->animating = FALSE; model->cur_frame--; /* create movie? */ if (sysenv.render.animate && !sysenv.render.no_povray_exec) { text = NULL; switch (sysenv.render.animate_type) { case ANIM_GIF: text = g_strdup_printf("%s -delay %d %s_*.tga %s.gif", sysenv.convert_path, (gint) sysenv.render.delay, sysenv.render.animate_file, sysenv.render.animate_file); break; case ANIM_MPEG: text = g_strdup_printf("%s -quality %d -delay %d %s_*.tga %s.mpg", sysenv.convert_path, (gint) sysenv.render.mpeg_quality, (gint) sysenv.render.delay, sysenv.render.animate_file, sysenv.render.animate_file); break; } if (text) { system(text); g_free(text); gui_text_show(DEFAULT, "Completed movie creation.\n"); } } /* cleanup */ if (sysenv.render.no_keep_tempfiles) { #ifndef __WIN32 text = g_strdup_printf("rm -rf %s_*.pov", sysenv.render.animate_file); system(text); g_free(text); text = g_strdup_printf("rm -rf %s_*.tga", sysenv.render.animate_file); system(text); g_free(text); #endif /* TODO - windows equivalents */ } /* done - return FALSE to terminate the timer */ return(FALSE); }
int main(int argc, char* argv[]) { int i, j, loop, num_alive, maxloop; int ldboard, ldnbngb; double t1, t2; double temps; int *board; int *nbngb; if (argc < 2) { maxloop = 10; } else if (argc >= 2){ maxloop = atoi(argv[1]); if(argc > 2) BS = atoi(argv[2]); if(argc > 3){ num_threads = atoi(argv[3]); } } omp_set_num_threads(num_threads); num_alive = 0; /* Leading dimension of the board array */ ldboard = BS + 2; /* Leading dimension of the neigbour counters array */ ldnbngb = BS; board = malloc( ldboard * ldboard * sizeof(int) ); nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) ); num_alive = generate_initial_board( BS, &(cell(1, 1)), ldboard ); #ifdef OUTPUT_BOARD output_board( BS, &(cell(1, 1)), ldboard, 0 ); #endif printf("Starting number of living cells = %d\n", num_alive); t1 = mytimer(); for (loop = 1; loop <= maxloop; loop++) { cell( 0, 0 ) = cell(BS, BS); cell( 0, BS+1) = cell(BS, 1); cell(BS+1, 0 ) = cell( 1, BS); cell(BS+1, BS+1) = cell( 1, 1); #pragma omp parallel for for (i = 1; i <= BS; i++) { cell( i, 0) = cell( i, BS); cell( i, BS+1) = cell( i, 1); cell( 0, i) = cell(BS, i); cell(BS+1, i) = cell( 1, i); } #pragma omp parallel for private(i) for (j = 1; j <= BS; j++) { for (i = 1; i <= BS; i++) { ngb( i, j ) = cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) + cell( i-1, j ) + cell( i+1, j ) + cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 ); } } num_alive = 0; #pragma omp parallel for private (i) reduction(+:num_alive) for (j = 1; j <= BS; j++) { for (i = 1; i <= BS; i++) { if ( (ngb( i, j ) < 2) || (ngb( i, j ) > 3) ) { cell(i, j) = 0; } else { if ((ngb( i, j )) == 3) cell(i, j) = 1; } if (cell(i, j) == 1) { num_alive ++; } } } #ifdef OUTPUT_BOARD output_board( BS, &(cell(1, 1)), ldboard, loop); #endif #ifdef PRINT_ALIVE printf("%d \n", num_alive); #endif } t2 = mytimer(); temps = t2 - t1; printf("Final number of living cells = %d\n", num_alive); printf("time=%.2lf ms\n",(double)temps * 1.e3); #ifdef BENCH char fname [40]; sprintf(fname, "time_omp_%d.dat", num_threads); FILE* f=fopen(fname, "w"); if (f != NULL) fprintf(f,"%.2lf", temps*1.e3); fclose(f); #endif #ifdef OUTPUT_BOARD output_board( BS, &(cell(1, 1)), ldboard, maxloop); #endif free(board); free(nbngb); return EXIT_SUCCESS; }
int main(int argc, char* argv[]){ MPI_Init(NULL, NULL); int rank, size; int loop, num_alive, loop_iterations; int ldboard, ldnbngb, ldglobalboard; double t1, time, final_time; int periods[2] = {1, 1}; int *globboard= NULL; int *globboard2= NULL; int *board; int *nbngb; /* Initialization of MPI */ MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size); if(argc >= 2){ if(!strcmp("-h",argv[1])){ if(!rank) helper(); MPI_Finalize(); return EXIT_SUCCESS; } } int i, j; int process_per_row = sqrt(size); int process_per_column = sqrt(size); int dims[2] = {process_per_row, process_per_column}; // It only works if the number of process in the input is a perfect square if(size != process_per_column*process_per_row){ fprintf(stderr, "Square Perfect needed as input size.\nExiting Program."); MPI_Finalize(); return EXIT_FAILURE; } MPI_Comm grid; // Initialize cartesian grid MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods,0, &grid); MPI_Comm_rank(grid, &rank); /* User input */ if (argc < 2) { loop_iterations = 10; BS = 30; } else if (argc >= 2){ loop_iterations = atoi(argv[1]); if(argc > 2) BS = atoi(argv[2]); else BS = 30; } num_alive = 0; /*Leading dimension of global board array*/ ldglobalboard = BS + 2; // +2 because of upper and above added (+ X +) /* Leading dimension of board array */ ldboard = BS/process_per_row + 2; // +2 because of upper and above added (+ X +) /* Leading dimension of neigbour array */ ldnbngb = BS/sqrt(size); // Same number of element in each process which is equal to this formula // Initialization of cells board board = (int *)malloc( ldboard * ldboard * sizeof(int) ); nbngb = (int *)malloc( ldnbngb * ldnbngb * sizeof(int) ); // Initialization of global cell board (which is common between all processes) if(!rank){ globboard = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int)); globboard2 = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int)); num_alive = generate_initial_board( BS, &globboard[1+ldglobalboard] , ldglobalboard ); output_board( BS, &globboard[1+ldglobalboard], ldglobalboard, 0 ); fprintf(stderr, "Starting number of living cells = %d\n", num_alive); } // Matrix block type used by each processes MPI_Datatype block2, block; MPI_Type_vector(ldboard-2, ldboard-2, ldglobalboard, MPI_INT, &block2); MPI_Type_create_resized(block2, 0, sizeof(int), &block); MPI_Type_commit(&block); // Matrix sub block type used by each processes MPI_Datatype sub_block2, sub_block; MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &sub_block2); MPI_Type_create_resized(sub_block2, 0, sizeof(int), &sub_block); MPI_Type_commit(&sub_block); int *process_count = (int*)malloc(size*sizeof(int)); // number of cells per processes int *cell_per_processes = (int*)malloc(size*sizeof(int)); // Prototyping moves for each processes (preparing matrix's scatter) for (i = 0; i < process_per_row; ++i){ for (j = 0; j < process_per_column; ++j){ process_count[i+j*process_per_column]= 1; cell_per_processes[i+j*process_per_column]= i*ldglobalboard*(ldboard-2)+j*(ldboard-2); } } /* Explodes matrix into sub_blocks elements */ MPI_Scatterv(&globboard[1+ldglobalboard], process_count, cell_per_processes, block, &board[ldboard+1], 1, sub_block,0, grid); // Initialize for each processes, a table of the neighbours. int neighbours[8]; neighbour_table(neighbours, grid, rank); /* Time to begin */ t1 = mytimer(); int blocksize = ldboard-2; MPI_Datatype row_blocks; MPI_Type_vector(blocksize, 1, ldboard, MPI_INT, &row_blocks); MPI_Type_commit(&row_blocks); // status for waiting time... MPI_Status mpi_status; // Create as much MPI request as number of neighbours possible (in the worst case 8) MPI_Request cart_request[8]; for (loop = 1; loop <= loop_iterations; ++loop) { /* Start communications to send and recv informations from neighbours */ inter_proc_communications(cart_request, neighbours, grid, blocksize, board, ldboard, row_blocks); /* Compute inside process cells */ for (j = 2; j <= blocksize-1; ++j) { for (i = 2; i <= blocksize-1; ++i) { ngb( i, j ) = cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) + cell( i-1, j ) + cell( i+1, j ) + cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 ); } } /* Computes cells on the border */ // Cell neighbour's composition // 4 2 5 4 4 2 5 4 2 5 4 2 5 // // 0 X 1 --> 0 --> 0 --> 0 1 --> 0 1 // // 6 3 7 6 6 6 7 6 3 7 // /* Column on the left needs data from the left process --> 4, 0, 6*/ MPI_Wait(&cart_request[0], &mpi_status); MPI_Wait(&cart_request[4], &mpi_status); MPI_Wait(&cart_request[6], &mpi_status); process_frontier(1, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb); /* Line above needs data from the above process --> 2, 5 */ MPI_Wait(&cart_request[2], &mpi_status); MPI_Wait(&cart_request[5], &mpi_status); process_frontier(1, blocksize, board, ROW, ldboard, nbngb, ldnbngb); /* Column on the right needs data from the right process --> 1, 7 */ MPI_Wait(&cart_request[1], &mpi_status); MPI_Wait(&cart_request[7], &mpi_status); process_frontier(blocksize, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb); /* Line under needs data from under process --> 3 */ MPI_Wait(&cart_request[3], &mpi_status); process_frontier(blocksize, blocksize, board, ROW, ldboard, nbngb, ldnbngb); /* Update the cell */ num_alive = 0; for (j = 1; j <= blocksize; ++j) { for (i = 1; i <= blocksize; ++i) { if ( (ngb( i, j ) < 2) || (ngb( i, j ) > 3) ) { cell(i, j) = 0; } else { if ((ngb( i, j )) == 3) cell(i, j) = 1; } if (cell(i, j) == 1) { num_alive+=1; } } } printf("%d \n", num_alive); } /* Reassembles matrix into one from the sub blocks in the block */ MPI_Gatherv(&board[ldboard+1], 1, sub_block, &globboard2[1+ldglobalboard], process_count, cell_per_processes, block, 0, grid); /* Reduction to determine max time execution */ time = mytimer() - t1; MPI_Allreduce(&time, &final_time, 1,MPI_DOUBLE, MPI_MAX, grid); /* Reduction to determine number of cells still alive in all processes */ MPI_Allreduce(MPI_IN_PLACE, &num_alive, 1, MPI_INT, MPI_SUM, grid); /* The END */ if(!rank){ // Combien de cellules sont en PLS à la fin de la soirée ? printf("Final number of living cells = %d\n", num_alive); printf("time=%.2lf ms\n",(double)time * 1.e3); char str [100]; // create debug file sprintf(str, "mpi_debug_%d.dat", size); FILE *fd = NULL; fd=fopen(str, "w"); // JUST TELL ME IF IT WORKS !! if (fd != NULL) fprintf(fd,"%.2lf", time*1.e3); else exit(EXIT_FAILURE); fclose(fd); output_board( BS, &globboard2[1+ldglobalboard], ldglobalboard, loop_iterations); } // FREE ALL free(process_count); free(cell_per_processes); free(board); free(nbngb); MPI_Finalize(); // The final end return EXIT_SUCCESS; }
ibis::table* ibis::jRange::select(const ibis::table::stringArray& colnames) const { ibis::table *res = 0; if (nrows < 0) { int64_t ierr = count(); if (ierr < 0) { LOGGER(ibis::gVerbose > 0) << "Warning -- jRange::count failed with error code" << ierr; return res; } } if (valr_ == 0 || orderr_ == 0 || vals_ == 0 || orders_ == 0 || orderr_->size() != maskr_.cnt() || orders_->size() != masks_.cnt()) { LOGGER(ibis::gVerbose > 0) << "Warning -- jRange::select failed to evaluate the join"; return res; } if (colnames.empty() || nrows == 0) { std::string nm = ibis::util::shortName(desc_); res = new ibis::tabula(nm.c_str(), desc_.c_str(), nrows); return res; } const uint32_t ncols = colnames.size(); std::string evt; evt = "select "; evt += colnames[0]; for (uint32_t j = 1; j < ncols; ++ j) { evt += ", "; evt += colnames[j]; } if ((desc_[0] != 'F' && desc_[0] != 'f') || (desc_[1] != 'R' && desc_[1] != 'r') || (desc_[2] != 'O' && desc_[2] != 'o') || (desc_[3] != 'M' && desc_[3] != 'm')) evt += " for "; else evt += ' '; evt += desc_; ibis::util::timer mytimer(evt.c_str()); std::map<const char*, uint32_t, ibis::lessi> namesToPos; std::vector<uint32_t> ipToPos(colnames.size()); std::vector<const ibis::column*> ircol, iscol; std::vector<const ibis::dictionary*> cats(colnames.size(), 0); // identify the names from the two data partitions for (uint32_t j = 0; j < ncols; ++ j) { ipToPos[j] = ncols+1; const char* cn = colnames[j]; std::string tname; while (*cn != 0 && *cn != '.') { tname += *cn; ++ cn; } if (*cn == '.') { ++ cn; } else { // did not find '.' tname.erase(); cn = colnames[j]; } int match = -1; // 0 ==> partr_, 1 ==> parts_ if (! tname.empty()) { match = frm_->position(tname.c_str()); if (match >= static_cast<long>(frm_->size())) { if (stricmp(tname.c_str(), partr_.name()) == 0) { match = 0; } else if (stricmp(tname.c_str(), parts_.name()) == 0) { match = 1; } } } if (match == 0) { const ibis::column *col = partr_.getColumn(cn); if (col != 0) { namesToPos[colnames[j]] = j; ipToPos[j] = ircol.size(); ircol.push_back(col); if (col->type() == ibis::CATEGORY) { const ibis::category *cat = static_cast<const ibis::category*>(col); cats[j] = cat->getDictionary(); } else if (col->type() == ibis::UINT) { const ibis::bord::column *bc = dynamic_cast<const ibis::bord::column*>(col); if (bc != 0) { cats[j] = bc->getDictionary(); } } } else { LOGGER(ibis::gVerbose > 0) << "Warning -- " << evt << " can not find column named \"" << colnames[j] << "\" in data partition \"" << partr_.name() << "\""; return res; } } else if (match == 1) { const ibis::column *col = parts_.getColumn(cn); if (col != 0) { namesToPos[colnames[j]] = j; ipToPos[j] = ncols - iscol.size(); iscol.push_back(col); if (col->type() == ibis::CATEGORY) { const ibis::category *cat = static_cast<const ibis::category*>(col); cats[j] = cat->getDictionary(); } else if (col->type() == ibis::UINT) { const ibis::bord::column *bc = dynamic_cast<const ibis::bord::column*>(col); if (bc != 0) { cats[j] = bc->getDictionary(); } } } else { LOGGER(ibis::gVerbose > 0) << "Warning -- " << evt << " can not find column named \"" << colnames[j] << "\" in data partition \"" << parts_.name() << "\""; return res; } } else { // not prefixed with a data partition name cn = colnames[j]; const ibis::column* col = partr_.getColumn(cn); if (col != 0) { ipToPos[j] = ircol.size(); ircol.push_back(col); if (col->type() == ibis::CATEGORY) { const ibis::category *cat = static_cast<const ibis::category*>(col); cats[j] = cat->getDictionary(); } else if (col->type() == ibis::UINT) { const ibis::bord::column *bc = dynamic_cast<const ibis::bord::column*>(col); if (bc != 0) { cats[j] = bc->getDictionary(); } } LOGGER(ibis::gVerbose > 3) << evt << " encountered a column name (" << colnames[j] << ") that does not start with a data " "partition name, assume it is for \"" << partr_.name() << "\""; } else { col = parts_.getColumn(cn); if (col != 0) { ipToPos[j] = ncols - iscol.size(); iscol.push_back(col); if (col->type() == ibis::CATEGORY) { const ibis::category *cat = static_cast<const ibis::category*>(col); cats[j] = cat->getDictionary(); } else if (col->type() == ibis::UINT) { const ibis::bord::column *bc = dynamic_cast<const ibis::bord::column*>(col); if (bc != 0) { cats[j] = bc->getDictionary(); } } LOGGER(ibis::gVerbose > 1) << evt << " encountered a column name (" << colnames[j] << ") that does not start with a data partition name, " "assume it is for \"" << parts_.name() << "\""; } else { LOGGER(ibis::gVerbose > 0) << "Warning -- " << evt << " encountered a name (" << colnames[j] << ") that does not start with a data " "partition name"; return res; } } } } // for (uint32_t j = 0; j < ncols; LOGGER(ibis::gVerbose > 3) << evt << " -- found " << ircol.size() << " column" << (ircol.size() > 1 ? "s" : "") << " from " << partr_.name() << " and " << iscol.size() << " column" << (iscol.size() > 1 ? "s" : "") << " from " << parts_.name(); // change Pos values for columns in S to have offset ircol.size() for (uint32_t j = 0; j < ncols; ++j) { if (ipToPos[j] <= ncols && ipToPos[j] >= ircol.size()) ipToPos[j] = (ncols - ipToPos[j]) + ircol.size(); } ibis::table::typeArray rtypes(ircol.size(), ibis::UNKNOWN_TYPE); ibis::table::bufferArray rbuff(ircol.size(), 0); IBIS_BLOCK_GUARD(ibis::table::freeBuffers, ibis::util::ref(rbuff), ibis::util::ref(rtypes)); ibis::table::typeArray stypes(iscol.size(), ibis::UNKNOWN_TYPE); ibis::table::bufferArray sbuff(iscol.size(), 0); IBIS_BLOCK_GUARD(ibis::table::freeBuffers, ibis::util::ref(sbuff), ibis::util::ref(stypes)); bool sane = true; // retrieve values from r_ for (uint32_t j = 0; sane && j < ircol.size(); ++ j) { rtypes[j] = ircol[j]->type(); switch (ircol[j]->type()) { case ibis::BYTE: rbuff[j] = ircol[j]->selectBytes(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<signed char>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::UBYTE: rbuff[j] = ircol[j]->selectUBytes(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<unsigned char>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::SHORT: rbuff[j] = ircol[j]->selectShorts(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<int16_t>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::USHORT: rbuff[j] = ircol[j]->selectUShorts(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<uint16_t>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::INT: rbuff[j] = ircol[j]->selectInts(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<int32_t>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::UINT: rbuff[j] = ircol[j]->selectUInts(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<uint32_t>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::LONG: rbuff[j] = ircol[j]->selectLongs(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<int64_t>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::ULONG: rbuff[j] = ircol[j]->selectULongs(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<uint64_t>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::FLOAT: rbuff[j] = ircol[j]->selectFloats(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<float>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::DOUBLE: rbuff[j] = ircol[j]->selectDoubles(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<double>*>(rbuff[j]), *orderr_); else sane = false; break; case ibis::TEXT: case ibis::CATEGORY: rbuff[j] = ircol[j]->selectStrings(maskr_); if (rbuff[j] != 0) ibis::util::reorder (*static_cast<std::vector<std::string>*>(rbuff[j]), *orderr_); else sane = false; break; default: sane = false; rbuff[j] = 0; LOGGER(ibis::gVerbose > 1) << "Warning -- jRange::select does not support column " "type " << ibis::TYPESTRING[(int)ircol[j]->type()] << " (name = " << partr_.name() << "." << ircol[j]->name() << ")"; break; } } if (! sane) { return res; } // retrieve values from parts_ for (uint32_t j = 0; sane && j < iscol.size(); ++ j) { stypes[j] = iscol[j]->type(); switch (iscol[j]->type()) { case ibis::BYTE: sbuff[j] = iscol[j]->selectBytes(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<signed char>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::UBYTE: sbuff[j] = iscol[j]->selectUBytes(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<unsigned char>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::SHORT: sbuff[j] = iscol[j]->selectShorts(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<int16_t>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::USHORT: sbuff[j] = iscol[j]->selectUShorts(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<uint16_t>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::INT: sbuff[j] = iscol[j]->selectInts(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<int32_t>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::UINT: sbuff[j] = iscol[j]->selectUInts(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<uint32_t>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::LONG: sbuff[j] = iscol[j]->selectLongs(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<int64_t>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::ULONG: sbuff[j] = iscol[j]->selectULongs(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<uint64_t>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::FLOAT: sbuff[j] = iscol[j]->selectFloats(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<float>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::DOUBLE: sbuff[j] = iscol[j]->selectDoubles(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<array_t<double>*>(sbuff[j]), *orders_); else sane = false; break; case ibis::TEXT: case ibis::CATEGORY: sbuff[j] = iscol[j]->selectStrings(masks_); if (sbuff[j] != 0) ibis::util::reorder (*static_cast<std::vector<std::string>*>(sbuff[j]), *orders_); else sane = false; break; default: sane = false; sbuff[j] = 0; LOGGER(ibis::gVerbose > 1) << "Warning -- jRange::select does not support column " "type " << ibis::TYPESTRING[(int)iscol[j]->type()] << " (name = " << parts_.name() << "." << iscol[j]->name() << ")"; break; } } if (! sane) { return res; } /// fill the in-memory buffer switch (colr_.type()) { case ibis::BYTE: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<signed char>*>(valr_), rtypes, rbuff, *static_cast<array_t<signed char>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::UBYTE: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<unsigned char>*>(valr_), rtypes, rbuff, *static_cast<array_t<unsigned char>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::SHORT: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<int16_t>*>(valr_), rtypes, rbuff, *static_cast<array_t<int16_t>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::USHORT: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<uint16_t>*>(valr_), rtypes, rbuff, *static_cast<array_t<uint16_t>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::INT: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<int32_t>*>(valr_), rtypes, rbuff, *static_cast<array_t<int32_t>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::UINT: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<uint32_t>*>(valr_), rtypes, rbuff, *static_cast<array_t<uint32_t>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::LONG: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<int64_t>*>(valr_), rtypes, rbuff, *static_cast<array_t<int64_t>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::ULONG: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<uint64_t>*>(valr_), rtypes, rbuff, *static_cast<array_t<uint64_t>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::FLOAT: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<float>*>(valr_), rtypes, rbuff, *static_cast<array_t<float>*>(vals_), stypes, sbuff, colnames, ipToPos); break; case ibis::DOUBLE: res = fillResult (nrows, delta1_, delta2_, evt, *static_cast<array_t<double>*>(valr_), rtypes, rbuff, *static_cast<array_t<double>*>(vals_), stypes, sbuff, colnames, ipToPos); break; default: LOGGER(ibis::gVerbose > 0) << "Warning -- " << evt << " can not handle join column of type " << ibis::TYPESTRING[(int)colr_.type()]; } for (unsigned j = 0; j < cats.size(); ++ j) { if (cats[j] != 0) { ibis::bord::column *bc = dynamic_cast<ibis::bord::column*> (static_cast<ibis::bord*>(res)->getColumn(j)); if (bc != 0) bc->setDictionary(cats[j]); } } return res; } // ibis::jRange::select
void cg_solve(OperatorType& A, const VectorType& b, VectorType& x, Matvec matvec, typename OperatorType::LocalOrdinalType max_iter, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance, typename OperatorType::LocalOrdinalType& num_iters, typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr, timer_type* my_cg_times) { typedef typename OperatorType::ScalarType ScalarType; typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType; typedef typename OperatorType::LocalOrdinalType LocalOrdinalType; typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type; timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0; timer_type total_time = mytimer(); int myproc = 0; #ifdef HAVE_MPI MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (!A.has_local_indices) { std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means " << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve." << std::endl; return; } size_t nrows = A.rows.size(); LocalOrdinalType ncols = A.num_cols; nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors"); VectorType r(b.startIndex, nrows); VectorType p(0, ncols); VectorType Ap(b.startIndex, nrows); nvtxRangeEnd(r1); #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0); cudaCheckError(); #endif #endif normr = 0; magnitude_type rtrans = 0; magnitude_type oldrtrans = 0; LocalOrdinalType print_freq = max_iter/10; if (print_freq>50) print_freq = 50; if (print_freq<1) print_freq = 1; ScalarType one = 1.0; ScalarType zero = 0.0; TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY); TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY); TICK(); rtrans = dot(r, r); TOCK(tDOT); normr = std::sqrt(rtrans); if (myproc == 0) { std::cout << "Initial Residual = "<< normr << std::endl; } magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon(); #ifdef MINIFE_DEBUG std::ostream& os = outstream(); os << "brkdown_tol = " << brkdown_tol << std::endl; #endif for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) { if (k == 1) { TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY); } else { oldrtrans = rtrans; TICK(); rtrans = dot(r, r); TOCK(tDOT); magnitude_type beta = rtrans/oldrtrans; TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY); } normr = std::sqrt(rtrans); if (myproc == 0 && (k%print_freq==0 || k==max_iter)) { std::cout << "Iteration = "<<k<<" Residual = "<<normr<<std::endl; } magnitude_type alpha = 0; magnitude_type p_ap_dot = 0; TICK(); matvec(A, p, Ap); TOCK(tMATVEC); TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT); #ifdef MINIFE_DEBUG os << "iter " << k << ", p_ap_dot = " << p_ap_dot; os.flush(); #endif //TODO remove false below if (false && p_ap_dot < brkdown_tol) { if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) { std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl; #ifdef MINIFE_DEBUG os << "ERROR, numerical breakdown!"<<std::endl; #endif //update the timers before jumping out. my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[TOTAL] = mytimer() - total_time; return; } else brkdown_tol = 0.1 * p_ap_dot; } alpha = rtrans/p_ap_dot; #ifdef MINIFE_DEBUG os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl; #endif TICK(); waxpby(one, x, alpha, p, x); waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY); num_iters = k; } #ifdef HAVE_MPI #ifndef GPUDIRECT //TODO move outside? cudaHostUnregister(&p.coefs[0]); cudaCheckError(); if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]); cudaCheckError(); #endif #endif my_cg_times[WAXPY] = tWAXPY; my_cg_times[DOT] = tDOT; my_cg_times[MATVEC] = tMATVEC; my_cg_times[MATVECDOT] = tMATVECDOT; my_cg_times[TOTAL] = mytimer() - total_time; }