int main (int argc, const char * argv[]) { typedef float ScalarType; typedef boost::numeric::ublas::compressed_matrix<ScalarType> MatrixType; typedef boost::numeric::ublas::vector<ScalarType> VectorType; typedef viennacl::compressed_matrix<ScalarType> GPUMatrixType; typedef viennacl::vector<ScalarType> GPUVectorType; MatrixType M; // // Read system matrix from file // #ifdef _MSC_VER if (!viennacl::io::read_matrix_market_file(M, "../../examples/testdata/mat65k.mtx")) #else if (!viennacl::io::read_matrix_market_file(M, "../examples/testdata/mat65k.mtx")) #endif { std::cerr<<"ERROR: Could not read matrix file " << std::endl; exit(EXIT_FAILURE); } std::cout << "Size of matrix: " << M.size1() << std::endl; std::cout << "Avg. Entries per row: " << M.nnz() / static_cast<double>(M.size1()) << std::endl; // // Use uniform load vector: // VectorType rhs(M.size2()); for (size_t i=0; i<rhs.size(); ++i) rhs(i) = 1; GPUMatrixType gpu_M(M.size1(), M.size2()); GPUVectorType gpu_rhs(M.size1()); viennacl::copy(M, gpu_M); viennacl::copy(rhs, gpu_rhs); ///////////////////////////////// Tests to follow ///////////////////////////// viennacl::linalg::bicgstab_tag solver_tag(1e-10, 50); //for simplicity and reasonably short execution times we use only 50 iterations here // // Reference: No preconditioner: // std::cout << "--- Reference 1: Pure BiCGStab on CPU ---" << std::endl; VectorType result = viennacl::linalg::solve(M, rhs, solver_tag); std::cout << " * Solver iterations: " << solver_tag.iters() << std::endl; VectorType residual = viennacl::linalg::prod(M, result) - rhs; std::cout << " * Rel. Residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl; std::cout << "--- Reference 2: Pure BiCGStab on GPU ---" << std::endl; GPUVectorType gpu_result = viennacl::linalg::solve(gpu_M, gpu_rhs, solver_tag); std::cout << " * Solver iterations: " << solver_tag.iters() << std::endl; GPUVectorType gpu_residual = viennacl::linalg::prod(gpu_M, gpu_result) - gpu_rhs; std::cout << " * Rel. Residual: " << viennacl::linalg::norm_2(gpu_residual) / viennacl::linalg::norm_2(gpu_rhs) << std::endl; // // Reference: ILUT preconditioner: // std::cout << "--- Reference 2: BiCGStab with ILUT on CPU ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::ilut_precond<MatrixType> ilut(M, viennacl::linalg::ilut_tag()); std::cout << " * Iterative solver run..." << std::endl; run_solver(M, rhs, solver_tag, ilut); // // Test 1: SPAI with CPU: // std::cout << "--- Test 1: CPU-based SPAI ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::spai_precond<MatrixType> spai_cpu(M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2)); std::cout << " * Iterative solver run..." << std::endl; run_solver(M, rhs, solver_tag, spai_cpu); // // Test 2: FSPAI with CPU: // std::cout << "--- Test 2: CPU-based FSPAI ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::fspai_precond<MatrixType> fspai_cpu(M, viennacl::linalg::fspai_tag()); std::cout << " * Iterative solver run..." << std::endl; run_solver(M, rhs, solver_tag, fspai_cpu); // // Test 3: SPAI with GPU: // std::cout << "--- Test 3: GPU-based SPAI ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::spai_precond<GPUMatrixType> spai_gpu(gpu_M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2)); std::cout << " * Iterative solver run..." << std::endl; run_solver(gpu_M, gpu_rhs, solver_tag, spai_gpu); return EXIT_SUCCESS; }
/** * The main steps in this tutorial are the following: * - Setup the systems * - Run solvers without preconditioner and with ILUT preconditioner for comparison * - Run solver with SPAI preconditioner on CPU * - Run solver with SPAI preconditioner on GPU * - Run solver with factored SPAI preconditioner on CPU * - Run solver with factored SPAI preconditioner on GPU * **/ int main (int, const char **) { typedef float ScalarType; typedef boost::numeric::ublas::compressed_matrix<ScalarType> MatrixType; typedef boost::numeric::ublas::vector<ScalarType> VectorType; typedef viennacl::compressed_matrix<ScalarType> GPUMatrixType; typedef viennacl::vector<ScalarType> GPUVectorType; /** * If you have multiple OpenCL-capable devices in your system, we pick the second device for this tutorial. **/ #ifdef VIENNACL_WITH_OPENCL // Optional: Customize OpenCL backend viennacl::ocl::platform pf = viennacl::ocl::get_platforms()[0]; std::vector<viennacl::ocl::device> const & devices = pf.devices(); // Optional: Set first device to first context: viennacl::ocl::setup_context(0, devices[0]); // Optional: Set second device for second context (use the same device for the second context if only one device available): if (devices.size() > 1) viennacl::ocl::setup_context(1, devices[1]); else viennacl::ocl::setup_context(1, devices[0]); std::cout << viennacl::ocl::current_device().info() << std::endl; viennacl::context ctx(viennacl::ocl::get_context(1)); #else viennacl::context ctx; #endif /** * Create uBLAS-based sparse matrix and read system matrix from file **/ MatrixType M; if (!viennacl::io::read_matrix_market_file(M, "../examples/testdata/mat65k.mtx")) { std::cerr<<"ERROR: Could not read matrix file " << std::endl; exit(EXIT_FAILURE); } std::cout << "Size of matrix: " << M.size1() << std::endl; std::cout << "Avg. Entries per row: " << double(M.nnz()) / static_cast<double>(M.size1()) << std::endl; /** * Use a constant load vector for simplicity **/ VectorType rhs(M.size2()); for (std::size_t i=0; i<rhs.size(); ++i) rhs(i) = ScalarType(1); /** * Create the ViennaCL matrix and vector and initialize with uBLAS data: **/ GPUMatrixType gpu_M(M.size1(), M.size2(), ctx); GPUVectorType gpu_rhs(M.size1(), ctx); viennacl::copy(M, gpu_M); viennacl::copy(rhs, gpu_rhs); /** * <h2>Solver Runs</h2> * We use a relative tolerance of \f$ 10^{-10} \f$ with a maximum of 50 iterations for each use case. * Usually more than 50 solver iterations are required for convergence, but this choice ensures shorter execution times and suffices for this tutorial. **/ viennacl::linalg::bicgstab_tag solver_tag(1e-10, 50); //for simplicity and reasonably short execution times we use only 50 iterations here /** * The first reference is to use no preconditioner (CPU and GPU): **/ std::cout << "--- Reference 1: Pure BiCGStab on CPU ---" << std::endl; VectorType result = viennacl::linalg::solve(M, rhs, solver_tag); std::cout << " * Solver iterations: " << solver_tag.iters() << std::endl; VectorType residual = viennacl::linalg::prod(M, result) - rhs; std::cout << " * Rel. Residual: " << viennacl::linalg::norm_2(residual) / viennacl::linalg::norm_2(rhs) << std::endl; std::cout << "--- Reference 2: Pure BiCGStab on GPU ---" << std::endl; GPUVectorType gpu_result = viennacl::linalg::solve(gpu_M, gpu_rhs, solver_tag); std::cout << " * Solver iterations: " << solver_tag.iters() << std::endl; GPUVectorType gpu_residual = viennacl::linalg::prod(gpu_M, gpu_result); gpu_residual -= gpu_rhs; std::cout << " * Rel. Residual: " << viennacl::linalg::norm_2(gpu_residual) / viennacl::linalg::norm_2(gpu_rhs) << std::endl; /** * The second reference is a standard ILUT preconditioner (only CPU): **/ std::cout << "--- Reference 2: BiCGStab with ILUT on CPU ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::ilut_precond<MatrixType> ilut(M, viennacl::linalg::ilut_tag()); std::cout << " * Iterative solver run..." << std::endl; run_solver(M, rhs, solver_tag, ilut); /** * <h2>Step 1: SPAI with CPU</h2> **/ std::cout << "--- Test 1: CPU-based SPAI ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::spai_precond<MatrixType> spai_cpu(M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2)); std::cout << " * Iterative solver run..." << std::endl; run_solver(M, rhs, solver_tag, spai_cpu); /** * <h2>Step 2: FSPAI with CPU</h2> **/ std::cout << "--- Test 2: CPU-based FSPAI ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::fspai_precond<MatrixType> fspai_cpu(M, viennacl::linalg::fspai_tag()); std::cout << " * Iterative solver run..." << std::endl; run_solver(M, rhs, solver_tag, fspai_cpu); /** * <h2>Step 3: SPAI with GPU</h2> **/ std::cout << "--- Test 3: GPU-based SPAI ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::spai_precond<GPUMatrixType> spai_gpu(gpu_M, viennacl::linalg::spai_tag(1e-3, 3, 5e-2)); std::cout << " * Iterative solver run..." << std::endl; run_solver(gpu_M, gpu_rhs, solver_tag, spai_gpu); /** * <h2>Step 4: FSPAI with GPU</h2> **/ std::cout << "--- Test 4: GPU-based FSPAI ---" << std::endl; std::cout << " * Preconditioner setup..." << std::endl; viennacl::linalg::fspai_precond<GPUMatrixType> fspai_gpu(gpu_M, viennacl::linalg::fspai_tag()); std::cout << " * Iterative solver run..." << std::endl; run_solver(gpu_M, gpu_rhs, solver_tag, fspai_gpu); /** * That's it! Print success message and exit. **/ std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl; return EXIT_SUCCESS; }
/*---------------------------------------*/ void bilut(csr_t *csr, bilu_host_t *h_bilu, int nb, int p, double tol, double *avgfil) { /*---------------------------------------*/ int i,j,k,j1,j2; int *ia = csr->ia; int *ja = csr->ja; double *a = csr->a; /*---------------------------------------*/ /*---------- diagonal blocks */ Calloc(h_bilu->bdiag, nb, csr_t); /*------ nnz in all diag blocks */ int nnzds = 0; /*------ for each block diag */ for (i=0; i<nb; i++) { int nnzd = 0; j1 = h_bilu->noff[i]; j2 = h_bilu->noff[i] + h_bilu->nrow[i]; /*---------- nnz of block diag i */ for (j=j1; j<j2; j++) for (k=ia[j]; k<ia[j+1]; k++) if (ja[k-1] > j1 && ja[k-1] <= j2) nnzd ++; /*---------- alloc diag block csr */ nnzds += nnzd; malloc_csr(j2-j1, nnzd, h_bilu->bdiag+i); int *ia2 = h_bilu->bdiag[i].ia; int *ja2 = h_bilu->bdiag[i].ja; double *a2 = h_bilu->bdiag[i].a; /*------------- build block diag i */ ia2[0] = 1; //ptr for ja, a int p = 0; for (j=j1; j<j2; j++) { int rownnz = 0; for (k=ia[j]; k<ia[j+1]; k++) if (ja[k-1] > j1 && ja[k-1] <= j2) { rownnz ++; a2[p] = a[k-1]; ja2[p] = ja[k-1] - j1; p++; } ia2[j+1-j1] = ia2[j-j1] + rownnz; } } printf("number of blocks %d\n", nb); printf(" %f in diag blocks\n",\ (double)nnzds/(double)csr->nnz); /*------- iluk for each block diag */ printf("begin bilut(%d,%.2e) ...\n", p, tol); Calloc(h_bilu->blu, nb, lu_t); double *fil; Malloc(fil, nb, double); for (i=0; i<nb; i++) { if (ilut(&h_bilu->bdiag[i], &h_bilu->blu[i], tol, p)) { printf("BILUT error in block %d\n", i); exit(-1); } int nnzl = h_bilu->blu[i].l->nnz; int nnzu = h_bilu->blu[i].u->nnz; int nnzdi = h_bilu->bdiag[i].nnz; fil[i] = (nnzl+nnzu) / (double)(nnzdi); } /*-------- average fill factor */ (*avgfil) = 0.0; for (i=0; i<nb; i++) (*avgfil) += fil[i]; (*avgfil) /= nb; printf(" done, avg fillfactor %f\n",*avgfil); free(fil); }