void blacs_pdgetri_nektar(int *BLACS_PARAMS, int *DESCA, int *ipvt, double **inva_LOC){ int row_start = 1, col_start = 1; int lwork,liwork,info = 0; double *work; int *iwork; int M; M = BLACS_PARAMS[7] + ((row_start-1) % BLACS_PARAMS[10]); lwork = BLACS_PARAMS[9]*numroc_( M, BLACS_PARAMS[10], BLACS_PARAMS[5], row_start, BLACS_PARAMS[3]); liwork = BLACS_PARAMS[7]; work = dvector(0,lwork-1); iwork = ivector(0,liwork-1); int i = -1, j = -1; pdgetri_(BLACS_PARAMS[7],*inva_LOC, row_start,col_start,DESCA,ipvt, work,i, iwork,j, info); if (info != 0) fprintf(stderr,"blacs_pdgetri_nektar: ERROR - info = %d \n",info); if ( ((int) work[0]) > lwork){ lwork = (int) work[0]; free(work); work = dvector(0,lwork); } if ( iwork[0] > lwork){ liwork = iwork[0]; free(iwork); iwork = ivector(0,liwork); } pdgetri_(BLACS_PARAMS[7],*inva_LOC, row_start,col_start,DESCA,ipvt, work,lwork, iwork,liwork, info); if (info != 0) fprintf(stderr,"blacs_pdgetri_nektar: ERROR - info = %d \n",info); free(work); free(iwork); }
/// 计算结果存储在矩阵a中 /// n_global: the order of the matrix static void inv_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); //// self code //n_global = 3; //double *aaa = new double(n_global*n_global); //for (int i = 0; i < 9; i++) //{ // aaa[i] = i + 1; //} //aaa[8] = 10; //auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa); // Create a NxN random matrix A auto a = block_cyclic_mat_t::random(grid, n_global, n_global); // Create a NxN matrix to hold A^{-1} auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global); // Copy A to A^{-1} since it will be overwritten during factorization std::copy_n(a->local_data(), a->local_size(), ai->local_data()); MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); // Factorize A blas_idx_t ia = 1, ja = 1; std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100); blas_idx_t info; //含义应该是D-GE-TRF。 //第一个D表示我们的矩阵是double类型的 //GE表示我们的矩阵是General类型的 //TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。 pdgetrf_(n_global, n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), info); assert(info == 0); double t_factor = MPI_Wtime() - t0; // Compute A^{-1} based on the LU factorization // Compute workspace for double and integer work arrays on each process blas_idx_t lwork = 10; blas_idx_t liwork = 10; std::vector<double> work (lwork); std::vector<blas_idx_t> iwork(liwork); lwork = liwork = -1; // 计算lwork与liwork的值 pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); lwork = static_cast<blas_idx_t>(work[0]); liwork = static_cast<size_t>(iwork[0]); work.resize(lwork); iwork.resize(liwork); // Now compute the inverse t0 = MPI_Wtime(); pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); double t_solve = MPI_Wtime() - t0; // Verify that the inverse is correct using A*A^{-1} = I auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global); // Compute I = A * A^{-1} - I and verify that the ||I|| is small char nein = 'N'; double alpha = 1.0, beta = -1.0; pdgemm_(nein, nein, n_global, n_global, n_global, alpha, a->local_data() , ia, ja, a->descriptor(), ai->local_data(), ia, ja, ai->descriptor(), beta, identity->local_data(), ia, ja, identity->descriptor()); // Compute 1-norm of the result char norm='1'; work.resize(identity->local_cols()); double err = pdlange_(norm, n_global, n_global, identity->local_data(), ia, ja, identity->descriptor(), work.data()); double t_total = t_factor + t_solve; double t_glob; MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = getri_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX INVERSE BENCHMARK SUMMARY\n" "================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops, err);fflush(stdout); } }