// Calculates w void cg_solver_calc_w( const int x, const int y, const int z, const int halo_depth, double* pw, double* vec_p, double* vec_w, double* vec_kx, double* vec_ky, double* vec_kz, int* a_row_index, int* a_col_index, double* a_non_zeros) { double pw_temp = 0.0; int m = x*y*z; mkl_cspblas_dcsrgemv( "n", &m, a_non_zeros, a_row_index, a_col_index, vec_p, vec_w); int x_inner = x - 2*halo_depth; #pragma omp parallel for reduction(+:pw_temp) for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { int offset = ii*x*y + jj*x + halo_depth; pw_temp += cblas_ddot(x_inner, vec_w + offset, 1, vec_p + offset, 1); } } *pw += pw_temp; }
void mkl_warmup(){ srand48(time(0)); hbmat_t *t = malloc(sizeof(hbmat_t)); t->m = DIM; t->n = DIM; t->vdiag = NULL; int m = t->m; int alpha = 1; int beta = 1; int *vptr = t->vptr = malloc((DIM+1) * sizeof(int)); int *vpos = t->vpos = malloc((DIM * DIM) *sizeof(int)); double *vval = t->vval = malloc((DIM*DIM)*sizeof(double)); vptr[0] = 0; int vpos_p = 0; puts("warm-up"); for ( int i = 1; i <= DIM; ++i ) { vptr[i] = vptr[i-1] + FILL; int vp = 0; for ( int j = vptr[i-1]; j < vptr[i]; ++j ) { vpos[vpos_p] = vp; vval[vpos_p] = drand48(); vp++; vpos_p++; } } double *x = malloc(DIM*sizeof(double)); for(int i = 0; i < DIM; ++i) x[i] = drand48(); double *y = malloc(DIM*sizeof(double)); mkl_dcsrmv("N", &m, &m, &alpha, "GLNC", vval, vpos, vptr, vptr+1, x, &beta, y); mkl_dcsrsv("N", &m, &alpha, "TLNC", vval, vpos, vptr, vptr+1, x, y); mkl_cspblas_dcsrgemv("N", &m, vval, vptr, vpos, x, y); free(x); free(y); free(vptr); free(vpos); free(vval); free(t); }
void dsyrk_sparse_upper(hbmat_t* A, hbmat_t* C){ /* * Check if the input matrix has properly set */ if ( A->vptr == NULL ) hyper_sym_csr_task2(A); if ( C->vptr == NULL ) hyper_sym_csr_task2(C); int n = A->n; int m = A->m; int* vptr = A->vptr; int* vpos = A->vpos; double* vval = A->vval; int* vptr_c = C->vptr; int* vpos_c = C->vpos; double* vval_c = C->vval; int col_pos, row_pos; double* peela = malloc(m*sizeof(double)); double* peelc = malloc(m*sizeof(double)); char* trans = "N"; int i; for ( i = 0; i < n; i++ ) { array_clear(peela, m); // array_clear(peelc, m); array_s2d(A, peela, i); mkl_cspblas_dcsrgemv(trans, &m, vval, vptr, vpos, peela, peelc); int k; for ( k = vptr_c[i]; k < vptr_c[i+1]; k++ ) { col_pos = vpos_c[k]; vval_c[k] -= peelc[col_pos]; } } free(peela); free(peelc); }
void spmv_mkl_double(MKL_INT m, double values[], MKL_INT rowIndex[], MKL_INT columns[], double x[], double y[]) { char transa = 'n'; mkl_cspblas_dcsrgemv(&transa, &m, values, rowIndex, columns, x, y); }
// The main chebyshev iteration void cheby_solver_iterate( const int x, const int y, const int z, const int halo_depth, double alpha, double beta, double* vec_u, double* vec_u0, double* vec_p, double* vec_r, double* vec_w, double* vec_kx, double* vec_ky, double* vec_kz, int* a_row_index, int* a_col_index, double* a_non_zeros) { int m = x*y*z; mkl_cspblas_dcsrgemv( "n", &m, a_non_zeros, a_row_index, a_col_index, vec_u, vec_w); int x_inner = x - 2*halo_depth; #pragma omp parallel for for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { const int offset = ii*x*y + jj*x + halo_depth; cblas_dcopy(x_inner, vec_u0 + offset, 1, vec_r + offset, 1); cblas_daxpy(x_inner, -1.0, vec_w + offset, 1, vec_r + offset, 1); cblas_dscal(x_inner, alpha, vec_p + offset, 1); cblas_daxpy(x_inner, beta, vec_r + offset, 1, vec_p + offset, 1); } } cheby_calc_u(x, y, z, halo_depth, vec_u, vec_p); }
// Initialises the CG solver void cg_solver_init( const int x, const int y, const int z, const int halo_depth, const int coefficient, double rx, double ry, double rz, double* rro, double* density, double* energy, double* vec_u, double* vec_p, double* vec_r, double* vec_w, double* vec_kx, double* vec_ky, double* vec_kz, int* a_row_index, int* a_col_index, double* a_non_zeros) { if(coefficient != CONDUCTIVITY && coefficient != RECIP_CONDUCTIVITY) { die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient); } #pragma omp parallel for for(int ii = 0; ii < z; ++ii) { for(int jj = 0; jj < y; ++jj) { for(int kk = 0; kk < x; ++kk) { const int index = ii*y*x+jj*x+kk; vec_p[index] = 0.0; vec_r[index] = 0.0; vec_u[index] = energy[index]*density[index]; } } } #pragma omp parallel for for(int ii = 1; ii < z-1; ++ii) { for(int jj = 1; jj < y-1; ++jj) { for(int kk = 1; kk < x-1; ++kk) { const int index = ii*y*x+jj*x+kk; vec_w[index] = (coefficient == CONDUCTIVITY) ? density[index] : 1.0/density[index]; } } } #pragma omp parallel for for(int ii = halo_depth; ii < z-1; ++ii) { for(int jj = halo_depth; jj < y-1; ++jj) { for(int kk = halo_depth; kk < x-1; ++kk) { const int index = ii*x*y + jj*x + kk; vec_kx[index] = rx*(vec_w[index-1]+vec_w[index]) / (2.0*vec_w[index-1]*vec_w[index]); vec_ky[index] = ry*(vec_w[index-x]+vec_w[index]) / (2.0*vec_w[index-x]*vec_w[index]); vec_kz[index] = rz*(vec_w[index-x*y]+vec_w[index]) / (2.0*vec_w[index-x*y]*vec_w[index]); } } } // Initialise the CSR sparse coefficient matrix for(int ii = halo_depth; ii < z-1; ++ii) { for(int jj = halo_depth; jj < y-1; ++jj) { for(int kk = halo_depth; kk < x-1; ++kk) { const int index = ii*x*y + jj*x + kk; int coef_index = a_row_index[index]; if(ii >= halo_depth) { a_non_zeros[coef_index] = -vec_kz[index]; a_col_index[coef_index++] = index-x*y; } if(jj >= halo_depth) { a_non_zeros[coef_index] = -vec_ky[index]; a_col_index[coef_index++] = index-x; } if(kk >= halo_depth) { a_non_zeros[coef_index] = -vec_kx[index]; a_col_index[coef_index++] = index-1; } a_non_zeros[coef_index] = (1.0 + vec_kx[index+1] + vec_kx[index] + vec_ky[index+x] + vec_ky[index] + vec_kz[index+x*y] + vec_kz[index]); a_col_index[coef_index++] = index; if(ii < z-halo_depth) { a_non_zeros[coef_index] = -vec_kz[index+x*y]; a_col_index[coef_index++] = index+x*y; } if(jj < y-halo_depth) { a_non_zeros[coef_index] = -vec_ky[index+x]; a_col_index[coef_index++] = index+x; } if(kk < x-halo_depth) { a_non_zeros[coef_index] = -vec_kx[index+1]; a_col_index[coef_index] = index+1; } } } } double rro_temp = 0.0; int m = x*y*z; mkl_cspblas_dcsrgemv( "n", &m, a_non_zeros, a_row_index, a_col_index, vec_u, vec_w); int x_inner = x-2*halo_depth; #pragma omp parallel for reduction(+:rro_temp) for(int ii = halo_depth; ii < z-halo_depth; ++ii) { for(int jj = halo_depth; jj < y-halo_depth; ++jj) { const int offset = ii*y*x + jj*x + halo_depth; cblas_dcopy(x_inner, vec_u + offset, 1, vec_r + offset, 1); cblas_daxpy(x_inner, -1.0, vec_w + offset, 1, vec_r + offset, 1); cblas_dcopy(x_inner, vec_r + offset, 1, vec_p + offset, 1); rro_temp += cblas_ddot(x_inner, vec_r + offset, 1, vec_p + offset, 1); } } // Sum locally *rro += rro_temp; }
int cg_mkl_double(MKL_INT n, double a[], MKL_INT ia[], MKL_INT ja[], double solution[], double rhs[], MKL_INT max_iter, double r_tol, double a_tol) { MKL_INT rci_request, itercount, i; // parameter arrays for solver MKL_INT ipar[128]; double dpar[128]; double euclidean_norm; // for SpMV char tr = 'n'; double * tmp; double * residual; tmp = (double *) malloc(4 * n * sizeof(double)); residual = (double *) malloc(n * sizeof(double)); // initialize the solver dcg_init(&n,solution,rhs,&rci_request,ipar,dpar,tmp); if (rci_request!=0) goto failure; ipar[1]=6; // output all warnings and errors ipar[4]=max_iter; // maximum number of iterations ipar[7]=1; // stop iteration at maximum iterations ipar[8]=1; // residual stopping test ipar[9]=0; // request for the user defined stopping test dpar[0]=r_tol * r_tol; // relative residual tolerance dpar[1]=a_tol * a_tol; // absolute residual tolerance /*---------------------------------------------------------------------------*/ /* Check the correctness and consistency of the newly set parameters */ /*---------------------------------------------------------------------------*/ dcg_check(&n,solution,rhs,&rci_request,ipar,dpar,tmp); if (rci_request!=0) goto failure; /*---------------------------------------------------------------------------*/ /* Compute the solution by RCI (P)CG solver without preconditioning */ /* Reverse Communications starts here */ /*---------------------------------------------------------------------------*/ rci: dcg(&n,solution,rhs,&rci_request,ipar,dpar,tmp); //printf("Residual norm is %e\n", sqrt(dpar[4])); /*---------------------------------------------------------------------------*/ /* If rci_request=0, then the solution was found with the required precision */ /*---------------------------------------------------------------------------*/ if (rci_request==0) goto getsln; /*---------------------------------------------------------------------------*/ /* If rci_request=1, then compute the vector A*tmp[0] */ /* and put the result in vector tmp[n] */ /*---------------------------------------------------------------------------*/ if (rci_request==1) { mkl_cspblas_dcsrgemv(&tr, &n, a, ia, ja, tmp, &tmp[n]); goto rci; } /*---------------------------------------------------------------------------*/ /* If rci_request=anything else, then dcg subroutine failed */ /* to compute the solution vector: solution[n] */ /*---------------------------------------------------------------------------*/ goto failure; /*---------------------------------------------------------------------------*/ /* Reverse Communication ends here */ /* Get the current iteration number into itercount */ /*---------------------------------------------------------------------------*/ getsln: dcg_get(&n,solution,rhs,&rci_request,ipar,dpar,tmp,&itercount); mkl_cspblas_dcsrgemv(&tr, &n, a, ia, ja, solution, residual); for(i=0;i<n;i++) residual[i] -= rhs[i]; i=1; euclidean_norm=dnrm2(&n,residual,&i); printf("\nMKL CG reached %e residual in %d iterations\n",euclidean_norm, itercount); // release memory MKL_FreeBuffers(); free(tmp); free(residual); if (itercount <= max_iter && (euclidean_norm * euclidean_norm) < (dpar[0] * dpar[4] + dpar[5])) { // printf("This example has successfully PASSED through all steps of computation!"); // printf("\n"); // printf("(Residual norm is %e)\n", euclidean_norm); return 0; } else { // printf("This example may have FAILED as either the number of iterations exceeds"); // printf("\nthe maximum number of iterations %d, or the ", max_iter); // printf("computed solution\ndiffers has not sufficiently converged."); // printf("(Residual norm is %e), or both.\n", euclidean_norm); return 1; } /*-------------------------------------------------------------------------*/ /* Release internal MKL memory that might be used for computations */ /* NOTE: It is important to call the routine below to avoid memory leaks */ /* unless you disable MKL Memory Manager */ /*-------------------------------------------------------------------------*/ failure: printf("This example FAILED as the solver has returned the ERROR "); printf("code %d", rci_request); MKL_FreeBuffers(); return 1; }
int main(int argc, char **argv) { if (argc < 2) { fprintf(stderr, "Usage: reordering_test matrix_in_matrix_market_format\n"); return -1; } CSR *A = new CSR(argv[1], 0, true /* force-symmetric */); int nnz = A->getNnz(); double flops = 2*nnz; double bytes = (sizeof(double) + sizeof(int))*nnz + sizeof(double)*(A->m + A->n); printf("original bandwidth %d\n", A->getBandwidth()); double *x = MALLOC(double, A->m); double *y = MALLOC(double, A->m); // allocate a large buffer to flush out cache bufToFlushLlc = (double *)_mm_malloc(LLC_CAPACITY, 64); const int REPEAT = 128; double times[REPEAT]; for (int i = 0; i < REPEAT; ++i) { flushLlc(); double t = omp_get_wtime(); A->multiplyWithVector(y, x); times[i] = omp_get_wtime() - t; } correctnessCheck(A, y); printf("SpMV BW"); printEfficiency(times, REPEAT, flops, bytes); #ifdef MKL for (int i = 0; i < REPEAT; ++i) { flushLlc(); double t = omp_get_wtime(); mkl_cspblas_dcsrgemv( "N", &A->m, A->values, A->rowptr, A->colidx, x, y); times[i] = omp_get_wtime() - t; } correctnessCheck(A, y); printf("MKL SpMV BW"); printEfficiency(times, REPEAT, flops, bytes); #endif int *perm = MALLOC(int, A->m); int *inversePerm = MALLOC(int, A->m); for (int o = BFS; o <= RCM; ++o) { Option option = (Option)o; switch (option) { case BFS: printf("\nBFS reordering\n"); break; case RCM_WO_SOURCE_SELECTION: printf("\nRCM reordering w/o source selection heuristic\n"); break; case RCM: printf("\nRCM reordering\n"); break; default: assert(false); break; } double t = -omp_get_wtime(); switch (option) { case BFS: A->getBFSPermutation(perm, inversePerm); break; case RCM_WO_SOURCE_SELECTION: A->getRCMPermutation(perm, inversePerm, false); break; case RCM: A->getRCMPermutation(perm, inversePerm); break; default: assert(false); break; } t += omp_get_wtime(); printf( "Constructing permutation takes %g (%.2f gbps)\n", t, nnz*4/t/1e9); isPerm(perm, A->m); isPerm(inversePerm, A->m); t = -omp_get_wtime(); CSR *APerm = A->permute(perm, inversePerm); t += omp_get_wtime(); printf("Permute takes %g (%.2f gbps)\n", t, bytes/t/1e9); printf("Permuted bandwidth %d\n", APerm->getBandwidth()); for (int i = 0; i < REPEAT; ++i) { flushLlc(); t = omp_get_wtime(); APerm->multiplyWithVector(y, x); times[i] = omp_get_wtime() - t; } printf("SpMV BW"); printEfficiency(times, REPEAT, flops, bytes); delete APerm; } FREE(x); FREE(y); delete A; }