//------------------------------------------------------------------------------------------------------------------------------ void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ // form restriction of alpha[], beta_*[] coefficients from fromLevel if(fromLevel != NULL){ #ifdef VECTOR_ALPHA restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); #endif restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); } // else case assumes alpha/beta have been set // exchange alpha/beta/... (must be done before calculating Dinv) #ifdef VECTOR_ALPHA exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe #endif exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ... rebuild_operator_blackbox(level,a,b,2); // exchange Dinv... exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe }
//------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ if(NUM_SMOOTHS&1){ fprintf(stderr,"error - NUM_SMOOTHS must be even...\n"); exit(0); } #ifdef USE_L1JACOBI double weight = 1.0; #else double weight = 2.0/3.0; #endif int box,s; for(s=0;s<NUM_SMOOTHS;s++){ // exchange ghost zone data... Jacobi ping pongs between x_id and VECTOR_TEMP if((s&1)==0){exchange_boundary(level, x_id,stencil_get_shape());apply_BCs(level, x_id,stencil_get_shape());} else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());} // apply the smoother... Jacobi ping pongs between x_id and VECTOR_TEMP double _timeStart = getTime(); const int ghosts = level->box_ghosts; const int jStride = level->box_jStride; const int kStride = level->box_kStride; const int dim = level->box_dim; const double h2inv = 1.0/(level->h*level->h); PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain #ifdef USE_L1JACOBI const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); #else const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); #endif const double * __restrict__ x_n; double * __restrict__ x_np1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; double Ax_n = apply_op_ijk(x_n); x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n); }}} } // box-loop level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax // exchange the boundary of x in preparation for Ax exchange_boundary(level,x_id,stencil_is_star_shaped()); apply_BCs(level,x_id); // now do Ax proper... uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; Ax[ijk] = apply_op_ijk(x); }}} } level->cycles.apply_op += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ // form restriction of alpha[], beta_*[] coefficients from fromLevel if(fromLevel != NULL){ #ifdef VECTOR_ALPHA restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); #endif restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); } // else case assumes alpha/beta have been set // extrapolate the beta's into the ghost zones (needed for mixed derivatives) extrapolate_betas(level); //initialize_problem(level,level->h,a,b); // approach used for testing smooth beta's; destroys the black box nature of the solver // exchange alpha/beta/... (must be done before calculating Dinv) #ifdef VECTOR_ALPHA exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe #endif exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); // black box rebuild of D^{-1}, l1^{-1}, dominant eigenvalue, ... rebuild_operator_blackbox(level,a,b,4); // exchange Dinv/L1inv/... exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe #ifdef VECTOR_L1INV exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX); #endif }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ int box,s; for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth exchange_boundary(level,phi_id,stencil_get_shape()); apply_BCs(level,phi_id,stencil_get_shape()); double _timeStart = getTime(); #ifdef _OPENMP #pragma omp parallel for private(box) #endif for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const int ghosts = level->box_ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); if( (s&0x1)==0 ){ // forward sweep... hard to thread for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; double Ax = apply_op_ijk(phi); phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); }}} }else{ // backward sweep... hard to thread for(k=dim-1;k>=0;k--){ for(j=dim-1;j>=0;j--){ for(i=dim-1;i>=0;i--){ int ijk = i + j*jStride + k*kStride; double Ax = apply_op_ijk(phi); phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); }}} } } // boxes level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax // exchange the boundary of x in preparation for Ax exchange_boundary(level,x_id,stencil_is_star_shaped()); apply_BCs(level,x_id); // now do Ax proper... uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; Ax[ijk] = apply_op_ijk(x); }}} } level->cycles.apply_op += (uint64_t)(CycleTime()-_timeStart); }
void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){ // exchange the boundary for x in prep for Ax... exchange_boundary(level,x_id,stencil_get_shape()); apply_BCs(level,x_id,stencil_get_shape()); // now do residual/restriction proper... double _timeStart = getTime(); const int ghosts = level->box_ghosts; const int jStride = level->box_jStride; const int kStride = level->box_kStride; const int dim = level->box_dim; const double h2inv = 1.0/(level->h*level->h); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); double * __restrict__ res = level->my_boxes[box].vectors[ res_id] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; double Ax = apply_op_ijk(x); res[ijk] = rhs[ijk]-Ax; }}} } level->timers.residual += (double)(getTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ // Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399 //------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){ fprintf(stderr,"error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n"); exit(0); } if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )fprintf(stderr,"dominant_eigenvalue_of_DinvA <= 0.0 !\n"); //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int s; int block; // compute the Chebyshev coefficients... double beta = 1.000*level->dominant_eigenvalue_of_DinvA; //double alpha = 0.300000*beta; //double alpha = 0.250000*beta; //double alpha = 0.166666*beta; double alpha = 0.125000*beta; double theta = 0.5*(beta+alpha); // center of the spectral ellipse double delta = 0.5*(beta-alpha); // major axis? double sigma = theta/delta; double rho_n = 1/sigma; // rho_0 double chebyshev_c1[CHEBYSHEV_DEGREE]; // + c1*(x_n-x_nm1) == rho_n*rho_nm1 double chebyshev_c2[CHEBYSHEV_DEGREE]; // + c2*(b-Ax_n) chebyshev_c1[0] = 0.0; chebyshev_c2[0] = 1/theta; for(s=1;s<CHEBYSHEV_DEGREE;s++){ double rho_nm1 = rho_n; rho_n = 1.0/(2.0*sigma - rho_nm1); chebyshev_c1[s] = rho_n*rho_nm1; chebyshev_c2[s] = rho_n*2.0/delta; } for(s=0;s<CHEBYSHEV_DEGREE*NUM_SMOOTHS;s++){ // get ghost zone data... Chebyshev ping pongs between x_id and VECTOR_TEMP if((s&1)==0){exchange_boundary(level, x_id,stencil_get_shape());apply_BCs(level, x_id,stencil_get_shape());} else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());} // apply the smoother... Chebyshev ping pongs between x_id and VECTOR_TEMP double _timeStart = getTime(); PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int ghosts = level->box_ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); #ifdef VECTOR_ALPHA const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); #endif const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); double * __restrict__ x_np1; const double * __restrict__ x_n; const double * __restrict__ x_nm1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); x_nm1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); x_nm1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} const double c1 = chebyshev_c1[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. const double c2 = chebyshev_c2[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ const int ijk = i + j*jStride + k*kStride; // According to Saad... but his was missing a Dinv[ijk] == D^{-1} !!! // x_{n+1} = x_{n} + rho_{n} [ rho_{n-1}(x_{n} - x_{n-1}) + (2/delta)(b-Ax_{n}) ] // x_temp[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_temp[ijk]) + c2*Dinv[ijk]*(rhs[ijk]-Ax_n); const double Ax_n = apply_op_ijk(x_n); x_np1[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_nm1[ijk]) + c2*Dinv[ijk]*(rhs[ijk]-Ax_n); }}} } // box-loop level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ if(level->my_rank==0){fprintf(stdout," rebuilding operator for level... h=%e ",level->h);fflush(stdout);} // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // form restriction of alpha[], beta_*[] coefficients from fromLevel if(fromLevel != NULL){ #ifdef VECTOR_ALPHA restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); #endif restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); } // else case assumes alpha/beta have been set // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange alpha/beta/... (must be done before calculating Dinv) #ifdef VECTOR_ALPHA exchange_boundary(level,VECTOR_ALPHA ,STENCIL_SHAPE_BOX); // safe #endif exchange_boundary(level,VECTOR_BETA_I,STENCIL_SHAPE_BOX); exchange_boundary(level,VECTOR_BETA_J,STENCIL_SHAPE_BOX); exchange_boundary(level,VECTOR_BETA_K,STENCIL_SHAPE_BOX); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // calculate Dinv, L1inv, and estimate the dominant Eigenvalue double _timeStart = getTime(); int block; double dominant_eigenvalue = -1e9; PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double h2inv = 1.0/(level->h*level->h); #ifdef VECTOR_ALPHA double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); #endif double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); #ifdef VECTOR_L1INV double * __restrict__ L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); #endif double block_eigenvalue = -1e9; for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; // used for quick linear approximation to zero dirichlet BC double ilo_is_valid =1.0; double ihi_is_valid =1.0; double jlo_is_valid =1.0; double jhi_is_valid =1.0; double klo_is_valid =1.0; double khi_is_valid =1.0; if(level->boundary_condition.type != BC_PERIODIC){ if(level->my_boxes[box].low.i+i-1 < 0)ilo_is_valid = 0.0; if(level->my_boxes[box].low.j+j-1 < 0)jlo_is_valid = 0.0; if(level->my_boxes[box].low.k+k-1 < 0)klo_is_valid = 0.0; if(level->my_boxes[box].low.i+i+1 >= level->dim.i)ihi_is_valid = 0.0; if(level->my_boxes[box].low.j+j+1 >= level->dim.j)jhi_is_valid = 0.0; if(level->my_boxes[box].low.k+k+1 >= level->dim.k)khi_is_valid = 0.0; } #ifdef STENCIL_VARIABLE_COEFFICIENT // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv) * ( fabs( beta_i[ijk ]*ilo_is_valid )+ fabs( beta_j[ijk ]*jlo_is_valid )+ fabs( beta_k[ijk ]*klo_is_valid )+ fabs( beta_i[ijk+1 ]*ihi_is_valid )+ fabs( beta_j[ijk+jStride]*jhi_is_valid )+ fabs( beta_k[ijk+kStride]*khi_is_valid ) ); // center of Gershgorin disc is the diagonal element... double Aii = -b*h2inv*( beta_i[ijk ]*( ilo_is_valid-2.0 )+ beta_j[ijk ]*( jlo_is_valid-2.0 )+ beta_k[ijk ]*( klo_is_valid-2.0 )+ beta_i[ijk+1 ]*( ihi_is_valid-2.0 )+ beta_j[ijk+jStride]*( jhi_is_valid-2.0 )+ beta_k[ijk+kStride]*( khi_is_valid-2.0 ) ); #ifdef VECTOR_ALPHA Aii += a*alpha[ijk]; #endif #else // Constant coefficient versions with fused BC's... // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv) * ( ilo_is_valid + jlo_is_valid + klo_is_valid + ihi_is_valid + jhi_is_valid + khi_is_valid ); // center of Gershgorin disc is the diagonal element... double Aii = a - b*h2inv*( ilo_is_valid + jlo_is_valid + klo_is_valid + ihi_is_valid + jhi_is_valid + khi_is_valid - 12.0 ); #endif Dinv[ijk] = 1.0/Aii; // inverse of the diagonal Aii double Di = (Aii + sumAbsAij)/Aii;if(Di>block_eigenvalue)block_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue #ifdef VECTOR_L1INV //L1inv[ijk] = 1.0/(Aii+sumAbsAij); // inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1} if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii ); // as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij); // #endif }}} if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;} } level->timers.blas1 += (double)(getTime()-_timeStart); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Reduce the local estimates dominant eigenvalue to a global estimate #ifdef USE_MPI double _timeStartAllReduce = getTime(); double send = dominant_eigenvalue; MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); double _timeEndAllReduce = getTime(); level->timers.collectives += (double)(_timeEndAllReduce-_timeStartAllReduce); #endif if(level->my_rank==0){fprintf(stdout,"eigenvalue_max<%e\n",dominant_eigenvalue);} level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange Dinv/L1inv/... exchange_boundary(level,VECTOR_DINV ,STENCIL_SHAPE_BOX); // safe #ifdef VECTOR_L1INV exchange_boundary(level,VECTOR_L1INV,STENCIL_SHAPE_BOX); #endif // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - }
//------------------------------------------------------------------------------------------------------------------------------ void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ if(level->my_rank==0){printf(" rebuilding operator for level... h=%e ",level->h);fflush(stdout);} // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // form restriction of alpha[], beta_*[] coefficients from fromLevel if(fromLevel != NULL){ restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); } // else case assumes alpha/beta have been set // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange alpha/beta/... (must be done before calculating Dinv) exchange_boundary(level,VECTOR_ALPHA ,0); // must be 0(faces,edges,corners) for CA version or 27pt exchange_boundary(level,VECTOR_BETA_I,0); exchange_boundary(level,VECTOR_BETA_J,0); exchange_boundary(level,VECTOR_BETA_K,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // calculate Dinv, L1inv, and estimate the dominant Eigenvalue uint64_t _timeStart = CycleTime(); int printedError=0; int box; double dominant_eigenvalue = -1e9; #pragma omp parallel for private(box) OMP_THREAD_ACROSS_BOXES(level->concurrent_boxes) reduction(max:dominant_eigenvalue) schedule(static) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; int lowi = level->my_boxes[box].low.i; int lowj = level->my_boxes[box].low.j; int lowk = level->my_boxes[box].low.k; int jStride = level->my_boxes[box].jStride; int kStride = level->my_boxes[box].kStride; int ghosts = level->my_boxes[box].ghosts; int dim = level->my_boxes[box].dim; double h2inv = 1.0/(level->h*level->h); double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); double * __restrict__ L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); double box_eigenvalue = -1e9; #pragma omp parallel for private(k,j,i) OMP_THREAD_WITHIN_A_BOX(level->threads_per_box) reduction(max:box_eigenvalue) schedule(static) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; #if 0 // FIX This looks wrong, but is faster... theory is because its doing something akin to SOR // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv*beta_i[ijk]) + fabs(b*h2inv*beta_i[ijk+ 1]) + fabs(b*h2inv*beta_j[ijk]) + fabs(b*h2inv*beta_j[ijk+jStride]) + fabs(b*h2inv*beta_k[ijk]) + fabs(b*h2inv*beta_k[ijk+kStride]); // centr of Gershgorin disc is the diagonal element... double Aii = a*alpha[ijk] - b*h2inv*( -beta_i[ijk]-beta_i[ijk+ 1] -beta_j[ijk]-beta_j[ijk+jStride] -beta_k[ijk]-beta_k[ijk+kStride] ); #endif #if 1 // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv) * ( fabs( beta_i[ijk ]*valid[ijk-1 ] )+ fabs( beta_j[ijk ]*valid[ijk-jStride] )+ fabs( beta_k[ijk ]*valid[ijk-kStride] )+ fabs( beta_i[ijk+1 ]*valid[ijk+1 ] )+ fabs( beta_j[ijk+jStride]*valid[ijk+jStride] )+ fabs( beta_k[ijk+kStride]*valid[ijk+kStride] ) ); // centr of Gershgorin disc is the diagonal element... double Aii = a*alpha[ijk] - b*h2inv*( beta_i[ijk ]*( valid[ijk-1 ]-2.0 )+ beta_j[ijk ]*( valid[ijk-jStride]-2.0 )+ beta_k[ijk ]*( valid[ijk-kStride]-2.0 )+ beta_i[ijk+1 ]*( valid[ijk+1 ]-2.0 )+ beta_j[ijk+jStride]*( valid[ijk+jStride]-2.0 )+ beta_k[ijk+kStride]*( valid[ijk+kStride]-2.0 ) ); #endif Dinv[ijk] = 1.0/Aii; // inverse of the diagonal Aii //L1inv[ijk] = 1.0/(Aii+sumAbsAij); // inverse of the L1 row norm // L1inv = ( D+D^{L1} )^{-1} // as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii ); // else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij); // double Di = (Aii + sumAbsAij)/Aii;if(Di>box_eigenvalue)box_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue }}} if(box_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = box_eigenvalue;} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Reduce the local estimates dominant eigenvalue to a global estimate #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = dominant_eigenvalue; MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif if(level->my_rank==0){printf("eigenvalue_max<%e\n",dominant_eigenvalue);fflush(stdout);} level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange Dinv/L1inv/... exchange_boundary(level,VECTOR_DINV ,0); // must be 0(faces,edges,corners) for CA version exchange_boundary(level,VECTOR_L1INV,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - }
//------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ // allocate a buffer to hold fluxes... if(level->fluxes==NULL)level->fluxes = (double*)MALLOC( ( (4*level->num_threads)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride) + BOX_ALIGN_JSTRIDE)*sizeof(double) ); // align fluxes to BOX_ALIGN_JSTRIDE double * __restrict__ fluxes_aligned = level->fluxes; uint64_t unaligned_by = (uint64_t)(fluxes_aligned) & (BOX_ALIGN_JSTRIDE-1)*sizeof(double); if(unaligned_by)fluxes_aligned = (double*)( (uint64_t)(fluxes_aligned) + BOX_ALIGN_JSTRIDE*sizeof(double) - unaligned_by ); int s;for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth // exchange the ghost zone... if((s&1)==0){ exchange_boundary(level, x_id,stencil_get_shape()); apply_BCs(level, x_id,stencil_get_shape()); }else{ exchange_boundary(level,VECTOR_TEMP,stencil_get_shape()); apply_BCs(level,VECTOR_TEMP,stencil_get_shape()); } // apply the smoother... double _timeStart = getTime(); double h2inv = 1.0/(level->h*level->h); // loop over all block/tiles this process owns... #ifdef _OPENMP #pragma omp parallel if(level->num_my_blocks>1) #endif { int block; int threadID=0; #ifdef _OPENMP threadID=omp_get_thread_num(); #endif // [thread][flux][ij] layout double * __restrict__ flux_i = fluxes_aligned + (4*threadID + 0)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride); double * __restrict__ flux_j = fluxes_aligned + (4*threadID + 1)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride); double * __restrict__ flux_k[2] = {fluxes_aligned + (4*threadID + 2)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride), fluxes_aligned + (4*threadID + 3)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride)}; // loop over (cache) blocks... #ifdef _OPENMP #pragma omp for schedule(static,1) #endif for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int jdim = level->my_blocks[block].dim.j; const int kdim = level->my_blocks[block].dim.k; const int ghosts = level->my_boxes[box].ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); #ifdef VECTOR_ALPHA const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); #else const double * __restrict__ alpha = NULL; #endif const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ x_n; double * __restrict__ x_np1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);} #ifdef __INTEL_COMPILER // superfluous with OMP4 simd (?) //__assume_aligned(x_n ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(x_np1 ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(rhs ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(alpha ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_i ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_j ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_k ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(Dinv ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_i ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_j ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_k[0],BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_k[1],BOX_ALIGN_JSTRIDE*sizeof(double)); __assume( jStride % BOX_ALIGN_JSTRIDE == 0); // e.g. jStride%4==0 or jStride%8==0, hence x+jStride is aligned __assume( kStride % BOX_ALIGN_JSTRIDE == 0); __assume( jStride >= BOX_ALIGN_JSTRIDE); __assume( kStride >= 3*BOX_ALIGN_JSTRIDE); __assume( jdim > 0); __assume( kdim > 0); #elif __xlC__ __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), rhs ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), alpha ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_i ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_j ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_k ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), Dinv ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_n ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_np1 ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_i ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_j ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[0]); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[1]); #endif int ij,k; double * __restrict__ flux_klo = flux_k[0]; // startup / prolog... calculate flux_klo (bottom of cell)... #if (_OPENMP>=201307) #pragma omp simd aligned(beta_k,x_n,flux_klo:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ flux_klo[ij] = beta_dxdk(x_n,ij); // k==0 } // wavefront loop... for(k=0;k<kdim;k++){ double * __restrict__ flux_klo = flux_k[(k )&0x1]; double * __restrict__ flux_khi = flux_k[(k+1)&0x1]; // calculate flux_i and flux_j together #if (_OPENMP>=201307) #pragma omp simd aligned(beta_i,beta_j,x_n,flux_i,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; flux_i[ij] = beta_dxdi(x_n,ijk); flux_j[ij] = beta_dxdj(x_n,ijk); } // calculate flux_jhi #if (_OPENMP>=201307) #pragma omp simd aligned(beta_j,x_n,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=jdim*jStride;ij<(jdim+1)*jStride;ij++){ int ijk = ij + k*kStride; flux_j[ij] = beta_dxdj(x_n,ijk); } // calculate flux_khi (top of cell) #if (_OPENMP>=201307) #pragma omp simd aligned(beta_k,x_n,flux_khi:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; flux_khi[ij] = beta_dxdk(x_n,ijk+kStride); // k+1 } const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^jlo^klo^s); // is element 000 of this *BLOCK* 000 red or black on this sweep const double * __restrict__ RedBlack = level->RedBlack_FP + ghosts*(1+jStride) + jStride*((k^color000)&0x1); // Red/Black pencils... presumes ghost zones were corectly colored #if (_OPENMP>=201307) #pragma omp simd aligned(flux_i,flux_j,flux_klo,flux_khi,alpha,rhs,Dinv,x_n,x_np1,RedBlack:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif #ifdef __INTEL_COMPILER #pragma vector nontemporal // generally, we don't expect to reuse x_np1 #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; double Lx = - flux_i[ ij] + flux_i[ ij+ 1] - flux_j[ ij] + flux_j[ ij+jStride] - flux_klo[ij] + flux_khi[ij ]; #ifdef USE_HELMHOLTZ double Ax = a*alpha[ijk]*x_n[ijk] - b*Lx; #else double Ax = -b*Lx; #endif x_np1[ijk] = x_n[ijk] + RedBlack[ij]*Dinv[ijk]*(rhs[ijk]-Ax); } } // kdim } // block } // omp level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ if(level->my_rank==0){fprintf(stdout," rebuilding 27pt CC operator for level... h=%e ",level->h);} // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // form restriction of alpha[], beta_*[] coefficients from fromLevel if(fromLevel != NULL){ restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); } // else case assumes alpha/beta have been set // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange alpha/beta/... (must be done before calculating Dinv) exchange_boundary(level,VECTOR_ALPHA ,0); // must be 0(faces,edges,corners) for CA version or 27pt exchange_boundary(level,VECTOR_BETA_I,0); exchange_boundary(level,VECTOR_BETA_J,0); exchange_boundary(level,VECTOR_BETA_K,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // calculate Dinv, L1inv, and estimate the dominant Eigenvalue uint64_t _timeStart = CycleTime(); int block; double dominant_eigenvalue = -1e9; PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double h2inv = 1.0/(level->h*level->h); double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); double * __restrict__ L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); double block_eigenvalue = -1e9; for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv*6.0*STENCIL_COEF1) + fabs(b*h2inv*12.0*STENCIL_COEF2) + fabs(b*h2inv*8.0*STENCIL_COEF3); // center of Gershgorin disc is the diagonal element... double Aii = a - b*h2inv*( STENCIL_COEF0 ); Dinv[ijk] = 1.0/Aii; // inverse of the diagonal Aii //L1inv[ijk] = 1.0/(Aii+sumAbsAij); // inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1} // as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii ); // else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij); // double Di = (Aii + sumAbsAij)/Aii;if(Di>block_eigenvalue)block_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue }}} if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Reduce the local estimates dominant eigenvalue to a global estimate #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = dominant_eigenvalue; MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif if(level->my_rank==0){fprintf(stdout,"eigenvalue_max<%e\n",dominant_eigenvalue);} level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange Dinv/L1inv/... exchange_boundary(level,VECTOR_DINV ,0); // must be 0(faces,edges,corners) for CA version exchange_boundary(level,VECTOR_L1INV,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - }