//------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ if(NUM_SMOOTHS&1){ fprintf(stderr,"error - NUM_SMOOTHS must be even...\n"); exit(0); } #ifdef USE_L1JACOBI double weight = 1.0; #else double weight = 2.0/3.0; #endif int box,s; for(s=0;s<NUM_SMOOTHS;s++){ // exchange ghost zone data... Jacobi ping pongs between x_id and VECTOR_TEMP if((s&1)==0){exchange_boundary(level, x_id,stencil_get_shape());apply_BCs(level, x_id,stencil_get_shape());} else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());} // apply the smoother... Jacobi ping pongs between x_id and VECTOR_TEMP double _timeStart = getTime(); const int ghosts = level->box_ghosts; const int jStride = level->box_jStride; const int kStride = level->box_kStride; const int dim = level->box_dim; const double h2inv = 1.0/(level->h*level->h); PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain #ifdef USE_L1JACOBI const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); #else const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); #endif const double * __restrict__ x_n; double * __restrict__ x_np1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; double Ax_n = apply_op_ijk(x_n); x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n); }}} } // box-loop level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax // exchange the boundary of x in preparation for Ax exchange_boundary(level,x_id,stencil_is_star_shaped()); apply_BCs(level,x_id); // now do Ax proper... uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; Ax[ijk] = apply_op_ijk(x); }}} } level->cycles.apply_op += (uint64_t)(CycleTime()-_timeStart); }
void display(void) // This function is called automatically, over and over again, by GLUT { int i,j,ip1,jp1,i0,icol,i1,i2,i3,i4,isol; float minvar,maxvar,frac; // set upper and lower limits for plotting minvar=0.0; maxvar=0.2; // do one Lattice Boltzmann step: stream, BC, collide: stream(); apply_BCs(); collide(); // convert the plotvar array into an array of colors to plot // if the mesh point is solid, make it black for (j=0;j<nj;j++){ for (i=0;i<ni;i++){ i0=I2D(ni,i,j); frac=(plotvar[i0]-minvar)/(maxvar-minvar); icol=frac*ncol; isol=(int)solid[i0]; plot_rgba[i0] = isol*cmap_rgba[icol]; } } // Fill the pixel buffer with the plot_rgba array glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB,ni*nj*sizeof(unsigned int), (void **)plot_rgba,GL_STREAM_COPY); // Copy the pixel buffer to the texture, ready to display glTexSubImage2D(GL_TEXTURE_2D,0,0,0,ni,nj,GL_RGBA,GL_UNSIGNED_BYTE,0); // Render one quad to the screen and colour it using our texture // i.e. plot our plotvar data to the screen glClear(GL_COLOR_BUFFER_BIT); glBegin(GL_QUADS); glTexCoord2f (0.0, 0.0); glVertex3f (0.0, 0.0, 0.0); glTexCoord2f (1.0, 0.0); glVertex3f (ni, 0.0, 0.0); glTexCoord2f (1.0, 1.0); glVertex3f (ni, nj, 0.0); glTexCoord2f (0.0, 1.0); glVertex3f (0.0, nj, 0.0); glEnd(); glutSwapBuffers(); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int phi_id, int rhs_id, double a, double b){ int box,s; for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps (forward/backward) per GS smooth exchange_boundary(level,phi_id,stencil_get_shape()); apply_BCs(level,phi_id,stencil_get_shape()); double _timeStart = getTime(); #ifdef _OPENMP #pragma omp parallel for private(box) #endif for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const int ghosts = level->box_ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); double * __restrict__ phi = level->my_boxes[box].vectors[ phi_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); if( (s&0x1)==0 ){ // forward sweep... hard to thread for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; double Ax = apply_op_ijk(phi); phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); }}} }else{ // backward sweep... hard to thread for(k=dim-1;k>=0;k--){ for(j=dim-1;j>=0;j--){ for(i=dim-1;i>=0;i--){ int ijk = i + j*jStride + k*kStride; double Ax = apply_op_ijk(phi); phi[ijk] = phi[ijk] + Dinv[ijk]*(rhs[ijk]-Ax); }}} } } // boxes level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax // exchange the boundary of x in preparation for Ax exchange_boundary(level,x_id,stencil_is_star_shaped()); apply_BCs(level,x_id); // now do Ax proper... uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; Ax[ijk] = apply_op_ijk(x); }}} } level->cycles.apply_op += (uint64_t)(CycleTime()-_timeStart); }
void residual(level_type * level, int res_id, int x_id, int rhs_id, double a, double b){ // exchange the boundary for x in prep for Ax... exchange_boundary(level,x_id,stencil_get_shape()); apply_BCs(level,x_id,stencil_get_shape()); // now do residual/restriction proper... double _timeStart = getTime(); const int ghosts = level->box_ghosts; const int jStride = level->box_jStride; const int kStride = level->box_kStride; const int dim = level->box_dim; const double h2inv = 1.0/(level->h*level->h); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); double * __restrict__ res = level->my_boxes[box].vectors[ res_id] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; double Ax = apply_op_ijk(x); res[ijk] = rhs[ijk]-Ax; }}} } level->timers.residual += (double)(getTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ // Based on Yousef Saad's Iterative Methods for Sparse Linear Algebra, Algorithm 12.1, page 399 //------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ if((CHEBYSHEV_DEGREE*NUM_SMOOTHS)&1){ fprintf(stderr,"error... CHEBYSHEV_DEGREE*NUM_SMOOTHS must be even for the chebyshev smoother...\n"); exit(0); } if( (level->dominant_eigenvalue_of_DinvA<=0.0) && (level->my_rank==0) )fprintf(stderr,"dominant_eigenvalue_of_DinvA <= 0.0 !\n"); //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int s; int block; // compute the Chebyshev coefficients... double beta = 1.000*level->dominant_eigenvalue_of_DinvA; //double alpha = 0.300000*beta; //double alpha = 0.250000*beta; //double alpha = 0.166666*beta; double alpha = 0.125000*beta; double theta = 0.5*(beta+alpha); // center of the spectral ellipse double delta = 0.5*(beta-alpha); // major axis? double sigma = theta/delta; double rho_n = 1/sigma; // rho_0 double chebyshev_c1[CHEBYSHEV_DEGREE]; // + c1*(x_n-x_nm1) == rho_n*rho_nm1 double chebyshev_c2[CHEBYSHEV_DEGREE]; // + c2*(b-Ax_n) chebyshev_c1[0] = 0.0; chebyshev_c2[0] = 1/theta; for(s=1;s<CHEBYSHEV_DEGREE;s++){ double rho_nm1 = rho_n; rho_n = 1.0/(2.0*sigma - rho_nm1); chebyshev_c1[s] = rho_n*rho_nm1; chebyshev_c2[s] = rho_n*2.0/delta; } for(s=0;s<CHEBYSHEV_DEGREE*NUM_SMOOTHS;s++){ // get ghost zone data... Chebyshev ping pongs between x_id and VECTOR_TEMP if((s&1)==0){exchange_boundary(level, x_id,stencil_get_shape());apply_BCs(level, x_id,stencil_get_shape());} else{exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());apply_BCs(level,VECTOR_TEMP,stencil_get_shape());} // apply the smoother... Chebyshev ping pongs between x_id and VECTOR_TEMP double _timeStart = getTime(); PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int ghosts = level->box_ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); #ifdef VECTOR_ALPHA const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); #endif const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); double * __restrict__ x_np1; const double * __restrict__ x_n; const double * __restrict__ x_nm1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); x_nm1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); x_nm1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} const double c1 = chebyshev_c1[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. const double c2 = chebyshev_c2[s%CHEBYSHEV_DEGREE]; // limit polynomial to degree CHEBYSHEV_DEGREE. for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ const int ijk = i + j*jStride + k*kStride; // According to Saad... but his was missing a Dinv[ijk] == D^{-1} !!! // x_{n+1} = x_{n} + rho_{n} [ rho_{n-1}(x_{n} - x_{n-1}) + (2/delta)(b-Ax_{n}) ] // x_temp[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_temp[ijk]) + c2*Dinv[ijk]*(rhs[ijk]-Ax_n); const double Ax_n = apply_op_ijk(x_n); x_np1[ijk] = x_n[ijk] + c1*(x_n[ijk]-x_nm1[ijk]) + c2*Dinv[ijk]*(rhs[ijk]-Ax_n); }}} } // box-loop level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ // allocate a buffer to hold fluxes... if(level->fluxes==NULL)level->fluxes = (double*)MALLOC( ( (4*level->num_threads)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride) + BOX_ALIGN_JSTRIDE)*sizeof(double) ); // align fluxes to BOX_ALIGN_JSTRIDE double * __restrict__ fluxes_aligned = level->fluxes; uint64_t unaligned_by = (uint64_t)(fluxes_aligned) & (BOX_ALIGN_JSTRIDE-1)*sizeof(double); if(unaligned_by)fluxes_aligned = (double*)( (uint64_t)(fluxes_aligned) + BOX_ALIGN_JSTRIDE*sizeof(double) - unaligned_by ); int s;for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth // exchange the ghost zone... if((s&1)==0){ exchange_boundary(level, x_id,stencil_get_shape()); apply_BCs(level, x_id,stencil_get_shape()); }else{ exchange_boundary(level,VECTOR_TEMP,stencil_get_shape()); apply_BCs(level,VECTOR_TEMP,stencil_get_shape()); } // apply the smoother... double _timeStart = getTime(); double h2inv = 1.0/(level->h*level->h); // loop over all block/tiles this process owns... #ifdef _OPENMP #pragma omp parallel if(level->num_my_blocks>1) #endif { int block; int threadID=0; #ifdef _OPENMP threadID=omp_get_thread_num(); #endif // [thread][flux][ij] layout double * __restrict__ flux_i = fluxes_aligned + (4*threadID + 0)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride); double * __restrict__ flux_j = fluxes_aligned + (4*threadID + 1)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride); double * __restrict__ flux_k[2] = {fluxes_aligned + (4*threadID + 2)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride), fluxes_aligned + (4*threadID + 3)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride)}; // loop over (cache) blocks... #ifdef _OPENMP #pragma omp for schedule(static,1) #endif for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int jdim = level->my_blocks[block].dim.j; const int kdim = level->my_blocks[block].dim.k; const int ghosts = level->my_boxes[box].ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); #ifdef VECTOR_ALPHA const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); #else const double * __restrict__ alpha = NULL; #endif const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); const double * __restrict__ x_n; double * __restrict__ x_np1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);} #ifdef __INTEL_COMPILER // superfluous with OMP4 simd (?) //__assume_aligned(x_n ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(x_np1 ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(rhs ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(alpha ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_i ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_j ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(beta_k ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(Dinv ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_i ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_j ,BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_k[0],BOX_ALIGN_JSTRIDE*sizeof(double)); //__assume_aligned(flux_k[1],BOX_ALIGN_JSTRIDE*sizeof(double)); __assume( jStride % BOX_ALIGN_JSTRIDE == 0); // e.g. jStride%4==0 or jStride%8==0, hence x+jStride is aligned __assume( kStride % BOX_ALIGN_JSTRIDE == 0); __assume( jStride >= BOX_ALIGN_JSTRIDE); __assume( kStride >= 3*BOX_ALIGN_JSTRIDE); __assume( jdim > 0); __assume( kdim > 0); #elif __xlC__ __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), rhs ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), alpha ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_i ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_j ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_k ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), Dinv ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_n ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_np1 ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_i ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_j ); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[0]); __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[1]); #endif int ij,k; double * __restrict__ flux_klo = flux_k[0]; // startup / prolog... calculate flux_klo (bottom of cell)... #if (_OPENMP>=201307) #pragma omp simd aligned(beta_k,x_n,flux_klo:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ flux_klo[ij] = beta_dxdk(x_n,ij); // k==0 } // wavefront loop... for(k=0;k<kdim;k++){ double * __restrict__ flux_klo = flux_k[(k )&0x1]; double * __restrict__ flux_khi = flux_k[(k+1)&0x1]; // calculate flux_i and flux_j together #if (_OPENMP>=201307) #pragma omp simd aligned(beta_i,beta_j,x_n,flux_i,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; flux_i[ij] = beta_dxdi(x_n,ijk); flux_j[ij] = beta_dxdj(x_n,ijk); } // calculate flux_jhi #if (_OPENMP>=201307) #pragma omp simd aligned(beta_j,x_n,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=jdim*jStride;ij<(jdim+1)*jStride;ij++){ int ijk = ij + k*kStride; flux_j[ij] = beta_dxdj(x_n,ijk); } // calculate flux_khi (top of cell) #if (_OPENMP>=201307) #pragma omp simd aligned(beta_k,x_n,flux_khi:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; flux_khi[ij] = beta_dxdk(x_n,ijk+kStride); // k+1 } const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^jlo^klo^s); // is element 000 of this *BLOCK* 000 red or black on this sweep const double * __restrict__ RedBlack = level->RedBlack_FP + ghosts*(1+jStride) + jStride*((k^color000)&0x1); // Red/Black pencils... presumes ghost zones were corectly colored #if (_OPENMP>=201307) #pragma omp simd aligned(flux_i,flux_j,flux_klo,flux_khi,alpha,rhs,Dinv,x_n,x_np1,RedBlack:BOX_ALIGN_JSTRIDE*sizeof(double)) #endif #ifdef __INTEL_COMPILER #pragma vector nontemporal // generally, we don't expect to reuse x_np1 #endif for(ij=0;ij<jdim*jStride;ij++){ int ijk = ij + k*kStride; double Lx = - flux_i[ ij] + flux_i[ ij+ 1] - flux_j[ ij] + flux_j[ ij+jStride] - flux_klo[ij] + flux_khi[ij ]; #ifdef USE_HELMHOLTZ double Ax = a*alpha[ijk]*x_n[ijk] - b*Lx; #else double Ax = -b*Lx; #endif x_np1[ijk] = x_n[ijk] + RedBlack[ij]*Dinv[ijk]*(rhs[ijk]-Ax); } } // kdim } // block } // omp level->timers.smooth += (double)(getTime()-_timeStart); } // s-loop }