void shift_vector(level_type * level, int id_c, int id_a, double shift_a) { uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0; k<dim; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { int ijk = i + j*jStride + k*kStride; grid_c[ijk] = grid_a[ijk] + shift_a; } } } } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void initialize_grid_to_scalar(level_type * level, int component_id, double scalar) { // initializes the grid to a scalar while zero'ing the ghost zones... uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=-ghosts; k<dim+ghosts; k++) { for(j=-ghosts; j<dim+ghosts; j++) { for(i=-ghosts; i<dim+ghosts; i++) { int ijk = i + j*jStride + k*kStride; int ghostZone = (i<0) || (j<0) || (k<0) || (i>=dim) || (j>=dim) || (k>=dim); grid[ijk] = ghostZone ? 0.0 : scalar; } } } } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void add_vectors(level_type * level, int id_c, double scale_a, int id_a, double scale_b, int id_b) { // c=scale_a*id_a + scale_b*id_b uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0; k<dim; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { int ijk = i + j*jStride + k*kStride; grid_c[ijk] = scale_a*grid_a[ijk] + scale_b*grid_b[ijk]; } } } } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void initialize_valid_region(level_type * level) { uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=-ghosts; k<dim+ghosts; k++) { for(j=-ghosts; j<dim+ghosts; j++) { for(i=-ghosts; i<dim+ghosts; i++) { int ijk = i + j*jStride + k*kStride; valid[ijk] = 1.0; // i.e. all cells including ghosts are valid for periodic BC's if(level->domain_boundary_condition == BC_DIRICHLET) { // cells outside the domain boundaries are not valid if(i + level->my_boxes[box].low.i < 0)valid[ijk] = 0.0; if(j + level->my_boxes[box].low.j < 0)valid[ijk] = 0.0; if(k + level->my_boxes[box].low.k < 0)valid[ijk] = 0.0; if(i + level->my_boxes[box].low.i >= level->dim.i)valid[ijk] = 0.0; if(j + level->my_boxes[box].low.j >= level->dim.j)valid[ijk] = 0.0; if(k + level->my_boxes[box].low.k >= level->dim.k)valid[ijk] = 0.0; } } } } } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax // exchange the boundary of x in preparation for Ax exchange_boundary(level,x_id,stencil_is_star_shaped()); apply_BCs(level,x_id); // now do Ax proper... uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; Ax[ijk] = apply_op_ijk(x); }}} } level->cycles.apply_op += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void mul_vectors(level_type * level, int id_c, double scale, int id_a, int id_b){ // id_c=scale*id_a*id_b uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; grid_c[ijk] = scale*grid_a[ijk]*grid_b[ijk]; }}} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void shift_vector(level_type * level, int id_c, int id_a, double shift_a){ uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double * __restrict__ grid_c = level->my_boxes[box].vectors[id_c] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; grid_c[ijk] = grid_a[ijk] + shift_a; }}} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void zero_vector(level_type * level, int component_id) { // zero's the entire grid INCLUDING ghost zones... uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=-ghosts; k<dim+ghosts; k++) { for(j=-ghosts; j<dim+ghosts; j++) { for(i=-ghosts; i<dim+ghosts; i++) { int ijk = i + j*jStride + k*kStride; grid[ijk] = 0.0; } } } } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
void FRichCurve::RemapTimeValue(float& InTime, float& CycleValueOffset) const { const int32 NumKeys = Keys.Num(); if (NumKeys < 2) { return; } if (InTime <= Keys[0].Time) { if (PreInfinityExtrap != RCCE_Linear && PreInfinityExtrap != RCCE_Constant) { float MinTime = Keys[0].Time; float MaxTime = Keys[NumKeys - 1].Time; int CycleCount = 0; CycleTime(MinTime, MaxTime, InTime, CycleCount); if (PreInfinityExtrap == RCCE_CycleWithOffset) { float DV = Keys[0].Value - Keys[NumKeys - 1].Value; CycleValueOffset = DV * CycleCount; } else if (PreInfinityExtrap == RCCE_Oscillate) { if (CycleCount % 2 == 1) { InTime = MinTime + (MaxTime - InTime); } } } } else if (InTime >= Keys[NumKeys - 1].Time) { if (PostInfinityExtrap != RCCE_Linear && PostInfinityExtrap != RCCE_Constant) { float MinTime = Keys[0].Time; float MaxTime = Keys[NumKeys - 1].Time; int CycleCount = 0; CycleTime(MinTime, MaxTime, InTime, CycleCount); if (PostInfinityExtrap == RCCE_CycleWithOffset) { float DV = Keys[NumKeys - 1].Value - Keys[0].Value; CycleValueOffset = DV * CycleCount; } else if (PostInfinityExtrap == RCCE_Oscillate) { if (CycleCount % 2 == 1) { InTime = MinTime + (MaxTime - InTime); } } } } }
//------------------------------------------------------------------------------------------------------------------------------ void smooth(level_type * level, int x_id, int rhs_id, double a, double b){ if(NUM_SMOOTHS&1){ fprintf(stderr,"error - NUM_SMOOTHS must be even...\n"); exit(0); } #ifdef USE_L1JACOBI double weight = 1.0; #else double weight = 2.0/3.0; #endif int box,s; for(s=0;s<NUM_SMOOTHS;s++){ // exchange ghost zone data... Jacobi ping pongs between x_id and VECTOR_TEMP if((s&1)==0){exchange_boundary(level, x_id,stencil_is_star_shaped());apply_BCs(level, x_id);} else{exchange_boundary(level,VECTOR_TEMP,stencil_is_star_shaped());apply_BCs(level,VECTOR_TEMP);} // apply the smoother... Jacobi ping pongs between x_id and VECTOR_TEMP uint64_t _timeStart = CycleTime(); PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; const int ghosts = level->box_ghosts; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ rhs = level->my_boxes[box].vectors[ rhs_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); // cell is inside the domain #ifdef USE_L1JACOBI const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); #else const double * __restrict__ lambda = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); #endif const double * __restrict__ x_n; double * __restrict__ x_np1; if((s&1)==0){x_n = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride);} else{x_n = level->my_boxes[box].vectors[VECTOR_TEMP ] + ghosts*(1+jStride+kStride); x_np1 = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride);} PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; double Ax_n = apply_op_ijk(x_n); x_np1[ijk] = x_n[ijk] + weight*lambda[ijk]*(rhs[ijk]-Ax_n); }}} } // box-loop level->cycles.smooth += (uint64_t)(CycleTime()-_timeStart); } // s-loop }
//------------------------------------------------------------------------------------------------------------------------------ void initialize_valid_region(level_type * level){ uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; int ilo = level->my_blocks[block].read.i; int jlo = level->my_blocks[block].read.j; int klo = level->my_blocks[block].read.k; int ihi = level->my_blocks[block].dim.i + ilo; int jhi = level->my_blocks[block].dim.j + jlo; int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; // expand the size of the block to include the ghost zones... if(ilo<= 0)ilo-=ghosts; if(jlo<= 0)jlo-=ghosts; if(klo<= 0)klo-=ghosts; if(ihi>=dim)ihi+=ghosts; if(jhi>=dim)jhi+=ghosts; if(khi>=dim)khi+=ghosts; double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID] + ghosts*(1+jStride+kStride); for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; valid[ijk] = 1.0; // i.e. all cells including ghosts are valid for periodic BC's if(level->domain_boundary_condition == BC_DIRICHLET){ // cells outside the domain boundaries are not valid if(i + level->my_boxes[box].low.i < 0)valid[ijk] = 0.0; if(j + level->my_boxes[box].low.j < 0)valid[ijk] = 0.0; if(k + level->my_boxes[box].low.k < 0)valid[ijk] = 0.0; if(i + level->my_boxes[box].low.i >= level->dim.i)valid[ijk] = 0.0; if(j + level->my_boxes[box].low.j >= level->dim.j)valid[ijk] = 0.0; if(k + level->my_boxes[box].low.k >= level->dim.k)valid[ijk] = 0.0; } }}} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ double mean(level_type * level, int id_a){ uint64_t _timeStart = CycleTime(); int block; double sum_level = 0.0; PRAGMA_THREAD_ACROSS_BLOCKS_SUM(level,block,level->num_my_blocks,sum_level) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double sum_block = 0.0; for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; sum_block += grid_a[ijk]; }}} sum_level+=sum_block; } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); double ncells_level = (double)level->dim.i*(double)level->dim.j*(double)level->dim.k; #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = sum_level; MPI_Allreduce(&send,&sum_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif double mean_level = sum_level / ncells_level; return(mean_level); }
//------------------------------------------------------------------------------------------------------------------------------ double norm(level_type * level, int component_id){ // implements the max norm uint64_t _timeStart = CycleTime(); int block; double max_norm = 0.0; PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,max_norm) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double block_norm = 0.0; for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; double fabs_grid_ijk = fabs(grid[ijk]); if(fabs_grid_ijk>block_norm){block_norm=fabs_grid_ijk;} // max norm }}} if(block_norm>max_norm){max_norm = block_norm;} } // block list level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = max_norm; MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif return(max_norm); }
//------------------------------------------------------------------------------------------------------------------------------ double norm(level_type * level, int component_id) { // implements the max norm uint64_t _timeStart = CycleTime(); int box; double max_norm = 0.0; // FIX, schedule(static) is a stand in to guarantee reproducibility... PRAGMA_THREAD_ACROSS_BOXES_MAX(level,box,max_norm) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double box_norm = 0.0; PRAGMA_THREAD_WITHIN_A_BOX_MAX(level,i,j,k,box_norm) for(k=0; k<dim; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { int ijk = i + j*jStride + k*kStride; double fabs_grid_ijk = fabs(grid[ijk]); if(fabs_grid_ijk>box_norm) { box_norm=fabs_grid_ijk; // max norm } } } } if(box_norm>max_norm) { max_norm = box_norm; } } // box list level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = max_norm; MPI_Allreduce(&send,&max_norm,1,MPI_DOUBLE,MPI_MAX,level->MPI_COMM_ALLREDUCE); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif return(max_norm); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_op(level_type * level, int Ax_id, int x_id, double a, double b){ // y=Ax // exchange the boundary of x in preparation for Ax exchange_boundary(level,x_id,stencil_is_star_shaped()); apply_BCs(level,x_id); // now do Ax proper... uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; const double h2inv = 1.0/(level->h*level->h); const double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ Ax = level->my_boxes[box].vectors[ Ax_id] + ghosts*(1+jStride+kStride); const double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); const double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; Ax[ijk] = apply_op_ijk(x); }}} } level->cycles.apply_op += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ double dot(level_type * level, int id_a, int id_b) { uint64_t _timeStart = CycleTime(); int box; double a_dot_b_level = 0.0; // FIX, schedule(static) is a stand in to guarantee reproducibility... PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,a_dot_b_level) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ grid_b = level->my_boxes[box].vectors[id_b] + ghosts*(1+jStride+kStride); double a_dot_b_box = 0.0; PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,a_dot_b_box) for(k=0; k<dim; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { int ijk = i + j*jStride + k*kStride; a_dot_b_box += grid_a[ijk]*grid_b[ijk]; } } } a_dot_b_level+=a_dot_b_box; } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = a_dot_b_level; MPI_Allreduce(&send,&a_dot_b_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif return(a_dot_b_level); }
//------------------------------------------------------------------------------------------------------------------------------ void initialize_grid_to_scalar(level_type * level, int component_id, double scalar){ // initializes the grid to a scalar while zero'ing the ghost zones... uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; int ilo = level->my_blocks[block].read.i; int jlo = level->my_blocks[block].read.j; int klo = level->my_blocks[block].read.k; int ihi = level->my_blocks[block].dim.i + ilo; int jhi = level->my_blocks[block].dim.j + jlo; int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; // expand the size of the block to include the ghost zones... if(ilo<= 0)ilo-=ghosts; if(jlo<= 0)jlo-=ghosts; if(klo<= 0)klo-=ghosts; if(ihi>=dim)ihi+=ghosts; if(jhi>=dim)jhi+=ghosts; if(khi>=dim)khi+=ghosts; double * __restrict__ grid = level->my_boxes[box].vectors[component_id] + ghosts*(1+jStride+kStride); for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; int ghostZone = (i<0) || (j<0) || (k<0) || (i>=dim) || (j>=dim) || (k>=dim); grid[ijk] = ghostZone ? 0.0 : scalar; }}} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ double mean(level_type * level, int id_a) { uint64_t _timeStart = CycleTime(); int box; double sum_level = 0.0; PRAGMA_THREAD_ACROSS_BOXES_SUM(level,box,sum_level) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid_a = level->my_boxes[box].vectors[id_a] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double sum_box = 0.0; PRAGMA_THREAD_WITHIN_A_BOX_SUM(level,i,j,k,sum_box) for(k=0; k<dim; k++) { for(j=0; j<dim; j++) { for(i=0; i<dim; i++) { int ijk = i + j*jStride + k*kStride; sum_box += grid_a[ijk]; } } } sum_level+=sum_box; } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); double ncells_level = (double)level->dim.i*(double)level->dim.j*(double)level->dim.k; #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = sum_level; MPI_Allreduce(&send,&sum_level,1,MPI_DOUBLE,MPI_SUM,level->MPI_COMM_ALLREDUCE); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif double mean_level = sum_level / ncells_level; return(mean_level); }
//------------------------------------------------------------------------------------------------------------------------------ void project_cell_to_face(level_type * level, int id_cell, int id_face, int dir) { uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0; box<level->num_my_boxes; box++) { int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ grid_cell = level->my_boxes[box].vectors[id_cell] + ghosts*(1+jStride+kStride); double * __restrict__ grid_face = level->my_boxes[box].vectors[id_face] + ghosts*(1+jStride+kStride); int stride; switch(dir) { case 0: stride = 1; break;//i-direction case 1: stride = jStride; break;//j-direction case 2: stride = kStride; break;//k-direction } PRAGMA_THREAD_WITHIN_A_BOX(level,i,j,k) for(k=0; k<=dim; k++) { // <= to ensure you do low and high faces for(j=0; j<=dim; j++) { for(i=0; i<=dim; i++) { int ijk = i + j*jStride + k*kStride; grid_face[ijk] = 0.5*(grid_cell[ijk-stride] + grid_cell[ijk]); // simple linear interpolation } } } } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void project_cell_to_face(level_type * level, int id_cell, int id_face, int dir){ uint64_t _timeStart = CycleTime(); int block; PRAGMA_THREAD_ACROSS_BLOCKS(level,block,level->num_my_blocks) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double * __restrict__ grid_cell = level->my_boxes[box].vectors[id_cell] + ghosts*(1+jStride+kStride); double * __restrict__ grid_face = level->my_boxes[box].vectors[id_face] + ghosts*(1+jStride+kStride); int stride; switch(dir){ case 0: stride = 1;break;//i-direction case 1: stride = jStride;break;//j-direction case 2: stride = kStride;break;//k-direction } for(k=klo;k<=khi;k++){ // <= to ensure you do low and high faces for(j=jlo;j<=jhi;j++){ for(i=ilo;i<=ihi;i++){ int ijk = i + j*jStride + k*kStride; grid_face[ijk] = 0.5*(grid_cell[ijk-stride] + grid_cell[ijk]); // simple linear interpolation }}} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_BCs_linear(level_type * level, int x_id){ if(level->boundary_condition.type == BC_PERIODIC)return; // no BC's to apply ! // for cell-centered, we need to fill in the ghost zones to apply any BC's // this code does a simple linear interpolation for homogeneous dirichlet // // . . . . . . . . . . . . . . . . . . // . . . . . . // . ? . ? . .+x(0,0).-x(0,0). // . . . . . . // . . . . +-------+ . . . . +-------+ // . | | . | | // . ? | x(0,0)| .-x(0,0)| x(0,0)| // . | | . | | // . . . . +-------+ . . . . +-------+ // ^ // domain boundary is the face... i.e. between two array indices !!! // uint64_t _timeStart = CycleTime(); int box; PRAGMA_THREAD_ACROSS_BOXES(level,box) for(box=0;box<level->num_my_boxes;box++){ const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; const int dim = level->my_boxes[box].dim; double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point //double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID] + ghosts*(1+jStride+kStride); int box_on_low_i = (level->my_boxes[box].low.i == 0); int box_on_low_j = (level->my_boxes[box].low.j == 0); int box_on_low_k = (level->my_boxes[box].low.k == 0); int box_on_high_i = (level->my_boxes[box].low.i+dim == level->dim.i); int box_on_high_j = (level->my_boxes[box].low.j+dim == level->dim.j); int box_on_high_k = (level->my_boxes[box].low.k+dim == level->dim.k); if(level->boundary_condition.type == BC_DIRICHLET){ int i,j,k,normal; double s; // note, just because you are in a corner ghost zone, doesn't mean you are on the corner of the domain. // thus, one needs to calculate the normal to the domain (not normal to box) in each ghost zone region // depending on whether this normal is on a domain face, edge, or corner, one needs to choose 's' appropriately // calculate a normal vector for this face // if face is on a domain boundary, impose the boundary condition using the calculated normal s=1;if(box_on_low_i ){normal= 1+ 0+ 0;s*=-1;} if(box_on_low_i ){i= -1;j =0;k =0;for(j=0;j<dim;j++)for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;if(box_on_low_j ){normal= 0+jStride+ 0;s*=-1;} if(box_on_low_j ){i= 0;j= -1;k =0;for(k=0;k<dim;k++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;if(box_on_low_k ){normal= 0+ 0+kStride;s*=-1;} if(box_on_low_k ){i= 0;j =0;k= -1;for(j=0;j<dim;j++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;if(box_on_high_i){normal=-1+ 0+ 0;s*=-1;} if(box_on_high_i){i=dim;j =0;k =0;for(j=0;j<dim;j++)for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;if(box_on_high_j){normal= 0-jStride+ 0;s*=-1;} if(box_on_high_j){i= 0;j=dim;k =0;for(k=0;k<dim;k++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;if(box_on_high_k){normal= 0+ 0-kStride;s*=-1;} if(box_on_high_k){i= 0;j =0;k=dim;for(j=0;j<dim;j++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} // calculate a normal vector for this edge // if edge is on a domain boundary, impose the boundary condition using the calculated normal s=1;normal=0;if(box_on_low_j ){normal+=jStride;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;} if(box_on_low_j ||box_on_low_k ){i= 0;j= -1;k= -1;for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_j){normal-=jStride;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;} if(box_on_high_j||box_on_low_k ){i= 0;j=dim;k= -1;for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_j ){normal+=jStride;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;} if(box_on_low_j ||box_on_high_k){i= 0;j= -1;k=dim;for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_j){normal-=jStride;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;} if(box_on_high_j||box_on_high_k){i= 0;j=dim;k=dim;for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;} if(box_on_low_i ||box_on_low_k ){i= -1;j= 0;k= -1;for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;} if(box_on_high_i||box_on_low_k ){i=dim;j= 0;k= -1;for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;} if(box_on_low_i ||box_on_high_k){i= -1;j= 0;k=dim;for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;} if(box_on_high_i||box_on_high_k){i=dim;j= 0;k=dim;for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_low_j ){normal+=jStride;s*=-1;} if(box_on_low_i ||box_on_low_j ){i= -1;j= -1;k= 0;for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_low_j ){normal+=jStride;s*=-1;} if(box_on_high_i||box_on_low_j ){i=dim;j= -1;k= 0;for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_high_j){normal-=jStride;s*=-1;} if(box_on_low_i ||box_on_high_j){i= -1;j=dim;k= 0;for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_high_j){normal-=jStride;s*=-1;} if(box_on_high_i||box_on_high_j){i=dim;j=dim;k= 0;for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} // calculate a normal vector for this corner // if corner is on a domain boundary, impose the boundary condition using the calculated normal s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_low_j ){normal+=jStride;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;}if(box_on_low_i || box_on_low_j || box_on_low_k ){i= -1;j= -1;k= -1;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_low_j ){normal+=jStride;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;}if(box_on_high_i|| box_on_low_j || box_on_low_k ){i=dim;j= -1;k= -1;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_high_j){normal-=jStride;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;}if(box_on_low_i || box_on_high_j|| box_on_low_k ){i= -1;j=dim;k= -1;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_high_j){normal-=jStride;s*=-1;}if(box_on_low_k ){normal+=kStride;s*=-1;}if(box_on_high_i|| box_on_high_j|| box_on_low_k ){i=dim;j=dim;k= -1;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_low_j ){normal+=jStride;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;}if(box_on_low_i || box_on_low_j || box_on_high_k){i= -1;j= -1;k=dim;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_low_j ){normal+=jStride;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;}if(box_on_high_i|| box_on_low_j || box_on_high_k){i=dim;j= -1;k=dim;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_low_i ){normal+= 1;s*=-1;}if(box_on_high_j){normal-=jStride;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;}if(box_on_low_i || box_on_high_j|| box_on_high_k){i= -1;j=dim;k=dim;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} s=1;normal=0;if(box_on_high_i){normal-= 1;s*=-1;}if(box_on_high_j){normal-=jStride;s*=-1;}if(box_on_high_k){normal-=kStride;s*=-1;}if(box_on_high_i|| box_on_high_j|| box_on_high_k){i=dim;j=dim;k=dim;{int ijk=i+j*jStride+k*kStride;x[ijk]=s*x[ijk+normal];}} } } level->cycles.boundary_conditions += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ if(level->my_rank==0){printf(" rebuilding operator for level... h=%e ",level->h);fflush(stdout);} // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // form restriction of alpha[], beta_*[] coefficients from fromLevel if(fromLevel != NULL){ restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); } // else case assumes alpha/beta have been set // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange alpha/beta/... (must be done before calculating Dinv) exchange_boundary(level,VECTOR_ALPHA ,0); // must be 0(faces,edges,corners) for CA version or 27pt exchange_boundary(level,VECTOR_BETA_I,0); exchange_boundary(level,VECTOR_BETA_J,0); exchange_boundary(level,VECTOR_BETA_K,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // calculate Dinv, L1inv, and estimate the dominant Eigenvalue uint64_t _timeStart = CycleTime(); int printedError=0; int box; double dominant_eigenvalue = -1e9; #pragma omp parallel for private(box) OMP_THREAD_ACROSS_BOXES(level->concurrent_boxes) reduction(max:dominant_eigenvalue) schedule(static) for(box=0;box<level->num_my_boxes;box++){ int i,j,k; int lowi = level->my_boxes[box].low.i; int lowj = level->my_boxes[box].low.j; int lowk = level->my_boxes[box].low.k; int jStride = level->my_boxes[box].jStride; int kStride = level->my_boxes[box].kStride; int ghosts = level->my_boxes[box].ghosts; int dim = level->my_boxes[box].dim; double h2inv = 1.0/(level->h*level->h); double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); double * __restrict__ L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); double box_eigenvalue = -1e9; #pragma omp parallel for private(k,j,i) OMP_THREAD_WITHIN_A_BOX(level->threads_per_box) reduction(max:box_eigenvalue) schedule(static) for(k=0;k<dim;k++){ for(j=0;j<dim;j++){ for(i=0;i<dim;i++){ int ijk = i + j*jStride + k*kStride; #if 0 // FIX This looks wrong, but is faster... theory is because its doing something akin to SOR // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv*beta_i[ijk]) + fabs(b*h2inv*beta_i[ijk+ 1]) + fabs(b*h2inv*beta_j[ijk]) + fabs(b*h2inv*beta_j[ijk+jStride]) + fabs(b*h2inv*beta_k[ijk]) + fabs(b*h2inv*beta_k[ijk+kStride]); // centr of Gershgorin disc is the diagonal element... double Aii = a*alpha[ijk] - b*h2inv*( -beta_i[ijk]-beta_i[ijk+ 1] -beta_j[ijk]-beta_j[ijk+jStride] -beta_k[ijk]-beta_k[ijk+kStride] ); #endif #if 1 // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv) * ( fabs( beta_i[ijk ]*valid[ijk-1 ] )+ fabs( beta_j[ijk ]*valid[ijk-jStride] )+ fabs( beta_k[ijk ]*valid[ijk-kStride] )+ fabs( beta_i[ijk+1 ]*valid[ijk+1 ] )+ fabs( beta_j[ijk+jStride]*valid[ijk+jStride] )+ fabs( beta_k[ijk+kStride]*valid[ijk+kStride] ) ); // centr of Gershgorin disc is the diagonal element... double Aii = a*alpha[ijk] - b*h2inv*( beta_i[ijk ]*( valid[ijk-1 ]-2.0 )+ beta_j[ijk ]*( valid[ijk-jStride]-2.0 )+ beta_k[ijk ]*( valid[ijk-kStride]-2.0 )+ beta_i[ijk+1 ]*( valid[ijk+1 ]-2.0 )+ beta_j[ijk+jStride]*( valid[ijk+jStride]-2.0 )+ beta_k[ijk+kStride]*( valid[ijk+kStride]-2.0 ) ); #endif Dinv[ijk] = 1.0/Aii; // inverse of the diagonal Aii //L1inv[ijk] = 1.0/(Aii+sumAbsAij); // inverse of the L1 row norm // L1inv = ( D+D^{L1} )^{-1} // as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii ); // else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij); // double Di = (Aii + sumAbsAij)/Aii;if(Di>box_eigenvalue)box_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue }}} if(box_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = box_eigenvalue;} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Reduce the local estimates dominant eigenvalue to a global estimate #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = dominant_eigenvalue; MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif if(level->my_rank==0){printf("eigenvalue_max<%e\n",dominant_eigenvalue);fflush(stdout);} level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange Dinv/L1inv/... exchange_boundary(level,VECTOR_DINV ,0); // must be 0(faces,edges,corners) for CA version exchange_boundary(level,VECTOR_L1INV,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - }
//------------------------------------------------------------------------------------------------------------------------------ void init_timer() { uint64_t t0 = CycleTime(); sleep(1); uint64_t t1 = CycleTime(); frequency = (double)(t1-t0); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ void apply_BCs_linear(level_type * level, int x_id){ if(level->domain_boundary_condition == BC_PERIODIC)return; // no BC's to apply ! // for cell-centered, we need to fill in the ghost zones to apply any BC's // this code does a simple linear interpolation for homogeneous dirichlet // // . . . . . . . . . . . . . . . . . . // . . . . . . // . ? . ? . .+x(0,0).-x(0,0). // . . . . . . // . . . . +-------+ . . . . +-------+ // . | | . | | // . ? | x(0,0)| .-x(0,0)| x(0,0)| // . | | . | | // . . . . +-------+ . . . . +-------+ // ^ // domain boundary is the face... i.e. between two array indices !!! // uint64_t _timeStart = CycleTime(); int omp_across_boxes = 1; int omp_within_a_box = 0; int box; #pragma omp parallel for private(box) OMP_THREAD_ACROSS_BOXES(level->concurrent_boxes) for(box=0;box<level->num_my_boxes;box++){ int i,j,k,s; int jStride = level->my_boxes[box].jStride; int kStride = level->my_boxes[box].kStride; int ghosts = level->my_boxes[box].ghosts; int dim = level->my_boxes[box].dim; double h2inv = 1.0/(level->h*level->h); double * __restrict__ x = level->my_boxes[box].vectors[ x_id] + ghosts*(1+jStride+kStride); // i.e. [0] = first non ghost zone point double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID] + ghosts*(1+jStride+kStride); if(level->domain_boundary_condition == BC_DIRICHLET){ // why these and not -1, -5, +77 ??? k= -1;if((level->my_boxes[box].low.k == 0)) for(j=0;j<dim;j++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk +kStride];} // face j= -1; if((level->my_boxes[box].low.j == 0)) for(k=0;k<dim;k++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk +jStride ];} // face i= -1; if((level->my_boxes[box].low.i == 0)) for(k=0;k<dim;k++)for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk+1 ];} // face i=dim; if((level->my_boxes[box].low.i+dim == level->dim.i)) for(k=0;k<dim;k++)for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk-1 ];} // face j=dim; if((level->my_boxes[box].low.j+dim == level->dim.j)) for(k=0;k<dim;k++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk -jStride ];} // face k=dim;if((level->my_boxes[box].low.k+dim == level->dim.k)) for(j=0;j<dim;j++)for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk -kStride];} // face j= -1;k= -1;if((level->my_boxes[box].low.j == 0)&&(level->my_boxes[box].low.k == 0)) for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk +jStride+kStride];} // edge i= -1; k= -1;if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.k == 0)) for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk+1 +kStride];} // edge i=dim; k= -1;if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.k == 0)) for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk-1 +kStride];} // edge j=dim;k= -1;if((level->my_boxes[box].low.j+dim == level->dim.j)&&(level->my_boxes[box].low.k == 0)) for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk -jStride+kStride];} // edge i= -1;j= -1; if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.j == 0)) for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk+1+jStride ];} // edge i=dim;j= -1; if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.j == 0)) for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk-1+jStride ];} // edge i= -1;j=dim; if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.j+dim == level->dim.j)) for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk+1-jStride ];} // edge i=dim;j=dim; if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.j+dim == level->dim.j)) for(k=0;k<dim;k++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk-1-jStride ];} // edge j= -1;k=dim;if((level->my_boxes[box].low.j == 0)&&(level->my_boxes[box].low.k+dim == level->dim.k)) for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk +jStride-kStride];} // edge i= -1; k=dim;if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.k+dim == level->dim.k)) for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk+1 -kStride];} // edge i=dim; k=dim;if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.k+dim == level->dim.k)) for(j=0;j<dim;j++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk-1 -kStride];} // edge j=dim;k=dim;if((level->my_boxes[box].low.j+dim == level->dim.j)&&(level->my_boxes[box].low.k+dim == level->dim.k)) for(i=0;i<dim;i++){int ijk=i+j*jStride+k*kStride;x[ijk]=x[ijk -jStride-kStride];} // edge i= -1;j= -1;k= -1;if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.j == 0)&&(level->my_boxes[box].low.k == 0)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk+1+jStride+kStride];} // corner i=dim;j= -1;k= -1;if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.j == 0)&&(level->my_boxes[box].low.k == 0)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk-1+jStride+kStride];} // corner i= -1;j=dim;k= -1;if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.j+dim == level->dim.j)&&(level->my_boxes[box].low.k == 0)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk+1-jStride+kStride];} // corner i=dim;j=dim;k= -1;if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.j+dim == level->dim.j)&&(level->my_boxes[box].low.k == 0)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk-1-jStride+kStride];} // corner i= -1;j= -1;k=dim;if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.j == 0)&&(level->my_boxes[box].low.k+dim == level->dim.k)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk+1+jStride-kStride];} // corner i=dim;j= -1;k=dim;if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.j == 0)&&(level->my_boxes[box].low.k+dim == level->dim.k)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk-1+jStride-kStride];} // corner i= -1;j=dim;k=dim;if((level->my_boxes[box].low.i == 0)&&(level->my_boxes[box].low.j+dim == level->dim.j)&&(level->my_boxes[box].low.k+dim == level->dim.k)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk+1-jStride-kStride];} // corner i=dim;j=dim;k=dim;if((level->my_boxes[box].low.i+dim == level->dim.i)&&(level->my_boxes[box].low.j+dim == level->dim.j)&&(level->my_boxes[box].low.k+dim == level->dim.k)){int ijk=i+j*jStride+k*kStride;x[ijk]=-x[ijk-1-jStride-kStride];} // corner } } level->cycles.boundary_conditions += (uint64_t)(CycleTime()-_timeStart); }
//------------------------------------------------------------------------------------------------------------------------------ // Samuel Williams // [email protected] // Lawrence Berkeley National Lab //------------------------------------------------------------------------------------------------------------------------------ // perform a (intra-level) ghost zone exchange // NOTE exchange_boundary() only exchanges the boundary. // It will not enforce any boundary conditions // BC's are either the responsibility of a separate function or should be fused into the stencil void exchange_boundary(level_type * level, int id, int justFaces){ uint64_t _timeCommunicationStart = CycleTime(); uint64_t _timeStart,_timeEnd; int buffer=0; int n; if(justFaces)justFaces=1;else justFaces=0; // must be 0 or 1 in order to index into exchange_ghosts[] #ifdef USE_MPI int nMessages = level->exchange_ghosts[justFaces].num_recvs + level->exchange_ghosts[justFaces].num_sends; //MPI_Request *recv_requests = level->exchange_ghosts[justFaces].requests; //MPI_Request *send_requests = level->exchange_ghosts[justFaces].requests + level->exchange_ghosts[justFaces].num_recvs; MPI_Request *send_requests = level->exchange_ghosts[justFaces].requests; MPI_Request *recv_requests = level->exchange_ghosts[justFaces].requests + level->exchange_ghosts[justFaces].num_sends; // loop through packed list of MPI receives and prepost Irecv's... _timeStart = CycleTime(); #ifdef USE_MPI_THREAD_MULTIPLE #pragma omp parallel for schedule(dynamic,1) #endif for(n=0;n<level->exchange_ghosts[justFaces].num_recvs;n++){ MPI_Irecv(level->exchange_ghosts[justFaces].recv_buffers[n], level->exchange_ghosts[justFaces].recv_sizes[n], MPI_DOUBLE, level->exchange_ghosts[justFaces].recv_ranks[n], 0, // by convention, ghost zone exchanges use tag=0 MPI_COMM_WORLD, //&level->exchange_ghosts[justFaces].requests[n] //&recv_requests[n] &recv_requests[n] ); } _timeEnd = CycleTime(); level->cycles.ghostZone_recv += (_timeEnd-_timeStart); // pack MPI send buffers... _timeStart = CycleTime(); #pragma omp parallel for if(level->exchange_ghosts[justFaces].num_blocks[0]>1) schedule(static,1) for(buffer=0;buffer<level->exchange_ghosts[justFaces].num_blocks[0];buffer++){CopyBlock(level,id,&level->exchange_ghosts[justFaces].blocks[0][buffer]);} _timeEnd = CycleTime(); level->cycles.ghostZone_pack += (_timeEnd-_timeStart); // loop through MPI send buffers and post Isend's... _timeStart = CycleTime(); #ifdef USE_MPI_THREAD_MULTIPLE #pragma omp parallel for schedule(dynamic,1) #endif for(n=0;n<level->exchange_ghosts[justFaces].num_sends;n++){ MPI_Isend(level->exchange_ghosts[justFaces].send_buffers[n], level->exchange_ghosts[justFaces].send_sizes[n], MPI_DOUBLE, level->exchange_ghosts[justFaces].send_ranks[n], 0, // by convention, ghost zone exchanges use tag=0 MPI_COMM_WORLD, &send_requests[n] //&level->exchange_ghosts[justFaces].requests[n+level->exchange_ghosts[justFaces].num_recvs] // requests[0..num_recvs-1] were used by recvs. So sends start at num_recvs ); } _timeEnd = CycleTime(); level->cycles.ghostZone_send += (_timeEnd-_timeStart); #endif // exchange locally... try and hide within Isend latency... _timeStart = CycleTime(); #pragma omp parallel for if(level->exchange_ghosts[justFaces].num_blocks[1]>1) schedule(static,1) for(buffer=0;buffer<level->exchange_ghosts[justFaces].num_blocks[1];buffer++){CopyBlock(level,id,&level->exchange_ghosts[justFaces].blocks[1][buffer]);} _timeEnd = CycleTime(); level->cycles.ghostZone_local += (_timeEnd-_timeStart); // wait for MPI to finish... #ifdef USE_MPI _timeStart = CycleTime(); if(nMessages)MPI_Waitall(nMessages,level->exchange_ghosts[justFaces].requests,level->exchange_ghosts[justFaces].status); _timeEnd = CycleTime(); level->cycles.ghostZone_wait += (_timeEnd-_timeStart); // unpack MPI receive buffers _timeStart = CycleTime(); #pragma omp parallel for if(level->exchange_ghosts[justFaces].num_blocks[2]>1) schedule(static,1) for(buffer=0;buffer<level->exchange_ghosts[justFaces].num_blocks[2];buffer++){CopyBlock(level,id,&level->exchange_ghosts[justFaces].blocks[2][buffer]);} _timeEnd = CycleTime(); level->cycles.ghostZone_unpack += (_timeEnd-_timeStart); #endif level->cycles.ghostZone_total += (uint64_t)(CycleTime()-_timeCommunicationStart); }
//------------------------------------------------------------------------------------------------------------------------------ // perform a (inter-level) restriction void restriction(level_type * level_c, int id_c, level_type *level_f, int id_f, int restrictionType){ uint64_t _timeCommunicationStart = CycleTime(); uint64_t _timeStart,_timeEnd; int buffer=0; int n; int my_tag = (level_f->tag<<4) | 0x5; #ifdef USE_MPI // by convention, level_f allocates a combined array of requests for both level_f sends and level_c recvs... int nMessages = level_c->restriction[restrictionType].num_recvs + level_f->restriction[restrictionType].num_sends; MPI_Request *recv_requests = level_f->restriction[restrictionType].requests; MPI_Request *send_requests = level_f->restriction[restrictionType].requests + level_c->restriction[restrictionType].num_recvs; // loop through packed list of MPI receives and prepost Irecv's... _timeStart = CycleTime(); #ifdef USE_MPI_THREAD_MULTIPLE #pragma omp parallel for schedule(dynamic,1) #endif for(n=0;n<level_c->restriction[restrictionType].num_recvs;n++){ MPI_Irecv(level_c->restriction[restrictionType].recv_buffers[n], level_c->restriction[restrictionType].recv_sizes[n], MPI_DOUBLE, level_c->restriction[restrictionType].recv_ranks[n], my_tag, MPI_COMM_WORLD, &recv_requests[n] ); } _timeEnd = CycleTime(); level_f->cycles.restriction_recv += (_timeEnd-_timeStart); // pack MPI send buffers... _timeStart = CycleTime(); PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->restriction[restrictionType].num_blocks[0]) for(buffer=0;buffer<level_f->restriction[restrictionType].num_blocks[0];buffer++){RestrictBlock(level_c,id_c,level_f,id_f,&level_f->restriction[restrictionType].blocks[0][buffer],restrictionType);} _timeEnd = CycleTime(); level_f->cycles.restriction_pack += (_timeEnd-_timeStart); // loop through MPI send buffers and post Isend's... _timeStart = CycleTime(); #ifdef USE_MPI_THREAD_MULTIPLE #pragma omp parallel for schedule(dynamic,1) #endif for(n=0;n<level_f->restriction[restrictionType].num_sends;n++){ MPI_Isend(level_f->restriction[restrictionType].send_buffers[n], level_f->restriction[restrictionType].send_sizes[n], MPI_DOUBLE, level_f->restriction[restrictionType].send_ranks[n], my_tag, MPI_COMM_WORLD, &send_requests[n] ); } _timeEnd = CycleTime(); level_f->cycles.restriction_send += (_timeEnd-_timeStart); #endif // perform local restriction[restrictionType]... try and hide within Isend latency... _timeStart = CycleTime(); PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_f->restriction[restrictionType].num_blocks[1]) for(buffer=0;buffer<level_f->restriction[restrictionType].num_blocks[1];buffer++){RestrictBlock(level_c,id_c,level_f,id_f,&level_f->restriction[restrictionType].blocks[1][buffer],restrictionType);} _timeEnd = CycleTime(); level_f->cycles.restriction_local += (_timeEnd-_timeStart); // wait for MPI to finish... #ifdef USE_MPI _timeStart = CycleTime(); if(nMessages)MPI_Waitall(nMessages,level_f->restriction[restrictionType].requests,level_f->restriction[restrictionType].status); _timeEnd = CycleTime(); level_f->cycles.restriction_wait += (_timeEnd-_timeStart); // unpack MPI receive buffers _timeStart = CycleTime(); PRAGMA_THREAD_ACROSS_BLOCKS(level_f,buffer,level_c->restriction[restrictionType].num_blocks[2]) for(buffer=0;buffer<level_c->restriction[restrictionType].num_blocks[2];buffer++){CopyBlock(level_c,id_c,&level_c->restriction[restrictionType].blocks[2][buffer]);} _timeEnd = CycleTime(); level_f->cycles.restriction_unpack += (_timeEnd-_timeStart); #endif level_f->cycles.restriction_total += (uint64_t)(CycleTime()-_timeCommunicationStart); }
//------------------------------------------------------------------------------------------------------------------------------ void rebuild_operator(level_type * level, level_type *fromLevel, double a, double b){ if(level->my_rank==0){fprintf(stdout," rebuilding 27pt CC operator for level... h=%e ",level->h);} // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // form restriction of alpha[], beta_*[] coefficients from fromLevel if(fromLevel != NULL){ restriction(level,VECTOR_ALPHA ,fromLevel,VECTOR_ALPHA ,RESTRICT_CELL ); restriction(level,VECTOR_BETA_I,fromLevel,VECTOR_BETA_I,RESTRICT_FACE_I); restriction(level,VECTOR_BETA_J,fromLevel,VECTOR_BETA_J,RESTRICT_FACE_J); restriction(level,VECTOR_BETA_K,fromLevel,VECTOR_BETA_K,RESTRICT_FACE_K); } // else case assumes alpha/beta have been set // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange alpha/beta/... (must be done before calculating Dinv) exchange_boundary(level,VECTOR_ALPHA ,0); // must be 0(faces,edges,corners) for CA version or 27pt exchange_boundary(level,VECTOR_BETA_I,0); exchange_boundary(level,VECTOR_BETA_J,0); exchange_boundary(level,VECTOR_BETA_K,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // calculate Dinv, L1inv, and estimate the dominant Eigenvalue uint64_t _timeStart = CycleTime(); int block; double dominant_eigenvalue = -1e9; PRAGMA_THREAD_ACROSS_BLOCKS_MAX(level,block,level->num_my_blocks,dominant_eigenvalue) for(block=0;block<level->num_my_blocks;block++){ const int box = level->my_blocks[block].read.box; const int ilo = level->my_blocks[block].read.i; const int jlo = level->my_blocks[block].read.j; const int klo = level->my_blocks[block].read.k; const int ihi = level->my_blocks[block].dim.i + ilo; const int jhi = level->my_blocks[block].dim.j + jlo; const int khi = level->my_blocks[block].dim.k + klo; int i,j,k; const int jStride = level->my_boxes[box].jStride; const int kStride = level->my_boxes[box].kStride; const int ghosts = level->my_boxes[box].ghosts; double h2inv = 1.0/(level->h*level->h); double * __restrict__ alpha = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride); double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride); double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride); double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride); double * __restrict__ Dinv = level->my_boxes[box].vectors[VECTOR_DINV ] + ghosts*(1+jStride+kStride); double * __restrict__ L1inv = level->my_boxes[box].vectors[VECTOR_L1INV ] + ghosts*(1+jStride+kStride); double * __restrict__ valid = level->my_boxes[box].vectors[VECTOR_VALID ] + ghosts*(1+jStride+kStride); double block_eigenvalue = -1e9; for(k=klo;k<khi;k++){ for(j=jlo;j<jhi;j++){ for(i=ilo;i<ihi;i++){ int ijk = i + j*jStride + k*kStride; // radius of Gershgorin disc is the sum of the absolute values of the off-diagonal elements... double sumAbsAij = fabs(b*h2inv*6.0*STENCIL_COEF1) + fabs(b*h2inv*12.0*STENCIL_COEF2) + fabs(b*h2inv*8.0*STENCIL_COEF3); // center of Gershgorin disc is the diagonal element... double Aii = a - b*h2inv*( STENCIL_COEF0 ); Dinv[ijk] = 1.0/Aii; // inverse of the diagonal Aii //L1inv[ijk] = 1.0/(Aii+sumAbsAij); // inverse of the L1 row norm... L1inv = ( D+D^{L1} )^{-1} // as suggested by eq 6.5 in Baker et al, "Multigrid smoothers for ultra-parallel computing: additional theory and discussion"... if(Aii>=1.5*sumAbsAij)L1inv[ijk] = 1.0/(Aii ); // else L1inv[ijk] = 1.0/(Aii+0.5*sumAbsAij); // double Di = (Aii + sumAbsAij)/Aii;if(Di>block_eigenvalue)block_eigenvalue=Di; // upper limit to Gershgorin disc == bound on dominant eigenvalue }}} if(block_eigenvalue>dominant_eigenvalue){dominant_eigenvalue = block_eigenvalue;} } level->cycles.blas1 += (uint64_t)(CycleTime()-_timeStart); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Reduce the local estimates dominant eigenvalue to a global estimate #ifdef USE_MPI uint64_t _timeStartAllReduce = CycleTime(); double send = dominant_eigenvalue; MPI_Allreduce(&send,&dominant_eigenvalue,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); uint64_t _timeEndAllReduce = CycleTime(); level->cycles.collectives += (uint64_t)(_timeEndAllReduce-_timeStartAllReduce); #endif if(level->my_rank==0){fprintf(stdout,"eigenvalue_max<%e\n",dominant_eigenvalue);} level->dominant_eigenvalue_of_DinvA = dominant_eigenvalue; // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // exchange Dinv/L1inv/... exchange_boundary(level,VECTOR_DINV ,0); // must be 0(faces,edges,corners) for CA version exchange_boundary(level,VECTOR_L1INV,0); // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - }