Beispiel #1
0
/**************************************************************************
 Function: reference_jacobi

 This routine contains the main iteration loop for the Jacobi iteration
 reference implementation (no OpenCL).

 params:
    a           two arrays to compute solution into
    max_iter    maximum number of iterations   
    size        size of array for this MPI rank
    tolerance   all differences should be les than this tolerance value
    mpi_ranks   number of MPI ranks in each dimension
    rank_pos    cartesian position of this rank    
    origin      origin for this rank
    d           discretion size
    mpi_comm    MPI communications structure
 **************************************************************************/
static void reference_jacobi(value_type *a[2],
                            unsigned int max_iter,
                            size_t size[DIMENSIONS],
                            value_type tolerance,
                            value_type d[DIMENSIONS]) {

    unsigned int rc, iter = 0;
    value_type max_diff, timer;

    /* init arrays by setting the initial value and the boundary conditions */
    set_initial_solution(a[OLD], size, INITIAL_GUESS);
    set_initial_solution(a[NEW], size, INITIAL_GUESS);
    set_boundary_conditions(a[OLD], size, d);
    set_boundary_conditions(a[NEW], size, d);

    /* print the initial solution guess */
    print_array("Init ", a[NEW], size, d);

    /*  iterate until maximum difference is less than the given tolerance
        or number of iterations is too high
     */
    do {
        /* swap array pointers for next iteration */
        SWAP_PTR(a[OLD], a[NEW]);

        /* iterate using a[OLD] as the input and a[NEW] as the output */
        max_diff = reference_jacobi_kernel(a[OLD], a[NEW], size);

        /* output status for user, overwrite the same line */
        if (0 == iter % 100) {
            printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r",
                iter, max_diff, tolerance);
            fflush(stdout);
        }

        /* increment counter */
        iter++;
    } while (max_diff > tolerance && max_iter > iter); /* do loop */

    /* output final iteration count and maximum difference value */
    printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n",
                    iter, max_diff, timer);

}
Beispiel #2
0
int invert_doublet_eo_quda(spinor * const Even_new_s, spinor * const Odd_new_s,
                           spinor * const Even_new_c, spinor * const Odd_new_c,
                           spinor * const Even_s, spinor * const Odd_s,
                           spinor * const Even_c, spinor * const Odd_c,
                           const double precision, const int max_iter,
                           const int solver_flag, const int rel_prec, const int even_odd_flag,
                           const SloppyPrecision sloppy_precision,
                           CompressionType compression) {

  spinor ** solver_field = NULL;
  const int nr_sf = 4;
  init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);

  convert_eo_to_lexic(solver_field[0],   Even_s,  Odd_s);
  convert_eo_to_lexic(solver_field[1],   Even_c,  Odd_c);
//  convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], Even_new, Odd_new);

  void *spinorIn    = (void*)solver_field[0]; // source
  void *spinorIn_c  = (void*)solver_field[1]; // charme source
  void *spinorOut   = (void*)solver_field[2]; // solution
  void *spinorOut_c = (void*)solver_field[3]; // charme solution

  if ( rel_prec )
    inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
  else
    inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;

  inv_param.kappa = g_kappa;

  // IMPORTANT: use opposite TM mu-flavor since gamma5 -> -gamma5
  inv_param.mu      = -g_mubar /2./g_kappa;
  inv_param.epsilon =  g_epsbar/2./g_kappa;


  // figure out which BC to use (theta, trivial...)
  set_boundary_conditions(&compression);

  // set the sloppy precision of the mixed prec solver
  set_sloppy_prec(sloppy_precision);

  // load gauge after setting precision
   _loadGaugeQuda(compression);

  // choose dslash type
  if( g_c_sw > 0.0 ) {
    inv_param.dslash_type = QUDA_TWISTED_CLOVER_DSLASH;
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
    inv_param.solution_type = QUDA_MAT_SOLUTION;
    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
    inv_param.clover_coeff = g_c_sw*g_kappa;
  }
  else {
    inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH;
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC;
    inv_param.solution_type = QUDA_MAT_SOLUTION;
  }

  // choose solver
  if(solver_flag == BICGSTAB) {
    if(g_proc_id == 0) {printf("# QUDA: Using BiCGstab!\n"); fflush(stdout);}
    inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
  }
  else {
    /* Here we invert the hermitean operator squared */
    inv_param.inv_type = QUDA_CG_INVERTER;
    if(g_proc_id == 0) {
      printf("# QUDA: Using mixed precision CG!\n");
      printf("# QUDA: mu = %f, kappa = %f\n", g_mu/2./g_kappa, g_kappa);
      fflush(stdout);
    }
  }

  if( even_odd_flag ) {
    inv_param.solve_type = QUDA_NORMOP_PC_SOLVE;
    if(g_proc_id == 0) printf("# QUDA: Using preconditioning!\n");
  }
  else {
    inv_param.solve_type = QUDA_NORMOP_SOLVE;
    if(g_proc_id == 0) printf("# QUDA: Not using preconditioning!\n");
  }

  inv_param.tol = sqrt(precision)*0.25;
  inv_param.maxiter = max_iter;

  inv_param.twist_flavor = QUDA_TWIST_NONDEG_DOUBLET;
  inv_param.Ls = 2;

  // NULL pointers to host fields to force
  // construction instead of download of the clover field:
  if( g_c_sw > 0.0 )
    loadCloverQuda(NULL, NULL, &inv_param);

  // reorder spinor
  reorder_spinor_toQuda( (double*)spinorIn,   inv_param.cpu_prec, 1, (double*)spinorIn_c );

  // perform the inversion
  invertQuda(spinorOut, spinorIn, &inv_param);

  if( inv_param.verbosity == QUDA_VERBOSE )
    if(g_proc_id == 0)
      printf("# QUDA: Device memory used:  Spinor: %f GiB,  Gauge: %f GiB, Clover: %f GiB\n",
   inv_param.spinorGiB, gauge_param.gaugeGiB, inv_param.cloverGiB);
  if( inv_param.verbosity > QUDA_SILENT )
    if(g_proc_id == 0)
      printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n",
   inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs);

  // number of CG iterations
  int iteration = inv_param.iter;

  // reorder spinor
  reorder_spinor_fromQuda( (double*)spinorIn,    inv_param.cpu_prec, 1, (double*)spinorIn_c );
  reorder_spinor_fromQuda( (double*)spinorOut,   inv_param.cpu_prec, 1, (double*)spinorOut_c );
  convert_lexic_to_eo(Even_s,     Odd_s,     solver_field[0]);
  convert_lexic_to_eo(Even_c,     Odd_c,     solver_field[1]);
  convert_lexic_to_eo(Even_new_s, Odd_new_s, solver_field[2]);
  convert_lexic_to_eo(Even_new_c, Odd_new_c, solver_field[3]);

  finalize_solver(solver_field, nr_sf);
  freeGaugeQuda();

  if(iteration >= max_iter)
    return(-1);

  return(iteration);
}
int main()
{
      Array3<double> level_set;				// level-set field
      Array3<double> pressure;				// pressure field
      Array3<double> u_1_velocity_old;			// velocity field at old time level x1 direction
      Array3<double> u_2_velocity_old;			// velocity field at old time level x2 direction
      Array3<double> u_3_velocity_old;			// velocity field at old time level x3 direction
      Array3<double> u_1_velocity_new;			// velocity field at new time level x1 direction
      Array3<double> u_2_velocity_new;			// velocity field at new time level x2 direction
      Array3<double> u_3_velocity_new;			// velocity field at new time level x3 direction
      Array3<double> momentum_source_term_u_1;		// source term of the momentum equation in x1 direction
					       		// defined on all u1 points (including boundaries)
      Array3<double> momentum_source_term_u_2;		// source term of the momentum equation in x2 direction
					        	// defined on all u1 points (including boundaries)
      Array3<double> momentum_source_term_u_3;		// source term of the momentum equation in x3 direction
					        	// defined on all u1 points (including boundaries)
      Array3<double> surface_tension_body_force_x1;	// source term of the momentum equation in x1 direction
					        	// defined on all u1 points (including boundaries)
      Array3<double> surface_tension_body_force_x2;	// source term of the momentum equation in x2 direction
					        	// defined on all u1 points (including boundaries)
      Array3<double> surface_tension_body_force_x3;	// source term of the momentum equation in x3 direction
					        	// defined on all u1 points (including boundaries)
      Array3<double> scaled_density_u1;			// scaled density for the controlvolumes
							// of the momentum equation in x1 direction
      Array3<double> scaled_density_u2;			// scaled density for the controlvolumes
							// of the momentum equation in x2 direction
      Array3<double> scaled_density_u3;			// scaled density for the controlvolumes
							// of the momentum equation in x3 direction
      boundary_face boundary_faces[6];		// array with all the information
							// for the boundary conditions 
      double mesh_width_x1;				// grid spacing in x1 direction (uniform)
      double mesh_width_x2;				// grid spacing in x2 direction (uniform)
      double mesh_width_x3;				// grid spacing in x3 direction (uniform)
      int number_primary_cells_i;			// number of primary (pressure) cells in x1 direction
      int number_primary_cells_j;			// number of primary (pressure) cells in x2 direction
      int number_primary_cells_k;			// number of primary (pressure) cells in x3 direction
      int number_matrix_connections = 7;			// number of connections in momentum matrix				       
      vector gravity;					// gravitational acceleration vector 
      double actual_time_step_navier_stokes;	// time step used for level-set advection
							// computed from all stability restrictions and 
							// possibly subscycling
      double rho_plus_over_rho_minus;		// ratio of the densities of the two phases
      double smoothing_distance_factor;		// the smoothing distance is smoothing_distance_factor
							// times the smallest mesh width
      double rho_minus_over_mu_minus;		// this was the 'Reynolds' number
							// in the original implementation of Sander
      double mu_plus_over_mu_minus;			// ratio of the viscosities of both phases
      double tolerance_pressure = 1e-8;	  		// the tolerance with which the system for the pressure is solved	
      int maximum_iterations_allowed_pressure=1e3;	// maximum number of iterations allowed for the
							// conjugate gradient method
      double tolerance_velocity=1e-7;	  		// the tolerance with which the system for the momentum predictor is solved	
      int maximum_iterations_allowed_velocity=1e2;	// maximum number of iterations allowed for the
							// conjugate gradient method
      int continuous_surface_force_model = 1;       	// =1, the continuous surface force model is applied
					        	// =0, the exact interface boundary conditions are applied
      int source_terms_in_momentum_predictor =1;   	// =1, the source terms are applied in the momentum predictor
					        	// equation
					        	// =0, the source terms are applied in the momentum corrector
					        	// equation
      vector initial_velocity;				// some initial velocity to fill the arrays


int i, j, k;  		// local variables for loop indexing
double x,y,z;		// coordinates
double x_length = 1;
double y_length = 2;
double z_length = 3; 

      double a,b,c,d,e,f,g,h,l,m,n,o,p,q,r,s,t,u;  // input coefficients
      
      double u_1, u_2, u_3; 
      double viscosity_over_density;
      double output_polynomial; 
      double scaled_density_constant = 1; 

      initial_velocity.u1 = 0.0;
      initial_velocity.u2 = 0.0;
      initial_velocity.u3 = 0.0;
 
      smoothing_distance_factor = 2; 
      rho_plus_over_rho_minus = 10.0;
      rho_minus_over_mu_minus = 1e20;
      mu_plus_over_mu_minus = 1; 
      
      
      gravity.u1 = 00.0;
      gravity.u2 = 00.0;
      gravity.u3 = 00.0;
 	
	actual_time_step_navier_stokes = 0.1; 

set_boundary_conditions(boundary_faces, initial_velocity); 
//	set_boundary_conditions(boundary_face[6], initial_velocity);
	
      number_primary_cells_i = pow(2,1); 
      number_primary_cells_j = pow(2,2); 
      number_primary_cells_k = pow(2,2); 

      	mesh_width_x1 = x_length/number_primary_cells_i; 
	mesh_width_x2 = y_length/number_primary_cells_j; 
	mesh_width_x3 = z_length/number_primary_cells_k; 

viscosity_over_density = (1/scaled_density_constant)*compute_scaled_viscosity(100,mesh_width_x1,mesh_width_x2,mesh_width_x3, smoothing_distance_factor, rho_minus_over_mu_minus,mu_plus_over_mu_minus);
      
      level_set.create(number_primary_cells_i+2,number_primary_cells_j+2,number_primary_cells_k+2);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, 
			    number_primary_cells_k+2, level_set, 0.1);

      pressure.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+2);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, 
			    number_primary_cells_k+2, pressure, 10000.0);
	// velocity old
      u_1_velocity_old.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2);
      u_2_velocity_old.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2);
      u_3_velocity_old.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1);
      set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2,
			    number_primary_cells_k+2, u_1_velocity_old, initial_velocity.u1);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, 
			    number_primary_cells_k+2, u_2_velocity_old, initial_velocity.u2);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, 
			    number_primary_cells_k+1, u_3_velocity_old, initial_velocity.u3);


// multiple input sets of parameters
//	a=1;	b=2;	c=3;	d=4;	e=5;	f=6;	g=7;	h=8;	l=9;	m=1;	n=2;	o=3;	p=4; 	q=5; 	r=6;	s=7;	t=8;	u=9; // full test
	a=0;	b=0;	c=3;	d=4;	e=5;	f=6;	g=7;	h=8;	l=0;	m=0;	n=-3;	o=3;	p=4; 	q=5; 	r=6;	s=7;	t=0;	u=0; // divergence free velocity input b+m+n = 0
//	a=0;	b=2;	c=0;	d=4;	e=0;	f=6;	g=0;	h=8;	l=0;	m=1;	n=0;	o=3;	p=0; 	q=5; 	r=0;	s=7;	t=0;	u=9; // linear test
//	a=0;	b=0;	c=0;	d=0;	e=0;	f=0;	g=0;	h=1;	l=0;	m=0;	n=0;	o=0;	p=0; 	q=0; 	r=0;	s=0;	t=0;	u=0; // random test
//	a=0;	b=1;	c=0;	d=0;	e=0;	f=0;	g=0;	h=0;	l=0;	m=0;	n=0;	o=0;	p=0; 	q=0; 	r=0;	s=0;	t=0;	u=0; // random test

//	a=1;	b=2;	c=3;	d=4;	e=5;	f=6;	g=7;	h=8;	l=9;	m=1;	n=2;	o=3;	p=4; 	q=5; 	r=6;	s=7;	t=8;	u=9;  // test for diffusion, only quadratic terms

  for(i=0;i<number_primary_cells_i+2;i++)
  {
     for(j=0;j<number_primary_cells_j+2;j++)
      {
	  for(k=0;k<number_primary_cells_k+2;k++)
	  {
	  x = x_length * i/number_primary_cells_i -0.5*mesh_width_x1;
//	  printf("x = %f \n",x);
	  y = y_length * j/number_primary_cells_j -0.5*mesh_width_x2;
//	  printf("y = %f \n",y);
	  z = z_length * k/number_primary_cells_k -0.5*mesh_width_x3;
	  
	  if(i!=number_primary_cells_i+1)
	  	{x += 0.5*mesh_width_x1;
//	  	printf("x = %f \t",x);
	  	u_1_velocity_old[i][j][k]= a*x*x+b*x 	+c*y*y+d*y	+e*z*z+f*z; 	// 1st test
//	  	u_1_velocity_old[i][j][k]= a*x*x+b*y*y+c*z*z 	+d*x*y+e*x*z +f*y*z; 	// 2nd test, only quadratic terms 
//	  	printf("u_1 = %f \n", u_1_velocity_old[i][j][k]);
	  	x -= 0.5*mesh_width_x1;}
	  if(j!=number_primary_cells_j+1)
	  	{y += 0.5*mesh_width_x2;
	  	u_2_velocity_old[i][j][k]= g*x*x+h*x 	+l*y*y+m*y	+n*z*z+o*z; 	// 1st test
//	  	u_2_velocity_old[i][j][k]= g*x*x+h*y*y+l*z*z 	+m*x*y+n*x*z +o*y*z; 	// 2nd test, only quadratic terms 	  	
	  	y -= 0.5*mesh_width_x2;}
	  if(k!=number_primary_cells_k+1)
	  	{z += 0.5*mesh_width_x3;
	  	u_3_velocity_old[i][j][k]= p*x*x+q*x 	+r*y*y+s*y	+t*z*z+u*z;	// 1st test
//	  	u_3_velocity_old[i][j][k]= p*x*x+q*y*y+r*z*z 	+s*x*y+t*x*z +u*y*z; 	// 2nd test, only quadratic terms 
	  	z -= 0.5*mesh_width_x3;}
	  }  
  
      }  
     
  } 


	// velocity new
      u_1_velocity_new.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2);
      u_2_velocity_new.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2);
      u_3_velocity_new.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1);
      set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2,
			    number_primary_cells_k+2, u_1_velocity_new, initial_velocity.u1);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, 
			    number_primary_cells_k+2, u_2_velocity_new, initial_velocity.u2);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, 
			    number_primary_cells_k+1, u_3_velocity_new, initial_velocity.u3);

	// momentum source terms
      momentum_source_term_u_1.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2);
      momentum_source_term_u_2.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2);
      momentum_source_term_u_3.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1);
      set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2,
			    number_primary_cells_k+2, momentum_source_term_u_1, 0.0);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, 
			    number_primary_cells_k+2, momentum_source_term_u_2, 0.0);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, 
			    number_primary_cells_k+1, momentum_source_term_u_3, 0.0);
			    
	// surface_tension_body_force      
      surface_tension_body_force_x1.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2);
      surface_tension_body_force_x2.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2);
      surface_tension_body_force_x3.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1);
      set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2,
			    number_primary_cells_k+2, surface_tension_body_force_x1, 0);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, 
			    number_primary_cells_k+2, surface_tension_body_force_x2, 0);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, 
			    number_primary_cells_k+1, surface_tension_body_force_x3, 0);			    			    			    



	// scaled density
      scaled_density_u1.create(number_primary_cells_i+1, number_primary_cells_j+2, number_primary_cells_k+2);
      scaled_density_u2.create(number_primary_cells_i+2, number_primary_cells_j+1, number_primary_cells_k+2);
      scaled_density_u3.create(number_primary_cells_i+2, number_primary_cells_j+2, number_primary_cells_k+1);
      set_constant_matrix2(number_primary_cells_i+1, number_primary_cells_j+2,
			    number_primary_cells_k+2, scaled_density_u1, 1);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+1, 
			    number_primary_cells_k+2, scaled_density_u2, 1);
      set_constant_matrix2(number_primary_cells_i+2, number_primary_cells_j+2, 
			    number_primary_cells_k+1, scaled_density_u3, 1);

 	
	advance_flow_field(level_set, pressure,
                       u_1_velocity_old, u_2_velocity_old, u_3_velocity_old,
                         u_1_velocity_new, u_2_velocity_new, u_3_velocity_new,
                           momentum_source_term_u_1, momentum_source_term_u_2, momentum_source_term_u_3,
                            surface_tension_body_force_x1, surface_tension_body_force_x2, surface_tension_body_force_x3,
                             scaled_density_u1, scaled_density_u2, scaled_density_u3,
                              boundary_faces,
                                mesh_width_x1, mesh_width_x2, mesh_width_x3,
                                  number_primary_cells_i, number_primary_cells_j, number_primary_cells_k,
                                   number_matrix_connections, gravity,
                                     actual_time_step_navier_stokes,
                                       rho_plus_over_rho_minus,
                                         smoothing_distance_factor,
                                          rho_minus_over_mu_minus, mu_plus_over_mu_minus,
                                           tolerance_pressure, maximum_iterations_allowed_pressure,
                                             tolerance_velocity, maximum_iterations_allowed_velocity,
                                                continuous_surface_force_model,
                                                 source_terms_in_momentum_predictor
	);
	
	
// check for u,v,w momentum 
for(i=1;i<number_primary_cells_i+1;i++)
{
  for(j=1;j<number_primary_cells_j+1;j++)
  {
      for(k=1;k<number_primary_cells_k+1;k++)
      {
	  x = x_length * i/number_primary_cells_i -0.5*mesh_width_x1;
	  y = y_length * j/number_primary_cells_j -0.5*mesh_width_x2;
	  z = z_length * k/number_primary_cells_k -0.5*mesh_width_x3;
	
	printf("\n i=%i j=%i k=%i \n", i,j,k);


	if(i!=number_primary_cells_i){
		x += 0.5*mesh_width_x1;
		u_1 = a*x*x+b*x 	+c*y*y+d*y	+e*z*z+f*z;
		u_2 = g*x*x+h*x 	+l*y*y+m*y	+n*z*z+o*z;
		u_3 = p*x*x+q*x 	+r*y*y+s*y	+t*z*z+u*z;
		output_polynomial = 2*u_1*(2*a*x+b) + u_1*(2*y*l+m) + u_2*(2*y*c+d) + u_1*(2*z*t+u) + u_3*(2*z*e+f)	+viscosity_over_density*(2*2*a+2*c+2*e); // 1st test
//		output_polynomial = viscosity_over_density*(4*a+2*b+m+2*c+t); // 2nd test, only quadratic terms 
		x -= 0.5*mesh_width_x1;
		printf("\n u_1_velocity_old = %f \n", u_1);		
		printf("u_1_velocity_new = %f \n", u_1_velocity_new[i][j][k]);		
		printf("new - old = %f \n", u_1_velocity_new[i][j][k]-u_1);			
		printf("polynomial   = %f \n", output_polynomial);
	}	
	
	if(j!=number_primary_cells_j){
		y += 0.5*mesh_width_x2;
		u_1 = a*x*x+b*x 	+c*y*y+d*y	+e*z*z+f*z;
		u_2 = g*x*x+h*x 	+l*y*y+m*y	+n*z*z+o*z;
		u_3 = p*x*x+q*x 	+r*y*y+s*y	+t*z*z+u*z;
		output_polynomial = u_1*(2*x*g+h)+u_2*(2*x*a+b) + 2*u_2*(2*y*l+m) + u_3*(2*z*n+o)+u_2*(2*z*t+u)		+viscosity_over_density*(2*2*l+2*g+2*n); // 1st test
//		output_polynomial = viscosity_over_density*(4*h+2*g+d+2*l+u); // 2nd test, only quadratic terms 
		y -= 0.5*mesh_width_x2;
		printf(" \n u_2_velocity_old = %f \n", u_2);		
		printf("u_2_velocity_new = %f \n", u_2_velocity_new[i][j][k]);		
		printf("new - old = %f \n", u_2_velocity_new[i][j][k]-u_2);			
		printf("polynomial   = %f \n", output_polynomial);	
	}	
	if(k!=number_primary_cells_k){
		z += 0.5*mesh_width_x3;
		u_1 = a*x*x+b*x 	+c*y*y+d*y	+e*z*z+f*z;
		u_2 = g*x*x+h*x 	+l*y*y+m*y	+n*z*z+o*z;
		u_3 = p*x*x+q*x 	+r*y*y+s*y	+t*z*z+u*z;
		output_polynomial = u_1*(2*x*p+q)+u_3*(2*x*a+b) + u_2*(2*y*r+s)+u_3*(2*y*l+m) + 2*u_3*(2*z*t+u)		+viscosity_over_density*(2*2*t+2*p+2*r); // 1st test
//		output_polynomial = viscosity_over_density*(4*r+2*p+e+2*q+o); // 2nd test, only quadratic terms 
		z -= 0.5*mesh_width_x3;
		printf("\n u_3_velocity_old = %f \n", u_3);		
		printf("u_3_velocity_new = %f \n", u_3_velocity_new[i][j][k]);		
		printf("new - old = %f \n", u_3_velocity_new[i][j][k]-u_3);			
		printf("polynomial   = %f \n", output_polynomial);
	}
}}}


//		printf("u_1_velocity_new = %f \n", u_1_velocity_new[i][j][k]);
//		printf("u_2_velocity_old = %f \n", u_2_velocity_old[i][j][k]);
//		printf("u_2_velocity_new = %f \n", u_2_velocity_new[i][j][k]);
//		printf("u_3_velocity_old = %f \n", u_3_velocity_old[i][j][k]);
//		printf("u_3_velocity_new = %f \n", u_3_velocity_new[i][j][k]);
//		printf("pressure = %f \n", pressure[i][j][k]);
	

      printf("%s \n", "advance_flow_field_unit_test finish");

      return 0;
}
Beispiel #4
0
/**************************************************************************
 Function: ocl_jacobi

  This routine contains the main iteration loop for the Jacobi iteration
  using OpenCL kernel.

 params:
    a                       two arrays to compute solution into
    max_iter                maximum number of iterations
    size                    size of array for this MPI rank
    tolerance               all differences should be les than this tolerance value
    mpi_ranks               number of MPI ranks in each dimension
    rank_pos                cartesian position of this rank
    origin                  origin for this rank
    d                       discretion size
    mpi_comm                MPI communications structure
    local_workblock_size    size of local workblock for OpenCL kernel
    device_type             OpenCL device type
    full_copy               boolean if full buffer copy is to be done
 **************************************************************************/
static void ocl_jacobi(value_type *a[2],
                        unsigned int max_iter,
                        size_t size[DIMENSIONS],
                        value_type tolerance,
                        value_type d[DIMENSIONS],
                        size_t local_workblock_size[DIMENSIONS],
                        cl_device_type device_type,
                        unsigned int full_copy) {

    size_t array_size;
    unsigned int i, j, rc, iter = 0;
    size_t delta_buffer_size, delta_size[DIMENSIONS];
    size_t tile_delta_size, tile_cache_size;
    value_type max_diff, timer;
    icl_device* device_id;
    icl_kernel* kernel;
    cl_int err;
    icl_buffer *a_buf[2], *delta_buf;
    value_type *delta;
 
    /* convenience for y stride in array */
    cl_uint ystride = size[Y]+2*GHOST_CELL_WIDTH;
    
    /* init devices */
    icl_init_devices(device_type);
    
    /* find OpenCL device */
    device_id  = icl_get_device(0);


    /* build the kernel and verify the kernel */
    kernel = icl_create_kernel(device_id, "jacsolver_kernel.cl", "ocl_jacobi_local_copy", "", ICL_SOURCE);

    /* calculate size of kernel local memory  - also used later for kernel params */
    tile_delta_size = local_workblock_size[X] * local_workblock_size[Y];
    tile_cache_size = (local_workblock_size[X]+2*GHOST_CELL_WIDTH) * (local_workblock_size[Y]+2*GHOST_CELL_WIDTH);

    /* verify the device has enough resources for this device */
/*  I'm an optimist, we just hope for the best
  	if ((cluGetAvailableLocalMem(device_id, kernel) < tile_delta_size + tile_cache_size) ||
        (! cluCheckLocalWorkgroupSize(device_id, kernel, DIMENSIONS, local_workblock_size))) {
        local_workblock_size[X] = 1;
        local_workblock_size[Y] = 1;
    }
*/
    printf("Estimating solution using OpenCL Jacobi iteration with %d x %d workblock.\n", (int)local_workblock_size[X], (int)local_workblock_size[Y]);
    fflush(stdout);

    /* init arrays by setting the initial value and the boundary conditions */
    set_initial_solution(a[OLD], size, INITIAL_GUESS);
    set_initial_solution(a[NEW], size, INITIAL_GUESS);
    set_boundary_conditions(a[OLD], size, d);
    set_boundary_conditions(a[NEW], size, d);

    /* print the initial solution guess */ 
    print_array("Init ", a[NEW], size, d);

    /* allocate memory for differences */
    delta_size[X] = size[X] / local_workblock_size[X];
    delta_size[Y] = size[Y] / local_workblock_size[Y];
    delta_buffer_size = delta_size[X] * delta_size[Y];
    delta = (value_type *)malloc(sizeof(value_type) * delta_buffer_size);
    
    /* initialize deltas so that first execution of kernel with overlapping 
     * reduction on the host will work correctly and not prematurely exit
     */
    for (i=0; i<delta_size[X]; ++i) {
        for (j=0; j<delta_size[Y]; ++j) {
            delta[i * delta_size[Y] + j] = 1.0;
        }
    }

    /* create buffers for OpenCL device using host memory */
    array_size = (size[X]+2*GHOST_CELL_WIDTH) * ystride;
    a_buf[OLD] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size);
    a_buf[NEW] = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * array_size);
    delta_buf = icl_create_buffer(device_id, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * delta_buffer_size);

    /* copy over buffers to device */
    icl_write_buffer(a_buf[OLD], CL_TRUE, sizeof(value_type) * array_size, a[OLD], NULL, NULL);
    icl_write_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL);

    /* set the kernel execution type  - data parallel */
 //   cluSetKernelNDRange(clu, kernel, DIMENSIONS, NULL, size, local_workblock_size);

    /*  iterate until maximum difference is less than the given tolerance
        or number of iterations is too high */
    do {
        /* swap array pointers for next iteration */
        SWAP_PTR(a[OLD], a[NEW]);
        SWAP_BUF(a_buf[OLD], a_buf[NEW]);
        icl_run_kernel(kernel, DIMENSIONS, size, local_workblock_size, NULL, NULL, 6,
                    (size_t)0,(void *) a_buf[OLD],
                    (size_t)0, (void *) a_buf[NEW],
                    sizeof(value_type) * tile_delta_size, NULL,
                    sizeof(value_type) * tile_cache_size, NULL,
                    (size_t)0, (void *) delta_buf,
                    sizeof(cl_uint), (void *) &ystride);

        /* while the kernel is running, calculate the reduction for the previous iteration */
        max_diff = ocl_jacobi_reduce(delta, delta_size);
        
        /* enqueue a synchronous copy of the delta. This will not occur until the kernel 
         * has finished. The deltas for each workgroup is a much smaller array to process
         */
        icl_read_buffer(delta_buf, CL_TRUE, sizeof(value_type) * delta_buffer_size, delta, NULL, NULL);
//        clEnqueueReadBuffer(queue, a_buf[NEW], CL_TRUE,    0, sizeof(value_type) * array_size, a[NEW], 0, NULL, NULL));

        /* output status for user, overwrite the same line */
        if ((0 == iter % 100)) {
            printf("Iteration=%5d, max difference=%0.7f, target=%0.7f\r",
                        iter, max_diff, tolerance);
            fflush(stdout);
        }

        
        /* increment the iteration counter */
        iter++;
    } while (max_diff > tolerance && max_iter >= iter); /* do loop */

    /* read back the final result */
    icl_read_buffer(a_buf[NEW], CL_TRUE, sizeof(value_type) * array_size, a[NEW], NULL, NULL);

    /* output final iteration count and maximum difference value */
    printf("Iteration=%5d, max difference=%0.7f, execution time=%.3f seconds\n", iter-1, max_diff, timer);
    fflush(stdout);

    /* finish usage of OpenCL device */
    icl_release_buffers(3, a_buf[OLD], a_buf[NEW], delta_buf);
    icl_release_kernel(kernel);
    free(delta);
}