/* the "full" operators */ void Q_pm_psi_prec(spinor * const l, spinor * const k) { spinorPrecWS *ws=(spinorPrecWS*)g_precWS; _Complex double ALIGN alpha= -1.0; if(g_prec_sequence_d_dagger_d[0]!=0.0) { alpha = g_prec_sequence_d_dagger_d[0]; spinorPrecondition(l,k,ws,T,L,alpha,0,1); } else assign(l,k,VOLUME); g_mu = -g_mu; D_psi(g_spinor_field[DUM_MATRIX], l); gamma5(l, g_spinor_field[DUM_MATRIX], VOLUME); g_mu = -g_mu; if(g_prec_sequence_d_dagger_d[1]!=0.0) { alpha = g_prec_sequence_d_dagger_d[1]; spinorPrecondition(l,l,ws,T,L,alpha,0,1); } D_psi(g_spinor_field[DUM_MATRIX], l); gamma5(l, g_spinor_field[DUM_MATRIX], VOLUME); if(g_prec_sequence_d_dagger_d[2]!=0.0) { alpha = g_prec_sequence_d_dagger_d[2]; spinorPrecondition(l,l,ws,T,L,alpha,0,1); } }
/* the "full" operators */ void Q_pm_psi2(spinor * const l, spinor * const k) { g_mu = -10.*g_mu; D_psi(l, k); gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME); g_mu = -g_mu/10.; D_psi(l, g_spinor_field[DUM_MATRIX]); gamma5(l, l, VOLUME); }
/* This is the version for the gpu with interchanged order of gamma5 and D_psi (Florian Burger)*/ void Q_pm_psi_gpu(spinor * const l, spinor * const k) { gamma5(k, k, VOLUME); g_mu = -g_mu; D_psi(l, k); gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME); g_mu = -g_mu; D_psi(l, g_spinor_field[DUM_MATRIX]); }
void Q_minus_psi(spinor * const l, spinor * const k) { g_mu = -g_mu; D_psi(l, k); g_mu = -g_mu; gamma5(l, l, VOLUME); }
/* |R>=rnorm^2 Q^2 |S> */ void norm_Q_sqr_psi(spinor * const R, spinor * const S, const double rnorm) { spinor *aux; aux=lock_Dov_WS_spinor(1); /* Term -1-s is done in D_psi! does this comment make sense for HMC? */ /* no, it doesn't, we do have to work on this */ /* here we need to set kappa = 1./(2 (-1-s) + 8) */ D_psi(R, S); gamma5(aux, R, VOLUME); D_psi(R, aux); gamma5(R, R, VOLUME); mul_r(R, rnorm*rnorm, R, VOLUME); unlock_Dov_WS_spinor(1); return; }
void D_psi_prec(spinor * const P, spinor * const Q){ /* todo: do preconditioning */ spinorPrecWS *ws=(spinorPrecWS*)g_precWS; static _Complex double alpha = -1.0; alpha = -0.5; spinorPrecondition(P,Q,ws,T,L,alpha,0,1); D_psi(g_spinor_field[DUM_MATRIX],P); alpha = -0.5; spinorPrecondition(P,g_spinor_field[DUM_MATRIX],ws,T,L,alpha,0,1); }
void Msap(spinor * const P, spinor * const Q, const int Ncy, const int Niter) { int blk, ncy = 0, eo, vol; spinor * r, * a, * b; double nrm; spinor ** solver_field = NULL; const int nr_sf = 6; /* * here it would be probably better to get the working fields as a parameter * from the calling function */ init_solver_field(&solver_field, VOLUME, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 2 && eo == 1) { /* GG, was 1 */ printf("Msap: %d %1.3e\n", ncy, nrm); fflush(stdout); } /* choose the even (odd) block */ /*blk = eolist[eo];*/ for (blk = 0; blk < nb_blocks; blk++) { if(block_list[blk].evenodd == eo) { vol = block_list[blk].volume; /* get part of r corresponding to block blk into b */ copy_global_to_block(b, r, blk); // does this work?? i.e. solver_field[3] mrblk(a, b, solver_field[3], Niter, 1.e-31, 1, vol, &dummy_Di, blk); /* add a up to full spinor P */ add_block_to_global(P, a, blk); } } } } finalize_solver(solver_field, nr_sf); return; }
void addproj_q_invsqrt(spinor * const Q, spinor * const P, const int n, const int N) { int j; spinor *aux; complex cnorm, lambda; static double save_ev[2]={-1.,-1.}; static int * ev_sign = NULL; if(eigenvls[0] != save_ev[0] && eigenvls[1] != save_ev[1] ) { if(g_proc_id == 0 && g_debug_level > 1) { printf("# Recomputing eigenvalue signs!\n"); fflush(stdout); } for(j = 0; j < 2; j++) { save_ev[j] = eigenvls[j]; } free(ev_sign); ev_sign = (int*) malloc(n * sizeof(int)); aux=lock_Dov_WS_spinor(1); for(j=0; j < n; j++) { D_psi(aux, &(eigenvectors[j*evlength])); gamma5(aux, aux, N); lambda = scalar_prod(&(eigenvectors[j*evlength]), aux, N, 1); if (lambda.re < 0) { ev_sign[j] = -1; } else { ev_sign[j] = 1; } } unlock_Dov_WS_spinor(1); /* free(aux_); */ } for(j = 0; j < n; j++) { cnorm = scalar_prod(&(eigenvectors[j*evlength]), P, N, 1); cnorm.re *= (double)ev_sign[j]; cnorm.im *= (double)ev_sign[j]; assign_add_mul(Q, &(eigenvectors[j*evlength]), cnorm, N); } return; }
/* |R>=rnorm^n Q^n |S> */ void norm_Q_n_psi(spinor * const R, spinor * const S, const int n, const double rnorm) { int i; double npar = 1.; spinor *aux; aux=lock_Dov_WS_spinor(1); assign(aux, S, VOLUME); for(i=0; i < n; i++){ D_psi(R, aux); /* Term -1-s is done in D_psi! does this comment make sense for HMC? */ gamma5(aux, R, VOLUME); npar *= rnorm; } mul_r(R, npar, aux, VOLUME); unlock_Dov_WS_spinor(1); return; }
int main(int argc,char *argv[]) { #ifdef _USE_HALFSPINOR #undef _USE_HALFSPINOR printf("# WARNING: USE_HALFSPINOR will be ignored (not supported here).\n"); #endif if(even_odd_flag) { even_odd_flag=0; printf("# WARNING: even_odd_flag will be ignored (not supported here).\n"); } int j,j_max,k,k_max = 1; #ifdef HAVE_LIBLEMON paramsXlfInfo *xlfInfo; #endif int status = 0; static double t1,t2,dt,sdt,dts,qdt,sqdt; double antioptaway=0.0; #ifdef MPI static double dt2; DUM_DERI = 6; DUM_SOLVER = DUM_DERI+2; DUM_MATRIX = DUM_SOLVER+6; NO_OF_SPINORFIELDS = DUM_MATRIX+2; # ifdef OMP int mpi_thread_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); # else MPI_Init(&argc, &argv); # endif MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); #else g_proc_id = 0; #endif g_rgi_C1 = 1.; /* Read the input file */ if((status = read_input("test_Dslash.input")) != 0) { fprintf(stderr, "Could not find input file: test_Dslash.input\nAborting...\n"); exit(-1); } #ifdef OMP init_openmp(); #endif tmlqcd_mpi_init(argc, argv); if(g_proc_id==0) { #ifdef SSE printf("# The code was compiled with SSE instructions\n"); #endif #ifdef SSE2 printf("# The code was compiled with SSE2 instructions\n"); #endif #ifdef SSE3 printf("# The code was compiled with SSE3 instructions\n"); #endif #ifdef P4 printf("# The code was compiled for Pentium4\n"); #endif #ifdef OPTERON printf("# The code was compiled for AMD Opteron\n"); #endif #ifdef _GAUGE_COPY printf("# The code was compiled with -D_GAUGE_COPY\n"); #endif #ifdef BGL printf("# The code was compiled for Blue Gene/L\n"); #endif #ifdef BGP printf("# The code was compiled for Blue Gene/P\n"); #endif #ifdef _USE_HALFSPINOR printf("# The code was compiled with -D_USE_HALFSPINOR\n"); #endif #ifdef _USE_SHMEM printf("# The code was compiled with -D_USE_SHMEM\n"); # ifdef _PERSISTENT printf("# The code was compiled for persistent MPI calls (halfspinor only)\n"); # endif #endif #ifdef MPI # ifdef _NON_BLOCKING printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n"); # endif #endif printf("\n"); fflush(stdout); } #ifdef _GAUGE_COPY init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); #else init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); #endif init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); if(even_odd_flag) { j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1); } else { j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max); } if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(0); } j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand); if ( j!= 0) { fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); exit(0); } if(g_proc_id == 0) { fprintf(stdout,"# The number of processes is %d \n",g_nproc); printf("# The lattice size is %d x %d x %d x %d\n", (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),(int) LZ); // if(even_odd_flag) { // printf("# benchmarking the even/odd preconditioned Dirac operator\n"); // } // else { // printf("# benchmarking the standard Dirac operator\n"); // } fflush(stdout); } /* define the geometry */ geometry(); /* define the boundary conditions for the fermion fields */ boundary(g_kappa); #ifdef _USE_HALFSPINOR j = init_dirac_halfspinor(); if ( j!= 0) { fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n"); exit(0); } if(g_sloppy_precision_flag == 1) { g_sloppy_precision = 1; j = init_dirac_halfspinor32(); if ( j!= 0) { fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n"); exit(0); } } # if (defined _PERSISTENT) init_xchange_halffield(); # endif #endif status = check_geometry(); if (status != 0) { fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); exit(1); } #if (defined MPI && !(defined _USE_SHMEM)) check_xchange(); #endif start_ranlux(1, 123456); random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); #ifdef MPI /*For parallelization: exchange the gaugefield */ xchange_gauge(g_gauge_field); #endif /* the non even/odd case now */ /*initialize the pseudo-fermion fields*/ j_max=1; sdt=0.; for (k=0;k<k_max;k++) { random_spinor_field_lexic(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS); } #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); #endif t1 = gettime(); /* here the actual Dslash */ D_psi(g_spinor_field[0], g_spinor_field[1]); t2 = gettime(); dt=t2-t1; #ifdef MPI MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else sdt = dt; #endif if(g_proc_id==0) { printf("# Time for Dslash %e sec.\n", sdt); printf("\n"); fflush(stdout); } #ifdef HAVE_LIBLEMON if(g_proc_id==0) { printf("# Performing parallel IO test ...\n"); } xlfInfo = construct_paramsXlfInfo(0.5, 0); write_gauge_field( "conf.test", 64, xlfInfo); free(xlfInfo); if(g_proc_id==0) { printf("# done ...\n"); } #endif #ifdef OMP free_omp_accumulators(); #endif free_gauge_field(); free_geometry_indices(); free_spinor_field(); free_moment_field(); #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); #endif return(0); }
int main(int argc,char *argv[]) { FILE *parameterfile=NULL; int c, j, is=0, ic=0; int x, X, y, Y, z, Z, t, tt, i, sum; char * filename = NULL; char datafilename[50]; char parameterfilename[50]; char conf_filename[50]; char * input_filename = NULL; double plaquette_energy, nrm; double * norm; struct stout_parameters params_smear; #ifdef _GAUGE_COPY int kb=0; #endif #ifdef MPI double atime=0., etime=0.; #endif #ifdef _KOJAK_INST #pragma pomp inst init #pragma pomp inst begin(main) #endif DUM_DERI = 6; /* DUM_DERI + 2 is enough (not 7) */ DUM_SOLVER = DUM_DERI+2; DUM_MATRIX = DUM_SOLVER+6; /* DUM_MATRIX + 2 is enough (not 6) */ NO_OF_SPINORFIELDS = DUM_MATRIX+2; verbose = 0; g_use_clover_flag = 0; g_nr_of_psf = 1; #ifdef MPI MPI_Init(&argc, &argv); #endif while ((c = getopt(argc, argv, "h?f:o:")) != -1) { switch (c) { case 'f': input_filename = calloc(200, sizeof(char)); strcpy(input_filename,optarg); break; case 'o': filename = calloc(200, sizeof(char)); strcpy(filename,optarg); break; case 'h': case '?': default: usage(); break; } } if(input_filename == NULL){ input_filename = "hmc.input"; } if(filename == NULL){ filename = "output"; } /* Read the input file */ read_input(input_filename); /* here we want no even/odd preconditioning */ even_odd_flag = 0; /* this DBW2 stuff is not needed for the inversion ! */ g_rgi_C1 = 0; if(Nsave == 0){ Nsave = 1; } tmlqcd_mpi_init(argc, argv); g_dbw2rand = 0; #ifndef MPI g_dbw2rand = 0; #endif #ifdef _GAUGE_COPY j = init_gauge_field(VOLUMEPLUSRAND, 1); #else j = init_gauge_field(VOLUMEPLUSRAND, 0); #endif if ( j!= 0) { fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); exit(-1); } j = init_geometry_indices(VOLUMEPLUSRAND); if ( j!= 0) { fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n"); exit(-1); } if(even_odd_flag) { j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS); } else { j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); } if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(-1); } g_mu = g_mu1; if(g_proc_id == 0){ /*construct the filenames for the observables and the parameters*/ strcpy(datafilename,filename); strcat(datafilename,".data"); strcpy(parameterfilename,filename); strcat(parameterfilename,".para"); parameterfile=fopen(parameterfilename, "w"); write_first_messages(parameterfile, 0, 1); } /* define the geometry */ geometry(); /* define the boundary conditions for the fermion fields */ boundary(); #ifdef _USE_HALFSPINOR j = init_dirac_halfspinor(); if ( j!= 0) { fprintf(stderr, "Not enough memory for halffield! Aborting...\n"); exit(-1); } if(g_sloppy_precision_flag == 1) { j = init_dirac_halfspinor32(); if ( j!= 0) { fprintf(stderr, "Not enough memory for 32-Bit halffield! Aborting...\n"); exit(-1); } } # if (defined _PERSISTENT) init_xchange_halffield(); # endif #endif norm = (double*)calloc(3.*LX/2.+T/2., sizeof(double)); for(j=0;j<Nmeas; j++) { sprintf(conf_filename,"%s.%.4d", gauge_input_filename, nstore); if (g_proc_id == 0){ printf("Reading Gauge field from file %s\n", conf_filename); fflush(stdout); } read_lime_gauge_field(conf_filename); if (g_proc_id == 0){ printf("done!\n"); fflush(stdout); } #ifdef MPI xchange_gauge(); #endif #ifdef _GAUGE_COPY update_backward_gauge(); #endif /* Compute minimal eigenvalues, if wanted */ if(compute_evs != 0) { eigenvalues(&no_eigenvalues, 1000, eigenvalue_precision, 0, compute_evs, nstore, even_odd_flag); } /*compute the energy of the gauge field*/ plaquette_energy = measure_gauge_action(); if(g_proc_id == 0) { printf("The plaquette value is %e\n", plaquette_energy/(6.*VOLUME*g_nproc)); fflush(stdout); } if (use_stout_flag == 1){ params_smear.rho = stout_rho; params_smear.iterations = stout_no_iter; if (stout_smear((su3_tuple*)(g_gauge_field[0]), ¶ms_smear, (su3_tuple*)(g_gauge_field[0])) != 0) exit(1) ; g_update_gauge_copy = 1; g_update_gauge_energy = 1; g_update_rectangle_energy = 1; plaquette_energy = measure_gauge_action(); if (g_proc_id == 0) { printf("# The plaquette value after stouting is %e\n", plaquette_energy / (6.*VOLUME*g_nproc)); fflush(stdout); } } source_spinor_field(g_spinor_field[0], g_spinor_field[1], 0, 0); convert_eo_to_lexic(g_spinor_field[DUM_DERI], g_spinor_field[0], g_spinor_field[1]); D_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); if(even_odd_flag) { i = invert_eo(g_spinor_field[2], g_spinor_field[3], g_spinor_field[0], g_spinor_field[1], solver_precision, max_solver_iterations, solver_flag, g_relative_precision_flag, sub_evs_cg_flag, even_odd_flag); convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], g_spinor_field[2], g_spinor_field[3]); } for(i = 0; i < 3*LX/2+T/2; i++){ norm[i] = 0.; } for(x = 0; x < LX; x++){ if(x > LX/2) X = LX-x; else X = x; for(y = 0; y < LY; y++){ if(y > LY/2) Y = LY-y; else Y = y; for(z = 0; z < LZ; z++){ if(z > LZ/2) Z = LZ-z; else Z = z; for(t = 0; t < T; t++){ if(t > T/2) tt = T - t; else tt = t; sum = X + Y + Z + tt; _spinor_norm_sq(nrm, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ]); /* _spinor_norm_sq(nrm, qprop[0][0][1][ g_ipt[t][x][y][z] ]); */ printf("%e %e\n", g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.re, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.im); nrm = sqrt( nrm ); printf("%1.12e\n", nrm); if(nrm > norm[sum]) norm[sum] = nrm; } } } } for(i = 0; i < 3*L/2+T/2; i++){ printf("%d %1.12e\n", i, norm[i]); } printf("\n"); nstore+=Nsave; } #ifdef MPI MPI_Finalize(); #endif free_gauge_field(); free_geometry_indices(); free_spinor_field(); free_moment_field(); return(0); #ifdef _KOJAK_INST #pragma pomp inst end(main) #endif }
void Msap_eo_old(spinor * const P, spinor * const Q, const int Ncy, const int Niter) { int blk, ncy = 0, eo, vol, vols; spinor * r, * a, * b, * c; double nrm; double musave = g_mu; double kappasave = g_kappa; spinor * b_even, * b_odd, * a_even, * a_odd; spinor ** solver_field = NULL; // also get space for mrblk! 6 = 3+3 const int nr_sf = 6; if(kappa_dflgen > 0) { g_kappa = kappa_dfl; } if(mu_dflgen > -10) { g_mu = mu_dfl; // make sure the sign is correct! if(g_mu*musave < 0) g_mu *= -1.; } boundary(g_kappa); /* * here it would be probably better to get the working fields as a parameter * from the calling function */ vols = block_list[0].volume/2+block_list[0].spinpad; vol = block_list[0].volume/2; init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) { printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa); fflush(stdout); } /* choose the even (odd) block */ // rely on nested parallelism // #ifdef TM_USE_OMP # pragma omp parallel for private (a_even, a_odd, b_even, b_odd, c) #endif for (blk = 0; blk < nb_blocks; blk++) { b_even = b + blk*2*vols; b_odd = b +blk*2*vols + vols; a_even = a + blk*2*vols; a_odd = a + blk*2*vols + vols; c = solver_field[3] + blk*vols; if(block_list[blk].evenodd == eo) { /* get part of r corresponding to block blk into b_even and b_odd */ copy_global_to_block_eo(b_even, b_odd, r, blk); if(g_c_sw > 0) { assign_mul_one_sw_pm_imu_inv_block(EE, a_even, b_even, g_mu, &block_list[blk]); Block_H_psi(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff(a_odd, b_odd, a_odd, vol); mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Msw_plus_block_psi, blk); Block_H_psi(&block_list[blk], b_even, b_odd, EO); assign(c, b_even, vol); assign_mul_one_sw_pm_imu_inv_block(EE, b_even, c, g_mu, &block_list[blk]); } else { assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol); Block_H_psi(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff(a_odd, b_odd, a_odd, vol); mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk); Block_H_psi(&block_list[blk], b_even, b_odd, EO); mul_one_pm_imu_inv(b_even, +1., vol); } /* a_even = a_even - b_even */ diff(a_even, a_even, b_even, vol); /* add even and odd part up to full spinor P */ add_eo_block_to_global(P, a_even, b_odd, blk); } } } } finalize_solver(solver_field, nr_sf); g_mu = musave; g_kappa = kappasave; boundary(g_kappa); return; }
int main(int argc,char *argv[]) { int j,j_max,k,k_max = 1; #ifdef HAVE_LIBLEMON paramsXlfInfo *xlfInfo; #endif int status = 0; static double t1,t2,dt,sdt,dts,qdt,sqdt; double antioptaway=0.0; #ifdef MPI static double dt2; DUM_DERI = 6; DUM_SOLVER = DUM_DERI+2; DUM_MATRIX = DUM_SOLVER+6; NO_OF_SPINORFIELDS = DUM_MATRIX+2; # ifdef OMP int mpi_thread_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); # else MPI_Init(&argc, &argv); # endif MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); #else g_proc_id = 0; #endif g_rgi_C1 = 1.; /* Read the input file */ if((status = read_input("benchmark.input")) != 0) { fprintf(stderr, "Could not find input file: benchmark.input\nAborting...\n"); exit(-1); } #ifdef OMP if(omp_num_threads > 0) { omp_set_num_threads(omp_num_threads); } else { if( g_proc_id == 0 ) printf("# No value provided for OmpNumThreads, running in single-threaded mode!\n"); omp_num_threads = 1; omp_set_num_threads(omp_num_threads); } init_omp_accumulators(omp_num_threads); #endif tmlqcd_mpi_init(argc, argv); if(g_proc_id==0) { #ifdef SSE printf("# The code was compiled with SSE instructions\n"); #endif #ifdef SSE2 printf("# The code was compiled with SSE2 instructions\n"); #endif #ifdef SSE3 printf("# The code was compiled with SSE3 instructions\n"); #endif #ifdef P4 printf("# The code was compiled for Pentium4\n"); #endif #ifdef OPTERON printf("# The code was compiled for AMD Opteron\n"); #endif #ifdef _GAUGE_COPY printf("# The code was compiled with -D_GAUGE_COPY\n"); #endif #ifdef BGL printf("# The code was compiled for Blue Gene/L\n"); #endif #ifdef BGP printf("# The code was compiled for Blue Gene/P\n"); #endif #ifdef _USE_HALFSPINOR printf("# The code was compiled with -D_USE_HALFSPINOR\n"); #endif #ifdef _USE_SHMEM printf("# The code was compiled with -D_USE_SHMEM\n"); # ifdef _PERSISTENT printf("# The code was compiled for persistent MPI calls (halfspinor only)\n"); # endif #endif #ifdef MPI # ifdef _NON_BLOCKING printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n"); # endif #endif printf("\n"); fflush(stdout); } #ifdef _GAUGE_COPY init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); #else init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); #endif init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); if(even_odd_flag) { j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1); } else { j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max); } if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(0); } j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand); if ( j!= 0) { fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); exit(0); } if(g_proc_id == 0) { fprintf(stdout,"# The number of processes is %d \n",g_nproc); printf("# The lattice size is %d x %d x %d x %d\n", (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),(int) LZ); if(even_odd_flag) { printf("# benchmarking the even/odd preconditioned Dirac operator\n"); } else { printf("# benchmarking the standard Dirac operator\n"); } fflush(stdout); } /* define the geometry */ geometry(); /* define the boundary conditions for the fermion fields */ boundary(g_kappa); #ifdef _USE_HALFSPINOR j = init_dirac_halfspinor(); if ( j!= 0) { fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n"); exit(0); } if(g_sloppy_precision_flag == 1) { g_sloppy_precision = 1; j = init_dirac_halfspinor32(); if ( j!= 0) { fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n"); exit(0); } } # if (defined _PERSISTENT) init_xchange_halffield(); # endif #endif status = check_geometry(); if (status != 0) { fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); exit(1); } #if (defined MPI && !(defined _USE_SHMEM)) check_xchange(); #endif start_ranlux(1, 123456); random_gauge_field(reproduce_randomnumber_flag); #ifdef MPI /*For parallelization: exchange the gaugefield */ xchange_gauge(g_gauge_field); #endif if(even_odd_flag) { /*initialize the pseudo-fermion fields*/ j_max=2048; sdt=0.; for (k = 0; k < k_max; k++) { random_spinor_field(g_spinor_field[k], VOLUME/2, 0); } while(sdt < 30.) { #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); #endif t1 = gettime(); antioptaway=0.0; for (j=0;j<j_max;j++) { for (k=0;k<k_max;k++) { Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]); Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]); antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0); } } t2 = gettime(); dt = t2-t1; #ifdef MPI MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else sdt = dt; #endif qdt=dt*dt; #ifdef MPI MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else sqdt = qdt; #endif sdt=sdt/((double)g_nproc); sqdt=sqrt(sqdt/g_nproc-sdt*sdt); j_max*=2; } j_max=j_max/2; dts=dt; sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME))); sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME))); if(g_proc_id==0) { printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway); printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max); printf("# Communication switched on:\n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/sdt),(int)sizeof(spinor)/3); #ifdef OMP printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*sdt))); #endif printf("\n"); fflush(stdout); } #ifdef MPI /* isolated computation */ t1 = gettime(); antioptaway=0.0; for (j=0;j<j_max;j++) { for (k=0;k<k_max;k++) { Hopping_Matrix_nocom(0, g_spinor_field[k+k_max], g_spinor_field[k]); Hopping_Matrix_nocom(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]); antioptaway += creal(g_spinor_field[2*k_max][0].s0.c0); } } t2 = gettime(); dt2 = t2-t1; /* compute the bandwidth */ dt=dts-dt2; MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); sdt=sdt/((double)g_nproc); MPI_Allreduce (&dt2, &dt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); dt=dt/((double)g_nproc); dt=1.0e6f*dt/((double)(k_max*j_max*(VOLUME))); if(g_proc_id==0) { printf("# The following result is printed just to make sure that the calculation is not optimized away: %e\n",antioptaway); printf("# Communication switched off: \n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/dt),(int)sizeof(spinor)/3); #ifdef OMP printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*dt))); #endif printf("\n"); fflush(stdout); } sdt=sdt/((double)k_max); sdt=sdt/((double)j_max); sdt=sdt/((double)(2*SLICE)); if(g_proc_id==0) { printf("# The size of the package is %d bytes.\n",(SLICE)*192); #ifdef _USE_HALFSPINOR printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 192./sdt/1024/1024, 192./sdt/1024./1024); #else printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 2.*192./sdt/1024/1024, 2.*192./sdt/1024./1024); #endif } #endif fflush(stdout); } else { /* the non even/odd case now */ /*initialize the pseudo-fermion fields*/ j_max=1; sdt=0.; for (k=0;k<k_max;k++) { random_spinor_field(g_spinor_field[k], VOLUME, 0); } while(sdt < 3.) { #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); #endif t1 = gettime(); for (j=0;j<j_max;j++) { for (k=0;k<k_max;k++) { D_psi(g_spinor_field[k+k_max], g_spinor_field[k]); antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0); } } t2 = gettime(); dt=t2-t1; #ifdef MPI MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else sdt = dt; #endif qdt=dt*dt; #ifdef MPI MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else sqdt = qdt; #endif sdt=sdt/((double)g_nproc); sqdt=sqrt(sqdt/g_nproc-sdt*sdt); j_max*=2; } j_max=j_max/2; dts=dt; sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME))); sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME))); if(g_proc_id==0) { printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway); printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max); printf("\n# (%d Mflops [%d bit arithmetic])\n", (int)(1680.0f/sdt),(int)sizeof(spinor)/3); #ifdef OMP printf("# Mflops per OpenMP thread ~ %d\n",(int)(1680.0f/(omp_num_threads*sdt))); #endif printf("\n"); fflush(stdout); } } #ifdef HAVE_LIBLEMON if(g_proc_id==0) { printf("# Performing parallel IO test ...\n"); } xlfInfo = construct_paramsXlfInfo(0.5, 0); write_gauge_field( "conf.test", 64, xlfInfo); free(xlfInfo); if(g_proc_id==0) { printf("# done ...\n"); } #endif #ifdef MPI MPI_Finalize(); #endif #ifdef OMP free_omp_accumulators(); #endif free_gauge_field(); free_geometry_indices(); free_spinor_field(); free_moment_field(); return(0); }
void Msap_eo(spinor * const P, spinor * const Q, const int Ncy) { int blk, ncy = 0, eo, vol; spinor * r, * a, * b; double nrm; spinor * b_even, * b_odd, * a_even, * a_odd; spinor ** solver_field = NULL; const int nr_sf = 3; /* * here it would be probably better to get the working fields as a parameter * from the calling function */ init_solver_field(&solver_field, VOLUME, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; vol = block_list[0].volume/2; b_even = b; b_odd = b + vol + 1; a_even = a; a_odd = a + vol + 1; for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 1 && eo == 1) { printf("Msap: %d %1.3e\n", ncy, nrm); } /* choose the even (odd) block */ for (blk = 0; blk < nb_blocks; blk++) { if(block_list[blk].evenodd == eo) { /* get part of r corresponding to block blk into b_even and b_odd */ copy_global_to_block_eo(b_even, b_odd, r, blk); assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol); Block_H_psi(&block_list[blk], a_odd, a_even, OE); /* a_odd = a_odd - b_odd */ assign_mul_add_r(a_odd, -1., b_odd, vol); mrblk(b_odd, a_odd, 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk); Block_H_psi(&block_list[blk], b_even, b_odd, EO); mul_one_pm_imu_inv(b_even, +1., vol); /* a_even = a_even - b_even */ assign_add_mul_r(a_even, b_even, -1., vol); /* add even and odd part up to full spinor P */ add_eo_block_to_global(P, a_even, b_odd, blk); } } } } finalize_solver(solver_field, nr_sf); return; }
void Msap_eo(spinor * const P, spinor * const Q, const int Ncy, const int Niter) { int ncy = 0, vol, vols; spinor * r, * a, * b; double nrm; double musave = g_mu; double kappasave = g_kappa; spinor ** solver_field = NULL; // also get space for mrblk! 6 = 3+3 const int nr_sf = 6; if(kappa_Msap > 0) { g_kappa = kappa_Msap; } if(mu_Msap > -10) { g_mu = mu_Msap; // make sure the sign is correct! if(g_mu*musave < 0) g_mu *= -1.; } boundary(g_kappa); /* * here it would be probably better to get the working fields as a parameter * from the calling function */ vols = block_list[0].volume/2+block_list[0].spinpad; vol = block_list[0].volume/2; init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; int * blk_e_list = malloc(nb_blocks/2*sizeof(int)); int * blk_o_list = malloc(nb_blocks/2*sizeof(int)); int iblke = 0, iblko = 0; for(int blk = 0; blk < nb_blocks; blk++) { if (block_list[blk].evenodd == 0) { blk_e_list[iblke] = blk; iblke++; } else { blk_o_list[iblko] = blk; iblko++; } } for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(int eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) { printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa); fflush(stdout); } int * blk_eo_list; if(eo == 0) { blk_eo_list = blk_e_list; } else { blk_eo_list = blk_o_list; } /* choose the even (odd) block */ // rely on nested parallelism // #ifdef TM_USE_OMP # pragma omp parallel for #endif for (int iblk = 0; iblk < nb_blocks/2; iblk++) { int blk = blk_eo_list[iblk]; spinor32 * b_even = (spinor32*) (b + blk*2*vols); spinor32 * b_odd = (spinor32*) (b +blk*2*vols + vols); spinor32 * a_even = (spinor32*) (a + blk*2*vols); spinor32 * a_odd = (spinor32*) (a + blk*2*vols + vols); // mrblk needs 3 solver fields which we distribute according to the block number spinor32 * c = (spinor32*) (solver_field[3] + blk*2*3*vols); /* get part of r corresponding to block blk into b_even and b_odd */ copy_global_to_block_eo_32(b_even, b_odd, r, blk); if(g_c_sw > 0) { assign_mul_one_sw_pm_imu_inv_block_32(EE, a_even, b_even, g_mu, &block_list[blk]); Block_H_psi_32(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff_32(a_odd, b_odd, a_odd, vol); mrblk_32(b_odd, a_odd, c, Niter, 1.e-31, 1, vol, &Msw_plus_block_psi_32, blk); Block_H_psi_32(&block_list[blk], b_even, b_odd, EO); assign_32(c, b_even, vol); assign_mul_one_sw_pm_imu_inv_block_32(EE, b_even, c, g_mu, &block_list[blk]); } else { assign_mul_one_pm_imu_inv_32(a_even, b_even, +1., vol); Block_H_psi_32(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff_32(a_odd, b_odd, a_odd, vol); mrblk_32(b_odd, a_odd, c, Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi_32, blk); Block_H_psi_32(&block_list[blk], b_even, b_odd, EO); mul_one_pm_imu_inv_32(b_even, +1., vol); } /* a_even = a_even - b_even */ diff_32(a_even, a_even, b_even, vol); /* add even and odd part up to full spinor P */ add_eo_block_32_to_global(P, a_even, b_odd, blk); } } } free(blk_e_list); free(blk_o_list); finalize_solver(solver_field, nr_sf); g_mu = musave; g_kappa = kappasave; boundary(g_kappa); return; }
void Q_plus_psi(spinor * const l, spinor * const k) { D_psi(l, k); gamma5(l, l, VOLUME); }
void poly_nonherm_precon(spinor * const R, spinor * const S, const double e, const double d, const int n, const int N) { int j; double a1, a2, dtmp; static spinor *work, *work_; static int initpnH = 0; spinor * psi, * chi, *tmp0, *tmp1, *cptmp; if(initpnH == 0) { work_ = calloc(4*VOLUMEPLUSRAND+1, sizeof(spinor)); #if (defined SSE || defined SSE2 || defined SSE3) work = (spinor *)(((unsigned long int)(work_)+ALIGN_BASE)&~ALIGN_BASE); #else work = work_; #endif initpnH = 1; } psi = work; chi = &work[VOLUMEPLUSRAND]; tmp0 = &work[2*VOLUMEPLUSRAND]; tmp1 = &work[3*VOLUMEPLUSRAND]; /* signs to be clarified!! */ /* P_0 * S */ mul_r(psi, 1./d, S, N); /* P_1 * S = a_1(1+kappa*H) * S */ a1 = d/(d*d-e*e/2.); boundary(g_kappa/d); dtmp = g_mu; g_mu = g_mu/d; D_psi(chi, S); mul_r(chi, a1, chi, N); boundary(g_kappa); g_mu = dtmp; /* boundary(-g_kappa); */ /* g_mu = -g_mu; */ /* D_psi(aux, chi); */ /* diff(aux, aux, S, N); */ /* dtmp = square_norm(aux, N, 1); */ /* printf("1 %1.3e\n", dtmp); */ /* boundary(-g_kappa); */ /* g_mu = -g_mu; */ /* assign(chi, d, N); */ for(j = 2; j < n+1; j++) { /* a_n */ a2 = 1./(d-a1*e*e/4.); /* 1-a_n */ a1 = 1.-d*a2; /* aux = a_n*S + (1-a_n) psi */ mul_add_mul_r(tmp0, S, psi, a2, a1, N); /* sv = kappa H chi = (D_psi(-kappa, -2kappamu) - 1) chi */ D_psi(tmp1, chi); /* why is the following sign like this? */ diff(tmp1, chi, tmp1, N); /* psi = aux + a_n * sv */ mul_add_mul_r(psi, tmp0, tmp1, 1., a2, N); cptmp = psi; psi = chi; chi = cptmp; /* boundary(-g_kappa); */ /* g_mu = -g_mu; */ if(g_debug_level>4) { D_psi(tmp0, chi); diff(tmp0, tmp0, S, N); dtmp = square_norm(tmp0, N, 1); if(g_proc_id == 0) printf("poly %d %1.3e\n", j, dtmp); } /* boundary(-g_kappa); */ /* g_mu = -g_mu; */ a1 = a2; } assign(R, chi, N); boundary(g_kappa); g_mu = dtmp; return; }