void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k, complex double const cfactor) { # ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } # endif # ifdef OMP # pragma omp parallel { su3 * restrict u0 ALIGN; # endif # define _MUL_G5_CMPLX # if (defined BGQ && defined XLC) complex double ALIGN bla = cfactor; vector4double ALIGN cf = vec_ld2(0, (double*) &bla); # elif (defined SSE2 || defined SSE3) _Complex double ALIGN cf = cfactor; # endif # include "operator/halfspinor_body.c" # undef _MUL_G5_CMPLX # ifdef OMP } /* OpenMP closing brace */ # endif return; }
void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * p, spinor * const k, complex double const cfactor) { # ifdef XLC # pragma disjoint(*l, *k) # endif # ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } # endif # if (defined TM_USE_MPI) xchange_field(k, ieo); # endif # ifdef TM_USE_OMP # pragma omp parallel { # endif # define _TM_SUB_HOP spinor * pn; # if (defined BGQ && defined XLC) complex double ALIGN bla = cfactor; vector4double ALIGN cf = vec_ld2(0, (double*) &bla); # elif (defined SSE2 || defined SSE3) _Complex double ALIGN cf = cfactor; su3_vector ALIGN psi, psi2; # endif # include "operator/hopping_body_dbl.c" # undef _TM_SUB_HOP # ifdef TM_USE_OMP } /* OpenMP closing brace */ # endif return; }
void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k, double complex const cfactor) { # ifdef XLC # pragma disjoint(*l, *k) # endif # ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } # endif # if (defined MPI) xchange_field(k, ieo); # endif # ifdef OMP # pragma omp parallel { # endif # define _MUL_G5_CMPLX # if (defined BGQ && defined XLC) complex double ALIGN bla = cfactor; vector4double ALIGN cf = vec_ld2(0, (double*) &bla); # elif (defined SSE2 || defined SSE3) _Complex double ALIGN cf = cfactor; # endif # include "operator/hopping_body_dbl.c" # undef _MUL_G5_CMPLX # ifdef OMP } /* OpenMP closing brace */ # endif return; }
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) { # ifdef XLC # pragma disjoint(*l, *k) # endif # ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } # endif # if (defined TM_USE_MPI && !(defined _NO_COMM)) xchange_field(k, ieo); # endif # ifdef TM_USE_OMP # pragma omp parallel { # endif # include "operator/hopping_body_dbl.c" # ifdef TM_USE_OMP } /* OpenMP closing brace */ # endif return; }
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) { #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } #endif #ifdef TM_USE_OMP #pragma omp parallel { su3 * restrict u0 ALIGN; #endif # include "operator/halfspinor_body.c" # ifdef TM_USE_OMP } /* OpenMP closing brace */ # endif return; }
int main(int argc,char *argv[]) { FILE *parameterfile=NULL,*rlxdfile=NULL, *countfile=NULL; char * filename = NULL; char datafilename[50]; char parameterfilename[50]; char gauge_filename[50]; char * nstore_filename = ".nstore_counter"; char * input_filename = NULL; int rlxd_state[105]; int j,ix,mu; int k; struct timeval t1; int g_nev, max_iter_ev; double stop_prec_ev; /* Energy corresponding to the Gauge part */ double eneg = 0., plaquette_energy = 0., rectangle_energy = 0.; /* Acceptance rate */ int Rate=0; /* Do we want to perform reversibility checks */ /* See also return_check_flag in read_input.h */ int return_check = 0; /* For getopt */ int c; /* For the Polyakov loop: */ int dir = 2; _Complex double pl, pl4; verbose = 0; g_use_clover_flag = 0; g_nr_of_psf = 1; #ifndef XLC signal(SIGUSR1,&catch_del_sig); signal(SIGUSR2,&catch_del_sig); signal(SIGTERM,&catch_del_sig); signal(SIGXCPU,&catch_del_sig); #endif while ((c = getopt(argc, argv, "h?f:o:")) != -1) { switch (c) { case 'f': input_filename = calloc(200, sizeof(char)); strcpy(input_filename,optarg); break; case 'o': filename = calloc(200, sizeof(char)); strcpy(filename,optarg); break; case 'h': case '?': default: usage(); break; } } if(input_filename == NULL){ input_filename = "hmc.input"; } if(filename == NULL){ filename = "output"; } /* Read the input file */ read_input(input_filename); mpi_init(argc, argv); if(Nsave == 0){ Nsave = 1; } if(nstore == -1) { countfile = fopen(nstore_filename, "r"); if(countfile != NULL) { fscanf(countfile, "%d\n", &nstore); fclose(countfile); } else { nstore = 0; } } if(g_rgi_C1 == 0.) { g_dbw2rand = 0; } #ifndef TM_USE_MPI g_dbw2rand = 0; #endif /* Reorder the mu parameter and the number of iterations */ if(g_mu3 > 0.) { g_mu = g_mu1; g_mu1 = g_mu3; g_mu3 = g_mu; j = int_n[1]; int_n[1] = int_n[3]; int_n[3] = j; j = g_csg_N[0]; g_csg_N[0] = g_csg_N[4]; g_csg_N[4] = j; g_csg_N[6] = j; if(fabs(g_mu3) > 0) { g_csg_N[6] = 0; } g_nr_of_psf = 3; } else if(g_mu2 > 0.) { g_mu = g_mu1; g_mu1 = g_mu2; g_mu2 = g_mu; int_n[3] = int_n[1]; int_n[1] = int_n[2]; int_n[2] = int_n[3]; /* For chronological inverter */ g_csg_N[4] = g_csg_N[0]; g_csg_N[0] = g_csg_N[2]; g_csg_N[2] = g_csg_N[4]; if(fabs(g_mu2) > 0) { g_csg_N[4] = 0; } g_csg_N[6] = 0; g_nr_of_psf = 2; } else { g_csg_N[2] = g_csg_N[0]; if(fabs(g_mu2) > 0) { g_csg_N[2] = 0; } g_csg_N[4] = 0; g_csg_N[6] = 0; } for(j = 0; j < g_nr_of_psf+1; j++) { if(int_n[j] == 0) int_n[j] = 1; } if(g_nr_of_psf == 3) { g_eps_sq_force = g_eps_sq_force1; g_eps_sq_force1 = g_eps_sq_force3; g_eps_sq_force3 = g_eps_sq_force; g_eps_sq_acc = g_eps_sq_acc1; g_eps_sq_acc1 = g_eps_sq_acc3; g_eps_sq_acc3 = g_eps_sq_acc; } if(g_nr_of_psf == 2) { g_eps_sq_force = g_eps_sq_force1; g_eps_sq_force1 = g_eps_sq_force2; g_eps_sq_force2 = g_eps_sq_force; g_eps_sq_acc = g_eps_sq_acc1; g_eps_sq_acc1 = g_eps_sq_acc2; g_eps_sq_acc2 = g_eps_sq_acc; } g_mu = g_mu1; g_eps_sq_acc = g_eps_sq_acc1; g_eps_sq_force = g_eps_sq_force1; #ifdef _GAUGE_COPY j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); #else j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); #endif if ( j!= 0) { fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); exit(0); } j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); if ( j!= 0) { fprintf(stderr, "Not enough memory for geometry_indices! Aborting...\n"); exit(0); } j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS); if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(0); } j = init_bispinor_field(VOLUME/2, NO_OF_SPINORFIELDS); j = init_csg_field(VOLUMEPLUSRAND/2, g_csg_N); if ( j!= 0) { fprintf(stderr, "Not enough memory for csg fields! Aborting...\n"); exit(0); } j = init_moment_field(VOLUME, VOLUMEPLUSRAND); if ( j!= 0) { fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); exit(0); } zero_spinor_field(g_spinor_field[DUM_DERI+4],VOLUME/2); zero_spinor_field(g_spinor_field[DUM_DERI+5],VOLUME/2); zero_spinor_field(g_spinor_field[DUM_DERI+6],VOLUME/2); if(g_proc_id == 0){ /* fscanf(fp6,"%s",filename); */ /*construct the filenames for the observables and the parameters*/ strcpy(datafilename,filename); strcat(datafilename,".data"); strcpy(parameterfilename,filename); strcat(parameterfilename,".para"); parameterfile=fopen(parameterfilename, "w"); printf("# This is the hmc code for twisted Mass Wilson QCD\n\nVersion %s\n", Version); #ifdef SSE printf("# The code was compiled with SSE instructions\n"); #endif #ifdef SSE2 printf("# The code was compiled with SSE2 instructions\n"); #endif #ifdef SSE3 printf("# The code was compiled with SSE3 instructions\n"); #endif #ifdef P4 printf("# The code was compiled for Pentium4\n"); #endif #ifdef OPTERON printf("# The code was compiled for AMD Opteron\n"); #endif #ifdef _NEW_GEOMETRY printf("# The code was compiled with -D_NEW_GEOMETRY\n"); #endif #ifdef _GAUGE_COPY printf("# The code was compiled with -D_GAUGE_COPY\n"); #endif printf("# The lattice size is %d x %d x %d x %d\n", (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY), (int)(LZ)); printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),(int) LZ); printf("# beta = %f , kappa= %f\n", g_beta, g_kappa); printf("# mus = %f, %f, %f\n", g_mu1, g_mu2, g_mu3); printf("# int_n_gauge = %d, int_n_ferm1 = %d, int_n_ferm2 = %d, int_n_ferm3 = %d\n", int_n[0], int_n[1], int_n[2], int_n[3]); printf("# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1); printf("# Number of pseudo-fermion fields: %d\n", g_nr_of_psf); printf("# g_eps_sq_force = %e, g_eps_sq_acc = %e\n", g_eps_sq_force, g_eps_sq_acc); printf("# Integration scheme: "); if(integtyp == 1) printf("leap-frog (single time scale)\n"); if(integtyp == 2) printf("Sexton-Weingarten (single time scale)\n"); if(integtyp == 3) printf("leap-frog (multiple time scales)\n"); if(integtyp == 4) printf("Sexton-Weingarten (multiple time scales)\n"); if(integtyp == 5) printf("higher order and leap-frog (multiple time scales)\n"); printf("# Using %s precision for the inversions!\n", g_relative_precision_flag ? "relative" : "absolute"); printf("# Using in chronological inverter for spinor_field 1,2,3 a history of %d, %d, %d, respectively\n", g_csg_N[0], g_csg_N[2], g_csg_N[4]); fprintf(parameterfile, "The lattice size is %d x %d x %d x %d\n", (int)(g_nproc_t*T), (int)(g_nproc_x*LX), (int)(LY), (int)(LZ)); fprintf(parameterfile, "The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY), (int)(LZ)); fprintf(parameterfile, "g_beta = %f , g_kappa= %f, c_sw = %f \n",g_beta,g_kappa,g_c_sw); fprintf(parameterfile, "boundary of fermion fields (t,x,y,z): %f %f %f %f \n",X0,X1,X2,X3); fprintf(parameterfile, "EPS_SQ0=%e, EPS_SQ1=%e EPS_SQ2=%e, EPS_SQ3=%e \n" ,EPS_SQ0,EPS_SQ1,EPS_SQ2,EPS_SQ3); fprintf(parameterfile, "g_eps_sq_force = %e, g_eps_sq_acc = %e\n", g_eps_sq_force, g_eps_sq_acc); fprintf(parameterfile, "dtau=%f, Nsteps=%d, Nmeas=%d, Nsave=%d, integtyp=%d, nsmall=%d \n", dtau,Nsteps,Nmeas,Nsave,integtyp,nsmall); fprintf(parameterfile, "mu = %f, mu2=%f, mu3=%f\n ", g_mu, g_mu2, g_mu3); fprintf(parameterfile, "int_n_gauge = %d, int_n_ferm1 = %d, int_n_ferm2 = %d, int_n_ferm3 = %d\n ", int_n[0], int_n[1], int_n[2], int_n[3]); fprintf(parameterfile, "g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1); fprintf(parameterfile, "# Number of pseudo-fermion fields: %d\n", g_nr_of_psf); fprintf(parameterfile, "# Integration scheme: "); if(integtyp == 1) fprintf(parameterfile, "leap-frog (single time scale)\n"); if(integtyp == 2) fprintf(parameterfile, "Sexton-Weingarten (single time scale)\n"); if(integtyp == 3) fprintf(parameterfile, "leap-frog (multiple time scales)\n"); if(integtyp == 4) fprintf(parameterfile, "Sexton-Weingarten (multiple time scales)\n"); if(integtyp == 5) fprintf(parameterfile, "higher order and leap-frog (multiple time scales)\n"); fprintf(parameterfile, "Using %s precision for the inversions!\n", g_relative_precision_flag ? "relative" : "absolute"); fprintf(parameterfile, "Using in chronological inverter for spinor_field 1,2,3 a history of %d, %d, %d, respectively\n", g_csg_N[0], g_csg_N[2], g_csg_N[4]); fflush(stdout); fflush(parameterfile); } /* define the geometry */ geometry(); /* define the boundary conditions for the fermion fields */ boundary(); check_geometry(); if(g_proc_id == 0) { #if defined GEOMETRIC if(g_proc_id==0) fprintf(parameterfile,"The geometric series is used as solver \n\n"); #else if(g_proc_id==0) fprintf(parameterfile,"The BICG_stab is used as solver \n\n"); #endif fflush(parameterfile); } /* Continue */ if(startoption == 3){ rlxdfile = fopen(rlxd_input_filename,"r"); if(rlxdfile != NULL) { if(g_proc_id == 0) { fread(rlxd_state,sizeof(rlxd_state),1,rlxdfile); } } else { if(g_proc_id == 0) { printf("%s does not exist, switching to restart...\n", rlxd_input_filename); } startoption = 2; } fclose(rlxdfile); if(startoption != 2) { if(g_proc_id == 0) { rlxd_reset(rlxd_state); printf("Reading Gauge field from file %s\n", gauge_input_filename); fflush(stdout); } read_gauge_field_time_p(gauge_input_filename,g_gauge_field); } } if(startoption != 3){ /* Initialize random number generator */ if(g_proc_id == 0) { rlxd_init(1, random_seed); /* hot */ if(startoption == 1) { random_gauge_field(); } rlxd_get(rlxd_state); #ifdef TM_USE_MPI MPI_Send(&rlxd_state[0], 105, MPI_INT, 1, 99, MPI_COMM_WORLD); MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_nproc-1, 99, MPI_COMM_WORLD, &status); rlxd_reset(rlxd_state); #endif } #ifdef TM_USE_MPI else { MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_proc_id-1, 99, MPI_COMM_WORLD, &status); rlxd_reset(rlxd_state); /* hot */ if(startoption == 1) { random_gauge_field(); } k=g_proc_id+1; if(k==g_nproc){ k=0; } rlxd_get(rlxd_state); MPI_Send(&rlxd_state[0], 105, MPI_INT, k, 99, MPI_COMM_WORLD); } #endif /* Cold */ if(startoption == 0) { unit_g_gauge_field(); } /* Restart */ else if(startoption == 2) { if (g_proc_id == 0){ printf("Reading Gauge field from file %s\n", gauge_input_filename); fflush(stdout); } read_gauge_field_time_p(gauge_input_filename,g_gauge_field); } } /*For parallelization: exchange the gaugefield */ #ifdef TM_USE_MPI xchange_gauge(g_gauge_field); #endif #ifdef _GAUGE_COPY update_backward_gauge(); #endif /*compute the energy of the gauge field*/ plaquette_energy=measure_gauge_action(); if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) { rectangle_energy = measure_rectangles(); if(g_proc_id==0){ fprintf(parameterfile,"#First rectangle value: %14.12f \n",rectangle_energy/(12.*VOLUME*g_nproc)); } } eneg = g_rgi_C0 * plaquette_energy + g_rgi_C1 * rectangle_energy; /* Measure and print the Polyakov loop: */ polyakov_loop(&pl, dir); if(g_proc_id==0){ fprintf(parameterfile,"#First plaquette value: %14.12f \n", plaquette_energy/(6.*VOLUME*g_nproc)); fprintf(parameterfile,"#First Polyakov loop value in %d-direction |L(%d)|= %14.12f \n", dir, dir, cabs(pl)); } dir=3; polyakov_loop(&pl, dir); if(g_proc_id==0){ fprintf(parameterfile,"#First Polyakov loop value in %d-direction |L(%d)|= %14.12f \n", dir, dir, cabs(pl)); fclose(parameterfile); } /* set ddummy to zero */ for(ix = 0; ix < VOLUME+RAND; ix++){ for(mu=0; mu<4; mu++){ ddummy[ix][mu].d1=0.; ddummy[ix][mu].d2=0.; ddummy[ix][mu].d3=0.; ddummy[ix][mu].d4=0.; ddummy[ix][mu].d5=0.; ddummy[ix][mu].d6=0.; ddummy[ix][mu].d7=0.; ddummy[ix][mu].d8=0.; } } if(g_proc_id == 0) { gettimeofday(&t1,NULL); countfile = fopen("history_hmc_tm", "a"); fprintf(countfile, "!!! Timestamp %ld, Nsave = %d, g_mu = %e, g_mu1 = %e, g_mu_2 = %e, g_mu3 = %e, beta = %f, kappa = %f, C1 = %f, int0 = %d, int1 = %d, int2 = %d, int3 = %d, g_eps_sq_force = %e, g_eps_sq_acc = %e, ", t1.tv_sec, Nsave, g_mu, g_mu1, g_mu2, g_mu3, g_beta, g_kappa, g_rgi_C1, int_n[0], int_n[1], int_n[2], int_n[3], g_eps_sq_force, g_eps_sq_acc); fprintf(countfile, "Nsteps = %d, dtau = %e, tau = %e, integtyp = %d, rel. prec. = %d\n", Nsteps, dtau, tau, integtyp, g_relative_precision_flag); fclose(countfile); } /* HERE THE CALLS FOR SOME EIGENVALUES */ /* for lowest g_nev = 10; */ /* for largest */ g_nev = 10; max_iter_ev = 1000; stop_prec_ev = 1.e-10; if(g_proc_id==0) { printf(" Values of mu = %e mubar = %e eps = %e precision = %e \n \n", g_mu, g_mubar, g_epsbar, stop_prec_ev); } eigenvalues(&g_nev, operator_flag, max_iter_ev, stop_prec_ev); g_nev = 4; max_iter_ev = 200; stop_prec_ev = 1.e-03; max_eigenvalues(&g_nev, operator_flag, max_iter_ev, stop_prec_ev); if(g_proc_id==0) { printf(" Values of mu = %e mubar = %e eps = %e precision = %e \n \n", g_mu, g_mubar, g_epsbar, stop_prec_ev); /* printf(" Values of mu = %e precision = %e \n \n", g_mu, stop_prec_ev); */ } /* END OF EIGENVALUES CALLS */ if(g_proc_id==0) { rlxd_get(rlxd_state); rlxdfile=fopen("last_state","w"); fwrite(rlxd_state,sizeof(rlxd_state),1,rlxdfile); fclose(rlxdfile); printf("Acceptance Rate was: %e Prozent\n", 100.*(double)Rate/(double)Nmeas); fflush(stdout); parameterfile = fopen(parameterfilename, "a"); fprintf(parameterfile, "Acceptance Rate was: %e Prozent\n", 100.*(double)Rate/(double)Nmeas); fclose(parameterfile); } #ifdef TM_USE_MPI MPI_Finalize(); #endif free_gauge_tmp(); free_gauge_field(); free_geometry_indices(); free_spinor_field(); free_bispinor_field(); free_moment_field(); return(0); }
int main(int argc,char *argv[]) { int j,j_max,k,k_max = 2; paramsXlfInfo *xlfInfo; int ix, n, *nn,*mm,i; double delta, deltamax; spinor rsp; int status = 0; #ifdef MPI DUM_DERI = 6; DUM_SOLVER = DUM_DERI+2; DUM_MATRIX = DUM_SOLVER+6; NO_OF_SPINORFIELDS = DUM_MATRIX+2; MPI_Init(&argc, &argv); #endif g_rgi_C1 = 1.; /* Read the input file */ read_input("hopping_test.input"); tmlqcd_mpi_init(argc, argv); if(g_proc_id==0) { #ifdef SSE printf("# The code was compiled with SSE instructions\n"); #endif #ifdef SSE2 printf("# The code was compiled with SSE2 instructions\n"); #endif #ifdef SSE3 printf("# The code was compiled with SSE3 instructions\n"); #endif #ifdef P4 printf("# The code was compiled for Pentium4\n"); #endif #ifdef OPTERON printf("# The code was compiled for AMD Opteron\n"); #endif #ifdef _GAUGE_COPY printf("# The code was compiled with -D_GAUGE_COPY\n"); #endif #ifdef BGL printf("# The code was compiled for Blue Gene/L\n"); #endif #ifdef BGP printf("# The code was compiled for Blue Gene/P\n"); #endif #ifdef _USE_HALFSPINOR printf("# The code was compiled with -D_USE_HALFSPINOR\n"); #endif #ifdef _USE_SHMEM printf("# the code was compiled with -D_USE_SHMEM\n"); # ifdef _PERSISTENT printf("# the code was compiled for persistent MPI calls (halfspinor only)\n"); # endif #endif #ifdef _INDEX_INDEP_GEOM printf("# the code was compiled with index independent geometry\n"); #endif #ifdef MPI # ifdef _NON_BLOCKING printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n"); # endif # ifdef _USE_TSPLITPAR printf("# the code was compiled with tsplit parallelization\n"); # endif #endif printf("\n"); fflush(stdout); } #ifdef _GAUGE_COPY init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); #else init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); #endif init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); if(even_odd_flag) { j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1); } else { j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max); } if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(0); } j = init_moment_field(VOLUME, VOLUMEPLUSRAND); if ( j!= 0) { fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); exit(0); } if(g_proc_id == 0) { fprintf(stdout,"The number of processes is %d \n",g_nproc); printf("# The lattice size is %d x %d x %d x %d\n", (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),(int) LZ); if(even_odd_flag) { printf("# testinging the even/odd preconditioned Dirac operator\n"); } else { printf("# testinging the standard Dirac operator\n"); } fflush(stdout); } /* define the geometry */ geometry(); /* define the boundary conditions for the fermion fields */ boundary(g_kappa); #ifdef _USE_HALFSPINOR j = init_dirac_halfspinor(); if ( j!= 0) { fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n"); exit(0); } if(g_sloppy_precision_flag == 1) { g_sloppy_precision = 1; j = init_dirac_halfspinor32(); if ( j!= 0) { fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n"); exit(0); } } # if (defined _PERSISTENT) init_xchange_halffield(); # endif #endif status = check_geometry(); if (status != 0) { fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); exit(1); } #if (defined MPI && !(defined _USE_SHMEM)) check_xchange(); #endif start_ranlux(1, 123456); xlfInfo = construct_paramsXlfInfo(0.5, 0); random_gauge_field(reproduce_randomnumber_flag); if ( startoption == 2 ) { /* restart */ write_gauge_field(gauge_input_filename,gauge_precision_write_flag,xlfInfo); } else if ( startoption == 0 ) { /* cold */ unit_g_gauge_field(); } else if (startoption == 3 ) { /* continue */ read_gauge_field(gauge_input_filename); } else if ( startoption == 1 ) { /* hot */ } #ifdef MPI /*For parallelization: exchange the gaugefield */ xchange_gauge(); #endif #ifdef _GAUGE_COPY update_backward_gauge(); #endif if(even_odd_flag) { /*initialize the pseudo-fermion fields*/ j_max=1; for (k = 0; k < k_max; k++) { random_spinor_field(g_spinor_field[k], VOLUME/2, 0); } if (read_source_flag == 2) { /* save */ /* even first, odd second */ write_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename); } else if (read_source_flag == 1) { /* yes */ /* even first, odd second */ read_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename,-1,0); # if (!defined MPI) if (write_cp_flag == 1) { strcat(SourceInfo.basename,".2"); read_spinorfield_cm_single(g_spinor_field[2],g_spinor_field[3],SourceInfo.basename,-1,0); nn=(int*)calloc(VOLUME,sizeof(int)); if((void*)nn == NULL) return(100); mm=(int*)calloc(VOLUME,sizeof(int)); if((void*)mm == NULL) return(100); n=0; deltamax=0.0; for(ix=0;ix<VOLUME/2;ix++){ (rsp.s0).c0.re = (g_spinor_field[2][ix].s0).c0.re - (g_spinor_field[0][ix].s0).c0.re; (rsp.s0).c0.im = (g_spinor_field[2][ix].s0).c0.im - (g_spinor_field[0][ix].s0).c0.im; (rsp.s0).c1.re = (g_spinor_field[2][ix].s0).c1.re - (g_spinor_field[0][ix].s0).c1.re; (rsp.s0).c1.im = (g_spinor_field[2][ix].s0).c1.im - (g_spinor_field[0][ix].s0).c1.im; (rsp.s0).c2.re = (g_spinor_field[2][ix].s0).c2.re - (g_spinor_field[0][ix].s0).c2.re; (rsp.s0).c2.im = (g_spinor_field[2][ix].s0).c2.im - (g_spinor_field[0][ix].s0).c2.im; (rsp.s1).c0.re = (g_spinor_field[2][ix].s1).c0.re - (g_spinor_field[0][ix].s1).c0.re; (rsp.s1).c0.im = (g_spinor_field[2][ix].s1).c0.im - (g_spinor_field[0][ix].s1).c0.im; (rsp.s1).c1.re = (g_spinor_field[2][ix].s1).c1.re - (g_spinor_field[0][ix].s1).c1.re; (rsp.s1).c1.im = (g_spinor_field[2][ix].s1).c1.im - (g_spinor_field[0][ix].s1).c1.im; (rsp.s1).c2.re = (g_spinor_field[2][ix].s1).c2.re - (g_spinor_field[0][ix].s1).c2.re; (rsp.s1).c2.im = (g_spinor_field[2][ix].s1).c2.im - (g_spinor_field[0][ix].s1).c2.im; (rsp.s2).c0.re = (g_spinor_field[2][ix].s2).c0.re - (g_spinor_field[0][ix].s2).c0.re; (rsp.s2).c0.im = (g_spinor_field[2][ix].s2).c0.im - (g_spinor_field[0][ix].s2).c0.im; (rsp.s2).c1.re = (g_spinor_field[2][ix].s2).c1.re - (g_spinor_field[0][ix].s2).c1.re; (rsp.s2).c1.im = (g_spinor_field[2][ix].s2).c1.im - (g_spinor_field[0][ix].s2).c1.im; (rsp.s2).c2.re = (g_spinor_field[2][ix].s2).c2.re - (g_spinor_field[0][ix].s2).c2.re; (rsp.s2).c2.im = (g_spinor_field[2][ix].s2).c2.im - (g_spinor_field[0][ix].s2).c2.im; (rsp.s3).c0.re = (g_spinor_field[2][ix].s3).c0.re - (g_spinor_field[0][ix].s3).c0.re; (rsp.s3).c0.im = (g_spinor_field[2][ix].s3).c0.im - (g_spinor_field[0][ix].s3).c0.im; (rsp.s3).c1.re = (g_spinor_field[2][ix].s3).c1.re - (g_spinor_field[0][ix].s3).c1.re; (rsp.s3).c1.im = (g_spinor_field[2][ix].s3).c1.im - (g_spinor_field[0][ix].s3).c1.im; (rsp.s3).c2.re = (g_spinor_field[2][ix].s3).c2.re - (g_spinor_field[0][ix].s3).c2.re; (rsp.s3).c2.im = (g_spinor_field[2][ix].s3).c2.im - (g_spinor_field[0][ix].s3).c2.im; _spinor_norm_sq(delta,rsp); if (delta > 1.0e-12) { nn[n] = g_eo2lexic[ix]; mm[n]=ix; n++; } if(delta>deltamax) deltamax=delta; } if (n>0){ printf("mismatch in even spincolorfield in %d points:\n",n); for(i=0; i< MIN(n,1000); i++){ printf("%d,(%d,%d,%d,%d):%f vs. %f\n",nn[i],g_coord[nn[i]][0],g_coord[nn[i]][1],g_coord[nn[i]][2],g_coord[nn[i]][3],(g_spinor_field[2][mm[i]].s0).c0.re, (g_spinor_field[0][mm[i]].s0).c0.re);fflush(stdout); } } n = 0; for(ix=0;ix<VOLUME/2;ix++){ (rsp.s0).c0.re = (g_spinor_field[3][ix].s0).c0.re - (g_spinor_field[1][ix].s0).c0.re; (rsp.s0).c0.im = (g_spinor_field[3][ix].s0).c0.im - (g_spinor_field[1][ix].s0).c0.im; (rsp.s0).c1.re = (g_spinor_field[3][ix].s0).c1.re - (g_spinor_field[1][ix].s0).c1.re; (rsp.s0).c1.im = (g_spinor_field[3][ix].s0).c1.im - (g_spinor_field[1][ix].s0).c1.im; (rsp.s0).c2.re = (g_spinor_field[3][ix].s0).c2.re - (g_spinor_field[1][ix].s0).c2.re; (rsp.s0).c2.im = (g_spinor_field[3][ix].s0).c2.im - (g_spinor_field[1][ix].s0).c2.im; (rsp.s1).c0.re = (g_spinor_field[3][ix].s1).c0.re - (g_spinor_field[1][ix].s1).c0.re; (rsp.s1).c0.im = (g_spinor_field[3][ix].s1).c0.im - (g_spinor_field[1][ix].s1).c0.im; (rsp.s1).c1.re = (g_spinor_field[3][ix].s1).c1.re - (g_spinor_field[1][ix].s1).c1.re; (rsp.s1).c1.im = (g_spinor_field[3][ix].s1).c1.im - (g_spinor_field[1][ix].s1).c1.im; (rsp.s1).c2.re = (g_spinor_field[3][ix].s1).c2.re - (g_spinor_field[1][ix].s1).c2.re; (rsp.s1).c2.im = (g_spinor_field[3][ix].s1).c2.im - (g_spinor_field[1][ix].s1).c2.im; (rsp.s2).c0.re = (g_spinor_field[3][ix].s2).c0.re - (g_spinor_field[1][ix].s2).c0.re; (rsp.s2).c0.im = (g_spinor_field[3][ix].s2).c0.im - (g_spinor_field[1][ix].s2).c0.im; (rsp.s2).c1.re = (g_spinor_field[3][ix].s2).c1.re - (g_spinor_field[1][ix].s2).c1.re; (rsp.s2).c1.im = (g_spinor_field[3][ix].s2).c1.im - (g_spinor_field[1][ix].s2).c1.im; (rsp.s2).c2.re = (g_spinor_field[3][ix].s2).c2.re - (g_spinor_field[1][ix].s2).c2.re; (rsp.s2).c2.im = (g_spinor_field[3][ix].s2).c2.im - (g_spinor_field[1][ix].s2).c2.im; (rsp.s3).c0.re = (g_spinor_field[3][ix].s3).c0.re - (g_spinor_field[1][ix].s3).c0.re; (rsp.s3).c0.im = (g_spinor_field[3][ix].s3).c0.im - (g_spinor_field[1][ix].s3).c0.im; (rsp.s3).c1.re = (g_spinor_field[3][ix].s3).c1.re - (g_spinor_field[1][ix].s3).c1.re; (rsp.s3).c1.im = (g_spinor_field[3][ix].s3).c1.im - (g_spinor_field[1][ix].s3).c1.im; (rsp.s3).c2.re = (g_spinor_field[3][ix].s3).c2.re - (g_spinor_field[1][ix].s3).c2.re; (rsp.s3).c2.im = (g_spinor_field[3][ix].s3).c2.im - (g_spinor_field[1][ix].s3).c2.im; _spinor_norm_sq(delta,rsp); if (delta > 1.0e-12) { nn[n]=g_eo2lexic[ix+(VOLUME+RAND)/2]; mm[n]=ix; n++; } if(delta>deltamax) deltamax=delta; } if (n>0){ printf("mismatch in odd spincolorfield in %d points:\n",n); for(i=0; i< MIN(n,1000); i++){ printf("%d,(%d,%d,%d,%d):%f vs. %f\n",nn[i],g_coord[nn[i]][0],g_coord[nn[i]][1],g_coord[nn[i]][2],g_coord[nn[i]][3],(g_spinor_field[3][mm[i]].s0).c0.re, (g_spinor_field[1][mm[i]].s0).c0.re);fflush(stdout); } } printf("max delta=%e",deltamax);fflush(stdout); } # endif } if (read_source_flag > 0 && write_cp_flag == 0) { /* read-source yes or nobutsave; checkpoint no */ /* first spinorial arg is output, the second is input */ Hopping_Matrix(1, g_spinor_field[1], g_spinor_field[0]); /*ieo=1 M_{eo}*/ Hopping_Matrix(0, g_spinor_field[0], g_spinor_field[1]); /*ieo=0 M_{oe}*/ strcat(SourceInfo.basename,".out"); write_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename); printf("Check-field printed. Exiting...\n"); fflush(stdout); } #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); #endif } free_gauge_field(); free_geometry_indices(); free_spinor_field(); free_moment_field(); return(0); }
/* Serially Checked ! */ void Dtm_psi(spinor * const P, spinor * const Q){ if(P==Q){ printf("Error in Dtm_psi (D_psi.c):\n"); printf("Arguments must be differen spinor fields\n"); printf("Program aborted\n"); exit(1); } #ifdef _GAUGE_COPY2 if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } #endif # if defined TM_USE_MPI xchange_lexicfield(Q); # endif #ifdef TM_USE_OMP #pragma omp parallel { #endif int ix,iy,iz; su3 *up,*um; spinor *s,*sp,*sm,*rn; _Complex double fact1, fact2; spinor rs __attribute__ ((aligned (16))); fact1 = 1. + g_mu * I; fact2 = conj(fact1); #ifndef TM_USE_OMP iy=g_iup[0][0]; sp=(spinor *) Q + iy; up=&g_gauge_field[0][0]; #endif /************************ loop over all lattice sites *************************/ #ifdef TM_USE_OMP #pragma omp for #endif for (ix=0;ix<VOLUME;ix++){ #ifdef TM_USE_OMP iy=g_iup[ix][0]; up=&g_gauge_field[ix][0]; sp=(spinor *) Q + iy; #endif s=(spinor *) Q + ix; _prefetch_spinor(s); /******************************* direction +0 *********************************/ iy=g_idn[ix][0]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s2); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_0); _sse_store_up(rs.s2); // the diagonal bit _sse_load_up(s->s0); _sse_vector_cmplx_mul(fact1); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s0); // g5 in the twisted term _sse_load_up(s->s2); _sse_vector_cmplx_mul(fact2); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s2); um=&g_gauge_field[iy][0]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s3); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_0); _sse_store_up(rs.s3); // the diagonal bit _sse_load_up(s->s1); _sse_vector_cmplx_mul(fact1); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s1); // g5 in the twisted term _sse_load_up(s->s3); _sse_vector_cmplx_mul(fact2); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s3); /******************************* direction -0 *********************************/ iy=g_iup[ix][1]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s2); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); up+=1; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s3); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_0); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); /******************************* direction +1 *********************************/ iy=g_idn[ix][1]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_1); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s3); um=&g_gauge_field[iy][1]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); /******************************* direction -1 *********************************/ iy=g_iup[ix][2]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_1); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); up+=1; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s2); /******************************* direction +2 *********************************/ iy=g_idn[ix][2]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s3); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_2); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s3); um=&g_gauge_field[iy][2]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s2); _sse_vector_sub(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_2); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); /******************************* direction -2 *********************************/ iy=g_iup[ix][3]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s3); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_2); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); up+=1; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s2); _sse_vector_add(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_2); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s2); /******************************* direction +3 *********************************/ iy=g_idn[ix][3]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_3); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); um=&g_gauge_field[iy][3]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_3); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); /******************************* direction -3 *********************************/ iz=(ix+1+VOLUME)%VOLUME; iy=g_iup[iz][0]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_3); rn = (spinor *) P + ix; _sse_load(rs.s0); _sse_vector_add(); _sse_store_nt(rn->s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store_nt(rn->s2); up=&g_gauge_field[iz][0]; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_3); _sse_load(rs.s1); _sse_vector_add(); _sse_store_nt(rn->s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt(rn->s3); /******************************** end of loop *********************************/ } #ifdef TM_USE_OMP } /* OpenMP closing brace */ #endif }
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ int ix; su3 * restrict ALIGN U; spinor * restrict ALIGN s; halfspinor * restrict * phi ALIGN; halfspinor32 * restrict * phi32 ALIGN; /* We have 32 registers available */ _declare_hregs(); #ifdef _KOJAK_INST #pragma pomp inst begin(hoppingmatrix) #endif #pragma disjoint(*s, *U) #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } #endif __alignx(16, l); __alignx(16, k); if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) { __alignx(16, HalfSpinor32); /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); /* s contains the source vector */ if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi32 = NBPointer32[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(int i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _hop_t_p_pre32(); s++; U++; ix++; /*********************** direction -0 ************************/ _hop_t_m_pre32(); ix++; /*********************** direction +1 ************************/ _hop_x_p_pre32(); ix++; U++; /*********************** direction -1 ************************/ _hop_x_m_pre32(); ix++; /*********************** direction +2 ************************/ _hop_y_p_pre32(); ix++; U++; /*********************** direction -2 ************************/ _hop_y_m_pre32(); ix++; /*********************** direction +3 ************************/ _hop_z_p_pre32(); ix++; U++; /*********************** direction -3 ************************/ _hop_z_m_pre32(); ix++; /************************ end of loop ************************/ } # if (defined TM_USE_MPI && !defined _NO_COMM) xchange_halffield32(); # endif s = l; phi32 = NBPointer32[2 + ieo]; if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } //_prefetch_halfspinor(phi32[0]); _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; /* _prefetch_spinor_for_store(s); */ for(int i = 0; i < (VOLUME)/2; i++){ /* This causes a lot of trouble, do we understand this? */ /* _prefetch_spinor_for_store(s); */ //_prefetch_halfspinor(phi32[ix+1]); /*********************** direction +0 ************************/ _hop_t_p_post32(); ix++; /*********************** direction -0 ************************/ _hop_t_m_post32(); U++; ix++; /*********************** direction +1 ************************/ _hop_x_p_post32(); ix++; /*********************** direction -1 ************************/ _hop_x_m_post32(); U++; ix++; /*********************** direction +2 ************************/ _hop_y_p_post32(); ix++; /*********************** direction -2 ************************/ _hop_y_m_post32(); U++; ix++; /*********************** direction +3 ************************/ _hop_z_p_post32(); ix++; /*********************** direction -3 ************************/ _hop_z_m_post32(); U++; ix++; s++; } } else { __alignx(16, HalfSpinor); /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); /* s contains the source vector */ if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi = NBPointer[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(int i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _hop_t_p_pre(); s++; U++; ix++; /*********************** direction -0 ************************/ _hop_t_m_pre(); ix++; /*********************** direction +1 ************************/ _hop_x_p_pre(); ix++; U++; /*********************** direction -1 ************************/ _hop_x_m_pre(); ix++; /*********************** direction +2 ************************/ _hop_y_p_pre(); ix++; U++; /*********************** direction -2 ************************/ _hop_y_m_pre(); ix++; /*********************** direction +3 ************************/ _hop_z_p_pre(); ix++; U++; /*********************** direction -3 ************************/ _hop_z_m_pre(); ix++; /************************ end of loop ************************/ } # if (defined TM_USE_MPI && !defined _NO_COMM) xchange_halffield(); # endif s = l; phi = NBPointer[2 + ieo]; //_prefetch_halfspinor(phi[0]); if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; /* _prefetch_spinor_for_store(s); */ for(int i = 0; i < (VOLUME)/2; i++){ /* This causes a lot of trouble, do we understand this? */ /* _prefetch_spinor_for_store(s); */ //_prefetch_halfspinor(phi[ix+1]); /*********************** direction +0 ************************/ _hop_t_p_post(); ix++; /*********************** direction -0 ************************/ _hop_t_m_post(); U++; ix++; /*********************** direction +1 ************************/ _hop_x_p_post(); ix++; /*********************** direction -1 ************************/ _hop_x_m_post(); U++; ix++; /*********************** direction +2 ************************/ _hop_y_p_post(); ix++; /*********************** direction -2 ************************/ _hop_y_m_post(); U++; ix++; /*********************** direction +3 ************************/ _hop_z_p_post(); ix++; /*********************** direction -3 ************************/ _hop_z_m_post(); U++; ix++; s++; } } #ifdef _KOJAK_INST #pragma pomp inst end(hoppingmatrix) #endif }
/* for ieo=0, k resides on odd sites and l on even sites */ void Hopping_Matrix(int ieo, spinor * const l, spinor * const k){ int ix,iy; int ioff,ioff2,icx,icy; su3 * restrict up, * restrict um; spinor * restrict r, * restrict sp, * restrict sm; spinor temp; #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(); } #endif /* for parallelization */ # if (defined MPI && !(defined _NO_COMM)) xchange_field(k, ieo); # endif if(k == l){ printf("Error in H_psi (simple.c):\n"); printf("Arguments k and l must be different\n"); printf("Program aborted\n"); exit(1); } if(ieo == 0){ ioff = 0; } else{ ioff = (VOLUME+RAND)/2; } ioff2 = (VOLUME+RAND)/2-ioff; /**************** loop over all lattice sites ****************/ for (icx = ioff; icx < (VOLUME/2 + ioff); icx++){ ix=g_eo2lexic[icx]; r=l+(icx-ioff); /*********************** direction +0 ************************/ iy=g_iup[ix][0]; icy=g_lexic2eosub[iy]; sp=k+icy; # if ((defined _GAUGE_COPY)) up=&g_gauge_field_copy[icx][0]; # else up=&g_gauge_field[ix][0]; # endif _vector_add(psi,(*sp).s0,(*sp).s2); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka0,chi); _vector_assign(temp.s0,psi); _vector_assign(temp.s2,psi); _vector_add(psi,(*sp).s1,(*sp).s3); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka0,chi); _vector_assign(temp.s1,psi); _vector_assign(temp.s3,psi); /*********************** direction -0 ************************/ iy=g_idn[ix][0]; icy=g_lexic2eosub[iy]; sm=k+icy; # if ((defined _GAUGE_COPY)) um = up+1; # else um=&g_gauge_field[iy][0]; # endif _vector_sub(psi,(*sm).s0,(*sm).s2); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka0,chi); _vector_add_assign(temp.s0,psi); _vector_sub_assign(temp.s2,psi); _vector_sub(psi,(*sm).s1,(*sm).s3); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka0,chi); _vector_add_assign(temp.s1,psi); _vector_sub_assign(temp.s3,psi); /*********************** direction +1 ************************/ iy=g_iup[ix][1]; icy=g_lexic2eosub[iy]; sp=k+icy; # if ((defined _GAUGE_COPY)) up=um+1; # else up+=1; # endif _vector_i_add(psi,(*sp).s0,(*sp).s3); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka1,chi); _vector_add_assign(temp.s0,psi); _vector_i_sub_assign(temp.s3,psi); _vector_i_add(psi,(*sp).s1,(*sp).s2); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka1,chi); _vector_add_assign(temp.s1,psi); _vector_i_sub_assign(temp.s2,psi); /*********************** direction -1 ************************/ iy=g_idn[ix][1]; icy=g_lexic2eosub[iy]; sm=k+icy; # ifndef _GAUGE_COPY um=&g_gauge_field[iy][1]; # else um=up+1; # endif _vector_i_sub(psi,(*sm).s0,(*sm).s3); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka1,chi); _vector_add_assign(temp.s0,psi); _vector_i_add_assign(temp.s3,psi); _vector_i_sub(psi,(*sm).s1,(*sm).s2); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka1,chi); _vector_add_assign(temp.s1,psi); _vector_i_add_assign(temp.s2,psi); /*********************** direction +2 ************************/ iy=g_iup[ix][2]; icy=g_lexic2eosub[iy]; sp=k+icy; # if ((defined _GAUGE_COPY)) up=um+1; # else up+=1; # endif _vector_add(psi,(*sp).s0,(*sp).s3); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka2,chi); _vector_add_assign(temp.s0,psi); _vector_add_assign(temp.s3,psi); _vector_sub(psi,(*sp).s1,(*sp).s2); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka2,chi); _vector_add_assign(temp.s1,psi); _vector_sub_assign(temp.s2,psi); /*********************** direction -2 ************************/ iy=g_idn[ix][2]; icy=g_lexic2eosub[iy]; sm=k+icy; # ifndef _GAUGE_COPY um = &g_gauge_field[iy][2]; # else um = up +1; # endif _vector_sub(psi,(*sm).s0,(*sm).s3); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka2,chi); _vector_add_assign(temp.s0,psi); _vector_sub_assign(temp.s3,psi); _vector_add(psi,(*sm).s1,(*sm).s2); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka2,chi); _vector_add_assign(temp.s1,psi); _vector_add_assign(temp.s2,psi); /*********************** direction +3 ************************/ iy=g_iup[ix][3]; icy=g_lexic2eosub[iy]; sp=k+icy; # if ((defined _GAUGE_COPY)) up=um+1; # else up+=1; # endif _vector_i_add(psi,(*sp).s0,(*sp).s2); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka3,chi); _vector_add_assign(temp.s0,psi); _vector_i_sub_assign(temp.s2,psi); _vector_i_sub(psi,(*sp).s1,(*sp).s3); _su3_multiply(chi,(*up),psi); _complex_times_vector(psi,ka3,chi); _vector_add_assign(temp.s1,psi); _vector_i_add_assign(temp.s3,psi); /*********************** direction -3 ************************/ iy=g_idn[ix][3]; icy=g_lexic2eosub[iy]; sm=k+icy; # ifndef _GAUGE_COPY um = &g_gauge_field[iy][3]; # else um = up+1; # endif _vector_i_sub(psi,(*sm).s0,(*sm).s2); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka3,chi); _vector_add((*r).s0, temp.s0, psi); _vector_i_add((*r).s2, temp.s2, psi); _vector_i_add(psi,(*sm).s1,(*sm).s3); _su3_inverse_multiply(chi,(*um),psi); _complexcjg_times_vector(psi,ka3,chi); _vector_add((*r).s1, temp.s1, psi); _vector_i_sub((*r).s3, temp.s3, psi); /************************ end of loop ************************/ } }
int main(int argc,char *argv[]) { FILE *parameterfile=NULL; int c, j, is=0, ic=0; int x, X, y, Y, z, Z, t, tt, i, sum; char * filename = NULL; char datafilename[50]; char parameterfilename[50]; char conf_filename[50]; char * input_filename = NULL; double plaquette_energy, nrm; double * norm; struct stout_parameters params_smear; #ifdef _GAUGE_COPY int kb=0; #endif #ifdef MPI double atime=0., etime=0.; #endif #ifdef _KOJAK_INST #pragma pomp inst init #pragma pomp inst begin(main) #endif DUM_DERI = 6; /* DUM_DERI + 2 is enough (not 7) */ DUM_SOLVER = DUM_DERI+2; DUM_MATRIX = DUM_SOLVER+6; /* DUM_MATRIX + 2 is enough (not 6) */ NO_OF_SPINORFIELDS = DUM_MATRIX+2; verbose = 0; g_use_clover_flag = 0; g_nr_of_psf = 1; #ifdef MPI MPI_Init(&argc, &argv); #endif while ((c = getopt(argc, argv, "h?f:o:")) != -1) { switch (c) { case 'f': input_filename = calloc(200, sizeof(char)); strcpy(input_filename,optarg); break; case 'o': filename = calloc(200, sizeof(char)); strcpy(filename,optarg); break; case 'h': case '?': default: usage(); break; } } if(input_filename == NULL){ input_filename = "hmc.input"; } if(filename == NULL){ filename = "output"; } /* Read the input file */ read_input(input_filename); /* here we want no even/odd preconditioning */ even_odd_flag = 0; /* this DBW2 stuff is not needed for the inversion ! */ g_rgi_C1 = 0; if(Nsave == 0){ Nsave = 1; } tmlqcd_mpi_init(argc, argv); g_dbw2rand = 0; #ifndef MPI g_dbw2rand = 0; #endif #ifdef _GAUGE_COPY j = init_gauge_field(VOLUMEPLUSRAND, 1); #else j = init_gauge_field(VOLUMEPLUSRAND, 0); #endif if ( j!= 0) { fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); exit(-1); } j = init_geometry_indices(VOLUMEPLUSRAND); if ( j!= 0) { fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n"); exit(-1); } if(even_odd_flag) { j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS); } else { j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); } if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(-1); } g_mu = g_mu1; if(g_proc_id == 0){ /*construct the filenames for the observables and the parameters*/ strcpy(datafilename,filename); strcat(datafilename,".data"); strcpy(parameterfilename,filename); strcat(parameterfilename,".para"); parameterfile=fopen(parameterfilename, "w"); write_first_messages(parameterfile, 0, 1); } /* define the geometry */ geometry(); /* define the boundary conditions for the fermion fields */ boundary(); #ifdef _USE_HALFSPINOR j = init_dirac_halfspinor(); if ( j!= 0) { fprintf(stderr, "Not enough memory for halffield! Aborting...\n"); exit(-1); } if(g_sloppy_precision_flag == 1) { j = init_dirac_halfspinor32(); if ( j!= 0) { fprintf(stderr, "Not enough memory for 32-Bit halffield! Aborting...\n"); exit(-1); } } # if (defined _PERSISTENT) init_xchange_halffield(); # endif #endif norm = (double*)calloc(3.*LX/2.+T/2., sizeof(double)); for(j=0;j<Nmeas; j++) { sprintf(conf_filename,"%s.%.4d", gauge_input_filename, nstore); if (g_proc_id == 0){ printf("Reading Gauge field from file %s\n", conf_filename); fflush(stdout); } read_lime_gauge_field(conf_filename); if (g_proc_id == 0){ printf("done!\n"); fflush(stdout); } #ifdef MPI xchange_gauge(); #endif #ifdef _GAUGE_COPY update_backward_gauge(); #endif /* Compute minimal eigenvalues, if wanted */ if(compute_evs != 0) { eigenvalues(&no_eigenvalues, 1000, eigenvalue_precision, 0, compute_evs, nstore, even_odd_flag); } /*compute the energy of the gauge field*/ plaquette_energy = measure_gauge_action(); if(g_proc_id == 0) { printf("The plaquette value is %e\n", plaquette_energy/(6.*VOLUME*g_nproc)); fflush(stdout); } if (use_stout_flag == 1){ params_smear.rho = stout_rho; params_smear.iterations = stout_no_iter; if (stout_smear((su3_tuple*)(g_gauge_field[0]), ¶ms_smear, (su3_tuple*)(g_gauge_field[0])) != 0) exit(1) ; g_update_gauge_copy = 1; g_update_gauge_energy = 1; g_update_rectangle_energy = 1; plaquette_energy = measure_gauge_action(); if (g_proc_id == 0) { printf("# The plaquette value after stouting is %e\n", plaquette_energy / (6.*VOLUME*g_nproc)); fflush(stdout); } } source_spinor_field(g_spinor_field[0], g_spinor_field[1], 0, 0); convert_eo_to_lexic(g_spinor_field[DUM_DERI], g_spinor_field[0], g_spinor_field[1]); D_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); if(even_odd_flag) { i = invert_eo(g_spinor_field[2], g_spinor_field[3], g_spinor_field[0], g_spinor_field[1], solver_precision, max_solver_iterations, solver_flag, g_relative_precision_flag, sub_evs_cg_flag, even_odd_flag); convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], g_spinor_field[2], g_spinor_field[3]); } for(i = 0; i < 3*LX/2+T/2; i++){ norm[i] = 0.; } for(x = 0; x < LX; x++){ if(x > LX/2) X = LX-x; else X = x; for(y = 0; y < LY; y++){ if(y > LY/2) Y = LY-y; else Y = y; for(z = 0; z < LZ; z++){ if(z > LZ/2) Z = LZ-z; else Z = z; for(t = 0; t < T; t++){ if(t > T/2) tt = T - t; else tt = t; sum = X + Y + Z + tt; _spinor_norm_sq(nrm, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ]); /* _spinor_norm_sq(nrm, qprop[0][0][1][ g_ipt[t][x][y][z] ]); */ printf("%e %e\n", g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.re, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.im); nrm = sqrt( nrm ); printf("%1.12e\n", nrm); if(nrm > norm[sum]) norm[sum] = nrm; } } } } for(i = 0; i < 3*L/2+T/2; i++){ printf("%d %1.12e\n", i, norm[i]); } printf("\n"); nstore+=Nsave; } #ifdef MPI MPI_Finalize(); #endif free_gauge_field(); free_geometry_indices(); free_spinor_field(); free_moment_field(); return(0); #ifdef _KOJAK_INST #pragma pomp inst end(main) #endif }
/* input on k; output on l */ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ int icx,icz,ioff; int ix,iz; int x0,icx0,jj; su3 *restrict up; su3 * restrict um; spinor * restrict sp; spinor * restrict sm; spinor * restrict rn; # if (defined MPI) # ifdef PARALLELX # define REQC 4 # elif defined PARALLELXY # define REQC 8 # elif defined PARALLELXYZ # define REQC 12 # endif MPI_Request requests[REQC]; MPI_Status status[REQC]; # endif #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } #endif if(ieo == 0){ /* even out - odd in */ ioff = 0; } else{ /* odd out - even in */ ioff = (VOLUME+RAND)/2; } /* Loop over time direction. This is the outmost loop */ for(x0=0;x0<T;x0++){ /* start the communication of the timslice borders (non-blocking send and receive)*/ # if (defined MPI && !defined _NO_COMM) xchange_field_open(k, ieo, x0, requests, status); # endif /* loop over timeslice. At: contribution of timelike links */ icx0=g_1st_eot[x0][ieo]; jj =0; um=&g_gauge_field_copyt[icx0][0]-1; /* allowed? */ for(icx = icx0; icx < icx0+TEOSLICE; icx++){ rn=l+(icx-ioff); /*********************** direction +0 ************************/ sp=k+g_iup_eo[icx][0]; /* all sp,sm,up,um could be moved up */ up=um+1; #if (defined AVX) _avx_load(sp->s0); _avx_load_up(sp->s2); _avx_vector_add(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka0); _avx_store_up(rn->s0); _avx_store_up(rn->s2); _avx_load(sp->s1); _avx_load_up(sp->s3); _avx_vector_add(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka0); _avx_store_up(rn->s1); _avx_store_up(rn->s3); /*********************** direction -0 ************************/ sm=k+g_idn_eo[icx][0]; um=up+1; _avx_load(sm->s0); _avx_load_up(sm->s2); _avx_vector_sub(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka0); _avx_load(rn->s0); _avx_vector_add(); _avx_store(rn->s0); _avx_load(rn->s2); _avx_vector_sub(); _avx_store(rn->s2); _avx_load(sm->s1); _avx_load_up(sm->s3); _avx_vector_sub(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka0); _avx_load(rn->s1); _avx_vector_add(); _avx_store(rn->s1); _avx_load(rn->s3); _avx_vector_sub(); _avx_store(rn->s3); #elif (defined SSE2 || defined SSE3) _sse_load(sp->s0); _sse_load_up(sp->s2); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka0); _sse_store_up(rn->s0); _sse_store_up(rn->s2); _sse_load(sp->s1); _sse_load_up(sp->s3); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka0); _sse_store_up(rn->s1); _sse_store_up(rn->s3); /*********************** direction -0 ************************/ sm=k+g_idn_eo[icx][0]; um=up+1; _sse_load(sm->s0); _sse_load_up(sm->s2); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka0); _sse_load(rn->s0); _sse_vector_add(); _sse_store(rn->s0); _sse_load(rn->s2); _sse_vector_sub(); _sse_store(rn->s2); _sse_load(sm->s1); _sse_load_up(sm->s3); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka0); _sse_load(rn->s1); _sse_vector_add(); _sse_store(rn->s1); _sse_load(rn->s3); _sse_vector_sub(); _sse_store(rn->s3); #endif jj++; } /* end of loop over timeslice (At)*/ /* complete the communication of the timslice borders (and wait) */ #if (defined MPI && !defined _NO_COMM) xchange_field_close(requests, status, REQC); /* MPI_Waitall */ #endif /* loop over timeslice. Bt: contribution of spacelike links */ um=&g_gauge_field_copys[icx0][0]-1; for(icx = icx0; icx < icx0+TEOSLICE; icx++){ ix=g_eo2lexic[icx]; rn=l+(icx-ioff); /*********************** direction +1 ************************/ sp=k+g_iup_eo[icx][1]; up=um+1; #if (defined AVX) _avx_load(sp->s0); _avx_load_up(sp->s3); _avx_vector_i_mul(); _avx_vector_add(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka1); _avx_load(rn->s0); _avx_vector_add(); _avx_store(rn->s0); _avx_load(rn->s3); _avx_vector_i_mul(); _avx_vector_sub(); _avx_store(rn->s3); _avx_load(sp->s1); _avx_load_up(sp->s2); _avx_vector_i_mul(); _avx_vector_add(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka1); _avx_load(rn->s1); _avx_vector_add(); _avx_store(rn->s1); _avx_load(rn->s2); _avx_vector_i_mul(); _avx_vector_sub(); _avx_store(rn->s2); /*********************** direction -1 ************************/ sm=k+g_idn_eo[icx][1]; um=up+1; _avx_load(sm->s0); _avx_load_up(sm->s3); _avx_vector_i_mul(); _avx_vector_sub(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka1); _avx_load(rn->s0); _avx_vector_add(); _avx_store(rn->s0); _avx_load(rn->s3); _avx_vector_i_mul(); _avx_vector_add(); _avx_store(rn->s3); _avx_load(sm->s1); _avx_load_up(sm->s2); _avx_vector_i_mul(); _avx_vector_sub(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka1); _avx_load(rn->s1); _avx_vector_add(); _avx_store(rn->s1); _avx_load(rn->s2); _avx_vector_i_mul(); _avx_vector_add(); _avx_store(rn->s2); /*********************** direction +2 ************************/ sp=k+g_iup_eo[icx][2]; up=um+1; _avx_load(sp->s0); _avx_load_up(sp->s3); _avx_vector_add(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka2); _avx_load(rn->s0); _avx_vector_add(); _avx_store(rn->s0); _avx_load(rn->s3); _avx_vector_add(); _avx_store(rn->s3); _avx_load(sp->s1); _avx_load_up(sp->s2); _avx_vector_sub(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka2); _avx_load(rn->s1); _avx_vector_add(); _avx_store(rn->s1); _avx_load(rn->s2); _avx_vector_sub(); _avx_store(rn->s2); /*********************** direction -2 ************************/ sm=k+g_idn_eo[icx][2]; um=up+1; _avx_load(sm->s0); _avx_load_up(sm->s3); _avx_vector_sub(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka2); _avx_load(rn->s0); _avx_vector_add(); _avx_store(rn->s0); _avx_load(rn->s3); _avx_vector_sub(); _avx_store(rn->s3); _avx_load(sm->s1); _avx_load_up(sm->s2); _avx_vector_add(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka2); _avx_load(rn->s1); _avx_vector_add(); _avx_store(rn->s1); _avx_load(rn->s2); _avx_vector_add(); _avx_store(rn->s2); /*********************** direction +3 ************************/ sp=k+g_iup_eo[icx][3]; up=um+1; _avx_load(sp->s0); _avx_load_up(sp->s2); _avx_vector_i_mul(); _avx_vector_add(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka3); _avx_load(rn->s0); _avx_vector_add(); _avx_store(rn->s0); _avx_load(rn->s2); _avx_vector_i_mul(); _avx_vector_sub(); _avx_store(rn->s2); _avx_load(sp->s1); _avx_load_up(sp->s3); _avx_vector_i_mul(); _avx_vector_sub(); _avx_su3_multiply((*up)); _avx_vector_cmplx_mul(ka3); _avx_load(rn->s1); _avx_vector_add(); _avx_store(rn->s1); _avx_load(rn->s3); _avx_vector_i_mul(); _avx_vector_add(); _avx_store(rn->s3); /*********************** direction -3 ************************/ sm=k+g_idn_eo[icx][3]; um=up+1; _avx_load(sm->s0); _avx_load_up(sm->s2); _avx_vector_i_mul(); _avx_vector_sub(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka3); _avx_load(rn->s0); _avx_vector_add(); _avx_store(rn->s0); _avx_load(rn->s2); _avx_vector_i_mul(); _avx_vector_add(); _avx_store(rn->s2); _avx_load(sm->s1); _avx_load_up(sm->s3); _avx_vector_i_mul(); _avx_vector_add(); _avx_su3_inverse_multiply((*um)); _avx_vector_cmplxcg_mul(ka3); _avx_load(rn->s1); _avx_vector_add(); _avx_store(rn->s1); _avx_load(rn->s3); _avx_vector_i_mul(); _avx_vector_sub(); _avx_store(rn->s3); #elif (defined SSE2 || defined SSE3) _sse_load(sp->s0); _sse_load_up(sp->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka1); _sse_load(rn->s0); _sse_vector_add(); _sse_store(rn->s0); _sse_load(rn->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rn->s3); _sse_load(sp->s1); _sse_load_up(sp->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka1); _sse_load(rn->s1); _sse_vector_add(); _sse_store(rn->s1); _sse_load(rn->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rn->s2); /*********************** direction -1 ************************/ sm=k+g_idn_eo[icx][1]; um=up+1; _sse_load(sm->s0); _sse_load_up(sm->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka1); _sse_load(rn->s0); _sse_vector_add(); _sse_store(rn->s0); _sse_load(rn->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rn->s3); _sse_load(sm->s1); _sse_load_up(sm->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka1); _sse_load(rn->s1); _sse_vector_add(); _sse_store(rn->s1); _sse_load(rn->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rn->s2); /*********************** direction +2 ************************/ sp=k+g_iup_eo[icx][2]; up=um+1; _sse_load(sp->s0); _sse_load_up(sp->s3); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka2); _sse_load(rn->s0); _sse_vector_add(); _sse_store(rn->s0); _sse_load(rn->s3); _sse_vector_add(); _sse_store(rn->s3); _sse_load(sp->s1); _sse_load_up(sp->s2); _sse_vector_sub(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka2); _sse_load(rn->s1); _sse_vector_add(); _sse_store(rn->s1); _sse_load(rn->s2); _sse_vector_sub(); _sse_store(rn->s2); /*********************** direction -2 ************************/ sm=k+g_idn_eo[icx][2]; um=up+1; _sse_load(sm->s0); _sse_load_up(sm->s3); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka2); _sse_load(rn->s0); _sse_vector_add(); _sse_store(rn->s0); _sse_load(rn->s3); _sse_vector_sub(); _sse_store(rn->s3); _sse_load(sm->s1); _sse_load_up(sm->s2); _sse_vector_add(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka2); _sse_load(rn->s1); _sse_vector_add(); _sse_store(rn->s1); _sse_load(rn->s2); _sse_vector_add(); _sse_store(rn->s2); /*********************** direction +3 ************************/ sp=k+g_iup_eo[icx][3]; up=um+1; _sse_load(sp->s0); _sse_load_up(sp->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka3); _sse_load(rn->s0); _sse_vector_add(); _sse_store(rn->s0); _sse_load(rn->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rn->s2); _sse_load(sp->s1); _sse_load_up(sp->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(ka3); _sse_load(rn->s1); _sse_vector_add(); _sse_store(rn->s1); _sse_load(rn->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rn->s3); /*********************** direction -3 ************************/ sm=k+g_idn_eo[icx][3]; um=up+1; _sse_load(sm->s0); _sse_load_up(sm->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka3); _sse_load(rn->s0); _sse_vector_add(); _sse_store(rn->s0); _sse_load(rn->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rn->s2); _sse_load(sm->s1); _sse_load_up(sm->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(ka3); _sse_load(rn->s1); _sse_vector_add(); _sse_store(rn->s1); _sse_load(rn->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rn->s3); #endif } /* end of loop over timeslice (Bt)*/ } /* x0=0; x0<T */ }
void jdher_bi(int n, int lda, double tau, double tol, int kmax, int jmax, int jmin, int itmax, int blksize, int blkwise, int V0dim, complex *V0, int solver_flag, int linitmax, double eps_tr, double toldecay, int verbosity, int *k_conv, complex *Q, double *lambda, int *it, int maxmin, const int shift_mode, matrix_mult_bi A_psi){ /**************************************************************************** * * * Local variables * * * ****************************************************************************/ /* constants */ /* allocatables: * initialize with NULL, so we can free even unallocated ptrs */ double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL; complex *V_ = NULL, *V, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, *Res_ = NULL, *Res, *eigwork = NULL, *temp1_ = NULL, *temp1; int *idx1 = NULL, *idx2 = NULL, *convind = NULL, *keepind = NULL, *solvestep = NULL, *actcorrits = NULL; /* non-allocated ptrs */ complex *q, *v, *u, *r = NULL; /* complex *matdummy, *vecdummy; */ /* scalar vars */ double theta, alpha, it_tol; int i, k, j, actblksize, eigworklen, found, conv, keep, n2, N = n*sizeof(complex)/sizeof(bispinor); int act, cnt, idummy, info, CntCorrIts=0, endflag=0; /* variables for random number generator */ int IDIST = 1; int ISEED[4] = {2, 3, 5, 7}; ISEED[0] = g_proc_id; /**************************************************************************** * * * Of course on the CRAY everything is different :( !! * * that's why we need something more. * * ****************************************************************************/ #ifdef CRAY fupl_u = _cptofcd(cupl_u, strlen(cupl_u)); fupl_c = _cptofcd(cupl_c, strlen(cupl_c)); fupl_n = _cptofcd(cupl_n, strlen(cupl_n)); fupl_a = _cptofcd(cupl_a, strlen(cupl_a)); fupl_v = _cptofcd(cupl_v, strlen(cupl_v)); filaenv = _cptofcd(cilaenv, strlen(cilaenv)); fvu = _cptofcd(cvu, strlen(cvu)); #endif /**************************************************************************** * * * Execution starts here... * * * ****************************************************************************/ /* NEW PART FOR GAUGE_COPY */ #ifdef _GAUGE_COPY update_backward_gauge(); #endif /* END NEW PART */ /* print info header */ if (verbosity > 1 && g_proc_id == 0) { printf("Jacobi-Davidson method for hermitian Matrices\n"); printf("Solving A*x = lambda*x \n\n"); printf(" N= %10d ITMAX=%4d\n", n, itmax); printf(" KMAX=%3d JMIN=%3d JMAX=%3d V0DIM=%3d\n", kmax, jmin, jmax, V0dim); printf(" BLKSIZE= %2d BLKWISE= %5s\n", blksize, blkwise ? "TRUE" : "FALSE"); printf(" TOL= %11.4e TAU= %11.4e\n", tol, tau); printf(" LINITMAX= %5d EPS_TR= %10.3e TOLDECAY=%9.2e\n", linitmax, eps_tr, toldecay); printf("\n Computing %s eigenvalues\n", maxmin ? "maximal" : "minimal"); printf("\n"); fflush( stdout ); } /* validate input parameters */ if(tol <= 0) jderrorhandler(401,""); if(kmax <= 0 || kmax > n) jderrorhandler(402,""); if(jmax <= 0 || jmax > n) jderrorhandler(403,""); if(jmin <= 0 || jmin > jmax) jderrorhandler(404,""); if(itmax < 0) jderrorhandler(405,""); if(blksize > jmin || blksize > (jmax - jmin)) jderrorhandler(406,""); if(blksize <= 0 || blksize > kmax) jderrorhandler(406,""); if(blkwise < 0 || blkwise > 1) jderrorhandler(407,""); if(V0dim < 0 || V0dim >= jmax) jderrorhandler(408,""); if(linitmax < 0) jderrorhandler(409,""); if(eps_tr < 0.) jderrorhandler(500,""); if(toldecay <= 1.0) jderrorhandler(501,""); CONE.re=1.; CONE.im=0.; CZERO.re=0.; CZERO.im=0.; CMONE.re=-1.; CMONE.im=0.; /* Get hardware-dependent values: * Opt size of workspace for ZHEEV is (NB+1)*j, where NB is the opt. * block size... */ eigworklen = (2 + _FT(ilaenv)(&ONE, filaenv, fvu, &jmax, &MONE, &MONE, &MONE, 6, 2)) * jmax; /* Allocating memory for matrices & vectors */ if((void*)(V_ = (complex *)malloc((lda * jmax + 4) * sizeof(complex))) == NULL) { errno = 0; jderrorhandler(300,"V in jdher_bi"); } #if (defined SSE || defined SSE2 || defined SSE3) V = (complex*)(((unsigned long int)(V_)+ALIGN_BASE)&~ALIGN_BASE); #else V = V_; #endif if((void*)(U = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) { jderrorhandler(300,"U in jdher_bi"); } if((void*)(s = (double *)malloc(jmax * sizeof(double))) == NULL) { jderrorhandler(300,"s in jdher_bi"); } if((void*)(Res_ = (complex *)malloc((lda * blksize+4) * sizeof(complex))) == NULL) { jderrorhandler(300,"Res in jdher_bi"); } #if (defined SSE || defined SSE2 || defined SSE3) Res = (complex*)(((unsigned long int)(Res_)+ALIGN_BASE)&~ALIGN_BASE); #else Res = Res_; #endif if((void*)(resnrm = (double *)malloc(blksize * sizeof(double))) == NULL) { jderrorhandler(300,"resnrm in jdher_bi"); } if((void*)(resnrm_old = (double *)calloc(blksize,sizeof(double))) == NULL) { jderrorhandler(300,"resnrm_old in jdher_bi"); } if((void*)(M = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) { jderrorhandler(300,"M in jdher_bi"); } if((void*)(Vtmp = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) { jderrorhandler(300,"Vtmp in jdher_bi"); } if((void*)(p_work_bi = (complex *)malloc(lda * sizeof(complex))) == NULL) { jderrorhandler(300,"p_work_bi in jdher_bi"); } /* ... */ if((void*)(idx1 = (int *)malloc(jmax * sizeof(int))) == NULL) { jderrorhandler(300,"idx1 in jdher_bi"); } if((void*)(idx2 = (int *)malloc(jmax * sizeof(int))) == NULL) { jderrorhandler(300,"idx2 in jdher_bi"); } /* Indices for (non-)converged approximations */ if((void*)(convind = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"convind in jdher_bi"); } if((void*)(keepind = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"keepind in jdher_bi"); } if((void*)(solvestep = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"solvestep in jdher_bi"); } if((void*)(actcorrits = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"actcorrits in jdher_bi"); } if((void*)(eigwork = (complex *)malloc(eigworklen * sizeof(complex))) == NULL) { jderrorhandler(300,"eigwork in jdher_bi"); } if((void*)(rwork = (double *)malloc(3*jmax * sizeof(double))) == NULL) { jderrorhandler(300,"rwork in jdher_bi"); } if((void*)(temp1_ = (complex *)malloc((lda+4) * sizeof(complex))) == NULL) { jderrorhandler(300,"temp1 in jdher_bi"); } #if (defined SSE || defined SSE2 || defined SSE3) temp1 = (complex*)(((unsigned long int)(temp1_)+ALIGN_BASE)&~ALIGN_BASE); #else temp1 = temp1_; #endif if((void*)(dtemp = (double *)malloc(lda * sizeof(complex))) == NULL) { jderrorhandler(300,"dtemp in jdher_bi"); } /* Set variables for Projection routines */ n2 = 2*n; p_n = n; p_n2 = n2; p_Q_bi = Q; p_A_psi_bi = A_psi; p_lda = lda; /************************************************************************** * * * Generate initial search subspace V. Vectors are taken from V0 and if * * necessary randomly generated. * * * **************************************************************************/ /* copy V0 to V */ _FT(zlacpy)(fupl_a, &n, &V0dim, V0, &lda, V, &lda, 1); j = V0dim; /* if V0dim < blksize: generate additional random vectors */ if (V0dim < blksize) { idummy = (blksize - V0dim)*n; /* nof random numbers */ _FT(zlarnv)(&IDIST, ISEED, &idummy, V + V0dim*lda); j = blksize; } for (cnt = 0; cnt < j; cnt ++) { ModifiedGS_bi(V + cnt*lda, n, cnt, V, lda); alpha = sqrt(square_norm_bi((bispinor*)(V+cnt*lda), N)); alpha = 1.0 / alpha; _FT(dscal)(&n2, &alpha, (double *)(V + cnt*lda), &ONE); } /* Generate interaction matrix M = V^dagger*A*V. Only the upper triangle is computed. */ for (cnt = 0; cnt < j; cnt++){ A_psi((bispinor*) temp1, (bispinor*) (V+cnt*lda)); idummy = cnt+1; for(i = 0; i < idummy; i++) { M[cnt*jmax+i] = scalar_prod_bi((bispinor*)(V+i*lda), (bispinor*) temp1, N); } } /* Other initializations */ k = 0; (*it) = 0; if((*k_conv) > 0) { k = (*k_conv); } actblksize = blksize; for(act = 0; act < blksize; act ++){ solvestep[act] = 1; } /**************************************************************************** * * * Main JD-iteration loop * * * ****************************************************************************/ while((*it) < itmax) { /**************************************************************************** * * * Solving the projected eigenproblem * * * * M*u = V^dagger*A*V*u = s*u * * M is hermitian, only the upper triangle is stored * * * ****************************************************************************/ _FT(zlacpy)(fupl_u, &j, &j, M, &jmax, U, &jmax, 1); _FT(zheev)(fupl_v, fupl_u, &j, U, &jmax, s, eigwork, &eigworklen, rwork, &info, 1, 1); if (info != 0) { printf("error solving the projected eigenproblem."); printf(" zheev: info = %d\n", info); } if(info != 0) jderrorhandler(502,"problem in zheev for jdher_bi"); /* Reverse order of eigenvalues if maximal value is needed */ if(maxmin == 1){ sorteig(j, s, U, jmax, s[j-1], dtemp, idx1, idx2, 0); } else{ sorteig(j, s, U, jmax, 0., dtemp, idx1, idx2, 0); } /**************************************************************************** * * * Convergence/Restart Check * * * * In case of convergence, strip off a whole block or just the converged * * ones and put 'em into Q. Update the matrices Q, V, U, s * * * * In case of a restart update the V, U and M matrices and recompute the * * Eigenvectors * * * ****************************************************************************/ found = 1; while(found) { /* conv/keep = Number of converged/non-converged Approximations */ conv = 0; keep = 0; for(act=0; act < actblksize; act++){ /* Setting pointers for single vectors */ q = Q + (act+k)*lda; u = U + act*jmax; r = Res + act*lda; /* Compute Ritz-Vector Q[:,k+cnt1]=V*U[:,cnt1] */ theta = s[act]; _FT(zgemv)(fupl_n, &n, &j, &CONE, V, &lda, u, &ONE, &CZERO, q, &ONE, 1); /* Compute the residual */ A_psi((bispinor*) r, (bispinor*) q); theta = -theta; _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); /* Compute norm of the residual and update arrays convind/keepind*/ resnrm_old[act] = resnrm[act]; resnrm[act] = sqrt(square_norm_bi((bispinor*) r, N)); if (resnrm[act] < tol){ convind[conv] = act; conv = conv + 1; } else{ keepind[keep] = act; keep = keep + 1; } } /* for(act = 0; act < actblksize; act ++) */ /* Check whether the blkwise-mode is chosen and ALL the approximations converged, or whether the strip-off mode is active and SOME of the approximations converged */ found = ((blkwise==1 && conv==actblksize) || (blkwise==0 && conv!=0)) && (j > actblksize || k == kmax - actblksize); /*************************************************************************** * * * Convergence Case * * * * In case of convergence, strip off a whole block or just the converged * * ones and put 'em into Q. Update the matrices Q, V, U, s * * * **************************************************************************/ if (found) { /* Store Eigenvalues */ for(act = 0; act < conv; act++) lambda[k+act] = s[convind[act]]; /* Re-use non approximated Ritz-Values */ for(act = 0; act < keep; act++) s[act] = s[keepind[act]]; /* Shift the others in the right position */ for(act = 0; act < (j-actblksize); act ++) s[act+keep] = s[act+actblksize]; /* Update V. Re-use the V-Vectors not looked at yet. */ idummy = j - actblksize; for (act = 0; act < n; act = act + jmax) { cnt = act + jmax > n ? n-act : jmax; _FT(zlacpy)(fupl_a, &cnt, &j, V+act, &lda, Vtmp, &jmax, 1); _FT(zgemm)(fupl_n, fupl_n, &cnt, &idummy, &j, &CONE, Vtmp, &jmax, U+actblksize*jmax, &jmax, &CZERO, V+act+keep*lda, &lda, 1, 1); } /* Insert the not converged approximations as first columns in V */ for(act = 0; act < keep; act++){ _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+keepind[act])*lda,&lda,V+act*lda,&lda,1); } /* Store Eigenvectors */ for(act = 0; act < conv; act++){ _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+convind[act])*lda,&lda,Q+(k+act)*lda,&lda,1); } /* Update SearchSpaceSize j */ j = j - conv; /* Let M become a diagonalmatrix with the Ritzvalues as entries ... */ _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); for (act = 0; act < j; act++){ M[act*jmax + act].re = s[act]; } /* ... and U the Identity(jnew,jnew) */ _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); if(shift_mode == 1){ if(maxmin == 0){ for(act = 0; act < conv; act ++){ if (lambda[k+act] > tau){ tau = lambda[k+act]; } } } else{ for(act = 0; act < conv; act ++){ if (lambda[k+act] < tau){ tau = lambda[k+act]; } } } } /* Update Converged-Eigenpair-counter and Pro_k */ k = k + conv; /* Update the new blocksize */ actblksize=min(blksize, kmax-k); /* Exit main iteration loop when kmax eigenpairs have been approximated */ if (k == kmax){ endflag = 1; break; } /* Counter for the linear-solver-accuracy */ for(act = 0; act < keep; act++) solvestep[act] = solvestep[keepind[act]]; /* Now we expect to have the next eigenvalues */ /* allready with some accuracy */ /* So we do not need to start from scratch... */ for(act = keep; act < blksize; act++) solvestep[act] = 1; } /* if(found) */ if(endflag == 1){ break; } /************************************************************************** * * * Restart * * * * The Eigenvector-Aproximations corresponding to the first jmin * * Petrov-Vectors are kept. if (j+actblksize > jmax) { * * * **************************************************************************/ if (j+actblksize > jmax) { idummy = j; j = jmin; for (act = 0; act < n; act = act + jmax) { /* V = V * U(:,1:j) */ cnt = act+jmax > n ? n-act : jmax; _FT(zlacpy)(fupl_a, &cnt, &idummy, V+act, &lda, Vtmp, &jmax, 1); _FT(zgemm)(fupl_n, fupl_n, &cnt, &j, &idummy, &CONE, Vtmp, &jmax, U, &jmax, &CZERO, V+act, &lda, 1, 1); } _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); for (act = 0; act < j; act++) M[act*jmax + act].re = s[act]; } } /* while(found) */ if(endflag == 1){ break; } /**************************************************************************** * * * Solving the correction equations * * * * * ****************************************************************************/ /* Solve actblksize times the correction equation ... */ for (act = 0; act < actblksize; act ++) { /* Setting start-value for vector v as zeros(n,1). Guarantees orthogonality */ v = V + j*lda; for (cnt = 0; cnt < n; cnt ++){ v[cnt].re = 0.; v[cnt].im = 0.; } /* Adaptive accuracy and shift for the lin.solver. In case the residual is big, we don't need a too precise solution for the correction equation, since even in exact arithmetic the solution wouldn't be too usefull for the Eigenproblem. */ r = Res + act*lda; if (resnrm[act] < eps_tr && resnrm[act] < s[act] && resnrm_old[act] > resnrm[act]){ p_theta = s[act]; } else{ p_theta = tau; } p_k = k + actblksize; /* if we are in blockwise mode, we do not want to */ /* iterate solutions much more, if they have */ /* allready the desired precision */ if(blkwise == 1 && resnrm[act] < tol) { it_tol = pow(toldecay, (double)(-5)); } else { it_tol = pow(toldecay, (double)(-solvestep[act])); } solvestep[act] = solvestep[act] + 1; /* equation and project if necessary */ ModifiedGS_bi(r, n, k + actblksize, Q, lda); /* for(i=0;i<n;i++){ */ /* r[i].re*=-1.; */ /* r[i].im*=-1.; */ /* } */ g_sloppy_precision = 1; /* Solve the correction equation ... */ if (solver_flag == BICGSTAB){ info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); } else if(solver_flag == CG){ info = cg_her_bi((bispinor*) v, (bispinor*) r, linitmax, it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); } else{ info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); } g_sloppy_precision = 0; /* Actualizing profiling data */ if (info == -1){ CntCorrIts += linitmax; } else{ CntCorrIts += info; } actcorrits[act] = info; /* orthonormalize v to Q, cause the implicit orthogonalization in the solvers may be too inaccurate. Then apply "IteratedCGS" to prevent numerical breakdown in order to orthogonalize v to V */ ModifiedGS_bi(v, n, k+actblksize, Q, lda); IteratedClassicalGS_bi(v, &alpha, n, j, V, temp1, lda); alpha = 1.0 / alpha; _FT(dscal)(&n2, &alpha, (double*) v, &ONE); /* update interaction matrix M */ A_psi((bispinor*) temp1, (bispinor*) v); idummy = j+1; for(i = 0; i < idummy; i++){ M[j*jmax+i] = scalar_prod_bi((bispinor*) (V+i*lda), (bispinor*) temp1, N); } /* Increasing SearchSpaceSize j */ j ++; } /* for (act = 0;act < actblksize; act ++) */ /* Print information line */ if(g_proc_id == 0) { print_status(verbosity, *it, k, j - blksize, kmax, blksize, actblksize, s, resnrm, actcorrits); } /* Increase iteration-counter for outer loop */ (*it) = (*it) + 1; } /* Main iteration loop */ /****************************************************************** * * * Eigensolutions converged or iteration limit reached * * * * Print statistics. Free memory. Return. * * * ******************************************************************/ *k_conv = k; if (verbosity >= 1) { if(g_proc_id == 0) { printf("\nJDHER execution statistics\n\n"); printf("IT_OUTER=%d IT_INNER_TOT=%d IT_INNER_AVG=%8.2f\n", (*it), CntCorrIts, (double)CntCorrIts/(*it)); printf("\nConverged eigensolutions in order of convergence:\n"); printf("\n I LAMBDA(I) RES(I)\n"); printf("---------------------------------------\n"); } for (act = 0; act < *k_conv; act ++) { /* Compute the residual for solution act */ q = Q + act*lda; theta = -lambda[act]; A_psi((bispinor*) r, (bispinor*) q); _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); alpha = sqrt(square_norm_bi((bispinor*) r, N)); if(g_proc_id == 0) { printf("%3d %22.15e %12.5e\n", act+1, lambda[act], alpha); } } if(g_proc_id == 0) { printf("\n"); fflush( stdout ); } } free(V_); free(Vtmp); free(U); free(s); free(Res_); free(resnrm); free(resnrm_old); free(M); free(Z); free(eigwork); free(temp1_); free(dtemp); free(rwork); free(p_work_bi); free(idx1); free(idx2); free(convind); free(keepind); free(solvestep); free(actcorrits); } /* jdher(.....) */
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ int ix, i; su3 * restrict U ALIGN; static spinor rs; spinor * restrict s ALIGN; halfspinor ** phi ALIGN; #if defined OPTERON const int predist=2; #else const int predist=1; #endif #ifdef _KOJAK_INST #pragma pomp inst begin(hoppingmatrix) #endif #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(); } #endif /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi = NBPointer[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _prefetch_su3(U+predist); _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka0); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka0); _sse_store_nt_up((*phi[ix]).s1); U++; ix++; /*********************** direction -0 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s1); ix++; /*********************** direction +1 ************************/ _prefetch_su3(U+predist); _sse_load((*s).s0); /*next not needed?*/ _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka1); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka1); _sse_store_nt_up((*phi[ix]).s1); ix++; U++; /*********************** direction -1 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s1); ix++; /*********************** direction +2 ************************/ _prefetch_su3(U+predist); _sse_load((*s).s0); _sse_load_up((*s).s3); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka2); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_sub(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka2); _sse_store_nt_up((*phi[ix]).s1); ix++; U++; /*********************** direction -2 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s3); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_add(); _sse_store_nt((*phi[ix]).s1); ix++; /*********************** direction +3 ************************/ _prefetch_su3(U+predist); _prefetch_spinor(s+1); _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka3); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka3); _sse_store_nt_up((*phi[ix]).s1); ix++; U++; /*********************** direction -3 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store_nt((*phi[ix]).s1); ix++; s++; } # if (defined MPI && !defined _NO_COMM) xchange_halffield(); # endif s = l; phi = NBPointer[2 + ieo]; if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; for(i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _vector_assign(rs.s0, (*phi[ix]).s0); _vector_assign(rs.s2, (*phi[ix]).s0); _vector_assign(rs.s1, (*phi[ix]).s1); _vector_assign(rs.s3, (*phi[ix]).s1); ix++; /*********************** direction -0 ************************/ _prefetch_su3(U+predist); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka0); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); ix++; U++; /*********************** direction +1 ************************/ _sse_load_up((*phi[ix]).s0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s3); _sse_load_up((*phi[ix]).s1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); ix++; /*********************** direction -1 ************************/ _prefetch_su3(U+predist); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka1); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s2); ix++; U++; /*********************** direction +2 ************************/ _sse_load_up((*phi[ix]).s0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s3); _sse_load_up((*phi[ix]).s1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); ix++; /*********************** direction -2 ************************/ _prefetch_su3(U+predist); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka2); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka2); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s2); ix++; U++; /*********************** direction +3 ************************/ _sse_load_up((*phi[ix]).s0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); _sse_load_up((*phi[ix]).s1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); ix++; /*********************** direction -3 ************************/ _prefetch_su3(U+predist); _prefetch_spinor(s+1); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka3); _sse_load(rs.s0); _sse_vector_add(); _sse_store_nt((*s).s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store_nt((*s).s2); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka3); _sse_load(rs.s1); _sse_vector_add(); _sse_store_nt((*s).s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*s).s3); ix++; U++; s++; } #ifdef _KOJAK_INST #pragma pomp inst end(hoppingmatrix) #endif }
/* for ieo=0, k resides on odd sites and l on even sites */ void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ int i,ix; su3 * restrict U ALIGN; spinor * restrict s ALIGN; spinor rs; static su3_vector psi, chi, psi2, chi2; halfspinor * restrict * phi ALIGN; halfspinor32 * restrict * phi32 ALIGN; #ifdef _KOJAK_INST #pragma pomp inst begin(hoppingmatrix) #endif #ifdef XLC #pragma disjoint(*l, *k, *U, *s) #endif #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(); } #endif if(k == l){ printf("Error in H_psi (simple.c):\n"); printf("Arguments k and l must be different\n"); printf("Program aborted\n"); exit(1); } s = k; if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) { phi32 = NBPointer32[ieo]; /**************** loop over all lattice sites ****************/ ix=0; for(i = 0; i < (VOLUME)/2; i++){ _vector_assign(rs.s0, (*s).s0); _vector_assign(rs.s1, (*s).s1); _vector_assign(rs.s2, (*s).s2); _vector_assign(rs.s3, (*s).s3); s++; /*********************** direction +0 ************************/ _vector_add(psi, rs.s0, rs.s2); _su3_multiply(chi,(*U),psi); _complex_times_vector((*phi32[ix]).s0, ka0, chi); _vector_add(psi, rs.s1, rs.s3); _su3_multiply(chi,(*U),psi); _complex_times_vector((*phi32[ix]).s1, ka0, chi); U++; ix++; /*********************** direction -0 ************************/ _vector_sub((*phi32[ix]).s0, rs.s0, rs.s2); _vector_sub((*phi32[ix]).s1, rs.s1, rs.s3); ix++; /*********************** direction +1 ************************/ _vector_i_add(psi, rs.s0, rs.s3); _su3_multiply(chi, (*U), psi); _complex_times_vector((*phi32[ix]).s0, ka1, chi); _vector_i_add(psi, rs.s1, rs.s2); _su3_multiply(chi, (*U), psi); _complex_times_vector((*phi32[ix]).s1, ka1, chi); U++; ix++; /*********************** direction -1 ************************/ _vector_i_sub((*phi32[ix]).s0, rs.s0, rs.s3); _vector_i_sub((*phi32[ix]).s1, rs.s1, rs.s2); ix++; /*********************** direction +2 ************************/ _vector_add(psi, rs.s0, rs.s3); _su3_multiply(chi,(*U),psi); _complex_times_vector((*phi32[ix]).s0, ka2, chi); _vector_sub(psi, rs.s1, rs.s2); _su3_multiply(chi,(*U),psi); _complex_times_vector((*phi32[ix]).s1, ka2, chi); U++; ix++; /*********************** direction -2 ************************/ _vector_sub((*phi32[ix]).s0, rs.s0, rs.s3); _vector_add((*phi32[ix]).s1, rs.s1, rs.s2); ix++; /*********************** direction +3 ************************/ _vector_i_add(psi, rs.s0, rs.s2); _su3_multiply(chi, (*U), psi); _complex_times_vector((*phi32[ix]).s0, ka3, chi); _vector_i_sub(psi, rs.s1, rs.s3); _su3_multiply(chi,(*U),psi); _complex_times_vector((*phi32[ix]).s1, ka3, chi); U++; ix++; /*********************** direction -3 ************************/ _vector_i_sub((*phi32[ix]).s0, rs.s0, rs.s2); _vector_i_add((*phi32[ix]).s1, rs.s1, rs.s3); ix++; /************************ end of loop ************************/ } # if (defined MPI && !defined _NO_COMM) xchange_halffield32(); # endif s = l; phi32 = NBPointer32[2 + ieo]; if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } ix = 0; for(i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _vector_assign(rs.s0, (*phi32[ix]).s0); _vector_assign(rs.s2, (*phi32[ix]).s0); _vector_assign(rs.s1, (*phi32[ix]).s1); _vector_assign(rs.s3, (*phi32[ix]).s1); ix++; /*********************** direction -0 ************************/ _vector_assign(psi, (*phi32[ix]).s0); _su3_inverse_multiply(chi,(*U), psi); _complexcjg_times_vector(psi,ka0,chi); _vector_add_assign(rs.s0, psi); _vector_sub_assign(rs.s2, psi); _vector_assign(psi, (*phi32[ix]).s1); _su3_inverse_multiply(chi,(*U), psi); _complexcjg_times_vector(psi,ka0,chi); _vector_add_assign(rs.s1, psi); _vector_sub_assign(rs.s3, psi); ix++; U++; /*********************** direction +1 ************************/ _vector_add_assign(rs.s0, (*phi32[ix]).s0); _vector_i_sub_assign(rs.s3, (*phi32[ix]).s0); _vector_add_assign(rs.s1, (*phi32[ix]).s1); _vector_i_sub_assign(rs.s2, (*phi32[ix]).s1); ix++; /*********************** direction -1 ************************/ _vector_assign(psi, (*phi32[ix]).s0); _su3_inverse_multiply(chi,(*U), psi); _complexcjg_times_vector(psi,ka1,chi); _vector_add_assign(rs.s0, psi); _vector_i_add_assign(rs.s3, psi); _vector_assign(psi, (*phi32[ix]).s1); _su3_inverse_multiply(chi,(*U), psi); _complexcjg_times_vector(psi,ka1,chi); _vector_add_assign(rs.s1, psi); _vector_i_add_assign(rs.s2, psi); U++; ix++; /*********************** direction +2 ************************/ _vector_add_assign(rs.s0, (*phi32[ix]).s0); _vector_add_assign(rs.s3, (*phi32[ix]).s0); _vector_add_assign(rs.s1, (*phi32[ix]).s1); _vector_sub_assign(rs.s2, (*phi32[ix]).s1); ix++; /*********************** direction -2 ************************/ _vector_assign(psi, (*phi32[ix]).s0); _su3_inverse_multiply(chi,(*U), psi); _complexcjg_times_vector(psi,ka2,chi); _vector_add_assign(rs.s0, psi); _vector_sub_assign(rs.s3, psi); _vector_assign(psi, (*phi32[ix]).s1); _su3_inverse_multiply(chi, (*U), psi); _complexcjg_times_vector(psi,ka2,chi); _vector_add_assign(rs.s1, psi); _vector_add_assign(rs.s2, psi); U++; ix++; /*********************** direction +3 ************************/ _vector_add_assign(rs.s0, (*phi32[ix]).s0); _vector_i_sub_assign(rs.s2, (*phi32[ix]).s0); _vector_add_assign(rs.s1, (*phi32[ix]).s1); _vector_i_add_assign(rs.s3, (*phi32[ix]).s1); ix++; /*********************** direction -3 ************************/ _vector_assign(psi, (*phi32[ix]).s0); _su3_inverse_multiply(chi,(*U), psi); _complexcjg_times_vector(psi,ka3,chi); _vector_add((*s).s0, rs.s0, psi); _vector_i_add((*s).s2, rs.s2, psi); _vector_assign(psi, (*phi32[ix]).s1); _su3_inverse_multiply(chi,(*U), psi); _complexcjg_times_vector(psi,ka3,chi); _vector_add((*s).s1, rs.s1, psi); _vector_i_sub((*s).s3, rs.s3, psi); U++; ix++; s++; } } else { phi = NBPointer[ieo]; /**************** loop over all lattice sites ****************/ ix=0; /* #pragma ivdep*/ for(i = 0; i < (VOLUME)/2; i++){ _vector_assign(rs.s0, (*s).s0); _vector_assign(rs.s1, (*s).s1); _vector_assign(rs.s2, (*s).s2); _vector_assign(rs.s3, (*s).s3); s++; /*********************** direction +0 ************************/ _vector_add(psi, rs.s0, rs.s2); _vector_add(psi2, rs.s1, rs.s3); _su3_multiply(chi,(*U),psi); _su3_multiply(chi2,(*U),psi2); _complex_times_vector((*phi[ix]).s0, ka0, chi); _complex_times_vector((*phi[ix]).s1, ka0, chi2); U++; ix++; /*********************** direction -0 ************************/ _vector_sub((*phi[ix]).s0, rs.s0, rs.s2); _vector_sub((*phi[ix]).s1, rs.s1, rs.s3); ix++; /*********************** direction +1 ************************/ _vector_i_add(psi, rs.s0, rs.s3); _vector_i_add(psi2, rs.s1, rs.s2); _su3_multiply(chi, (*U), psi); _su3_multiply(chi2, (*U), psi2); _complex_times_vector((*phi[ix]).s0, ka1, chi); _complex_times_vector((*phi[ix]).s1, ka1, chi2); U++; ix++; /*********************** direction -1 ************************/ _vector_i_sub((*phi[ix]).s0, rs.s0, rs.s3); _vector_i_sub((*phi[ix]).s1, rs.s1, rs.s2); ix++; /*********************** direction +2 ************************/ _vector_add(psi, rs.s0, rs.s3); _vector_sub(psi2, rs.s1, rs.s2); _su3_multiply(chi,(*U),psi); _su3_multiply(chi2,(*U),psi2); _complex_times_vector((*phi[ix]).s0, ka2, chi); _complex_times_vector((*phi[ix]).s1, ka2, chi2); U++; ix++; /*********************** direction -2 ************************/ _vector_sub((*phi[ix]).s0, rs.s0, rs.s3); _vector_add((*phi[ix]).s1, rs.s1, rs.s2); ix++; /*********************** direction +3 ************************/ _vector_i_add(psi, rs.s0, rs.s2); _vector_i_sub(psi2, rs.s1, rs.s3); _su3_multiply(chi, (*U), psi); _su3_multiply(chi2,(*U),psi2); _complex_times_vector((*phi[ix]).s0, ka3, chi); _complex_times_vector((*phi[ix]).s1, ka3, chi2); U++; ix++; /*********************** direction -3 ************************/ _vector_i_sub((*phi[ix]).s0, rs.s0, rs.s2); _vector_i_add((*phi[ix]).s1, rs.s1, rs.s3); ix++; /************************ end of loop ************************/ } # if (defined MPI && !defined _NO_COMM) xchange_halffield(); # endif s = l; phi = NBPointer[2 + ieo]; if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } ix = 0; /* #pragma ivdep */ for(i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _vector_assign(rs.s0, (*phi[ix]).s0); _vector_assign(rs.s2, (*phi[ix]).s0); _vector_assign(rs.s1, (*phi[ix]).s1); _vector_assign(rs.s3, (*phi[ix]).s1); ix++; /*********************** direction -0 ************************/ _su3_inverse_multiply(chi,(*U),(*phi[ix]).s0); _su3_inverse_multiply(chi2,(*U),(*phi[ix]).s1); _complexcjg_times_vector(psi,ka0,chi); _complexcjg_times_vector(psi2,ka0,chi2); _vector_add_assign(rs.s0, psi); _vector_sub_assign(rs.s2, psi); _vector_add_assign(rs.s1, psi2); _vector_sub_assign(rs.s3, psi2); ix++; U++; /*********************** direction +1 ************************/ _vector_add_assign(rs.s0, (*phi[ix]).s0); _vector_i_sub_assign(rs.s3, (*phi[ix]).s0); _vector_add_assign(rs.s1, (*phi[ix]).s1); _vector_i_sub_assign(rs.s2, (*phi[ix]).s1); ix++; /*********************** direction -1 ************************/ _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0); _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1); _complexcjg_times_vector(psi,ka1,chi); _complexcjg_times_vector(psi2,ka1,chi2); _vector_add_assign(rs.s0, psi); _vector_i_add_assign(rs.s3, psi); _vector_add_assign(rs.s1, psi2); _vector_i_add_assign(rs.s2, psi2); U++; ix++; /*********************** direction +2 ************************/ _vector_add_assign(rs.s0, (*phi[ix]).s0); _vector_add_assign(rs.s3, (*phi[ix]).s0); _vector_add_assign(rs.s1, (*phi[ix]).s1); _vector_sub_assign(rs.s2, (*phi[ix]).s1); ix++; /*********************** direction -2 ************************/ _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0); _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1); _complexcjg_times_vector(psi,ka2,chi); _complexcjg_times_vector(psi2,ka2,chi2); _vector_add_assign(rs.s0, psi); _vector_sub_assign(rs.s3, psi); _vector_add_assign(rs.s1, psi2); _vector_add_assign(rs.s2, psi2); U++; ix++; /*********************** direction +3 ************************/ _vector_add_assign(rs.s0, (*phi[ix]).s0); _vector_i_sub_assign(rs.s2, (*phi[ix]).s0); _vector_add_assign(rs.s1, (*phi[ix]).s1); _vector_i_add_assign(rs.s3, (*phi[ix]).s1); ix++; /*********************** direction -3 ************************/ _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0); _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1); _complexcjg_times_vector(psi,ka3,chi); _complexcjg_times_vector(psi2,ka3,chi2); _vector_add((*s).s0, rs.s0, psi); _vector_i_add((*s).s2, rs.s2, psi); _vector_add((*s).s1, rs.s1, psi2); _vector_i_sub((*s).s3, rs.s3, psi2); U++; ix++; s++; } } #ifdef _KOJAK_INST #pragma pomp inst end(hoppingmatrix) #endif }