void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k, complex double const cfactor) {
  
#  ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#  endif
  
#  ifdef OMP
#  pragma omp parallel
  {
    su3 * restrict u0 ALIGN;
#  endif

#  define _MUL_G5_CMPLX
#  if (defined BGQ && defined XLC)
    complex double ALIGN bla = cfactor;
    vector4double ALIGN cf = vec_ld2(0, (double*) &bla);
#  elif (defined SSE2 || defined SSE3)
    _Complex double ALIGN cf = cfactor;
#  endif
#  include "operator/halfspinor_body.c"
#  undef _MUL_G5_CMPLX    
#  ifdef OMP
  } /* OpenMP closing brace */
#  endif
  return;
}
void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * p, spinor * const k, 
			   complex double const cfactor) {
#  ifdef XLC
#    pragma disjoint(*l, *k)
#  endif
#  ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#  endif

#  if (defined TM_USE_MPI)
  xchange_field(k, ieo);
#  endif
  
#  ifdef TM_USE_OMP
#    pragma omp parallel
  {
#  endif
#  define _TM_SUB_HOP
    spinor * pn;
#  if (defined BGQ && defined XLC)
    complex double ALIGN bla = cfactor;
    vector4double ALIGN cf = vec_ld2(0, (double*) &bla);
#  elif (defined SSE2 || defined SSE3)
    _Complex double ALIGN cf = cfactor;
    su3_vector ALIGN psi, psi2;
#  endif
#  include "operator/hopping_body_dbl.c"
#  undef _TM_SUB_HOP
#  ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#  endif
  return;
}
void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k, double complex const cfactor) {
#  ifdef XLC
#    pragma disjoint(*l, *k)
#  endif
#  ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#  endif

#  if (defined MPI)
  xchange_field(k, ieo);
#  endif
  
#  ifdef OMP
#    pragma omp parallel
  {
#  endif
#  define _MUL_G5_CMPLX
#  if (defined BGQ && defined XLC)
    complex double ALIGN bla = cfactor;
    vector4double ALIGN cf = vec_ld2(0, (double*) &bla);
#  elif (defined SSE2 || defined SSE3)
    _Complex double ALIGN cf = cfactor;
#  endif
#  include "operator/hopping_body_dbl.c"
#  undef _MUL_G5_CMPLX
#  ifdef OMP
  } /* OpenMP closing brace */
#  endif
  return;
}
Example #4
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) {
#    ifdef XLC
#      pragma disjoint(*l, *k)
#    endif
#    ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#    endif

#    if (defined TM_USE_MPI && !(defined _NO_COMM))
  xchange_field(k, ieo);
#    endif

#    ifdef TM_USE_OMP
#      pragma omp parallel
  {
#    endif

#    include "operator/hopping_body_dbl.c"

#    ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#    endif
  return;
}
Example #5
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) {

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

#ifdef TM_USE_OMP
#pragma omp parallel
  {
  su3 * restrict u0 ALIGN;
#endif

#  include "operator/halfspinor_body.c"

#  ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#  endif
  return;
}
Example #6
0
int main(int argc,char *argv[]) {
 
  FILE *parameterfile=NULL,*rlxdfile=NULL, *countfile=NULL;
  char * filename = NULL;
  char datafilename[50];
  char parameterfilename[50];
  char gauge_filename[50];
  char * nstore_filename = ".nstore_counter";
  char * input_filename = NULL;
  int rlxd_state[105];
  int j,ix,mu;
  int k;
  struct timeval t1;

  int g_nev, max_iter_ev;
  double stop_prec_ev;


  /* Energy corresponding to the Gauge part */
  double eneg = 0., plaquette_energy = 0., rectangle_energy = 0.;
  /* Acceptance rate */
  int Rate=0;
  /* Do we want to perform reversibility checks */
  /* See also return_check_flag in read_input.h */
  int return_check = 0;
  /* For getopt */
  int c;

  /* For the Polyakov loop: */
  int dir = 2;
  _Complex double pl, pl4;

  verbose = 0;
  g_use_clover_flag = 0;
  g_nr_of_psf = 1;

#ifndef XLC 
  signal(SIGUSR1,&catch_del_sig);
  signal(SIGUSR2,&catch_del_sig);
  signal(SIGTERM,&catch_del_sig);
  signal(SIGXCPU,&catch_del_sig);
#endif

  while ((c = getopt(argc, argv, "h?f:o:")) != -1) {
    switch (c) {
    case 'f': 
      input_filename = calloc(200, sizeof(char));
      strcpy(input_filename,optarg);
      break;
    case 'o':
      filename = calloc(200, sizeof(char));
      strcpy(filename,optarg);
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }
  if(input_filename == NULL){
    input_filename = "hmc.input";
  }
  if(filename == NULL){
    filename = "output";
  } 

  /* Read the input file */
  read_input(input_filename);

  mpi_init(argc, argv);

  if(Nsave == 0){
    Nsave = 1;
  }
  if(nstore == -1) {
    countfile = fopen(nstore_filename, "r");
    if(countfile != NULL) {
      fscanf(countfile, "%d\n", &nstore);
      fclose(countfile);
    }
    else {
      nstore = 0;
    }
  }
  
  if(g_rgi_C1 == 0.) {
    g_dbw2rand = 0;
  }
#ifndef TM_USE_MPI
  g_dbw2rand = 0;
#endif

  /* Reorder the mu parameter and the number of iterations */
  if(g_mu3 > 0.) {
    g_mu = g_mu1;
    g_mu1 = g_mu3;
    g_mu3 = g_mu;

    j = int_n[1];
    int_n[1] = int_n[3];
    int_n[3] = j;

    j = g_csg_N[0];
    g_csg_N[0] = g_csg_N[4];
    g_csg_N[4] = j;
    g_csg_N[6] = j;
    if(fabs(g_mu3) > 0) {
      g_csg_N[6] = 0;
    }

    g_nr_of_psf = 3;
  }
  else if(g_mu2 > 0.) {
    g_mu = g_mu1;
    g_mu1 = g_mu2;
    g_mu2 = g_mu;

    int_n[3] = int_n[1];
    int_n[1] = int_n[2];
    int_n[2] = int_n[3];

    /* For chronological inverter */
    g_csg_N[4] = g_csg_N[0];
    g_csg_N[0] = g_csg_N[2];
    g_csg_N[2] = g_csg_N[4];
    if(fabs(g_mu2) > 0) {
      g_csg_N[4] = 0;
    }
    g_csg_N[6] = 0;

    g_nr_of_psf = 2;
  }
  else {
    g_csg_N[2] = g_csg_N[0];
    if(fabs(g_mu2) > 0) {
      g_csg_N[2] = 0;
    }
    g_csg_N[4] = 0;
    g_csg_N[6] = 0;
  }

  for(j = 0; j < g_nr_of_psf+1; j++) {
    if(int_n[j] == 0) int_n[j] = 1;
  }
  if(g_nr_of_psf == 3) {
    g_eps_sq_force = g_eps_sq_force1;
    g_eps_sq_force1 = g_eps_sq_force3;
    g_eps_sq_force3 = g_eps_sq_force;
    g_eps_sq_acc = g_eps_sq_acc1;
    g_eps_sq_acc1 = g_eps_sq_acc3;
    g_eps_sq_acc3 = g_eps_sq_acc;
  }
  if(g_nr_of_psf == 2) {
    g_eps_sq_force = g_eps_sq_force1;
    g_eps_sq_force1 = g_eps_sq_force2;
    g_eps_sq_force2 = g_eps_sq_force;
    g_eps_sq_acc = g_eps_sq_acc1;
    g_eps_sq_acc1 = g_eps_sq_acc2;
    g_eps_sq_acc2 = g_eps_sq_acc;
  }
  g_mu = g_mu1;
  g_eps_sq_acc = g_eps_sq_acc1;
  g_eps_sq_force = g_eps_sq_force1;


#ifdef _GAUGE_COPY
  j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
#else
  j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
#endif
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
    exit(0);
  }
  j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for geometry_indices! Aborting...\n");
    exit(0);
  }
  j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
    exit(0);
  }

  j = init_bispinor_field(VOLUME/2, NO_OF_SPINORFIELDS);


  j = init_csg_field(VOLUMEPLUSRAND/2, g_csg_N);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for csg fields! Aborting...\n");
    exit(0);
  }
  j = init_moment_field(VOLUME, VOLUMEPLUSRAND);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for moment fields! Aborting...\n");
    exit(0);
  }

  zero_spinor_field(g_spinor_field[DUM_DERI+4],VOLUME/2);
  zero_spinor_field(g_spinor_field[DUM_DERI+5],VOLUME/2);
  zero_spinor_field(g_spinor_field[DUM_DERI+6],VOLUME/2);
 

  if(g_proc_id == 0){
    
/*     fscanf(fp6,"%s",filename); */
    /*construct the filenames for the observables and the parameters*/
    strcpy(datafilename,filename);  strcat(datafilename,".data");
    strcpy(parameterfilename,filename);  strcat(parameterfilename,".para");
    
    parameterfile=fopen(parameterfilename, "w");
    printf("# This is the hmc code for twisted Mass Wilson QCD\n\nVersion %s\n", Version);
#ifdef SSE
    printf("# The code was compiled with SSE instructions\n");
#endif
#ifdef SSE2
    printf("# The code was compiled with SSE2 instructions\n");
#endif
#ifdef SSE3
    printf("# The code was compiled with SSE3 instructions\n");
#endif
#ifdef P4
    printf("# The code was compiled for Pentium4\n");
#endif
#ifdef OPTERON
    printf("# The code was compiled for AMD Opteron\n");
#endif
#ifdef _NEW_GEOMETRY
    printf("# The code was compiled with -D_NEW_GEOMETRY\n");
#endif
#ifdef _GAUGE_COPY
    printf("# The code was compiled with -D_GAUGE_COPY\n");
#endif
    printf("# The lattice size is %d x %d x %d x %d\n",
	   (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY), (int)(LZ));
    printf("# The local lattice size is %d x %d x %d x %d\n", 
	   (int)(T), (int)(LX), (int)(LY),(int) LZ);
    printf("# beta = %f , kappa= %f\n", g_beta, g_kappa);
    printf("# mus = %f, %f, %f\n", g_mu1, g_mu2, g_mu3);
    printf("# int_n_gauge = %d, int_n_ferm1 = %d, int_n_ferm2 = %d, int_n_ferm3 = %d\n", 
	    int_n[0], int_n[1], int_n[2], int_n[3]);
    printf("# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1);
    printf("# Number of pseudo-fermion fields: %d\n", g_nr_of_psf);
    printf("# g_eps_sq_force = %e, g_eps_sq_acc = %e\n", g_eps_sq_force, g_eps_sq_acc);
    printf("# Integration scheme: ");
    if(integtyp == 1) printf("leap-frog (single time scale)\n");
    if(integtyp == 2) printf("Sexton-Weingarten (single time scale)\n");
    if(integtyp == 3) printf("leap-frog (multiple time scales)\n");
    if(integtyp == 4) printf("Sexton-Weingarten (multiple time scales)\n");
    if(integtyp == 5) printf("higher order and leap-frog (multiple time scales)\n");
    printf("# Using %s precision for the inversions!\n", 
	   g_relative_precision_flag ? "relative" : "absolute");
    printf("# Using in chronological inverter for spinor_field 1,2,3 a history of %d, %d, %d, respectively\n", 
	   g_csg_N[0], g_csg_N[2], g_csg_N[4]);


    fprintf(parameterfile, "The lattice size is %d x %d x %d x %d\n", (int)(g_nproc_t*T), (int)(g_nproc_x*LX), (int)(LY), (int)(LZ));
    fprintf(parameterfile, "The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY), (int)(LZ));
    fprintf(parameterfile, "g_beta = %f , g_kappa= %f, c_sw = %f \n",g_beta,g_kappa,g_c_sw);
    fprintf(parameterfile, "boundary of fermion fields (t,x,y,z): %f %f %f %f \n",X0,X1,X2,X3);
    fprintf(parameterfile, "EPS_SQ0=%e, EPS_SQ1=%e EPS_SQ2=%e, EPS_SQ3=%e \n"
	    ,EPS_SQ0,EPS_SQ1,EPS_SQ2,EPS_SQ3);
    fprintf(parameterfile, "g_eps_sq_force = %e, g_eps_sq_acc = %e\n", g_eps_sq_force, g_eps_sq_acc);
    fprintf(parameterfile, "dtau=%f, Nsteps=%d, Nmeas=%d, Nsave=%d, integtyp=%d, nsmall=%d \n",
	    dtau,Nsteps,Nmeas,Nsave,integtyp,nsmall);
    fprintf(parameterfile, "mu = %f, mu2=%f, mu3=%f\n ", g_mu, g_mu2, g_mu3);
    fprintf(parameterfile, "int_n_gauge = %d, int_n_ferm1 = %d, int_n_ferm2 = %d, int_n_ferm3 = %d\n ", 
	    int_n[0], int_n[1], int_n[2], int_n[3]);
    fprintf(parameterfile, "g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1);
    fprintf(parameterfile, "# Number of pseudo-fermion fields: %d\n", g_nr_of_psf);
    fprintf(parameterfile, "# Integration scheme: ");
    if(integtyp == 1) fprintf(parameterfile, "leap-frog (single time scale)\n");
    if(integtyp == 2) fprintf(parameterfile, "Sexton-Weingarten (single time scale)\n");
    if(integtyp == 3) fprintf(parameterfile, "leap-frog (multiple time scales)\n");
    if(integtyp == 4) fprintf(parameterfile, "Sexton-Weingarten (multiple time scales)\n");
    if(integtyp == 5) fprintf(parameterfile, "higher order and leap-frog (multiple time scales)\n");
    fprintf(parameterfile, "Using %s precision for the inversions!\n", 
	   g_relative_precision_flag ? "relative" : "absolute");
    fprintf(parameterfile, "Using in chronological inverter for spinor_field 1,2,3 a history of %d, %d, %d, respectively\n", 
	   g_csg_N[0], g_csg_N[2], g_csg_N[4]);
    fflush(stdout); fflush(parameterfile);
  }

  /* define the geometry */
  geometry();

  /* define the boundary conditions for the fermion fields */
  boundary();

  check_geometry();

  if(g_proc_id == 0) {
#if defined GEOMETRIC
    if(g_proc_id==0) fprintf(parameterfile,"The geometric series is used as solver \n\n");
#else
    if(g_proc_id==0) fprintf(parameterfile,"The BICG_stab is used as solver \n\n");
#endif
    fflush(parameterfile);
  }
  
  /* Continue */
  if(startoption == 3){
    rlxdfile = fopen(rlxd_input_filename,"r");
    if(rlxdfile != NULL) {
      if(g_proc_id == 0) {
	fread(rlxd_state,sizeof(rlxd_state),1,rlxdfile);
      }
    }
    else {
      if(g_proc_id == 0) {
	printf("%s does not exist, switching to restart...\n", rlxd_input_filename);
      }
      startoption = 2;
    }
    fclose(rlxdfile);
    if(startoption != 2) {
      if(g_proc_id == 0) {
	rlxd_reset(rlxd_state);
	printf("Reading Gauge field from file %s\n", gauge_input_filename); fflush(stdout);
      }
      
      read_gauge_field_time_p(gauge_input_filename,g_gauge_field);
    }
  }
  if(startoption != 3){
    /* Initialize random number generator */
    if(g_proc_id == 0) {
      rlxd_init(1, random_seed);
      /* hot */
      if(startoption == 1) {
	random_gauge_field();
      }
      rlxd_get(rlxd_state);
#ifdef TM_USE_MPI
      MPI_Send(&rlxd_state[0], 105, MPI_INT, 1, 99, MPI_COMM_WORLD);
      MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_nproc-1, 99, MPI_COMM_WORLD, &status);
      rlxd_reset(rlxd_state);
#endif
    }
#ifdef TM_USE_MPI
    else {
      MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_proc_id-1, 99, MPI_COMM_WORLD, &status);
      rlxd_reset(rlxd_state);
      /* hot */
      if(startoption == 1) {
	random_gauge_field();
      }
      k=g_proc_id+1; 
      if(k==g_nproc){
	k=0;
      }
      rlxd_get(rlxd_state);
      MPI_Send(&rlxd_state[0], 105, MPI_INT, k, 99, MPI_COMM_WORLD);
    }
#endif

    /* Cold */
    if(startoption == 0) {
      unit_g_gauge_field();
    }
    /* Restart */
    else if(startoption == 2) {
      if (g_proc_id == 0){
	printf("Reading Gauge field from file %s\n", gauge_input_filename); fflush(stdout);
      }
      read_gauge_field_time_p(gauge_input_filename,g_gauge_field);
    }

  }

  /*For parallelization: exchange the gaugefield */
#ifdef TM_USE_MPI
  xchange_gauge(g_gauge_field);
#endif
#ifdef _GAUGE_COPY
  update_backward_gauge();
#endif

  /*compute the energy of the gauge field*/
  plaquette_energy=measure_gauge_action();
  if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) {
    rectangle_energy = measure_rectangles();
    if(g_proc_id==0){
      fprintf(parameterfile,"#First rectangle value: %14.12f \n",rectangle_energy/(12.*VOLUME*g_nproc));
    }
  }
  eneg = g_rgi_C0 * plaquette_energy + g_rgi_C1 * rectangle_energy;
  
  /* Measure and print the Polyakov loop: */
  polyakov_loop(&pl, dir);

  if(g_proc_id==0){
    fprintf(parameterfile,"#First plaquette value: %14.12f \n", plaquette_energy/(6.*VOLUME*g_nproc));
    fprintf(parameterfile,"#First Polyakov loop value in %d-direction |L(%d)|= %14.12f \n",
	    dir, dir, cabs(pl));
  }

  dir=3;
  polyakov_loop(&pl, dir);
  if(g_proc_id==0){
    fprintf(parameterfile,"#First Polyakov loop value in %d-direction |L(%d)|= %14.12f \n",
	    dir, dir, cabs(pl));
    fclose(parameterfile);
  }

  /* set ddummy to zero */
  for(ix = 0; ix < VOLUME+RAND; ix++){
    for(mu=0; mu<4; mu++){
      ddummy[ix][mu].d1=0.;
      ddummy[ix][mu].d2=0.;
      ddummy[ix][mu].d3=0.;
      ddummy[ix][mu].d4=0.;
      ddummy[ix][mu].d5=0.;
      ddummy[ix][mu].d6=0.;
      ddummy[ix][mu].d7=0.;
      ddummy[ix][mu].d8=0.;
    }
  }

  if(g_proc_id == 0) {
    gettimeofday(&t1,NULL);
    countfile = fopen("history_hmc_tm", "a");
    fprintf(countfile, "!!! Timestamp %ld, Nsave = %d, g_mu = %e, g_mu1 = %e, g_mu_2 = %e, g_mu3 = %e, beta = %f, kappa = %f, C1 = %f, int0 = %d, int1 = %d, int2 = %d, int3 = %d, g_eps_sq_force = %e, g_eps_sq_acc = %e, ", 
	    t1.tv_sec, Nsave, g_mu, g_mu1, g_mu2, g_mu3, g_beta, g_kappa, g_rgi_C1, 
	    int_n[0], int_n[1], int_n[2], int_n[3], g_eps_sq_force, g_eps_sq_acc); 
    fprintf(countfile, "Nsteps = %d, dtau = %e, tau = %e, integtyp = %d, rel. prec. = %d\n", 
	    Nsteps, dtau, tau, integtyp, g_relative_precision_flag);
    fclose(countfile);
  }



     /* HERE THE CALLS FOR SOME EIGENVALUES */

  /* for lowest
  g_nev = 10;
  */

  /* for largest
  */
  g_nev = 10;

  max_iter_ev = 1000;
  stop_prec_ev = 1.e-10;

  if(g_proc_id==0) {

  printf(" Values of   mu = %e     mubar = %e     eps = %e     precision = %e  \n \n", g_mu, g_mubar, g_epsbar, stop_prec_ev);

  }

  eigenvalues(&g_nev, operator_flag, max_iter_ev, stop_prec_ev);

  g_nev = 4;

  max_iter_ev = 200;
  stop_prec_ev = 1.e-03;

  max_eigenvalues(&g_nev, operator_flag, max_iter_ev, stop_prec_ev);

  if(g_proc_id==0) {

  printf(" Values of   mu = %e     mubar = %e     eps = %e     precision = %e  \n \n", g_mu, g_mubar, g_epsbar, stop_prec_ev);

  /*
  printf(" Values of   mu = %e     precision = %e  \n \n", g_mu, stop_prec_ev);
  */

  }

   /* END OF EIGENVALUES CALLS */


  if(g_proc_id==0) {
    rlxd_get(rlxd_state);
    rlxdfile=fopen("last_state","w");
    fwrite(rlxd_state,sizeof(rlxd_state),1,rlxdfile);
    fclose(rlxdfile);

    printf("Acceptance Rate was: %e Prozent\n", 100.*(double)Rate/(double)Nmeas);
    fflush(stdout);
    parameterfile = fopen(parameterfilename, "a");
    fprintf(parameterfile, "Acceptance Rate was: %e Prozent\n", 100.*(double)Rate/(double)Nmeas);
    fclose(parameterfile);
  }
#ifdef TM_USE_MPI
  MPI_Finalize();
#endif
  free_gauge_tmp();
  free_gauge_field();
  free_geometry_indices();
  free_spinor_field();
  free_bispinor_field();  
  free_moment_field();
  return(0);
}
Example #7
0
int main(int argc,char *argv[])
{
  int j,j_max,k,k_max = 2;
  paramsXlfInfo *xlfInfo;
  int ix, n, *nn,*mm,i;
  double delta, deltamax;
  spinor rsp;
  int status = 0;
#ifdef MPI
  DUM_DERI = 6;
  DUM_SOLVER = DUM_DERI+2;
  DUM_MATRIX = DUM_SOLVER+6;
  NO_OF_SPINORFIELDS = DUM_MATRIX+2;

  MPI_Init(&argc, &argv);
#endif
  g_rgi_C1 = 1.; 
  
  /* Read the input file */
  read_input("hopping_test.input");
  
  tmlqcd_mpi_init(argc, argv);
  
  if(g_proc_id==0) {
#ifdef SSE
    printf("# The code was compiled with SSE instructions\n");
#endif
#ifdef SSE2
    printf("# The code was compiled with SSE2 instructions\n");
#endif
#ifdef SSE3
    printf("# The code was compiled with SSE3 instructions\n");
#endif
#ifdef P4
    printf("# The code was compiled for Pentium4\n");
#endif
#ifdef OPTERON
    printf("# The code was compiled for AMD Opteron\n");
#endif
#ifdef _GAUGE_COPY
    printf("# The code was compiled with -D_GAUGE_COPY\n");
#endif
#ifdef BGL
    printf("# The code was compiled for Blue Gene/L\n");
#endif
#ifdef BGP
    printf("# The code was compiled for Blue Gene/P\n");
#endif
#ifdef _USE_HALFSPINOR
    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
#endif    
#ifdef _USE_SHMEM
    printf("# the code was compiled with -D_USE_SHMEM\n");
#  ifdef _PERSISTENT
    printf("# the code was compiled for persistent MPI calls (halfspinor only)\n");
#  endif
#endif
#ifdef _INDEX_INDEP_GEOM
    printf("# the code was compiled with index independent geometry\n");
#endif
#ifdef MPI
#  ifdef _NON_BLOCKING
    printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n");
#  endif
#  ifdef _USE_TSPLITPAR
    printf("# the code was compiled with tsplit parallelization\n");
#  endif
#endif
    printf("\n");
    fflush(stdout);
  }

  
#ifdef _GAUGE_COPY
  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
#else
  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
#endif
  init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);

  if(even_odd_flag) {
    j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1);
  }
  else {
    j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max);
  }

  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
    exit(0);
  }
  j = init_moment_field(VOLUME, VOLUMEPLUSRAND);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for moment fields! Aborting...\n");
    exit(0);
  }
  
  if(g_proc_id == 0) {
    fprintf(stdout,"The number of processes is %d \n",g_nproc);
    printf("# The lattice size is %d x %d x %d x %d\n",
	   (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ));
    printf("# The local lattice size is %d x %d x %d x %d\n", 
	   (int)(T), (int)(LX), (int)(LY),(int) LZ);
    if(even_odd_flag) {
      printf("# testinging the even/odd preconditioned Dirac operator\n");
    }
    else {
      printf("# testinging the standard Dirac operator\n");
    }
    fflush(stdout);
  }
  
  /* define the geometry */
  geometry();
  /* define the boundary conditions for the fermion fields */
  boundary(g_kappa);

#ifdef _USE_HALFSPINOR
  j = init_dirac_halfspinor();
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
    exit(0);
  }
  if(g_sloppy_precision_flag == 1) {
    g_sloppy_precision = 1;
    j = init_dirac_halfspinor32();
    if ( j!= 0) {
      fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n");
      exit(0);
    }
  }
#  if (defined _PERSISTENT)
  init_xchange_halffield();
#  endif
#endif  

  status = check_geometry();
  if (status != 0) {
    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
    exit(1);
  }

#if (defined MPI && !(defined _USE_SHMEM))
  check_xchange(); 
#endif

  start_ranlux(1, 123456);

  xlfInfo = construct_paramsXlfInfo(0.5, 0);

  random_gauge_field(reproduce_randomnumber_flag);
  if ( startoption == 2 ) {  /* restart */ 
    write_gauge_field(gauge_input_filename,gauge_precision_write_flag,xlfInfo);
  } else if ( startoption == 0 ) { /* cold */
    unit_g_gauge_field();
  } else if (startoption == 3 ) { /* continue */
    read_gauge_field(gauge_input_filename);
  } else if ( startoption == 1 ) { /* hot */
  }


#ifdef MPI
  /*For parallelization: exchange the gaugefield */
  xchange_gauge();
#endif

#ifdef _GAUGE_COPY
  update_backward_gauge();
#endif

  if(even_odd_flag) {
    /*initialize the pseudo-fermion fields*/
    j_max=1;
    for (k = 0; k < k_max; k++) {
      random_spinor_field(g_spinor_field[k], VOLUME/2, 0);
    }

    if (read_source_flag == 2) { /* save */
      /* even first, odd second */
      write_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename);  
    }	else if (read_source_flag == 1) { /* yes */
      /* even first, odd second */
      read_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename,-1,0); 
# if (!defined MPI)
      if (write_cp_flag == 1) {
	strcat(SourceInfo.basename,".2");
	read_spinorfield_cm_single(g_spinor_field[2],g_spinor_field[3],SourceInfo.basename,-1,0); 

	nn=(int*)calloc(VOLUME,sizeof(int));
	if((void*)nn == NULL) return(100);
	mm=(int*)calloc(VOLUME,sizeof(int));
	if((void*)mm == NULL) return(100);

	n=0;
	deltamax=0.0;
	for(ix=0;ix<VOLUME/2;ix++){
	  (rsp.s0).c0.re = (g_spinor_field[2][ix].s0).c0.re - (g_spinor_field[0][ix].s0).c0.re;
	  (rsp.s0).c0.im = (g_spinor_field[2][ix].s0).c0.im - (g_spinor_field[0][ix].s0).c0.im;
	  (rsp.s0).c1.re = (g_spinor_field[2][ix].s0).c1.re - (g_spinor_field[0][ix].s0).c1.re;
	  (rsp.s0).c1.im = (g_spinor_field[2][ix].s0).c1.im - (g_spinor_field[0][ix].s0).c1.im;
	  (rsp.s0).c2.re = (g_spinor_field[2][ix].s0).c2.re - (g_spinor_field[0][ix].s0).c2.re;
	  (rsp.s0).c2.im = (g_spinor_field[2][ix].s0).c2.im - (g_spinor_field[0][ix].s0).c2.im;
	  (rsp.s1).c0.re = (g_spinor_field[2][ix].s1).c0.re - (g_spinor_field[0][ix].s1).c0.re;
	  (rsp.s1).c0.im = (g_spinor_field[2][ix].s1).c0.im - (g_spinor_field[0][ix].s1).c0.im;
	  (rsp.s1).c1.re = (g_spinor_field[2][ix].s1).c1.re - (g_spinor_field[0][ix].s1).c1.re;
	  (rsp.s1).c1.im = (g_spinor_field[2][ix].s1).c1.im - (g_spinor_field[0][ix].s1).c1.im;
	  (rsp.s1).c2.re = (g_spinor_field[2][ix].s1).c2.re - (g_spinor_field[0][ix].s1).c2.re;
	  (rsp.s1).c2.im = (g_spinor_field[2][ix].s1).c2.im - (g_spinor_field[0][ix].s1).c2.im;
	  (rsp.s2).c0.re = (g_spinor_field[2][ix].s2).c0.re - (g_spinor_field[0][ix].s2).c0.re;
	  (rsp.s2).c0.im = (g_spinor_field[2][ix].s2).c0.im - (g_spinor_field[0][ix].s2).c0.im;
	  (rsp.s2).c1.re = (g_spinor_field[2][ix].s2).c1.re - (g_spinor_field[0][ix].s2).c1.re;
	  (rsp.s2).c1.im = (g_spinor_field[2][ix].s2).c1.im - (g_spinor_field[0][ix].s2).c1.im;
	  (rsp.s2).c2.re = (g_spinor_field[2][ix].s2).c2.re - (g_spinor_field[0][ix].s2).c2.re;
	  (rsp.s2).c2.im = (g_spinor_field[2][ix].s2).c2.im - (g_spinor_field[0][ix].s2).c2.im;
	  (rsp.s3).c0.re = (g_spinor_field[2][ix].s3).c0.re - (g_spinor_field[0][ix].s3).c0.re;
	  (rsp.s3).c0.im = (g_spinor_field[2][ix].s3).c0.im - (g_spinor_field[0][ix].s3).c0.im;
	  (rsp.s3).c1.re = (g_spinor_field[2][ix].s3).c1.re - (g_spinor_field[0][ix].s3).c1.re;
	  (rsp.s3).c1.im = (g_spinor_field[2][ix].s3).c1.im - (g_spinor_field[0][ix].s3).c1.im;
	  (rsp.s3).c2.re = (g_spinor_field[2][ix].s3).c2.re - (g_spinor_field[0][ix].s3).c2.re;
	  (rsp.s3).c2.im = (g_spinor_field[2][ix].s3).c2.im - (g_spinor_field[0][ix].s3).c2.im;
	  _spinor_norm_sq(delta,rsp);
	  if (delta > 1.0e-12) {
	    nn[n] = g_eo2lexic[ix];
	    mm[n]=ix;
	    n++;	    
	  }
	  if(delta>deltamax) deltamax=delta;
	}
	if (n>0){
	  printf("mismatch in even spincolorfield in %d points:\n",n);
	  for(i=0; i< MIN(n,1000); i++){
	    printf("%d,(%d,%d,%d,%d):%f vs. %f\n",nn[i],g_coord[nn[i]][0],g_coord[nn[i]][1],g_coord[nn[i]][2],g_coord[nn[i]][3],(g_spinor_field[2][mm[i]].s0).c0.re, (g_spinor_field[0][mm[i]].s0).c0.re);fflush(stdout);
	  }
	}
	n = 0;
	for(ix=0;ix<VOLUME/2;ix++){
	  (rsp.s0).c0.re = (g_spinor_field[3][ix].s0).c0.re - (g_spinor_field[1][ix].s0).c0.re;
	  (rsp.s0).c0.im = (g_spinor_field[3][ix].s0).c0.im - (g_spinor_field[1][ix].s0).c0.im;
	  (rsp.s0).c1.re = (g_spinor_field[3][ix].s0).c1.re - (g_spinor_field[1][ix].s0).c1.re;
	  (rsp.s0).c1.im = (g_spinor_field[3][ix].s0).c1.im - (g_spinor_field[1][ix].s0).c1.im;
	  (rsp.s0).c2.re = (g_spinor_field[3][ix].s0).c2.re - (g_spinor_field[1][ix].s0).c2.re;
	  (rsp.s0).c2.im = (g_spinor_field[3][ix].s0).c2.im - (g_spinor_field[1][ix].s0).c2.im;
	  (rsp.s1).c0.re = (g_spinor_field[3][ix].s1).c0.re - (g_spinor_field[1][ix].s1).c0.re;
	  (rsp.s1).c0.im = (g_spinor_field[3][ix].s1).c0.im - (g_spinor_field[1][ix].s1).c0.im;
	  (rsp.s1).c1.re = (g_spinor_field[3][ix].s1).c1.re - (g_spinor_field[1][ix].s1).c1.re;
	  (rsp.s1).c1.im = (g_spinor_field[3][ix].s1).c1.im - (g_spinor_field[1][ix].s1).c1.im;
	  (rsp.s1).c2.re = (g_spinor_field[3][ix].s1).c2.re - (g_spinor_field[1][ix].s1).c2.re;
	  (rsp.s1).c2.im = (g_spinor_field[3][ix].s1).c2.im - (g_spinor_field[1][ix].s1).c2.im;
	  (rsp.s2).c0.re = (g_spinor_field[3][ix].s2).c0.re - (g_spinor_field[1][ix].s2).c0.re;
	  (rsp.s2).c0.im = (g_spinor_field[3][ix].s2).c0.im - (g_spinor_field[1][ix].s2).c0.im;
	  (rsp.s2).c1.re = (g_spinor_field[3][ix].s2).c1.re - (g_spinor_field[1][ix].s2).c1.re;
	  (rsp.s2).c1.im = (g_spinor_field[3][ix].s2).c1.im - (g_spinor_field[1][ix].s2).c1.im;
	  (rsp.s2).c2.re = (g_spinor_field[3][ix].s2).c2.re - (g_spinor_field[1][ix].s2).c2.re;
	  (rsp.s2).c2.im = (g_spinor_field[3][ix].s2).c2.im - (g_spinor_field[1][ix].s2).c2.im;
	  (rsp.s3).c0.re = (g_spinor_field[3][ix].s3).c0.re - (g_spinor_field[1][ix].s3).c0.re;
	  (rsp.s3).c0.im = (g_spinor_field[3][ix].s3).c0.im - (g_spinor_field[1][ix].s3).c0.im;
	  (rsp.s3).c1.re = (g_spinor_field[3][ix].s3).c1.re - (g_spinor_field[1][ix].s3).c1.re;
	  (rsp.s3).c1.im = (g_spinor_field[3][ix].s3).c1.im - (g_spinor_field[1][ix].s3).c1.im;
	  (rsp.s3).c2.re = (g_spinor_field[3][ix].s3).c2.re - (g_spinor_field[1][ix].s3).c2.re;
	  (rsp.s3).c2.im = (g_spinor_field[3][ix].s3).c2.im - (g_spinor_field[1][ix].s3).c2.im;
	  _spinor_norm_sq(delta,rsp);
	  if (delta > 1.0e-12) {
	    nn[n]=g_eo2lexic[ix+(VOLUME+RAND)/2];
	    mm[n]=ix;
	    n++;	    
	  }
	  if(delta>deltamax) deltamax=delta;
	}
	if (n>0){
	  printf("mismatch in odd spincolorfield in %d points:\n",n);
	  for(i=0; i< MIN(n,1000); i++){
	    printf("%d,(%d,%d,%d,%d):%f vs. %f\n",nn[i],g_coord[nn[i]][0],g_coord[nn[i]][1],g_coord[nn[i]][2],g_coord[nn[i]][3],(g_spinor_field[3][mm[i]].s0).c0.re, (g_spinor_field[1][mm[i]].s0).c0.re);fflush(stdout);
	  }
	}
	printf("max delta=%e",deltamax);fflush(stdout);
      }
# endif
    }
    
    if (read_source_flag > 0 && write_cp_flag == 0) { /* read-source yes or nobutsave; checkpoint no */
      /* first spinorial arg is output, the second is input */
      Hopping_Matrix(1, g_spinor_field[1], g_spinor_field[0]);      /*ieo=1 M_{eo}*/
      Hopping_Matrix(0, g_spinor_field[0], g_spinor_field[1]);      /*ieo=0 M_{oe}*/
      strcat(SourceInfo.basename,".out");
      write_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename);
      printf("Check-field printed. Exiting...\n");
      fflush(stdout);
    }

#ifdef MPI
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
#endif
  }

  free_gauge_field();
  free_geometry_indices();
  free_spinor_field();
  free_moment_field();
  return(0);
}
Example #8
0
/* Serially Checked ! */
void Dtm_psi(spinor * const P, spinor * const Q){

  if(P==Q){
    printf("Error in Dtm_psi (D_psi.c):\n");
    printf("Arguments must be differen spinor fields\n");
    printf("Program aborted\n");
    exit(1);
  }

#ifdef _GAUGE_COPY2
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

# if defined TM_USE_MPI
  xchange_lexicfield(Q);
# endif

#ifdef TM_USE_OMP
#pragma omp parallel
  {
#endif
    int ix,iy,iz;
    su3 *up,*um;
    spinor *s,*sp,*sm,*rn;
    _Complex double fact1, fact2;
    spinor rs __attribute__ ((aligned (16)));

    fact1 = 1. + g_mu * I;
    fact2 = conj(fact1);

#ifndef TM_USE_OMP
    iy=g_iup[0][0];
    sp=(spinor *) Q + iy;
    up=&g_gauge_field[0][0];
#endif

    /************************ loop over all lattice sites *************************/
#ifdef TM_USE_OMP
#pragma omp for
#endif
    for (ix=0;ix<VOLUME;ix++){
#ifdef TM_USE_OMP
      iy=g_iup[ix][0];
      up=&g_gauge_field[ix][0];
      sp=(spinor *) Q + iy;
#endif
      s=(spinor *) Q + ix;
      _prefetch_spinor(s);

      /******************************* direction +0 *********************************/

      iy=g_idn[ix][0];
      
      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);       

      _sse_load(sp->s0);
      _sse_load_up(sp->s2);
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_0);
      _sse_store_up(rs.s2);

      // the diagonal bit
      _sse_load_up(s->s0);
      _sse_vector_cmplx_mul(fact1);
      _sse_load(rs.s2);
      _sse_vector_add();
      _sse_store(rs.s0);

      // g5 in the twisted term
      _sse_load_up(s->s2);
      _sse_vector_cmplx_mul(fact2);
      _sse_load(rs.s2);
      _sse_vector_add();
      _sse_store(rs.s2);      
      
      um=&g_gauge_field[iy][0];
      _prefetch_su3(um);
      
      _sse_load(sp->s1);
      _sse_load_up(sp->s3);
      _sse_vector_add();
      
      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_0);
      _sse_store_up(rs.s3);
    
      // the diagonal bit
      _sse_load_up(s->s1);
      _sse_vector_cmplx_mul(fact1);
      _sse_load(rs.s3);
      _sse_vector_add();
      _sse_store(rs.s1);

      // g5 in the twisted term
      _sse_load_up(s->s3);
      _sse_vector_cmplx_mul(fact2);
      _sse_load(rs.s3);
      _sse_vector_add();
      _sse_store(rs.s3); 

      /******************************* direction -0 *********************************/

      iy=g_iup[ix][1];

      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s2);
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_0);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s2);
      _sse_vector_sub();
      _sse_store(rs.s2);
      
      up+=1;
      _prefetch_su3(up);
      
      _sse_load(sm->s1);
      _sse_load_up(sm->s3);
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_0);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s3);
      _sse_vector_sub();
      _sse_store(rs.s3);
      
      /******************************* direction +1 *********************************/

      iy=g_idn[ix][1];
      
      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);

      _sse_load(sp->s0);
      _sse_load_up(sp->s3);
      _sse_vector_i_mul();
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_1);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store(rs.s3); 
      
      um=&g_gauge_field[iy][1];
      _prefetch_su3(um);

      _sse_load(sp->s1);
      _sse_load_up(sp->s2);
      _sse_vector_i_mul();
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_1);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store(rs.s2);       

      /******************************* direction -1 *********************************/

      iy=g_iup[ix][2];

      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s3);
      _sse_vector_i_mul();
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_1);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store(rs.s3);

      up+=1;
      _prefetch_su3(up);

      _sse_load(sm->s1);
      _sse_load_up(sm->s2);
      _sse_vector_i_mul();
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_1);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store(rs.s2);

      /******************************* direction +2 *********************************/

      iy=g_idn[ix][2];

      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);

      _sse_load(sp->s0);
      _sse_load_up(sp->s3);
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_2);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_add();
      _sse_store(rs.s3);
      
      um=&g_gauge_field[iy][2];
      _prefetch_su3(um);

      _sse_load(sp->s1);
      _sse_load_up(sp->s2);
      _sse_vector_sub();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_2);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_sub();
      _sse_store(rs.s2);      

      /******************************* direction -2 *********************************/

      iy=g_iup[ix][3];

      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s3);
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_2);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_sub();
      _sse_store(rs.s3);
      
      up+=1;
      _prefetch_su3(up);

      _sse_load(sm->s1);
      _sse_load_up(sm->s2);
      _sse_vector_add();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_2);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_add();
      _sse_store(rs.s2);      
      
      /******************************* direction +3 *********************************/

      iy=g_idn[ix][3];

      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);

      _sse_load(sp->s0);
      _sse_load_up(sp->s2);
      _sse_vector_i_mul();
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_3);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store(rs.s2);
      
      um=&g_gauge_field[iy][3];
      _prefetch_su3(um);

      _sse_load(sp->s1);
      _sse_load_up(sp->s3);
      _sse_vector_i_mul();
      _sse_vector_sub();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_3);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store(rs.s3);
      
      /******************************* direction -3 *********************************/

      iz=(ix+1+VOLUME)%VOLUME;

      iy=g_iup[iz][0];
      
      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s2);
      _sse_vector_i_mul();
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_3);
      rn = (spinor *) P + ix;
      
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store_nt(rn->s0);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store_nt(rn->s2);

      up=&g_gauge_field[iz][0];
      _prefetch_su3(up);

      _sse_load(sm->s1);
      _sse_load_up(sm->s3);
      _sse_vector_i_mul();
      _sse_vector_add();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_3);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store_nt(rn->s1);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store_nt(rn->s3);
      
      /******************************** end of loop *********************************/

    }
#ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#endif
}
Example #9
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int ix;
  su3 * restrict ALIGN U;
  spinor * restrict ALIGN s;
  halfspinor * restrict * phi ALIGN;
  halfspinor32 * restrict * phi32 ALIGN;
  /* We have 32 registers available */
  _declare_hregs();

#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif
#pragma disjoint(*s, *U)

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

  __alignx(16, l);
  __alignx(16, k);
  if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
    __alignx(16, HalfSpinor32);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi32 = NBPointer32[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre32();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre32();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre32();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre32();
      ix++;

      /*********************** direction +2 ************************/
      _hop_y_p_pre32();

      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre32();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre32();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre32();
      ix++;

      /************************ end of loop ************************/
    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield32(); 
#    endif
    s = l;
    phi32 = NBPointer32[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    //_prefetch_halfspinor(phi32[0]);
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi32[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post32();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post32();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post32();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post32();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post32();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post32();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post32();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post32();
      U++;
      ix++;
      s++;
    }
  }
  else {
    __alignx(16, HalfSpinor);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi = NBPointer[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre();
      ix++;


      /*********************** direction +2 ************************/
      _hop_y_p_pre();
      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre();
      ix++;

      /************************ end of loop ************************/

    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield(); 
#    endif
    s = l;
    phi = NBPointer[2 + ieo];
    //_prefetch_halfspinor(phi[0]);
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post();
      U++;
      ix++;
      s++;
    }
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}
Example #10
0
/* for ieo=0, k resides on  odd sites and l on even sites */
void Hopping_Matrix(int ieo, spinor * const l, spinor * const k){
  int ix,iy;
  int ioff,ioff2,icx,icy;
  su3 * restrict up, * restrict um;
  spinor * restrict r, * restrict sp, * restrict sm;
  spinor temp;

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge();
  }
#endif

  /* for parallelization */
#    if (defined MPI && !(defined _NO_COMM))
  xchange_field(k, ieo);
#    endif

  if(k == l){
    printf("Error in H_psi (simple.c):\n");
    printf("Arguments k and l must be different\n");
    printf("Program aborted\n");
    exit(1);
  }
  if(ieo == 0){
    ioff = 0;
  } 
  else{
    ioff = (VOLUME+RAND)/2;
  } 
  ioff2 = (VOLUME+RAND)/2-ioff;
  /**************** loop over all lattice sites ****************/

  for (icx = ioff; icx < (VOLUME/2 + ioff); icx++){
    ix=g_eo2lexic[icx];

    r=l+(icx-ioff);

    /*********************** direction +0 ************************/
    iy=g_iup[ix][0]; icy=g_lexic2eosub[iy];


    sp=k+icy;
#    if ((defined _GAUGE_COPY))
    up=&g_gauge_field_copy[icx][0];
#    else
    up=&g_gauge_field[ix][0];
#    endif
      
    _vector_add(psi,(*sp).s0,(*sp).s2);

    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka0,chi);
      
    _vector_assign(temp.s0,psi);
    _vector_assign(temp.s2,psi);

    _vector_add(psi,(*sp).s1,(*sp).s3);

    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka0,chi);
            
    _vector_assign(temp.s1,psi);
    _vector_assign(temp.s3,psi);

    /*********************** direction -0 ************************/

    iy=g_idn[ix][0]; icy=g_lexic2eosub[iy];

    sm=k+icy;
#    if ((defined _GAUGE_COPY))
    um = up+1;
#    else
    um=&g_gauge_field[iy][0];
#    endif

    _vector_sub(psi,(*sm).s0,(*sm).s2);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka0,chi);

    _vector_add_assign(temp.s0,psi);
    _vector_sub_assign(temp.s2,psi);

    _vector_sub(psi,(*sm).s1,(*sm).s3);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka0,chi);
      
    _vector_add_assign(temp.s1,psi);
    _vector_sub_assign(temp.s3,psi);

    /*********************** direction +1 ************************/

    iy=g_iup[ix][1]; icy=g_lexic2eosub[iy];

    sp=k+icy;

#    if ((defined _GAUGE_COPY))
    up=um+1;
#    else
    up+=1;
#    endif
      
    _vector_i_add(psi,(*sp).s0,(*sp).s3);

    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka1,chi);

    _vector_add_assign(temp.s0,psi);
    _vector_i_sub_assign(temp.s3,psi);

    _vector_i_add(psi,(*sp).s1,(*sp).s2);

    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka1,chi);

    _vector_add_assign(temp.s1,psi);
    _vector_i_sub_assign(temp.s2,psi);

    /*********************** direction -1 ************************/

    iy=g_idn[ix][1]; icy=g_lexic2eosub[iy];

    sm=k+icy;
#    ifndef _GAUGE_COPY
    um=&g_gauge_field[iy][1];
#    else
    um=up+1;
#    endif

    _vector_i_sub(psi,(*sm).s0,(*sm).s3);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka1,chi);

    _vector_add_assign(temp.s0,psi);
    _vector_i_add_assign(temp.s3,psi);

    _vector_i_sub(psi,(*sm).s1,(*sm).s2);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka1,chi);

    _vector_add_assign(temp.s1,psi);
    _vector_i_add_assign(temp.s2,psi);

    /*********************** direction +2 ************************/

    iy=g_iup[ix][2]; icy=g_lexic2eosub[iy];

    sp=k+icy;
#    if ((defined _GAUGE_COPY))
    up=um+1;
#    else
    up+=1;
#    endif 
    _vector_add(psi,(*sp).s0,(*sp).s3);

    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka2,chi);

    _vector_add_assign(temp.s0,psi);
    _vector_add_assign(temp.s3,psi);

    _vector_sub(psi,(*sp).s1,(*sp).s2);

    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka2,chi);
      
    _vector_add_assign(temp.s1,psi);
    _vector_sub_assign(temp.s2,psi);


    /*********************** direction -2 ************************/

    iy=g_idn[ix][2]; icy=g_lexic2eosub[iy];

    sm=k+icy;
#    ifndef _GAUGE_COPY
    um = &g_gauge_field[iy][2];
#    else
    um = up +1;
#    endif

    _vector_sub(psi,(*sm).s0,(*sm).s3);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka2,chi);

    _vector_add_assign(temp.s0,psi);
    _vector_sub_assign(temp.s3,psi);

    _vector_add(psi,(*sm).s1,(*sm).s2);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka2,chi);
      
    _vector_add_assign(temp.s1,psi);
    _vector_add_assign(temp.s2,psi);

    /*********************** direction +3 ************************/

    iy=g_iup[ix][3]; icy=g_lexic2eosub[iy];

    sp=k+icy;
#    if ((defined _GAUGE_COPY))
    up=um+1;
#    else
    up+=1;
#    endif 
    _vector_i_add(psi,(*sp).s0,(*sp).s2);
      
    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka3,chi);

    _vector_add_assign(temp.s0,psi);
    _vector_i_sub_assign(temp.s2,psi);

    _vector_i_sub(psi,(*sp).s1,(*sp).s3);

    _su3_multiply(chi,(*up),psi);
    _complex_times_vector(psi,ka3,chi);

    _vector_add_assign(temp.s1,psi);
    _vector_i_add_assign(temp.s3,psi);

    /*********************** direction -3 ************************/

    iy=g_idn[ix][3]; icy=g_lexic2eosub[iy];

    sm=k+icy;
#    ifndef _GAUGE_COPY
    um = &g_gauge_field[iy][3];
#    else
    um = up+1;
#    endif

    _vector_i_sub(psi,(*sm).s0,(*sm).s2);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka3,chi);
      
    _vector_add((*r).s0, temp.s0, psi);
    _vector_i_add((*r).s2, temp.s2, psi);

    _vector_i_add(psi,(*sm).s1,(*sm).s3);

    _su3_inverse_multiply(chi,(*um),psi);
    _complexcjg_times_vector(psi,ka3,chi);

    _vector_add((*r).s1, temp.s1, psi);
    _vector_i_sub((*r).s3, temp.s3, psi);
    /************************ end of loop ************************/
  }
}
Example #11
0
int main(int argc,char *argv[]) {

  FILE *parameterfile=NULL;
  int c, j, is=0, ic=0;
  int x, X, y, Y, z, Z, t, tt, i, sum;
  char * filename = NULL;
  char datafilename[50];
  char parameterfilename[50];
  char conf_filename[50];
  char * input_filename = NULL;
  double plaquette_energy, nrm;
  double * norm;
  struct stout_parameters params_smear;
  
#ifdef _GAUGE_COPY
  int kb=0;
#endif
#ifdef MPI
  double atime=0., etime=0.;
#endif
#ifdef _KOJAK_INST
#pragma pomp inst init
#pragma pomp inst begin(main)
#endif

  DUM_DERI = 6;
  /* DUM_DERI + 2 is enough (not 7) */
  DUM_SOLVER = DUM_DERI+2;
  DUM_MATRIX = DUM_SOLVER+6;
  /* DUM_MATRIX + 2 is enough (not 6) */
  NO_OF_SPINORFIELDS = DUM_MATRIX+2;

  verbose = 0;
  g_use_clover_flag = 0;
  g_nr_of_psf = 1;

#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "h?f:o:")) != -1) {
    switch (c) {
    case 'f': 
      input_filename = calloc(200, sizeof(char));
      strcpy(input_filename,optarg);
      break;
    case 'o':
      filename = calloc(200, sizeof(char));
      strcpy(filename,optarg);
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }
  if(input_filename == NULL){
    input_filename = "hmc.input";
  }
  if(filename == NULL){
    filename = "output";
  } 

  /* Read the input file */
  read_input(input_filename);
  /* here we want no even/odd preconditioning */
  even_odd_flag = 0;

  /* this DBW2 stuff is not needed for the inversion ! */
  g_rgi_C1 = 0;
  if(Nsave == 0){
    Nsave = 1;
  }
  tmlqcd_mpi_init(argc, argv);

  g_dbw2rand = 0;

#ifndef MPI
  g_dbw2rand = 0;
#endif

#ifdef _GAUGE_COPY
  j = init_gauge_field(VOLUMEPLUSRAND, 1);
#else
  j = init_gauge_field(VOLUMEPLUSRAND, 0);
#endif
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
    exit(-1);
  }
  j = init_geometry_indices(VOLUMEPLUSRAND);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n");
    exit(-1);
  }
  if(even_odd_flag) {
    j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS);
  }
  else {
    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
  }
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
    exit(-1);
  }

  g_mu = g_mu1; 
  if(g_proc_id == 0){    
    /*construct the filenames for the observables and the parameters*/
    strcpy(datafilename,filename);  strcat(datafilename,".data");
    strcpy(parameterfilename,filename);  strcat(parameterfilename,".para");
    
    parameterfile=fopen(parameterfilename, "w");
    write_first_messages(parameterfile, 0, 1);
  }

  /* define the geometry */
  geometry();

  /* define the boundary conditions for the fermion fields */
  boundary();

#ifdef _USE_HALFSPINOR
  j = init_dirac_halfspinor();
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
    exit(-1);
  }
  if(g_sloppy_precision_flag == 1) {
    j = init_dirac_halfspinor32();
    if ( j!= 0) {
      fprintf(stderr, "Not enough memory for 32-Bit halffield! Aborting...\n");
      exit(-1);
    }
  }
#  if (defined _PERSISTENT)
  init_xchange_halffield();
#  endif
#endif
  norm = (double*)calloc(3.*LX/2.+T/2., sizeof(double));

  for(j=0;j<Nmeas; j++) {
    sprintf(conf_filename,"%s.%.4d", gauge_input_filename, nstore);
    if (g_proc_id == 0){
      printf("Reading Gauge field from file %s\n", conf_filename); fflush(stdout);
    }
    read_lime_gauge_field(conf_filename);
    if (g_proc_id == 0){
      printf("done!\n"); fflush(stdout);
    }
#ifdef MPI
    xchange_gauge();
#endif
#ifdef _GAUGE_COPY
    update_backward_gauge();
#endif

    /* Compute minimal eigenvalues, if wanted */
    if(compute_evs != 0) {
      eigenvalues(&no_eigenvalues, 1000, eigenvalue_precision, 0, compute_evs, nstore, even_odd_flag);
    }
    /*compute the energy of the gauge field*/
    plaquette_energy = measure_gauge_action();

    if(g_proc_id == 0) {
      printf("The plaquette value is %e\n", plaquette_energy/(6.*VOLUME*g_nproc)); fflush(stdout);
    }
    if (use_stout_flag == 1){
      params_smear.rho = stout_rho;
      params_smear.iterations = stout_no_iter;
      if (stout_smear((su3_tuple*)(g_gauge_field[0]), &params_smear, (su3_tuple*)(g_gauge_field[0])) != 0)
        exit(1) ;
      g_update_gauge_copy = 1;
      g_update_gauge_energy = 1;
      g_update_rectangle_energy = 1;
      plaquette_energy = measure_gauge_action();

      if (g_proc_id == 0) {
        printf("# The plaquette value after stouting is %e\n", plaquette_energy / (6.*VOLUME*g_nproc));
        fflush(stdout);
      }
    }

    source_spinor_field(g_spinor_field[0], g_spinor_field[1], 0, 0);
    convert_eo_to_lexic(g_spinor_field[DUM_DERI], g_spinor_field[0], g_spinor_field[1]);
    D_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
    if(even_odd_flag) {
      i = invert_eo(g_spinor_field[2], g_spinor_field[3], g_spinor_field[0], g_spinor_field[1], 
		    solver_precision, max_solver_iterations, solver_flag, g_relative_precision_flag,
		    sub_evs_cg_flag, even_odd_flag);
      convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], g_spinor_field[2], g_spinor_field[3]);
    }


    for(i = 0; i < 3*LX/2+T/2; i++){
      norm[i] = 0.;
    }
    
    for(x = 0; x < LX; x++){
      if(x > LX/2) X = LX-x;
      else X = x;
      for(y = 0; y < LY; y++){
	if(y > LY/2) Y = LY-y;
	else Y = y;
	for(z = 0; z < LZ; z++){
	  if(z > LZ/2) Z = LZ-z;
	  else Z = z;
	  for(t = 0; t < T; t++){
	    if(t > T/2) tt = T - t;
	    else tt = t;
	    sum = X + Y + Z + tt;
	    _spinor_norm_sq(nrm, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ]);
/* 	    _spinor_norm_sq(nrm, qprop[0][0][1][ g_ipt[t][x][y][z] ]); */
 	    printf("%e %e\n", g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.re, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.im);
	    nrm = sqrt( nrm );
	    printf("%1.12e\n", nrm);
	    if(nrm > norm[sum]) norm[sum] = nrm;
	  }
	}
      }
    }
    
    for(i = 0; i < 3*L/2+T/2; i++){
      printf("%d %1.12e\n", i, norm[i]);
    }
    printf("\n");
    
    nstore+=Nsave;
  }

#ifdef MPI
  MPI_Finalize();
#endif
  free_gauge_field();
  free_geometry_indices();
  free_spinor_field();
  free_moment_field();
  return(0);
#ifdef _KOJAK_INST
#pragma pomp inst end(main)
#endif
}
Example #12
0
/* input on k; output on l */
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int icx,icz,ioff;
  int ix,iz;
  int x0,icx0,jj;
  su3 *restrict up;
  su3 * restrict um;
  spinor * restrict sp;
  spinor * restrict sm;
  spinor * restrict rn;

# if (defined MPI)
#  ifdef PARALLELX
#   define  REQC 4
#  elif defined PARALLELXY
#   define  REQC 8
#  elif defined PARALLELXYZ
#   define  REQC 12
#  endif
  MPI_Request requests[REQC];
  MPI_Status status[REQC];
# endif

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

  if(ieo == 0){ /* even out - odd in */
    ioff = 0;
  } 
  else{ /* odd out - even in */
    ioff = (VOLUME+RAND)/2;
  }

  /* Loop over time direction. This is the outmost loop */
  for(x0=0;x0<T;x0++){

    /* start the communication of the timslice borders (non-blocking send and receive)*/
#    if (defined MPI && !defined _NO_COMM)
   xchange_field_open(k, ieo, x0, requests, status);
#    endif
    

  /* loop over timeslice. At: contribution of timelike links  */
   icx0=g_1st_eot[x0][ieo];
   jj =0;
   um=&g_gauge_field_copyt[icx0][0]-1; /* allowed? */
   for(icx = icx0; icx < icx0+TEOSLICE; icx++){
     rn=l+(icx-ioff);
    /*********************** direction +0 ************************/

    sp=k+g_iup_eo[icx][0]; /* all sp,sm,up,um could be moved up */
    up=um+1;

#if (defined AVX)
    _avx_load(sp->s0);
    _avx_load_up(sp->s2);
    _avx_vector_add();

    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka0);
    _avx_store_up(rn->s0);
    _avx_store_up(rn->s2);      
      
    _avx_load(sp->s1);
    _avx_load_up(sp->s3);
    _avx_vector_add();
      
    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka0);
    _avx_store_up(rn->s1);
    _avx_store_up(rn->s3); 

    /*********************** direction -0 ************************/

    sm=k+g_idn_eo[icx][0];
    um=up+1;

    _avx_load(sm->s0);
    _avx_load_up(sm->s2);
    _avx_vector_sub();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka0);
      
    _avx_load(rn->s0);
    _avx_vector_add();
    _avx_store(rn->s0);

    _avx_load(rn->s2);
    _avx_vector_sub();
    _avx_store(rn->s2);
      
    _avx_load(sm->s1);
    _avx_load_up(sm->s3);
    _avx_vector_sub();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka0);
      
    _avx_load(rn->s1);
    _avx_vector_add();
    _avx_store(rn->s1);

    _avx_load(rn->s3);
    _avx_vector_sub();
    _avx_store(rn->s3);

#elif (defined SSE2 || defined SSE3)
    _sse_load(sp->s0);
    _sse_load_up(sp->s2);
    _sse_vector_add();

    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_up(rn->s0);
    _sse_store_up(rn->s2);      
      
    _sse_load(sp->s1);
    _sse_load_up(sp->s3);
    _sse_vector_add();
      
    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_up(rn->s1);
    _sse_store_up(rn->s3); 

    /*********************** direction -0 ************************/

    sm=k+g_idn_eo[icx][0];
    um=up+1;

    _sse_load(sm->s0);
    _sse_load_up(sm->s2);
    _sse_vector_sub();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka0);
      
    _sse_load(rn->s0);
    _sse_vector_add();
    _sse_store(rn->s0);

    _sse_load(rn->s2);
    _sse_vector_sub();
    _sse_store(rn->s2);
      
    _sse_load(sm->s1);
    _sse_load_up(sm->s3);
    _sse_vector_sub();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka0);
      
    _sse_load(rn->s1);
    _sse_vector_add();
    _sse_store(rn->s1);

    _sse_load(rn->s3);
    _sse_vector_sub();
    _sse_store(rn->s3);

#endif
    
    jj++;
   } /* end of loop over timeslice (At)*/

       
   /* complete the communication of the timslice borders (and wait) */
#if (defined MPI && !defined _NO_COMM)
   xchange_field_close(requests, status, REQC); /*    MPI_Waitall */
#endif

   /* loop over timeslice. Bt: contribution of spacelike links  */
   um=&g_gauge_field_copys[icx0][0]-1;
   for(icx = icx0; icx < icx0+TEOSLICE; icx++){
    ix=g_eo2lexic[icx];
    rn=l+(icx-ioff);
    /*********************** direction +1 ************************/

    sp=k+g_iup_eo[icx][1];
    up=um+1;

#if (defined AVX)
    _avx_load(sp->s0);
    _avx_load_up(sp->s3);
    _avx_vector_i_mul();
    _avx_vector_add();

    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka1);

    _avx_load(rn->s0);
    _avx_vector_add();
    _avx_store(rn->s0);

    _avx_load(rn->s3);
    _avx_vector_i_mul();      
    _avx_vector_sub();
    _avx_store(rn->s3); 
      
    _avx_load(sp->s1);
    _avx_load_up(sp->s2);
    _avx_vector_i_mul();
    _avx_vector_add();

    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka1);

    _avx_load(rn->s1);
    _avx_vector_add();
    _avx_store(rn->s1);

    _avx_load(rn->s2);
    _avx_vector_i_mul();      
    _avx_vector_sub();
    _avx_store(rn->s2);       

    /*********************** direction -1 ************************/

    sm=k+g_idn_eo[icx][1];
    um=up+1;

    _avx_load(sm->s0);
    _avx_load_up(sm->s3);
    _avx_vector_i_mul();
    _avx_vector_sub();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka1);
      
    _avx_load(rn->s0);
    _avx_vector_add();
    _avx_store(rn->s0);

    _avx_load(rn->s3);
    _avx_vector_i_mul();      
    _avx_vector_add();
    _avx_store(rn->s3);

    _avx_load(sm->s1);
    _avx_load_up(sm->s2);
    _avx_vector_i_mul();
    _avx_vector_sub();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka1);
      
    _avx_load(rn->s1);
    _avx_vector_add();
    _avx_store(rn->s1);

    _avx_load(rn->s2);
    _avx_vector_i_mul();      
    _avx_vector_add();
    _avx_store(rn->s2);

    /*********************** direction +2 ************************/

    sp=k+g_iup_eo[icx][2];
    up=um+1;

    _avx_load(sp->s0);
    _avx_load_up(sp->s3);
    _avx_vector_add();

    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka2);

    _avx_load(rn->s0);
    _avx_vector_add();
    _avx_store(rn->s0);

    _avx_load(rn->s3);
    _avx_vector_add();
    _avx_store(rn->s3);
      
    _avx_load(sp->s1);
    _avx_load_up(sp->s2);
    _avx_vector_sub();

    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka2);

    _avx_load(rn->s1);
    _avx_vector_add();
    _avx_store(rn->s1);

    _avx_load(rn->s2);
    _avx_vector_sub();
    _avx_store(rn->s2);      

    /*********************** direction -2 ************************/

    sm=k+g_idn_eo[icx][2];
    um=up+1;

    _avx_load(sm->s0);
    _avx_load_up(sm->s3);
    _avx_vector_sub();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka2);
      
    _avx_load(rn->s0);
    _avx_vector_add();
    _avx_store(rn->s0);

    _avx_load(rn->s3);
    _avx_vector_sub();
    _avx_store(rn->s3);
      
    _avx_load(sm->s1);
    _avx_load_up(sm->s2);
    _avx_vector_add();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka2);
      
    _avx_load(rn->s1);
    _avx_vector_add();
    _avx_store(rn->s1);

    _avx_load(rn->s2);
    _avx_vector_add();
    _avx_store(rn->s2);      
      
    /*********************** direction +3 ************************/

    sp=k+g_iup_eo[icx][3];
    up=um+1;

    _avx_load(sp->s0);
    _avx_load_up(sp->s2);
    _avx_vector_i_mul();
    _avx_vector_add();

    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka3);

    _avx_load(rn->s0);
    _avx_vector_add();
    _avx_store(rn->s0);

    _avx_load(rn->s2);
    _avx_vector_i_mul();      
    _avx_vector_sub();
    _avx_store(rn->s2);
      
    _avx_load(sp->s1);
    _avx_load_up(sp->s3);
    _avx_vector_i_mul();
    _avx_vector_sub();

    _avx_su3_multiply((*up));
    _avx_vector_cmplx_mul(ka3);

    _avx_load(rn->s1);
    _avx_vector_add();
    _avx_store(rn->s1);

    _avx_load(rn->s3);
    _avx_vector_i_mul();      
    _avx_vector_add();
    _avx_store(rn->s3);
      
    /*********************** direction -3 ************************/

    sm=k+g_idn_eo[icx][3];
    um=up+1;

    _avx_load(sm->s0);
    _avx_load_up(sm->s2);
    _avx_vector_i_mul();
    _avx_vector_sub();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka3);

    _avx_load(rn->s0);
    _avx_vector_add();
    _avx_store(rn->s0);

    _avx_load(rn->s2);
    _avx_vector_i_mul();
    _avx_vector_add();
    _avx_store(rn->s2);

    _avx_load(sm->s1);
    _avx_load_up(sm->s3);
    _avx_vector_i_mul();
    _avx_vector_add();
      
    _avx_su3_inverse_multiply((*um));
    _avx_vector_cmplxcg_mul(ka3);

    _avx_load(rn->s1);
    _avx_vector_add();
    _avx_store(rn->s1);

    _avx_load(rn->s3);
    _avx_vector_i_mul();      
    _avx_vector_sub();
    _avx_store(rn->s3);
    
#elif (defined SSE2 || defined SSE3)
    _sse_load(sp->s0);
    _sse_load_up(sp->s3);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka1);

    _sse_load(rn->s0);
    _sse_vector_add();
    _sse_store(rn->s0);

    _sse_load(rn->s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rn->s3); 
      
    _sse_load(sp->s1);
    _sse_load_up(sp->s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka1);

    _sse_load(rn->s1);
    _sse_vector_add();
    _sse_store(rn->s1);

    _sse_load(rn->s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rn->s2);       

    /*********************** direction -1 ************************/

    sm=k+g_idn_eo[icx][1];
    um=up+1;

    _sse_load(sm->s0);
    _sse_load_up(sm->s3);
    _sse_vector_i_mul();
    _sse_vector_sub();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka1);
      
    _sse_load(rn->s0);
    _sse_vector_add();
    _sse_store(rn->s0);

    _sse_load(rn->s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rn->s3);

    _sse_load(sm->s1);
    _sse_load_up(sm->s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka1);
      
    _sse_load(rn->s1);
    _sse_vector_add();
    _sse_store(rn->s1);

    _sse_load(rn->s2);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rn->s2);

    /*********************** direction +2 ************************/

    sp=k+g_iup_eo[icx][2];
    up=um+1;

    _sse_load(sp->s0);
    _sse_load_up(sp->s3);
    _sse_vector_add();

    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka2);

    _sse_load(rn->s0);
    _sse_vector_add();
    _sse_store(rn->s0);

    _sse_load(rn->s3);
    _sse_vector_add();
    _sse_store(rn->s3);
      
    _sse_load(sp->s1);
    _sse_load_up(sp->s2);
    _sse_vector_sub();

    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka2);

    _sse_load(rn->s1);
    _sse_vector_add();
    _sse_store(rn->s1);

    _sse_load(rn->s2);
    _sse_vector_sub();
    _sse_store(rn->s2);      

    /*********************** direction -2 ************************/

    sm=k+g_idn_eo[icx][2];
    um=up+1;

    _sse_load(sm->s0);
    _sse_load_up(sm->s3);
    _sse_vector_sub();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka2);
      
    _sse_load(rn->s0);
    _sse_vector_add();
    _sse_store(rn->s0);

    _sse_load(rn->s3);
    _sse_vector_sub();
    _sse_store(rn->s3);
      
    _sse_load(sm->s1);
    _sse_load_up(sm->s2);
    _sse_vector_add();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka2);
      
    _sse_load(rn->s1);
    _sse_vector_add();
    _sse_store(rn->s1);

    _sse_load(rn->s2);
    _sse_vector_add();
    _sse_store(rn->s2);      
      
    /*********************** direction +3 ************************/

    sp=k+g_iup_eo[icx][3];
    up=um+1;

    _sse_load(sp->s0);
    _sse_load_up(sp->s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka3);

    _sse_load(rn->s0);
    _sse_vector_add();
    _sse_store(rn->s0);

    _sse_load(rn->s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rn->s2);
      
    _sse_load(sp->s1);
    _sse_load_up(sp->s3);
    _sse_vector_i_mul();
    _sse_vector_sub();

    _sse_su3_multiply((*up));
    _sse_vector_cmplx_mul(ka3);

    _sse_load(rn->s1);
    _sse_vector_add();
    _sse_store(rn->s1);

    _sse_load(rn->s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rn->s3);
      
    /*********************** direction -3 ************************/

    sm=k+g_idn_eo[icx][3];
    um=up+1;

    _sse_load(sm->s0);
    _sse_load_up(sm->s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rn->s0);
    _sse_vector_add();
    _sse_store(rn->s0);

    _sse_load(rn->s2);
    _sse_vector_i_mul();
    _sse_vector_add();
    _sse_store(rn->s2);

    _sse_load(sm->s1);
    _sse_load_up(sm->s3);
    _sse_vector_i_mul();
    _sse_vector_add();
      
    _sse_su3_inverse_multiply((*um));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rn->s1);
    _sse_vector_add();
    _sse_store(rn->s1);

    _sse_load(rn->s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rn->s3);

#endif    
   }  /* end of loop over timeslice (Bt)*/
  } /* x0=0; x0<T */
}
Example #13
0
void jdher_bi(int n, int lda, double tau, double tol, 
	      int kmax, int jmax, int jmin, int itmax,
	      int blksize, int blkwise, 
	      int V0dim, complex *V0, 
	      int solver_flag, 
	      int linitmax, double eps_tr, double toldecay,
	      int verbosity,
	      int *k_conv, complex *Q, double *lambda, int *it,
	      int maxmin, const int shift_mode,
	      matrix_mult_bi A_psi){
  
  /****************************************************************************
   *                                                                          *
   * Local variables                                                          *
   *                                                                          *
   ****************************************************************************/
  
  /* constants */

  /* allocatables: 
   * initialize with NULL, so we can free even unallocated ptrs */
  double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL;

  complex *V_ = NULL, *V, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL,
    *Res_ = NULL, *Res,
    *eigwork = NULL, *temp1_ = NULL, *temp1;

  int *idx1 = NULL, *idx2 = NULL, 
    *convind = NULL, *keepind = NULL, *solvestep = NULL, 
    *actcorrits = NULL;

  /* non-allocated ptrs */
  complex *q, *v, *u, *r = NULL;  
  /*   complex *matdummy, *vecdummy; */

  /* scalar vars */
  double theta, alpha, it_tol;

  int i, k, j, actblksize, eigworklen, found, conv, keep, n2, N = n*sizeof(complex)/sizeof(bispinor);
  int act, cnt, idummy, info, CntCorrIts=0, endflag=0;


  /* variables for random number generator */
  int IDIST = 1;
  int ISEED[4] = {2, 3, 5, 7};
  ISEED[0] = g_proc_id;

  /****************************************************************************
   *                                                                          *
   * Of course on the CRAY everything is different :( !!                      *
   * that's why we need something more.
   *                                                                          *
   ****************************************************************************/

#ifdef CRAY
  fupl_u = _cptofcd(cupl_u, strlen(cupl_u));
  fupl_c = _cptofcd(cupl_c, strlen(cupl_c));
  fupl_n = _cptofcd(cupl_n, strlen(cupl_n));
  fupl_a = _cptofcd(cupl_a, strlen(cupl_a));
  fupl_v = _cptofcd(cupl_v, strlen(cupl_v));
  filaenv = _cptofcd(cilaenv, strlen(cilaenv));
  fvu = _cptofcd(cvu, strlen(cvu));
#endif

  /****************************************************************************
   *                                                                          *
   * Execution starts here...                                                 *
   *                                                                          *
   ****************************************************************************/
  /*   NEW PART FOR GAUGE_COPY */
#ifdef _GAUGE_COPY
  update_backward_gauge();
#endif
  /* END NEW PART */

  /* print info header */
  if (verbosity > 1 && g_proc_id == 0) {
    printf("Jacobi-Davidson method for hermitian Matrices\n");
    printf("Solving  A*x = lambda*x \n\n");
    printf("  N=      %10d  ITMAX=%4d\n", n, itmax);
    printf("  KMAX=%3d  JMIN=%3d  JMAX=%3d  V0DIM=%3d\n", 
	   kmax, jmin, jmax, V0dim);
    printf("  BLKSIZE=        %2d  BLKWISE=      %5s\n", 
	   blksize, blkwise ? "TRUE" : "FALSE");
    printf("  TOL=  %11.4e TAU=  %11.4e\n", 
	   tol, tau);
    printf("  LINITMAX=    %5d  EPS_TR=  %10.3e  TOLDECAY=%9.2e\n", 
	   linitmax, eps_tr, toldecay);
    printf("\n Computing %s eigenvalues\n",
	   maxmin ? "maximal" : "minimal");
    printf("\n");
    fflush( stdout );
  }

  /* validate input parameters */
  if(tol <= 0) jderrorhandler(401,"");
  if(kmax <= 0 || kmax > n) jderrorhandler(402,"");
  if(jmax <= 0 || jmax > n) jderrorhandler(403,"");
  if(jmin <= 0 || jmin > jmax) jderrorhandler(404,"");
  if(itmax < 0) jderrorhandler(405,"");
  if(blksize > jmin || blksize > (jmax - jmin)) jderrorhandler(406,"");
  if(blksize <= 0 || blksize > kmax) jderrorhandler(406,"");
  if(blkwise < 0 || blkwise > 1) jderrorhandler(407,"");
  if(V0dim < 0 || V0dim >= jmax) jderrorhandler(408,"");
  if(linitmax < 0) jderrorhandler(409,"");
  if(eps_tr < 0.) jderrorhandler(500,"");
  if(toldecay <= 1.0) jderrorhandler(501,"");
  
  CONE.re=1.; CONE.im=0.;
  CZERO.re=0.; CZERO.im=0.;
  CMONE.re=-1.; CMONE.im=0.;

  /* Get hardware-dependent values:
   * Opt size of workspace for ZHEEV is (NB+1)*j, where NB is the opt.
   * block size... */
  eigworklen = (2 + _FT(ilaenv)(&ONE, filaenv, fvu, &jmax, &MONE, &MONE, &MONE, 6, 2)) * jmax;

  /* Allocating memory for matrices & vectors */ 
  if((void*)(V_ = (complex *)malloc((lda * jmax + 4) * sizeof(complex))) == NULL) {
    errno = 0;
    jderrorhandler(300,"V in jdher_bi");
  }
#if (defined SSE || defined SSE2 || defined SSE3)
  V = (complex*)(((unsigned long int)(V_)+ALIGN_BASE)&~ALIGN_BASE);
#else
  V = V_;
#endif
  if((void*)(U = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) {
    jderrorhandler(300,"U in jdher_bi");
  }
  if((void*)(s = (double *)malloc(jmax * sizeof(double))) == NULL) {
    jderrorhandler(300,"s in jdher_bi");
  }
  if((void*)(Res_ = (complex *)malloc((lda * blksize+4) * sizeof(complex))) == NULL) {
    jderrorhandler(300,"Res in jdher_bi");
  }
#if (defined SSE || defined SSE2 || defined SSE3)
  Res = (complex*)(((unsigned long int)(Res_)+ALIGN_BASE)&~ALIGN_BASE);
#else
  Res = Res_;
#endif
  if((void*)(resnrm = (double *)malloc(blksize * sizeof(double))) == NULL) {
    jderrorhandler(300,"resnrm in jdher_bi");
  }
  if((void*)(resnrm_old = (double *)calloc(blksize,sizeof(double))) == NULL) {
    jderrorhandler(300,"resnrm_old in jdher_bi");
  }
  if((void*)(M = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) {
    jderrorhandler(300,"M in jdher_bi");
  }
  if((void*)(Vtmp = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) {
    jderrorhandler(300,"Vtmp in jdher_bi");
  }
  if((void*)(p_work_bi = (complex *)malloc(lda * sizeof(complex))) == NULL) {
    jderrorhandler(300,"p_work_bi in jdher_bi");
  }

  /* ... */
  if((void*)(idx1 = (int *)malloc(jmax * sizeof(int))) == NULL) {
    jderrorhandler(300,"idx1 in jdher_bi");
  }
  if((void*)(idx2 = (int *)malloc(jmax * sizeof(int))) == NULL) {
    jderrorhandler(300,"idx2 in jdher_bi");
  }

  /* Indices for (non-)converged approximations */
  if((void*)(convind = (int *)malloc(blksize * sizeof(int))) == NULL) {
    jderrorhandler(300,"convind in jdher_bi");
  }
  if((void*)(keepind = (int *)malloc(blksize * sizeof(int))) == NULL) {
    jderrorhandler(300,"keepind in jdher_bi");
  }
  if((void*)(solvestep = (int *)malloc(blksize * sizeof(int))) == NULL) {
    jderrorhandler(300,"solvestep in jdher_bi");
  }
  if((void*)(actcorrits = (int *)malloc(blksize * sizeof(int))) == NULL) {
    jderrorhandler(300,"actcorrits in jdher_bi");
  }

  if((void*)(eigwork = (complex *)malloc(eigworklen * sizeof(complex))) == NULL) {
    jderrorhandler(300,"eigwork in jdher_bi");
  }
  if((void*)(rwork = (double *)malloc(3*jmax * sizeof(double))) == NULL) {
    jderrorhandler(300,"rwork in jdher_bi");
  }
  if((void*)(temp1_ = (complex *)malloc((lda+4) * sizeof(complex))) == NULL) {
    jderrorhandler(300,"temp1 in jdher_bi");
  }
#if (defined SSE || defined SSE2 || defined SSE3)
  temp1 = (complex*)(((unsigned long int)(temp1_)+ALIGN_BASE)&~ALIGN_BASE);
#else
  temp1 = temp1_;
#endif
  if((void*)(dtemp = (double *)malloc(lda * sizeof(complex))) == NULL) {
    jderrorhandler(300,"dtemp in jdher_bi");
  }

  /* Set variables for Projection routines */
  n2 = 2*n;
  p_n = n;
  p_n2 = n2;
  p_Q_bi = Q;
  p_A_psi_bi = A_psi;
  p_lda = lda;
  
  /**************************************************************************
   *                                                                        *
   * Generate initial search subspace V. Vectors are taken from V0 and if   *
   * necessary randomly generated.                                          *
   *                                                                        *
   **************************************************************************/

  /* copy V0 to V */
  _FT(zlacpy)(fupl_a, &n, &V0dim, V0, &lda, V, &lda, 1);
  j = V0dim;
  /* if V0dim < blksize: generate additional random vectors */
  if (V0dim < blksize) {
    idummy = (blksize - V0dim)*n; /* nof random numbers */
    _FT(zlarnv)(&IDIST, ISEED, &idummy, V + V0dim*lda);
    j = blksize;
  }
  for (cnt = 0; cnt < j; cnt ++) {
    ModifiedGS_bi(V + cnt*lda, n, cnt, V, lda);
    alpha = sqrt(square_norm_bi((bispinor*)(V+cnt*lda), N));
    alpha = 1.0 / alpha;
    _FT(dscal)(&n2, &alpha, (double *)(V + cnt*lda), &ONE);
  }
  /* Generate interaction matrix M = V^dagger*A*V. Only the upper triangle
     is computed. */
  for (cnt = 0; cnt < j; cnt++){
    A_psi((bispinor*) temp1, (bispinor*) (V+cnt*lda));
    idummy = cnt+1;
    for(i = 0; i < idummy; i++) {
      M[cnt*jmax+i] = scalar_prod_bi((bispinor*)(V+i*lda), (bispinor*) temp1, N);
    }
  }
  /* Other initializations */
  k = 0; (*it) = 0; 
  if((*k_conv) > 0) {
    k = (*k_conv);
  }

  actblksize = blksize; 
  for(act = 0; act < blksize; act ++){
    solvestep[act] = 1;
  }


  /****************************************************************************
   *                                                                          *
   * Main JD-iteration loop                                                   *
   *                                                                          *
   ****************************************************************************/
  while((*it) < itmax) {
    /****************************************************************************
     *                                                                          *
     * Solving the projected eigenproblem                                       *
     *                                                                          *
     * M*u = V^dagger*A*V*u = s*u                                                     *
     * M is hermitian, only the upper triangle is stored                        *
     *                                                                          *
     ****************************************************************************/
    _FT(zlacpy)(fupl_u, &j, &j, M, &jmax, U, &jmax, 1);
    _FT(zheev)(fupl_v, fupl_u, &j, U, &jmax, s, eigwork, &eigworklen, rwork, &info, 1, 1); 

    if (info != 0) {
      printf("error solving the projected eigenproblem.");
      printf(" zheev: info = %d\n", info);
    }
    if(info != 0) jderrorhandler(502,"problem in zheev for jdher_bi");
  

    /* Reverse order of eigenvalues if maximal value is needed */
    if(maxmin == 1){
      sorteig(j, s, U, jmax, s[j-1], dtemp, idx1, idx2, 0); 
    }
    else{
      sorteig(j, s, U, jmax, 0., dtemp, idx1, idx2, 0); 
    }
    /****************************************************************************
     *                                                                          *
     * Convergence/Restart Check                                                *
     *                                                                          *
     * In case of convergence, strip off a whole block or just the converged    *
     * ones and put 'em into Q.  Update the matrices Q, V, U, s                 *
     *                                                                          *
     * In case of a restart update the V, U and M matrices and recompute the    *
     * Eigenvectors                                                             *
     *                                                                          *
     ****************************************************************************/

    found = 1;
    while(found) {

      /* conv/keep = Number of converged/non-converged Approximations */
      conv = 0; keep = 0;

      for(act=0; act < actblksize; act++){

	/* Setting pointers for single vectors */
	q = Q + (act+k)*lda; 
	u = U + act*jmax; 
	r = Res + act*lda; 
	
	/* Compute Ritz-Vector Q[:,k+cnt1]=V*U[:,cnt1] */
	theta = s[act];
	_FT(zgemv)(fupl_n, &n, &j, &CONE, V, &lda, u, &ONE, &CZERO, q, &ONE, 1);

	/* Compute the residual */
	A_psi((bispinor*) r, (bispinor*) q); 
	theta = -theta;
	_FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE);

	/* Compute norm of the residual and update arrays convind/keepind*/
	resnrm_old[act] = resnrm[act];
	resnrm[act] = sqrt(square_norm_bi((bispinor*) r, N));
	if (resnrm[act] < tol){
	  convind[conv] = act; 
	  conv = conv + 1; 
	}
	else{
	  keepind[keep] = act; 
	  keep = keep + 1; 
	}
	
      }  /* for(act = 0; act < actblksize; act ++) */

      /* Check whether the blkwise-mode is chosen and ALL the
	 approximations converged, or whether the strip-off mode is
	 active and SOME of the approximations converged */

      found = ((blkwise==1 && conv==actblksize) || (blkwise==0 && conv!=0)) 
	&& (j > actblksize || k == kmax - actblksize);
      
      /***************************************************************************
       *                                                                        *
       * Convergence Case                                                       *
       *                                                                        *
       * In case of convergence, strip off a whole block or just the converged  *
       * ones and put 'em into Q.  Update the matrices Q, V, U, s               *
       *                                                                        *
       **************************************************************************/

      if (found) {

	/* Store Eigenvalues */
	for(act = 0; act < conv; act++)
	  lambda[k+act] = s[convind[act]];
	 
	/* Re-use non approximated Ritz-Values */
	for(act = 0; act < keep; act++)
	  s[act] = s[keepind[act]];

	/* Shift the others in the right position */
	for(act = 0; act < (j-actblksize); act ++)
	  s[act+keep] = s[act+actblksize];

	/* Update V. Re-use the V-Vectors not looked at yet. */
	idummy = j - actblksize;
	for (act = 0; act < n; act = act + jmax) {
	  cnt = act + jmax > n ? n-act : jmax;
	  _FT(zlacpy)(fupl_a, &cnt, &j, V+act, &lda, Vtmp, &jmax, 1);
	  _FT(zgemm)(fupl_n, fupl_n, &cnt, &idummy, &j, &CONE, Vtmp, 
		     &jmax, U+actblksize*jmax, &jmax, &CZERO, V+act+keep*lda, &lda, 1, 1);
	}

	/* Insert the not converged approximations as first columns in V */
	for(act = 0; act < keep; act++){
	  _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+keepind[act])*lda,&lda,V+act*lda,&lda,1);
	}

	/* Store Eigenvectors */
	for(act = 0; act < conv; act++){
	  _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+convind[act])*lda,&lda,Q+(k+act)*lda,&lda,1);
	}

	/* Update SearchSpaceSize j */
	j = j - conv;

	/* Let M become a diagonalmatrix with the Ritzvalues as entries ... */ 
	_FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1);
	for (act = 0; act < j; act++){
	  M[act*jmax + act].re = s[act];
	}
	
	/* ... and U the Identity(jnew,jnew) */
	_FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1);

	if(shift_mode == 1){
	  if(maxmin == 0){
	    for(act = 0; act < conv; act ++){
	      if (lambda[k+act] > tau){
		tau = lambda[k+act];
	      }
	    }
	  }
	  else{
	    for(act = 0; act < conv; act ++){
	      if (lambda[k+act] < tau){
		tau = lambda[k+act];
	      }
	    } 
	  }
	}
	 
	/* Update Converged-Eigenpair-counter and Pro_k */
	k = k + conv;

	/* Update the new blocksize */
	actblksize=min(blksize, kmax-k);

	/* Exit main iteration loop when kmax eigenpairs have been
           approximated */
	if (k == kmax){
	  endflag = 1;
	  break;
	}
	/* Counter for the linear-solver-accuracy */
	for(act = 0; act < keep; act++)
	  solvestep[act] = solvestep[keepind[act]];

	/* Now we expect to have the next eigenvalues */
	/* allready with some accuracy                */
	/* So we do not need to start from scratch... */
	for(act = keep; act < blksize; act++)
	  solvestep[act] = 1;

      } /* if(found) */
      if(endflag == 1){
	break;
      }
      /**************************************************************************
       *                                                                        *
       * Restart                                                                *
       *                                                                        *
       * The Eigenvector-Aproximations corresponding to the first jmin          *
       * Petrov-Vectors are kept.  if (j+actblksize > jmax) {                   *
       *                                                                        *
       **************************************************************************/
      if (j+actblksize > jmax) {

	idummy = j; j = jmin;

	for (act = 0; act < n; act = act + jmax) { /* V = V * U(:,1:j) */
	  cnt = act+jmax > n ? n-act : jmax;
	  _FT(zlacpy)(fupl_a, &cnt, &idummy, V+act, &lda, Vtmp, &jmax, 1);
	  _FT(zgemm)(fupl_n, fupl_n, &cnt, &j, &idummy, &CONE, Vtmp, 
		     &jmax, U, &jmax, &CZERO, V+act, &lda, 1, 1);
	}
	  
	_FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1);
	_FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1);
	for (act = 0; act < j; act++)
	  M[act*jmax + act].re = s[act];
      }

    } /* while(found) */    

    if(endflag == 1){
      break;
    }

    /****************************************************************************
     *                                                                          *
     * Solving the correction equations                                         *
     *                                                                          *
     *                                                                          *
     ****************************************************************************/

    /* Solve actblksize times the correction equation ... */
    for (act = 0; act < actblksize; act ++) {      

      /* Setting start-value for vector v as zeros(n,1). Guarantees
         orthogonality */
      v = V + j*lda;
      for (cnt = 0; cnt < n; cnt ++){ 
	v[cnt].re = 0.;
	v[cnt].im = 0.;
      }

      /* Adaptive accuracy and shift for the lin.solver. In case the
	 residual is big, we don't need a too precise solution for the
	 correction equation, since even in exact arithmetic the
	 solution wouldn't be too usefull for the Eigenproblem. */
      r = Res + act*lda;

      if (resnrm[act] < eps_tr && resnrm[act] < s[act] && resnrm_old[act] > resnrm[act]){
	p_theta = s[act];
      }
      else{
	p_theta = tau;
      }
      p_k = k + actblksize;

      /* if we are in blockwise mode, we do not want to */
      /* iterate solutions much more, if they have      */
      /* allready the desired precision                 */
      if(blkwise == 1 && resnrm[act] < tol) {
	it_tol = pow(toldecay, (double)(-5));
      }
      else {
	it_tol = pow(toldecay, (double)(-solvestep[act]));
      }
      solvestep[act] = solvestep[act] + 1;


      /* equation and project if necessary */
      ModifiedGS_bi(r, n, k + actblksize, Q, lda);

      /*       for(i=0;i<n;i++){ */
      /* 	r[i].re*=-1.; */
      /* 	r[i].im*=-1.; */
      /*       } */
      g_sloppy_precision = 1;
      /* Solve the correction equation ...  */
      if (solver_flag == BICGSTAB){
	info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, 
				   it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi);
      }
      else if(solver_flag == CG){ 
	info = cg_her_bi((bispinor*) v, (bispinor*) r, linitmax, 
			 it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); 
      } 
      else{
	info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, 
				   it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi);
      }
      g_sloppy_precision = 0;
      /* Actualizing profiling data */
      if (info == -1){
	CntCorrIts += linitmax;
      }
      else{
	CntCorrIts += info;
      }
      actcorrits[act] = info;

      /* orthonormalize v to Q, cause the implicit
	 orthogonalization in the solvers may be too inaccurate. Then
	 apply "IteratedCGS" to prevent numerical breakdown 
         in order to orthogonalize v to V */

      ModifiedGS_bi(v, n, k+actblksize, Q, lda);
      IteratedClassicalGS_bi(v, &alpha, n, j, V, temp1, lda);

      alpha = 1.0 / alpha;
      _FT(dscal)(&n2, &alpha, (double*) v, &ONE);
      
      /* update interaction matrix M */
      A_psi((bispinor*) temp1, (bispinor*) v);
      idummy = j+1;
      for(i = 0; i < idummy; i++){
 	M[j*jmax+i] = scalar_prod_bi((bispinor*) (V+i*lda), (bispinor*) temp1, N);
      }
      /* Increasing SearchSpaceSize j */
      j ++;
    }   /* for (act = 0;act < actblksize; act ++) */    

    /* Print information line */
    if(g_proc_id == 0) {
      print_status(verbosity, *it, k, j - blksize, kmax, blksize, actblksize, 
		   s, resnrm, actcorrits);
    }

    /* Increase iteration-counter for outer loop  */
    (*it) = (*it) + 1;

  } /* Main iteration loop */
  
  /******************************************************************
   *                                                                *
   * Eigensolutions converged or iteration limit reached            *
   *                                                                *
   * Print statistics. Free memory. Return.                         *
   *                                                                *
   ******************************************************************/

  *k_conv = k;
  if (verbosity >= 1) {
    if(g_proc_id == 0) {
      printf("\nJDHER execution statistics\n\n");
      printf("IT_OUTER=%d   IT_INNER_TOT=%d   IT_INNER_AVG=%8.2f\n",
	     (*it), CntCorrIts, (double)CntCorrIts/(*it));
      printf("\nConverged eigensolutions in order of convergence:\n");
      printf("\n  I              LAMBDA(I)      RES(I)\n");
      printf("---------------------------------------\n");
    }
    
    for (act = 0; act < *k_conv; act ++) {
      /* Compute the residual for solution act */
      q = Q + act*lda;
      theta = -lambda[act];
      A_psi((bispinor*) r, (bispinor*) q);
      _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE);
      alpha = sqrt(square_norm_bi((bispinor*) r, N));
      if(g_proc_id == 0) {
	printf("%3d %22.15e %12.5e\n", act+1, lambda[act],
	       alpha);
      }
    }
    if(g_proc_id == 0) {
      printf("\n");
      fflush( stdout );
    }
  }

  free(V_); free(Vtmp); free(U); 
  free(s); free(Res_); 
  free(resnrm); free(resnrm_old); 
  free(M); free(Z);
  free(eigwork); free(temp1_);
  free(dtemp); free(rwork);
  free(p_work_bi);
  free(idx1); free(idx2); 
  free(convind); free(keepind); free(solvestep); free(actcorrits);
  
} /* jdher(.....) */
Example #14
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int ix, i;
  su3 * restrict U ALIGN;
  static spinor rs;
  spinor * restrict s ALIGN;
  halfspinor ** phi ALIGN;
#if defined OPTERON
  const int predist=2;
#else
  const int predist=1;
#endif
#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge();
  }
#endif
  /* We will run through the source vector now */
  /* instead of the solution vector            */
  s = k;
  _prefetch_spinor(s);

  if(ieo == 0) {
    U = g_gauge_field_copy[0][0];
  }
  else {
    U = g_gauge_field_copy[1][0];
  }
  phi = NBPointer[ieo];

  _prefetch_su3(U);
  /**************** loop over all lattice sites ******************/
  ix=0;
  for(i = 0; i < (VOLUME)/2; i++){

    /*********************** direction +0 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_add();
      
    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_nt_up((*phi[ix]).s1);
    U++;
    ix++;
    /*********************** direction -0 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +1 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    /*next not needed?*/
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka1);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka1);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;

    /*********************** direction -1 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +2 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka2);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_sub();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka2);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;
    /*********************** direction -2 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_add();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +3 ************************/
    _prefetch_su3(U+predist);
    _prefetch_spinor(s+1);

    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka3);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_sub();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka3);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;

    /*********************** direction -3 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_add();
    _sse_store_nt((*phi[ix]).s1);
    ix++;
    s++;
  }

#    if (defined MPI && !defined _NO_COMM)
  xchange_halffield(); 
#    endif
  s = l;
  phi = NBPointer[2 + ieo];
  if(ieo == 0) {
    U = g_gauge_field_copy[1][0];
  }
  else {
    U = g_gauge_field_copy[0][0];
  }
  _prefetch_su3(U);
  
  /* Now we sum up and expand to a full spinor */
  ix = 0;
  for(i = 0; i < (VOLUME)/2; i++){
    /*********************** direction +0 ************************/
    _vector_assign(rs.s0, (*phi[ix]).s0);
    _vector_assign(rs.s2, (*phi[ix]).s0);
    _vector_assign(rs.s1, (*phi[ix]).s1);
    _vector_assign(rs.s3, (*phi[ix]).s1);
    ix++;

    /*********************** direction -0 ************************/
    _prefetch_su3(U+predist);
      
    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s2);
    _sse_vector_sub();
    _sse_store(rs.s2);

    _sse_load((*phi[ix]).s1);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka0);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s3);
    _sse_vector_sub();
    _sse_store(rs.s3);

    ix++;
    U++;
    /*********************** direction +1 ************************/
    _sse_load_up((*phi[ix]).s0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s3); 

    _sse_load_up((*phi[ix]).s1);
    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s2);       
    ix++;

    /*********************** direction -1 ************************/

    _prefetch_su3(U+predist);

    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka1);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s3);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka1);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s2);
    ix++;
    U++;

    /*********************** direction +2 ************************/
    _sse_load_up((*phi[ix]).s0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_add();
    _sse_store(rs.s3);

    _sse_load_up((*phi[ix]).s1);
    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_sub();
    _sse_store(rs.s2);      
    ix++;

    /*********************** direction -2 ************************/

    _prefetch_su3(U+predist);

    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka2);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_sub();
    _sse_store(rs.s3);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka2);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_add();
    _sse_store(rs.s2);      
    ix++;
    U++;
    /*********************** direction +3 ************************/
    _sse_load_up((*phi[ix]).s0);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s2);

    _sse_load_up((*phi[ix]).s1);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s3);

    ix++;
    /*********************** direction -3 ************************/

    _prefetch_su3(U+predist); 
    _prefetch_spinor(s+1);

    _sse_load((*phi[ix]).s0);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store_nt((*s).s0);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store_nt((*s).s2);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store_nt((*s).s1);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store_nt((*s).s3);
    ix++;
    U++;
    s++;
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}
Example #15
0
/* for ieo=0, k resides on  odd sites and l on even sites */
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int i,ix;
  su3 * restrict U ALIGN;
  spinor * restrict s ALIGN;
  spinor rs;
  static su3_vector psi, chi, psi2, chi2;
  halfspinor * restrict * phi ALIGN;
  halfspinor32 * restrict * phi32 ALIGN;
#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif
#ifdef XLC
#pragma disjoint(*l, *k, *U, *s)
#endif

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge();
  }
#endif

  if(k == l){
    printf("Error in H_psi (simple.c):\n");
    printf("Arguments k and l must be different\n");
    printf("Program aborted\n");
    exit(1);
  }
  s = k;

  if(ieo == 0) {
    U = g_gauge_field_copy[0][0];
  }
  else {
    U = g_gauge_field_copy[1][0];
  }
  if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
    phi32 = NBPointer32[ieo];
      
    /**************** loop over all lattice sites ****************/
    ix=0;
    for(i = 0; i < (VOLUME)/2; i++){
      _vector_assign(rs.s0, (*s).s0);
      _vector_assign(rs.s1, (*s).s1);
      _vector_assign(rs.s2, (*s).s2);
      _vector_assign(rs.s3, (*s).s3);
      s++;
      /*********************** direction +0 ************************/
      
      _vector_add(psi, rs.s0, rs.s2);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s0, ka0, chi);
      
      _vector_add(psi, rs.s1, rs.s3);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s1, ka0, chi);
            
      U++;
      ix++;
    
      /*********************** direction -0 ************************/

      _vector_sub((*phi32[ix]).s0, rs.s0, rs.s2);
      _vector_sub((*phi32[ix]).s1, rs.s1, rs.s3);

      ix++;

      /*********************** direction +1 ************************/

      _vector_i_add(psi, rs.s0, rs.s3);

      _su3_multiply(chi, (*U), psi);
      _complex_times_vector((*phi32[ix]).s0, ka1, chi);

      _vector_i_add(psi, rs.s1, rs.s2);

      _su3_multiply(chi, (*U), psi);
      _complex_times_vector((*phi32[ix]).s1, ka1, chi);

      U++;
      ix++;

      /*********************** direction -1 ************************/

      _vector_i_sub((*phi32[ix]).s0, rs.s0, rs.s3);
      _vector_i_sub((*phi32[ix]).s1, rs.s1, rs.s2);

      ix++;
      /*********************** direction +2 ************************/

      _vector_add(psi, rs.s0, rs.s3);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s0, ka2, chi);

      _vector_sub(psi, rs.s1, rs.s2);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s1, ka2, chi);
      
      U++;
      ix++;

      /*********************** direction -2 ************************/

      _vector_sub((*phi32[ix]).s0, rs.s0, rs.s3);
      _vector_add((*phi32[ix]).s1, rs.s1, rs.s2);
      ix++;

      /*********************** direction +3 ************************/

      _vector_i_add(psi, rs.s0, rs.s2);
      
      _su3_multiply(chi, (*U), psi);
      _complex_times_vector((*phi32[ix]).s0, ka3, chi);


      _vector_i_sub(psi, rs.s1, rs.s3);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s1, ka3, chi);

      U++;
      ix++;
      /*********************** direction -3 ************************/

      _vector_i_sub((*phi32[ix]).s0, rs.s0, rs.s2);
      _vector_i_add((*phi32[ix]).s1, rs.s1, rs.s3);

      ix++;
      /************************ end of loop ************************/
    }
#    if (defined MPI && !defined _NO_COMM)
    xchange_halffield32(); 
#    endif
    s = l;
    phi32 = NBPointer32[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }

    ix = 0;
    for(i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _vector_assign(rs.s0, (*phi32[ix]).s0);
      _vector_assign(rs.s2, (*phi32[ix]).s0);
      _vector_assign(rs.s1, (*phi32[ix]).s1);
      _vector_assign(rs.s3, (*phi32[ix]).s1);
      ix++;
      /*********************** direction -0 ************************/
      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka0,chi);

      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s2, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka0,chi);
      
      _vector_add_assign(rs.s1, psi);
      _vector_sub_assign(rs.s3, psi);
      ix++;
      U++;
      /*********************** direction +1 ************************/

      _vector_add_assign(rs.s0, (*phi32[ix]).s0);
      _vector_i_sub_assign(rs.s3, (*phi32[ix]).s0);

      _vector_add_assign(rs.s1, (*phi32[ix]).s1);
      _vector_i_sub_assign(rs.s2, (*phi32[ix]).s1);
    
      ix++;
      /*********************** direction -1 ************************/
      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka1,chi);

      _vector_add_assign(rs.s0, psi);
      _vector_i_add_assign(rs.s3, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka1,chi);

      _vector_add_assign(rs.s1, psi);
      _vector_i_add_assign(rs.s2, psi);

      U++;
      ix++;

      /*********************** direction +2 ************************/

      _vector_add_assign(rs.s0, (*phi32[ix]).s0);
      _vector_add_assign(rs.s3, (*phi32[ix]).s0);

      _vector_add_assign(rs.s1, (*phi32[ix]).s1);
      _vector_sub_assign(rs.s2, (*phi32[ix]).s1);
    
      ix++;
      /*********************** direction -2 ************************/

      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka2,chi);

      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s3, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi, (*U), psi);
      _complexcjg_times_vector(psi,ka2,chi);
      
      _vector_add_assign(rs.s1, psi);
      _vector_add_assign(rs.s2, psi);

      U++;
      ix++;
      /*********************** direction +3 ************************/

      _vector_add_assign(rs.s0, (*phi32[ix]).s0);
      _vector_i_sub_assign(rs.s2, (*phi32[ix]).s0);

      _vector_add_assign(rs.s1, (*phi32[ix]).s1);
      _vector_i_add_assign(rs.s3, (*phi32[ix]).s1);

      ix++;

      /*********************** direction -3 ************************/

      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka3,chi);
      
      _vector_add((*s).s0, rs.s0, psi);
      _vector_i_add((*s).s2, rs.s2, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka3,chi);

      _vector_add((*s).s1, rs.s1, psi);
      _vector_i_sub((*s).s3, rs.s3, psi);

      U++;
      ix++;
      s++;
    }
  }
  else {
    phi = NBPointer[ieo];
      
    /**************** loop over all lattice sites ****************/
    ix=0;
    /* #pragma ivdep*/
    for(i = 0; i < (VOLUME)/2; i++){
      _vector_assign(rs.s0, (*s).s0);
      _vector_assign(rs.s1, (*s).s1);
      _vector_assign(rs.s2, (*s).s2);
      _vector_assign(rs.s3, (*s).s3);
      s++;
      /*********************** direction +0 ************************/
      
      _vector_add(psi, rs.s0, rs.s2);
      _vector_add(psi2, rs.s1, rs.s3);
      _su3_multiply(chi,(*U),psi);
      _su3_multiply(chi2,(*U),psi2);
      _complex_times_vector((*phi[ix]).s0, ka0, chi);
      _complex_times_vector((*phi[ix]).s1, ka0, chi2);
            
      U++;
      ix++;
    
      /*********************** direction -0 ************************/

      _vector_sub((*phi[ix]).s0, rs.s0, rs.s2);
      _vector_sub((*phi[ix]).s1, rs.s1, rs.s3);

      ix++;

      /*********************** direction +1 ************************/

      _vector_i_add(psi, rs.s0, rs.s3);
      _vector_i_add(psi2, rs.s1, rs.s2);
      _su3_multiply(chi, (*U), psi);
      _su3_multiply(chi2, (*U), psi2);
      _complex_times_vector((*phi[ix]).s0, ka1, chi);
      _complex_times_vector((*phi[ix]).s1, ka1, chi2);

      U++;
      ix++;

      /*********************** direction -1 ************************/

      _vector_i_sub((*phi[ix]).s0, rs.s0, rs.s3);
      _vector_i_sub((*phi[ix]).s1, rs.s1, rs.s2);

      ix++;
      /*********************** direction +2 ************************/

      _vector_add(psi, rs.s0, rs.s3);
      _vector_sub(psi2, rs.s1, rs.s2);
      _su3_multiply(chi,(*U),psi);
      _su3_multiply(chi2,(*U),psi2);
      _complex_times_vector((*phi[ix]).s0, ka2, chi);
      _complex_times_vector((*phi[ix]).s1, ka2, chi2);
      
      U++;
      ix++;

      /*********************** direction -2 ************************/

      _vector_sub((*phi[ix]).s0, rs.s0, rs.s3);
      _vector_add((*phi[ix]).s1, rs.s1, rs.s2);
      ix++;

      /*********************** direction +3 ************************/

      _vector_i_add(psi, rs.s0, rs.s2);
      _vector_i_sub(psi2, rs.s1, rs.s3);      
      _su3_multiply(chi, (*U), psi);
      _su3_multiply(chi2,(*U),psi2);
      _complex_times_vector((*phi[ix]).s0, ka3, chi);
      _complex_times_vector((*phi[ix]).s1, ka3, chi2);

      U++;
      ix++;
      /*********************** direction -3 ************************/

      _vector_i_sub((*phi[ix]).s0, rs.s0, rs.s2);
      _vector_i_add((*phi[ix]).s1, rs.s1, rs.s3);

      ix++;
      /************************ end of loop ************************/
    }
#    if (defined MPI && !defined _NO_COMM)
    xchange_halffield(); 
#    endif
    s = l;
    phi = NBPointer[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }

    ix = 0;
    /* #pragma ivdep */
    for(i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _vector_assign(rs.s0, (*phi[ix]).s0);
      _vector_assign(rs.s2, (*phi[ix]).s0);
      _vector_assign(rs.s1, (*phi[ix]).s1);
      _vector_assign(rs.s3, (*phi[ix]).s1);
      ix++;
      /*********************** direction -0 ************************/
      _su3_inverse_multiply(chi,(*U),(*phi[ix]).s0);
      _su3_inverse_multiply(chi2,(*U),(*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka0,chi);
      _complexcjg_times_vector(psi2,ka0,chi2);
      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s2, psi);
      _vector_add_assign(rs.s1, psi2);
      _vector_sub_assign(rs.s3, psi2);
      ix++;
      U++;
      /*********************** direction +1 ************************/

      _vector_add_assign(rs.s0, (*phi[ix]).s0);
      _vector_i_sub_assign(rs.s3, (*phi[ix]).s0);

      _vector_add_assign(rs.s1, (*phi[ix]).s1);
      _vector_i_sub_assign(rs.s2, (*phi[ix]).s1);
    
      ix++;
      /*********************** direction -1 ************************/

      _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0);
      _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka1,chi);
      _complexcjg_times_vector(psi2,ka1,chi2);
      _vector_add_assign(rs.s0, psi);
      _vector_i_add_assign(rs.s3, psi);
      _vector_add_assign(rs.s1, psi2);
      _vector_i_add_assign(rs.s2, psi2);

      U++;
      ix++;

      /*********************** direction +2 ************************/

      _vector_add_assign(rs.s0, (*phi[ix]).s0);
      _vector_add_assign(rs.s3, (*phi[ix]).s0);

      _vector_add_assign(rs.s1, (*phi[ix]).s1);
      _vector_sub_assign(rs.s2, (*phi[ix]).s1);
    
      ix++;
      /*********************** direction -2 ************************/

      _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0);
      _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka2,chi);
      _complexcjg_times_vector(psi2,ka2,chi2);
      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s3, psi);
      _vector_add_assign(rs.s1, psi2);
      _vector_add_assign(rs.s2, psi2);

      U++;
      ix++;
      /*********************** direction +3 ************************/

      _vector_add_assign(rs.s0, (*phi[ix]).s0);
      _vector_i_sub_assign(rs.s2, (*phi[ix]).s0);

      _vector_add_assign(rs.s1, (*phi[ix]).s1);
      _vector_i_add_assign(rs.s3, (*phi[ix]).s1);

      ix++;

      /*********************** direction -3 ************************/

      _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0);
      _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka3,chi);
      _complexcjg_times_vector(psi2,ka3,chi2);      
      _vector_add((*s).s0, rs.s0, psi);
      _vector_i_add((*s).s2, rs.s2, psi);
      _vector_add((*s).s1, rs.s1, psi2);
      _vector_i_sub((*s).s3, rs.s3, psi2);

      U++;
      ix++;
      s++;
    }
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}