C++ (Cpp) D_psi 예제들

예제 #1

0

파일 보기

파일: tm_operators.c 프로젝트: chaotika86/tmLQCD

/* the "full" operators */
void Q_pm_psi_prec(spinor * const l, spinor * const k)
{
  spinorPrecWS *ws=(spinorPrecWS*)g_precWS;

  _Complex double ALIGN alpha= -1.0;

  if(g_prec_sequence_d_dagger_d[0]!=0.0)
  {
    alpha = g_prec_sequence_d_dagger_d[0];
    spinorPrecondition(l,k,ws,T,L,alpha,0,1);
  } 
  else
    assign(l,k,VOLUME);

  g_mu = -g_mu;
  D_psi(g_spinor_field[DUM_MATRIX], l);
  gamma5(l, g_spinor_field[DUM_MATRIX], VOLUME);
  g_mu = -g_mu;

  if(g_prec_sequence_d_dagger_d[1]!=0.0)
  {
    alpha = g_prec_sequence_d_dagger_d[1];
    spinorPrecondition(l,l,ws,T,L,alpha,0,1);
  }

  D_psi(g_spinor_field[DUM_MATRIX], l);
  gamma5(l, g_spinor_field[DUM_MATRIX], VOLUME);

  if(g_prec_sequence_d_dagger_d[2]!=0.0)
  {
    alpha = g_prec_sequence_d_dagger_d[2]; 
    spinorPrecondition(l,l,ws,T,L,alpha,0,1);
  }

}

예제 #2

0

파일 보기

파일: tm_operators.c 프로젝트: chaotika86/tmLQCD

/* the "full" operators */
void Q_pm_psi2(spinor * const l, spinor * const k)
{
  g_mu = -10.*g_mu;
  D_psi(l, k);
  gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME);
  g_mu = -g_mu/10.;
  D_psi(l, g_spinor_field[DUM_MATRIX]);
  gamma5(l, l, VOLUME);
}

예제 #3

0

파일 보기

파일: tm_operators.c 프로젝트: chaotika86/tmLQCD

/* This is the version for the gpu with interchanged order of gamma5 and D_psi (Florian Burger)*/
void Q_pm_psi_gpu(spinor * const l, spinor * const k)
{
  gamma5(k, k, VOLUME);
  g_mu = -g_mu;
  D_psi(l, k);
  gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME);
  g_mu = -g_mu;
  D_psi(l, g_spinor_field[DUM_MATRIX]);
  
}

예제 #4

0

파일 보기

파일: tm_operators.c 프로젝트: chaotika86/tmLQCD

void Q_minus_psi(spinor * const l, spinor * const k)
{
  g_mu = -g_mu;
  D_psi(l, k);
  g_mu = -g_mu;
  gamma5(l, l, VOLUME);
}

예제 #5

0

파일 보기

파일: Dov_psi.c 프로젝트: palao/tmLQCD

/* |R>=rnorm^2 Q^2 |S> */
void norm_Q_sqr_psi(spinor * const R, spinor * const S, 
		    const double rnorm) { 

  spinor *aux;
  aux=lock_Dov_WS_spinor(1);

  /* Term -1-s is done in D_psi! does this comment make sense for HMC? */
  /* no, it doesn't, we do have to work on this */
  /* here we need to set kappa = 1./(2 (-1-s) + 8) */
  D_psi(R, S);
  gamma5(aux, R, VOLUME);
  D_psi(R, aux);
  gamma5(R, R, VOLUME);
  mul_r(R, rnorm*rnorm, R, VOLUME);

  unlock_Dov_WS_spinor(1);
  return;
}

예제 #6

0

파일 보기

파일: D_psi.c 프로젝트: Finkenrath/tmLQCD

void D_psi_prec(spinor * const P, spinor * const Q){

  /* todo: do preconditioning */
  spinorPrecWS *ws=(spinorPrecWS*)g_precWS;
  static _Complex double alpha = -1.0;

  alpha = -0.5;
  spinorPrecondition(P,Q,ws,T,L,alpha,0,1);
  D_psi(g_spinor_field[DUM_MATRIX],P);
  alpha = -0.5;
  spinorPrecondition(P,g_spinor_field[DUM_MATRIX],ws,T,L,alpha,0,1);
}

예제 #7

0

파일 보기

파일: Msap.c 프로젝트: Finkenrath/tmLQCD

void Msap(spinor * const P, spinor * const Q, const int Ncy, const int Niter) {
  int blk, ncy = 0, eo, vol;
  spinor * r, * a, * b;
  double nrm;
  spinor ** solver_field = NULL;
  const int nr_sf = 6;

  /* 
   * here it would be probably better to get the working fields as a parameter 
   * from the calling function
   */
  init_solver_field(&solver_field, VOLUME, nr_sf);
  r = solver_field[0];
  a = solver_field[1];
  b = solver_field[2];

  for(ncy = 0; ncy < Ncy; ncy++) {
    /* compute the global residue        */
    /* this can be done more efficiently */
    /* here only a naive implementation  */
    for(eo = 0; eo < 2; eo++) {
      D_psi(r, P);
      diff(r, Q, r, VOLUME);
      nrm = square_norm(r, VOLUME, 1);
      if(g_proc_id == 0 && g_debug_level > 2 && eo == 1) {  /*  GG, was 1 */
	printf("Msap: %d %1.3e\n", ncy, nrm);
	fflush(stdout);
      }
      /* choose the even (odd) block */

      /*blk = eolist[eo];*/

      for (blk = 0; blk < nb_blocks; blk++) {
	if(block_list[blk].evenodd == eo) {
	  vol = block_list[blk].volume;

	  /* get part of r corresponding to block blk into b */
	  copy_global_to_block(b, r, blk);
	  // does this work?? i.e. solver_field[3]
	  mrblk(a, b, solver_field[3], Niter, 1.e-31, 1, vol, &dummy_Di, blk);

	  /* add a up to full spinor P */
	  add_block_to_global(P, a, blk);
	}
      }
    }
  }
  finalize_solver(solver_field, nr_sf);
  return;
}

예제 #8

0

파일 보기

파일: Dov_psi.c 프로젝트: palao/tmLQCD

void addproj_q_invsqrt(spinor * const Q, spinor * const P, const int n, const int N) {
  
  int j;
  spinor  *aux;
  complex cnorm, lambda;
  static double save_ev[2]={-1.,-1.};
  static int * ev_sign = NULL;
  
  if(eigenvls[0] != save_ev[0] && eigenvls[1] != save_ev[1] ) {
    if(g_proc_id == 0 && g_debug_level > 1) {
      printf("# Recomputing eigenvalue signs!\n");
      fflush(stdout);
    }
    for(j = 0; j < 2; j++) {
      save_ev[j] = eigenvls[j];
    }
    free(ev_sign);
    ev_sign = (int*) malloc(n * sizeof(int));

    aux=lock_Dov_WS_spinor(1);

    for(j=0; j < n; j++) {
      D_psi(aux, &(eigenvectors[j*evlength]));
      gamma5(aux, aux, N);
      
      lambda = scalar_prod(&(eigenvectors[j*evlength]), aux, N, 1);
      if (lambda.re < 0) {
	ev_sign[j] = -1;
      }
      else {
	ev_sign[j] = 1;
      }
    }

    unlock_Dov_WS_spinor(1);
/*     free(aux_); */
  }

  for(j = 0; j < n; j++) {
    cnorm = scalar_prod(&(eigenvectors[j*evlength]), P, N, 1);
    
    cnorm.re *= (double)ev_sign[j];
    cnorm.im *= (double)ev_sign[j];
    
    assign_add_mul(Q, &(eigenvectors[j*evlength]), cnorm, N);
  }
  return;
}

예제 #9

0

파일 보기

파일: Dov_psi.c 프로젝트: palao/tmLQCD

/* |R>=rnorm^n Q^n |S>                                                     */
void norm_Q_n_psi(spinor * const R, spinor * const S, 
		  const int n, const double rnorm) { 

  int i;
  double npar = 1.;
  spinor *aux;

  aux=lock_Dov_WS_spinor(1);

  assign(aux, S, VOLUME);
  
  
  for(i=0; i < n; i++){
    D_psi(R, aux);
    /* Term -1-s is done in D_psi! does this comment make sense for HMC? */
    gamma5(aux, R, VOLUME);
    npar *= rnorm;
  }
  mul_r(R, npar, aux, VOLUME);
  unlock_Dov_WS_spinor(1);
  return;
}

예제 #10

0

파일 보기

파일: test_Dslash.c 프로젝트: Marcogarofalo/tmLQCD

int main(int argc,char *argv[])
{
#ifdef _USE_HALFSPINOR
	#undef _USE_HALFSPINOR
	printf("# WARNING: USE_HALFSPINOR will be ignored (not supported here).\n");
#endif

	if(even_odd_flag)
	{
		even_odd_flag=0;
		printf("# WARNING: even_odd_flag will be ignored (not supported here).\n");
	}
  int j,j_max,k,k_max = 1;
#ifdef HAVE_LIBLEMON
  paramsXlfInfo *xlfInfo;
#endif
  int status = 0;

  static double t1,t2,dt,sdt,dts,qdt,sqdt;
  double antioptaway=0.0;

#ifdef MPI
  static double dt2;

  DUM_DERI = 6;
  DUM_SOLVER = DUM_DERI+2;
  DUM_MATRIX = DUM_SOLVER+6;
  NO_OF_SPINORFIELDS = DUM_MATRIX+2;

#  ifdef OMP
  int mpi_thread_provided;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
#  else
  MPI_Init(&argc, &argv);
#  endif
  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);

#else
  g_proc_id = 0;
#endif

  g_rgi_C1 = 1.;

    /* Read the input file */
  if((status = read_input("test_Dslash.input")) != 0) {
    fprintf(stderr, "Could not find input file: test_Dslash.input\nAborting...\n");
    exit(-1);
  }

#ifdef OMP
  init_openmp();
#endif

  tmlqcd_mpi_init(argc, argv);



  if(g_proc_id==0) {
#ifdef SSE
    printf("# The code was compiled with SSE instructions\n");
#endif
#ifdef SSE2
    printf("# The code was compiled with SSE2 instructions\n");
#endif
#ifdef SSE3
    printf("# The code was compiled with SSE3 instructions\n");
#endif
#ifdef P4
    printf("# The code was compiled for Pentium4\n");
#endif
#ifdef OPTERON
    printf("# The code was compiled for AMD Opteron\n");
#endif
#ifdef _GAUGE_COPY
    printf("# The code was compiled with -D_GAUGE_COPY\n");
#endif
#ifdef BGL
    printf("# The code was compiled for Blue Gene/L\n");
#endif
#ifdef BGP
    printf("# The code was compiled for Blue Gene/P\n");
#endif
#ifdef _USE_HALFSPINOR
    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
#endif
#ifdef _USE_SHMEM
    printf("# The code was compiled with -D_USE_SHMEM\n");
#  ifdef _PERSISTENT
    printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
#  endif
#endif
#ifdef MPI
#  ifdef _NON_BLOCKING
    printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
#  endif
#endif
    printf("\n");
    fflush(stdout);
  }


#ifdef _GAUGE_COPY
  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
#else
  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
#endif
  init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);

  if(even_odd_flag) {
    j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1);
  }
  else {
    j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max);
  }

  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
    exit(0);
  }
  j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for moment fields! Aborting...\n");
    exit(0);
  }

  if(g_proc_id == 0) {
    fprintf(stdout,"# The number of processes is %d \n",g_nproc);
    printf("# The lattice size is %d x %d x %d x %d\n",
	   (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ));
    printf("# The local lattice size is %d x %d x %d x %d\n",
	   (int)(T), (int)(LX), (int)(LY),(int) LZ);
//    if(even_odd_flag) {
//      printf("# benchmarking the even/odd preconditioned Dirac operator\n");
//    }
//    else {
//      printf("# benchmarking the standard Dirac operator\n");
//    }
    fflush(stdout);
  }

  /* define the geometry */
  geometry();
  /* define the boundary conditions for the fermion fields */
  boundary(g_kappa);

#ifdef _USE_HALFSPINOR
  j = init_dirac_halfspinor();
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
    exit(0);
  }
  if(g_sloppy_precision_flag == 1) {
    g_sloppy_precision = 1;
    j = init_dirac_halfspinor32();
    if ( j!= 0) {
      fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n");
      exit(0);
    }
  }
#  if (defined _PERSISTENT)
  init_xchange_halffield();
#  endif
#endif

  status = check_geometry();
  if (status != 0) {
    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
    exit(1);
  }
#if (defined MPI && !(defined _USE_SHMEM))
  check_xchange();
#endif

  start_ranlux(1, 123456);
  random_gauge_field(reproduce_randomnumber_flag, g_gauge_field);

#ifdef MPI
  /*For parallelization: exchange the gaugefield */
  xchange_gauge(g_gauge_field);
#endif

	/* the non even/odd case now */
	/*initialize the pseudo-fermion fields*/
	j_max=1;
	sdt=0.;
	for (k=0;k<k_max;k++) {
	  random_spinor_field_lexic(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS);
	}

#ifdef MPI
      MPI_Barrier(MPI_COMM_WORLD);
#endif
      t1 = gettime();

      /* here the actual Dslash */
      D_psi(g_spinor_field[0], g_spinor_field[1]);

      t2 = gettime();
      dt=t2-t1;
#ifdef MPI
      MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#else
      sdt = dt;
#endif

    if(g_proc_id==0) {
      printf("# Time for Dslash %e sec.\n", sdt);
      printf("\n");
      fflush(stdout);
    }

#ifdef HAVE_LIBLEMON
  if(g_proc_id==0) {
    printf("# Performing parallel IO test ...\n");
  }
  xlfInfo = construct_paramsXlfInfo(0.5, 0);
  write_gauge_field( "conf.test", 64, xlfInfo);
  free(xlfInfo);
  if(g_proc_id==0) {
    printf("# done ...\n");
  }
#endif


#ifdef OMP
  free_omp_accumulators();
#endif
  free_gauge_field();
  free_geometry_indices();
  free_spinor_field();
  free_moment_field();
#ifdef MPI
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
#endif
  return(0);
}

예제 #11

0

파일 보기

파일: check_locallity.c 프로젝트: annube/tmLQCD

int main(int argc,char *argv[]) {

  FILE *parameterfile=NULL;
  int c, j, is=0, ic=0;
  int x, X, y, Y, z, Z, t, tt, i, sum;
  char * filename = NULL;
  char datafilename[50];
  char parameterfilename[50];
  char conf_filename[50];
  char * input_filename = NULL;
  double plaquette_energy, nrm;
  double * norm;
  struct stout_parameters params_smear;
  
#ifdef _GAUGE_COPY
  int kb=0;
#endif
#ifdef MPI
  double atime=0., etime=0.;
#endif
#ifdef _KOJAK_INST
#pragma pomp inst init
#pragma pomp inst begin(main)
#endif

  DUM_DERI = 6;
  /* DUM_DERI + 2 is enough (not 7) */
  DUM_SOLVER = DUM_DERI+2;
  DUM_MATRIX = DUM_SOLVER+6;
  /* DUM_MATRIX + 2 is enough (not 6) */
  NO_OF_SPINORFIELDS = DUM_MATRIX+2;

  verbose = 0;
  g_use_clover_flag = 0;
  g_nr_of_psf = 1;

#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "h?f:o:")) != -1) {
    switch (c) {
    case 'f': 
      input_filename = calloc(200, sizeof(char));
      strcpy(input_filename,optarg);
      break;
    case 'o':
      filename = calloc(200, sizeof(char));
      strcpy(filename,optarg);
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }
  if(input_filename == NULL){
    input_filename = "hmc.input";
  }
  if(filename == NULL){
    filename = "output";
  } 

  /* Read the input file */
  read_input(input_filename);
  /* here we want no even/odd preconditioning */
  even_odd_flag = 0;

  /* this DBW2 stuff is not needed for the inversion ! */
  g_rgi_C1 = 0;
  if(Nsave == 0){
    Nsave = 1;
  }
  tmlqcd_mpi_init(argc, argv);

  g_dbw2rand = 0;

#ifndef MPI
  g_dbw2rand = 0;
#endif

#ifdef _GAUGE_COPY
  j = init_gauge_field(VOLUMEPLUSRAND, 1);
#else
  j = init_gauge_field(VOLUMEPLUSRAND, 0);
#endif
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
    exit(-1);
  }
  j = init_geometry_indices(VOLUMEPLUSRAND);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n");
    exit(-1);
  }
  if(even_odd_flag) {
    j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS);
  }
  else {
    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
  }
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
    exit(-1);
  }

  g_mu = g_mu1; 
  if(g_proc_id == 0){    
    /*construct the filenames for the observables and the parameters*/
    strcpy(datafilename,filename);  strcat(datafilename,".data");
    strcpy(parameterfilename,filename);  strcat(parameterfilename,".para");
    
    parameterfile=fopen(parameterfilename, "w");
    write_first_messages(parameterfile, 0, 1);
  }

  /* define the geometry */
  geometry();

  /* define the boundary conditions for the fermion fields */
  boundary();

#ifdef _USE_HALFSPINOR
  j = init_dirac_halfspinor();
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
    exit(-1);
  }
  if(g_sloppy_precision_flag == 1) {
    j = init_dirac_halfspinor32();
    if ( j!= 0) {
      fprintf(stderr, "Not enough memory for 32-Bit halffield! Aborting...\n");
      exit(-1);
    }
  }
#  if (defined _PERSISTENT)
  init_xchange_halffield();
#  endif
#endif
  norm = (double*)calloc(3.*LX/2.+T/2., sizeof(double));

  for(j=0;j<Nmeas; j++) {
    sprintf(conf_filename,"%s.%.4d", gauge_input_filename, nstore);
    if (g_proc_id == 0){
      printf("Reading Gauge field from file %s\n", conf_filename); fflush(stdout);
    }
    read_lime_gauge_field(conf_filename);
    if (g_proc_id == 0){
      printf("done!\n"); fflush(stdout);
    }
#ifdef MPI
    xchange_gauge();
#endif
#ifdef _GAUGE_COPY
    update_backward_gauge();
#endif

    /* Compute minimal eigenvalues, if wanted */
    if(compute_evs != 0) {
      eigenvalues(&no_eigenvalues, 1000, eigenvalue_precision, 0, compute_evs, nstore, even_odd_flag);
    }
    /*compute the energy of the gauge field*/
    plaquette_energy = measure_gauge_action();

    if(g_proc_id == 0) {
      printf("The plaquette value is %e\n", plaquette_energy/(6.*VOLUME*g_nproc)); fflush(stdout);
    }
    if (use_stout_flag == 1){
      params_smear.rho = stout_rho;
      params_smear.iterations = stout_no_iter;
      if (stout_smear((su3_tuple*)(g_gauge_field[0]), &params_smear, (su3_tuple*)(g_gauge_field[0])) != 0)
        exit(1) ;
      g_update_gauge_copy = 1;
      g_update_gauge_energy = 1;
      g_update_rectangle_energy = 1;
      plaquette_energy = measure_gauge_action();

      if (g_proc_id == 0) {
        printf("# The plaquette value after stouting is %e\n", plaquette_energy / (6.*VOLUME*g_nproc));
        fflush(stdout);
      }
    }

    source_spinor_field(g_spinor_field[0], g_spinor_field[1], 0, 0);
    convert_eo_to_lexic(g_spinor_field[DUM_DERI], g_spinor_field[0], g_spinor_field[1]);
    D_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]);
    if(even_odd_flag) {
      i = invert_eo(g_spinor_field[2], g_spinor_field[3], g_spinor_field[0], g_spinor_field[1], 
		    solver_precision, max_solver_iterations, solver_flag, g_relative_precision_flag,
		    sub_evs_cg_flag, even_odd_flag);
      convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], g_spinor_field[2], g_spinor_field[3]);
    }


    for(i = 0; i < 3*LX/2+T/2; i++){
      norm[i] = 0.;
    }
    
    for(x = 0; x < LX; x++){
      if(x > LX/2) X = LX-x;
      else X = x;
      for(y = 0; y < LY; y++){
	if(y > LY/2) Y = LY-y;
	else Y = y;
	for(z = 0; z < LZ; z++){
	  if(z > LZ/2) Z = LZ-z;
	  else Z = z;
	  for(t = 0; t < T; t++){
	    if(t > T/2) tt = T - t;
	    else tt = t;
	    sum = X + Y + Z + tt;
	    _spinor_norm_sq(nrm, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ]);
/* 	    _spinor_norm_sq(nrm, qprop[0][0][1][ g_ipt[t][x][y][z] ]); */
 	    printf("%e %e\n", g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.re, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0.im);
	    nrm = sqrt( nrm );
	    printf("%1.12e\n", nrm);
	    if(nrm > norm[sum]) norm[sum] = nrm;
	  }
	}
      }
    }
    
    for(i = 0; i < 3*L/2+T/2; i++){
      printf("%d %1.12e\n", i, norm[i]);
    }
    printf("\n");
    
    nstore+=Nsave;
  }

#ifdef MPI
  MPI_Finalize();
#endif
  free_gauge_field();
  free_geometry_indices();
  free_spinor_field();
  free_moment_field();
  return(0);
#ifdef _KOJAK_INST
#pragma pomp inst end(main)
#endif
}

예제 #12

0

파일 보기

파일: Msap.c 프로젝트: Finkenrath/tmLQCD

void Msap_eo_old(spinor * const P, spinor * const Q, const int Ncy, const int Niter) {
  int blk, ncy = 0, eo, vol, vols;
  spinor * r, * a, * b, * c;
  double nrm;
  double musave = g_mu;
  double kappasave = g_kappa;
  spinor * b_even, * b_odd, * a_even, * a_odd;
  spinor ** solver_field = NULL;
  // also get space for mrblk! 6 = 3+3
  const int nr_sf = 6;

  if(kappa_dflgen > 0) {
    g_kappa = kappa_dfl;
  }
  if(mu_dflgen > -10) {
    g_mu = mu_dfl;
    // make sure the sign is correct!
    if(g_mu*musave < 0) g_mu *= -1.;
  }
  boundary(g_kappa);
  /* 
   * here it would be probably better to get the working fields as a parameter 
   * from the calling function
   */
  vols = block_list[0].volume/2+block_list[0].spinpad;
  vol = block_list[0].volume/2;

  init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf);
  r = solver_field[0];
  a = solver_field[1];
  b = solver_field[2];

  for(ncy = 0; ncy < Ncy; ncy++) {
    /* compute the global residue        */
    /* this can be done more efficiently */
    /* here only a naive implementation  */
    for(eo = 0; eo < 2; eo++) {
      D_psi(r, P);
      diff(r, Q, r, VOLUME);
      nrm = square_norm(r, VOLUME, 1);
      if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) {
	printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa);
	fflush(stdout);
      }
      /* choose the even (odd) block */

      // rely on nested parallelism
      // 
      #ifdef TM_USE_OMP
      # pragma omp parallel for private (a_even, a_odd, b_even, b_odd, c)
      #endif
      for (blk = 0; blk < nb_blocks; blk++) {
 	b_even = b + blk*2*vols;
 	b_odd = b +blk*2*vols + vols;
 	a_even = a + blk*2*vols;
 	a_odd = a + blk*2*vols + vols;
	c = solver_field[3] + blk*vols;

	if(block_list[blk].evenodd == eo) {
	  /* get part of r corresponding to block blk into b_even and b_odd */

	  copy_global_to_block_eo(b_even, b_odd, r, blk);
	  if(g_c_sw > 0) {
	    assign_mul_one_sw_pm_imu_inv_block(EE, a_even, b_even, g_mu, &block_list[blk]);
	    Block_H_psi(&block_list[blk], a_odd, a_even, OE);
	    /* a_odd = b_odd - a_odd */
	    diff(a_odd, b_odd, a_odd, vol);

	    mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Msw_plus_block_psi, blk);
	    
	    Block_H_psi(&block_list[blk], b_even, b_odd, EO);
	    assign(c, b_even, vol);
	    assign_mul_one_sw_pm_imu_inv_block(EE, b_even, c, g_mu, &block_list[blk]);
	  }
	  else {
	    assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol);
	    Block_H_psi(&block_list[blk], a_odd, a_even, OE);
	    /* a_odd = b_odd - a_odd */
	    diff(a_odd, b_odd, a_odd, vol);

	    mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk);

	    Block_H_psi(&block_list[blk], b_even, b_odd, EO);
	    mul_one_pm_imu_inv(b_even, +1., vol);
	  }
	  /* a_even = a_even - b_even */
	  diff(a_even, a_even, b_even, vol);

	  /* add even and odd part up to full spinor P */
	  add_eo_block_to_global(P, a_even, b_odd, blk);

	}
      }
    }
  }
  finalize_solver(solver_field, nr_sf);
  g_mu = musave;
  g_kappa = kappasave;
  boundary(g_kappa);
  return;
}

예제 #13

0

파일 보기

파일: benchmark.c 프로젝트: VincentDrach/tmLQCD

int main(int argc,char *argv[])
{
  int j,j_max,k,k_max = 1;
#ifdef HAVE_LIBLEMON
  paramsXlfInfo *xlfInfo;
#endif
  int status = 0;
  
  static double t1,t2,dt,sdt,dts,qdt,sqdt;
  double antioptaway=0.0;

#ifdef MPI
  static double dt2;
  
  DUM_DERI = 6;
  DUM_SOLVER = DUM_DERI+2;
  DUM_MATRIX = DUM_SOLVER+6;
  NO_OF_SPINORFIELDS = DUM_MATRIX+2;

#  ifdef OMP
  int mpi_thread_provided;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
#  else
  MPI_Init(&argc, &argv);
#  endif
  MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);

#else
  g_proc_id = 0;
#endif

  g_rgi_C1 = 1.; 
  
    /* Read the input file */
  if((status = read_input("benchmark.input")) != 0) {
    fprintf(stderr, "Could not find input file: benchmark.input\nAborting...\n");
    exit(-1);
  }

#ifdef OMP
  if(omp_num_threads > 0) 
  {
     omp_set_num_threads(omp_num_threads);
  }
  else {
    if( g_proc_id == 0 )
      printf("# No value provided for OmpNumThreads, running in single-threaded mode!\n");

    omp_num_threads = 1;
    omp_set_num_threads(omp_num_threads);
  }

  init_omp_accumulators(omp_num_threads);
#endif

  tmlqcd_mpi_init(argc, argv);
  
  if(g_proc_id==0) {
#ifdef SSE
    printf("# The code was compiled with SSE instructions\n");
#endif
#ifdef SSE2
    printf("# The code was compiled with SSE2 instructions\n");
#endif
#ifdef SSE3
    printf("# The code was compiled with SSE3 instructions\n");
#endif
#ifdef P4
    printf("# The code was compiled for Pentium4\n");
#endif
#ifdef OPTERON
    printf("# The code was compiled for AMD Opteron\n");
#endif
#ifdef _GAUGE_COPY
    printf("# The code was compiled with -D_GAUGE_COPY\n");
#endif
#ifdef BGL
    printf("# The code was compiled for Blue Gene/L\n");
#endif
#ifdef BGP
    printf("# The code was compiled for Blue Gene/P\n");
#endif
#ifdef _USE_HALFSPINOR
    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
#endif    
#ifdef _USE_SHMEM
    printf("# The code was compiled with -D_USE_SHMEM\n");
#  ifdef _PERSISTENT
    printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
#  endif
#endif
#ifdef MPI
#  ifdef _NON_BLOCKING
    printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
#  endif
#endif
    printf("\n");
    fflush(stdout);
  }
  
  
#ifdef _GAUGE_COPY
  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
#else
  init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
#endif
  init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);

  if(even_odd_flag) {
    j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1);
  }
  else {
    j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max);
  }

  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
    exit(0);
  }
  j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand);
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for moment fields! Aborting...\n");
    exit(0);
  }
  
  if(g_proc_id == 0) {
    fprintf(stdout,"# The number of processes is %d \n",g_nproc);
    printf("# The lattice size is %d x %d x %d x %d\n",
	   (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ));
    printf("# The local lattice size is %d x %d x %d x %d\n", 
	   (int)(T), (int)(LX), (int)(LY),(int) LZ);
    if(even_odd_flag) {
      printf("# benchmarking the even/odd preconditioned Dirac operator\n");
    }
    else {
      printf("# benchmarking the standard Dirac operator\n");
    }
    fflush(stdout);
  }
  
  /* define the geometry */
  geometry();
  /* define the boundary conditions for the fermion fields */
  boundary(g_kappa);

#ifdef _USE_HALFSPINOR
  j = init_dirac_halfspinor();
  if ( j!= 0) {
    fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
    exit(0);
  }
  if(g_sloppy_precision_flag == 1) {
    g_sloppy_precision = 1;
    j = init_dirac_halfspinor32();
    if ( j!= 0) {
      fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n");
      exit(0);
    }
  }
#  if (defined _PERSISTENT)
  init_xchange_halffield();
#  endif
#endif  

  status = check_geometry();
  if (status != 0) {
    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
    exit(1);
  }
#if (defined MPI && !(defined _USE_SHMEM))
  check_xchange(); 
#endif

  start_ranlux(1, 123456);
  random_gauge_field(reproduce_randomnumber_flag);

#ifdef MPI
  /*For parallelization: exchange the gaugefield */
  xchange_gauge(g_gauge_field);
#endif

  if(even_odd_flag) {
    /*initialize the pseudo-fermion fields*/
    j_max=2048;
    sdt=0.;
    for (k = 0; k < k_max; k++) {
      random_spinor_field(g_spinor_field[k], VOLUME/2, 0);
    }
    
    while(sdt < 30.) {
#ifdef MPI
      MPI_Barrier(MPI_COMM_WORLD);
#endif
      t1 = gettime();
      antioptaway=0.0;
      for (j=0;j<j_max;j++) {
        for (k=0;k<k_max;k++) {
          Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
          Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
          antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
        }
      }
      t2 = gettime();
      dt = t2-t1;
#ifdef MPI
      MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#else
      sdt = dt;
#endif
      qdt=dt*dt;
#ifdef MPI
      MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#else
      sqdt = qdt;
#endif
      sdt=sdt/((double)g_nproc);
      sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
      j_max*=2;
    }
    j_max=j_max/2;
    dts=dt;
    sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
    sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));
    
    if(g_proc_id==0) {
      printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
      printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
      printf("# Communication switched on:\n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/sdt),(int)sizeof(spinor)/3);
#ifdef OMP
      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*sdt)));
#endif
      printf("\n");
      fflush(stdout);
    }
    
#ifdef MPI
    /* isolated computation */
    t1 = gettime();
    antioptaway=0.0;
    for (j=0;j<j_max;j++) {
      for (k=0;k<k_max;k++) {
        Hopping_Matrix_nocom(0, g_spinor_field[k+k_max], g_spinor_field[k]);
        Hopping_Matrix_nocom(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
        antioptaway += creal(g_spinor_field[2*k_max][0].s0.c0);
      }
    }
    t2 = gettime();
    dt2 = t2-t1;
    /* compute the bandwidth */
    dt=dts-dt2;
    MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    sdt=sdt/((double)g_nproc);
    MPI_Allreduce (&dt2, &dt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    dt=dt/((double)g_nproc);
    dt=1.0e6f*dt/((double)(k_max*j_max*(VOLUME)));
    if(g_proc_id==0) {
      printf("# The following result is printed just to make sure that the calculation is not optimized away: %e\n",antioptaway);
      printf("# Communication switched off: \n# (%d Mflops [%d bit arithmetic])\n", (int)(1608.0f/dt),(int)sizeof(spinor)/3);
#ifdef OMP
      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1608.0f/(omp_num_threads*dt)));
#endif
      printf("\n"); 
      fflush(stdout);
    }
    sdt=sdt/((double)k_max);
    sdt=sdt/((double)j_max);
    sdt=sdt/((double)(2*SLICE));
    if(g_proc_id==0) {
      printf("# The size of the package is %d bytes.\n",(SLICE)*192);
#ifdef _USE_HALFSPINOR
      printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 192./sdt/1024/1024, 192./sdt/1024./1024);
#else
      printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 2.*192./sdt/1024/1024, 2.*192./sdt/1024./1024);
#endif
    }
#endif
    fflush(stdout);
  }
  else {
    /* the non even/odd case now */
    /*initialize the pseudo-fermion fields*/
    j_max=1;
    sdt=0.;
    for (k=0;k<k_max;k++) {
      random_spinor_field(g_spinor_field[k], VOLUME, 0);
    }
    
    while(sdt < 3.) {
#ifdef MPI
      MPI_Barrier(MPI_COMM_WORLD);
#endif
      t1 = gettime();
      for (j=0;j<j_max;j++) {
        for (k=0;k<k_max;k++) {
          D_psi(g_spinor_field[k+k_max], g_spinor_field[k]);
          antioptaway+=creal(g_spinor_field[k+k_max][0].s0.c0);
        }
      }
      t2 = gettime();
      dt=t2-t1;
#ifdef MPI
      MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#else
      sdt = dt;
#endif
      qdt=dt*dt;
#ifdef MPI
      MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#else
      sqdt = qdt;
#endif
      sdt=sdt/((double)g_nproc);
      sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
      j_max*=2;
    }
    j_max=j_max/2;
    dts=dt;
    sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
    sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));

    if(g_proc_id==0) {
      printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
      printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
      printf("\n# (%d Mflops [%d bit arithmetic])\n", (int)(1680.0f/sdt),(int)sizeof(spinor)/3);
#ifdef OMP
      printf("# Mflops per OpenMP thread ~ %d\n",(int)(1680.0f/(omp_num_threads*sdt)));
#endif
      printf("\n"); 
      fflush(stdout);
    }
  }
#ifdef HAVE_LIBLEMON
  if(g_proc_id==0) {
    printf("# Performing parallel IO test ...\n");
  }
  xlfInfo = construct_paramsXlfInfo(0.5, 0);
  write_gauge_field( "conf.test", 64, xlfInfo);
  free(xlfInfo);
  if(g_proc_id==0) {
    printf("# done ...\n");
  }
#endif


#ifdef MPI
  MPI_Finalize();
#endif
#ifdef OMP
  free_omp_accumulators();
#endif
  free_gauge_field();
  free_geometry_indices();
  free_spinor_field();
  free_moment_field();
  return(0);
}

예제 #14

0

파일 보기

파일: Msap.c 프로젝트: robfre21/tmLQCD

void Msap_eo(spinor * const P, spinor * const Q, const int Ncy) {
    int blk, ncy = 0, eo, vol;
    spinor * r, * a, * b;
    double nrm;
    spinor * b_even, * b_odd, * a_even, * a_odd;
    spinor ** solver_field = NULL;
    const int nr_sf = 3;

    /*
     * here it would be probably better to get the working fields as a parameter
     * from the calling function
     */
    init_solver_field(&solver_field, VOLUME, nr_sf);
    r = solver_field[0];
    a = solver_field[1];
    b = solver_field[2];

    vol = block_list[0].volume/2;
    b_even = b;
    b_odd = b + vol + 1;
    a_even = a;
    a_odd = a + vol + 1;

    for(ncy = 0; ncy < Ncy; ncy++) {
        /* compute the global residue        */
        /* this can be done more efficiently */
        /* here only a naive implementation  */
        for(eo = 0; eo < 2; eo++) {
            D_psi(r, P);
            diff(r, Q, r, VOLUME);
            nrm = square_norm(r, VOLUME, 1);
            if(g_proc_id == 0 && g_debug_level > 1 && eo == 1) {
                printf("Msap: %d %1.3e\n", ncy, nrm);
            }
            /* choose the even (odd) block */

            for (blk = 0; blk < nb_blocks; blk++) {
                if(block_list[blk].evenodd == eo) {
                    /* get part of r corresponding to block blk into b_even and b_odd */
                    copy_global_to_block_eo(b_even, b_odd, r, blk);

                    assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol);
                    Block_H_psi(&block_list[blk], a_odd, a_even, OE);
                    /* a_odd = a_odd - b_odd */
                    assign_mul_add_r(a_odd, -1., b_odd, vol);

                    mrblk(b_odd, a_odd, 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk);

                    Block_H_psi(&block_list[blk], b_even, b_odd, EO);
                    mul_one_pm_imu_inv(b_even, +1., vol);
                    /* a_even = a_even - b_even */
                    assign_add_mul_r(a_even, b_even, -1., vol);

                    /* add even and odd part up to full spinor P */
                    add_eo_block_to_global(P, a_even, b_odd, blk);
                }
            }
        }
    }
    finalize_solver(solver_field, nr_sf);
    return;
}

예제 #15

0

파일 보기

파일: Msap.c 프로젝트: Finkenrath/tmLQCD

void Msap_eo(spinor * const P, spinor * const Q, const int Ncy, const int Niter) {
  int ncy = 0, vol, vols;
  spinor * r, * a, * b;
  double nrm;
  double musave = g_mu;
  double kappasave = g_kappa;
  spinor ** solver_field = NULL;
  // also get space for mrblk! 6 = 3+3
  const int nr_sf = 6;

  if(kappa_Msap > 0) {
    g_kappa = kappa_Msap;
  }
  if(mu_Msap > -10) {
    g_mu = mu_Msap;
    // make sure the sign is correct!
    if(g_mu*musave < 0) g_mu *= -1.;
  }
  boundary(g_kappa);
  /* 
   * here it would be probably better to get the working fields as a parameter 
   * from the calling function
   */
  vols = block_list[0].volume/2+block_list[0].spinpad;
  vol = block_list[0].volume/2;

  init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf);
  r = solver_field[0];
  a = solver_field[1];
  b = solver_field[2];

  int * blk_e_list = malloc(nb_blocks/2*sizeof(int));
  int * blk_o_list = malloc(nb_blocks/2*sizeof(int));
  int iblke = 0, iblko = 0;
  for(int blk = 0; blk < nb_blocks; blk++) {
    if (block_list[blk].evenodd == 0) {
      blk_e_list[iblke] = blk;
      iblke++;
    }
    else {
      blk_o_list[iblko] = blk;
      iblko++;
    }
  }

  for(ncy = 0; ncy < Ncy; ncy++) {
    /* compute the global residue        */
    /* this can be done more efficiently */
    /* here only a naive implementation  */
    for(int eo = 0; eo < 2; eo++) {
      D_psi(r, P);
      diff(r, Q, r, VOLUME);
      nrm = square_norm(r, VOLUME, 1);
      if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) {
	printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa);
	fflush(stdout);
      }
      int * blk_eo_list;
      if(eo == 0) {
	blk_eo_list = blk_e_list;
      }
      else {
	blk_eo_list = blk_o_list;
      }
      /* choose the even (odd) block */
      // rely on nested parallelism
      // 
      #ifdef TM_USE_OMP
      # pragma omp parallel for 
      #endif
      for (int iblk = 0; iblk < nb_blocks/2; iblk++) {
	int blk = blk_eo_list[iblk];
 	spinor32 * b_even = (spinor32*) (b + blk*2*vols);
 	spinor32 * b_odd = (spinor32*) (b +blk*2*vols + vols);
 	spinor32 * a_even = (spinor32*) (a + blk*2*vols);
 	spinor32 * a_odd = (spinor32*) (a + blk*2*vols + vols);
        // mrblk needs 3 solver fields which we distribute according to the block number
	spinor32 * c = (spinor32*) (solver_field[3] + blk*2*3*vols);

	/* get part of r corresponding to block blk into b_even and b_odd */
	copy_global_to_block_eo_32(b_even, b_odd, r, blk);
	if(g_c_sw > 0) {
	  assign_mul_one_sw_pm_imu_inv_block_32(EE, a_even, b_even, g_mu, &block_list[blk]);
	  Block_H_psi_32(&block_list[blk], a_odd, a_even, OE);
	  /* a_odd = b_odd - a_odd */
	  diff_32(a_odd, b_odd, a_odd, vol);
	  
	  mrblk_32(b_odd, a_odd, c,
		   Niter, 1.e-31, 1, vol, &Msw_plus_block_psi_32, blk);
	  
	  Block_H_psi_32(&block_list[blk], b_even, b_odd, EO);
	  assign_32(c, b_even, vol);
	  assign_mul_one_sw_pm_imu_inv_block_32(EE, b_even, c, g_mu, &block_list[blk]);
	}
	else {
	  assign_mul_one_pm_imu_inv_32(a_even, b_even, +1., vol);
	  Block_H_psi_32(&block_list[blk], a_odd, a_even, OE);
	  /* a_odd = b_odd - a_odd */
	  diff_32(a_odd, b_odd, a_odd, vol);
	  
	  mrblk_32(b_odd, a_odd, c, 
		   Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi_32, blk);
	  
	  Block_H_psi_32(&block_list[blk], b_even, b_odd, EO);
	  mul_one_pm_imu_inv_32(b_even, +1., vol);
	}
	/* a_even = a_even - b_even */
	diff_32(a_even, a_even, b_even, vol);
	
	/* add even and odd part up to full spinor P */
	add_eo_block_32_to_global(P, a_even, b_odd, blk);
      }
    }
  }
  free(blk_e_list);
  free(blk_o_list);
  finalize_solver(solver_field, nr_sf);
  g_mu = musave;
  g_kappa = kappasave;
  boundary(g_kappa);
  return;
}

예제 #16

0

파일 보기

파일: tm_operators.c 프로젝트: chaotika86/tmLQCD

void Q_plus_psi(spinor * const l, spinor * const k)
{
  D_psi(l, k);
  gamma5(l, l, VOLUME);
}

예제 #17

0

파일 보기

파일: poly_precon.c 프로젝트: palao/tmLQCD

void poly_nonherm_precon(spinor * const R, spinor * const S, 
			 const double e, const double d, const int n, const int N) {
  int j;
  double a1, a2, dtmp;
  static spinor *work, *work_;
  static int initpnH = 0;
  spinor * psi, * chi, *tmp0, *tmp1, *cptmp;

  
  if(initpnH == 0) {
    work_  = calloc(4*VOLUMEPLUSRAND+1, sizeof(spinor));
#if (defined SSE || defined SSE2 || defined SSE3)
    work   = (spinor *)(((unsigned long int)(work_)+ALIGN_BASE)&~ALIGN_BASE);
#else 
    work = work_;
#endif
    initpnH = 1;
  }
  psi = work;
  chi = &work[VOLUMEPLUSRAND];
  tmp0 = &work[2*VOLUMEPLUSRAND];
  tmp1 = &work[3*VOLUMEPLUSRAND];

  /* signs to be clarified!! */
  /* P_0 * S */
  mul_r(psi, 1./d, S, N);
  /* P_1 * S = a_1(1+kappa*H) * S */
  a1 = d/(d*d-e*e/2.);
  boundary(g_kappa/d);
  dtmp = g_mu;
  g_mu = g_mu/d;
  D_psi(chi, S);
  mul_r(chi, a1, chi, N);
  boundary(g_kappa);
  g_mu = dtmp;
/*   boundary(-g_kappa); */
/*   g_mu = -g_mu; */
/*   D_psi(aux, chi); */
/*   diff(aux, aux, S, N); */
/*   dtmp = square_norm(aux, N, 1); */
/*   printf("1 %1.3e\n", dtmp); */
/*   boundary(-g_kappa); */
/*   g_mu = -g_mu; */

/*   assign(chi, d, N); */
  for(j = 2; j < n+1; j++) {
    /* a_n */
    a2 = 1./(d-a1*e*e/4.);
    /* 1-a_n */
    a1 = 1.-d*a2;
    /* aux = a_n*S + (1-a_n) psi */
    mul_add_mul_r(tmp0, S, psi, a2, a1, N);
    /* sv = kappa H chi = (D_psi(-kappa, -2kappamu) - 1) chi */
    D_psi(tmp1, chi);
    /* why is the following sign like this? */
    diff(tmp1, chi, tmp1, N);
    /* psi = aux + a_n * sv */
    mul_add_mul_r(psi, tmp0, tmp1, 1., a2, N);
    cptmp = psi;
    psi = chi;
    chi = cptmp;

/*     boundary(-g_kappa); */
/*     g_mu = -g_mu; */
    if(g_debug_level>4) {
      D_psi(tmp0, chi);
      diff(tmp0, tmp0, S, N);
      dtmp = square_norm(tmp0, N, 1);
      if(g_proc_id == 0) printf("poly %d %1.3e\n", j, dtmp);
    }
/*     boundary(-g_kappa); */
/*     g_mu = -g_mu; */
    a1 = a2;
  }
  assign(R, chi, N);
  boundary(g_kappa);
  g_mu = dtmp;

 
  return;
}