Exemple #1
0
void test_speed_nd_aux(struct size sz,
		       fftw_direction dir, int flags, int specific)
{
     fftw_complex *in;
     fftwnd_plan plan;
     double t;
     fftw_time begin, end;
     int i, N;

     /* only bench in-place multi-dim transforms */
     flags |= FFTW_IN_PLACE;	

     N = 1;
     for (i = 0; i < sz.rank; ++i)
	  N *= (sz.narray[i]);

     in = (fftw_complex *) fftw_malloc(N * howmany_fields *
				       sizeof(fftw_complex));

     if (specific) {
	  begin = fftw_get_time();
	  plan = fftwnd_create_plan_specific(sz.rank, sz.narray, dir,
					     speed_flag | flags
					     | wisdom_flag | no_vector_flag,
					     in, howmany_fields, 0, 1);
     } else {
	  begin = fftw_get_time();
	  plan = fftwnd_create_plan(sz.rank, sz.narray,
				    dir, speed_flag | flags 
				    | wisdom_flag | no_vector_flag);
     }
     end = fftw_get_time();
     CHECK(plan != NULL, "can't create plan");

     t = fftw_time_to_sec(fftw_time_diff(end, begin));
     WHEN_VERBOSE(2, printf("time for planner: %f s\n", t));

     WHEN_VERBOSE(2, printf("\n"));
     WHEN_VERBOSE(2, (fftwnd_print_plan(plan)));
     WHEN_VERBOSE(2, printf("\n"));

     FFTW_TIME_FFT(fftwnd(plan, howmany_fields,
			  in, howmany_fields, 1, 0, 0, 0),
		   in, N * howmany_fields, t);

     fftwnd_destroy_plan(plan);

     WHEN_VERBOSE(1, printf("time for one fft: %s", smart_sprint_time(t)));
     WHEN_VERBOSE(1, printf(" (%s/point)\n", smart_sprint_time(t / N)));
     WHEN_VERBOSE(1, printf("\"mflops\" = 5 (N log2 N) / (t in microseconds)"
			    " = %f\n", howmany_fields * mflops(t, N)));

     fftw_free(in);

     WHEN_VERBOSE(1, printf("\n"));
}
Exemple #2
0
fftwnd_plan fftw2d_create_plan_specific(int nx, int ny,
					fftw_direction dir, int flags,
					fftw_complex *in, int istride,
					fftw_complex *out, int ostride)
{
     int n[2];

     n[0] = nx;
     n[1] = ny;

     return fftwnd_create_plan_specific(2, n, dir, flags,
					in, istride, out, ostride);
}
Exemple #3
0
/*
 * Class:     jfftw_complex_nd_Plan
 * Method:    createPlanSpecific
 * Signature: ([III[DI[DI)V
 */
JNIEXPORT void JNICALL Java_jfftw_complex_nd_Plan_createPlanSpecific( JNIEnv *env, jobject obj, jintArray dim, jint dir, jint flags, jdoubleArray in, jint idist, jdoubleArray out, jint odist )
{
	jclass clazz;
	jfieldID id;
	jbyteArray arr;
	unsigned char* carr;
	int rank;
	int *cdim;
	double *cin, *cout;

	if( sizeof( jdouble ) != sizeof( fftw_real ) )
	{
		(*env)->ThrowNew( env, (*env)->FindClass( env, "java/lang/RuntimeException" ), "jdouble and fftw_real are incompatible" );
		return;
	}

	clazz = (*env)->GetObjectClass( env, obj );
	id    = (*env)->GetFieldID( env, clazz, "plan", "[B" );
	arr   = (*env)->NewByteArray( env, sizeof( fftwnd_plan ) );
	carr  = (*env)->GetByteArrayElements( env, arr, 0 );
	rank  = (*env)->GetArrayLength( env, dim );
	cdim  = (*env)->GetIntArrayElements( env, dim, 0 );
	cin   = (*env)->GetDoubleArrayElements( env, in, 0 );
	cout  = (*env)->GetDoubleArrayElements( env, out, 0 );

	(*env)->MonitorEnter( env, (*env)->FindClass( env, "jfftw/Plan" ) );

	*(fftwnd_plan*)carr = fftwnd_create_plan_specific( rank, cdim, dir, flags, (fftw_complex*)cin, idist, (fftw_complex*)cout, odist );

	(*env)->MonitorExit( env, (*env)->FindClass( env, "jfftw/Plan" ) );

	(*env)->ReleaseDoubleArrayElements( env, in, cin, 0 );
	(*env)->ReleaseDoubleArrayElements( env, out, cout, 0 );
	(*env)->ReleaseByteArrayElements( env, arr, carr, 0 );
	(*env)->SetObjectField( env, obj, id, arr );
}
Exemple #4
0
int main(int argc, char **argv) {
  
  const int n_c=3;
  const int n_s=4;
  const char outfile_prefix[] = "delta_pp_2pt_v4";

  int c, i, icomp;
  int filename_set = 0;
  int append, status;
  int l_LX_at, l_LXstart_at;
  int ix, it, iix, x1,x2,x3;
  int ir, ir2, is;
  int VOL3;
  int do_gt=0;
  int dims[3];
  double *connt=NULL;
  spinor_propagator_type *connq=NULL;
  int verbose = 0;
  int sx0, sx1, sx2, sx3;
  int write_ascii=0;
  int fermion_type = 1;  // Wilson fermion type
  int pos;
  char filename[200], contype[200], gauge_field_filename[200];
  double ratime, retime;
  //double plaq_m, plaq_r;
  double *work=NULL;
  fermion_propagator_type *fp1=NULL, *fp2=NULL, *fp3=NULL, *uprop=NULL, *dprop=NULL, *fpaux=NULL;
  spinor_propagator_type *sp1=NULL, *sp2=NULL;
  double q[3], phase, *gauge_trafo=NULL;
  complex w, w1;
  size_t items, bytes;
  FILE *ofs;
  int timeslice;
  DML_Checksum ildg_gauge_field_checksum, *spinor_field_checksum=NULL, connq_checksum;
  uint32_t nersc_gauge_field_checksum;
  int threadid, nthreads;

/*******************************************************************
 * Gamma components for the Delta:
 *                                                                 */
  const int num_component = 4;
  int gamma_component[2][4] = { {0, 1, 2, 3},
                                {0, 1, 2, 3} };
  double gamma_component_sign[4] = {+1.,+1.,-1.,+1.};
/*
 *******************************************************************/
  fftw_complex *in=NULL;
#ifdef MPI
   fftwnd_mpi_plan plan_p;
#else
   fftwnd_plan plan_p;
#endif 

#ifdef MPI
  MPI_Status status;
#endif

#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "ah?vgf:F:")) != -1) {
    switch (c) {
    case 'v':
      verbose = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'a':
      write_ascii = 1;
      fprintf(stdout, "# [] will write in ascii format\n");
      break;
    case 'F':
      if(strcmp(optarg, "Wilson") == 0) {
        fermion_type = _WILSON_FERMION;
      } else if(strcmp(optarg, "tm") == 0) {
        fermion_type = _TM_FERMION;
      } else {
        fprintf(stderr, "[] Error, unrecognized fermion type\n");
        exit(145);
      }
      fprintf(stdout, "# [] will use fermion type %s ---> no. %d\n", optarg, fermion_type);
      break;
    case 'g':
      do_gt = 1;
      fprintf(stdout, "# [] will perform gauge transform\n");
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

  /* set the default values */
  if(filename_set==0) strcpy(filename, "cvc.input");
  fprintf(stdout, "# reading input from file %s\n", filename);
  read_input_parser(filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stdout, "T and L's must be set\n");
    usage();
  }
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stdout, "kappa should be > 0.n");
    usage();
  }

#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#else
  fprintf(stdout, "[delta_pp_2pt_v4] Warning, resetting global thread number to 1\n");
  g_num_threads = 1;
#endif

  /* initialize MPI parameters */
  mpi_init(argc, argv);

#ifdef OPENMP
  status = fftw_threads_init();
  if(status != 0) {
    fprintf(stderr, "\n[] Error from fftw_init_threads; status was %d\n", status);
    exit(120);
  }
#endif

  /******************************************************
   *
   ******************************************************/
  VOL3 = LX*LY*LZ;
  l_LX_at      = LX;
  l_LXstart_at = 0;
  FFTW_LOC_VOLUME = T*LX*LY*LZ;
  fprintf(stdout, "# [%2d] parameters:\n"\
		  "# [%2d] l_LX_at      = %3d\n"\
		  "# [%2d] l_LXstart_at = %3d\n"\
		  "# [%2d] FFTW_LOC_VOLUME = %3d\n", 
		  g_cart_id, g_cart_id, l_LX_at,
		  g_cart_id, l_LXstart_at, g_cart_id, FFTW_LOC_VOLUME);

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
    exit(1);
  }

  geometry();

  if(N_Jacobi>0) {

    // alloc the gauge field
    alloc_gauge_field(&g_gauge_field, VOL3);
    switch(g_gauge_file_format) {
      case 0:
        sprintf(gauge_field_filename, "%s.%.4d", gaugefilename_prefix, Nconf);
        break;
      case 1:
        sprintf(gauge_field_filename, "%s.%.5d", gaugefilename_prefix, Nconf);
        break;
    }
  } else {
    g_gauge_field = NULL;
  }


  /*********************************************************************
   * gauge transformation
   *********************************************************************/
  if(do_gt) { init_gauge_trafo(&gauge_trafo, 1.); }

  // determine the source location
  sx0 = g_source_location/(LX*LY*LZ)-Tstart;
  sx1 = (g_source_location%(LX*LY*LZ)) / (LY*LZ);
  sx2 = (g_source_location%(LY*LZ)) / LZ;
  sx3 = (g_source_location%LZ);
//  g_source_time_slice = sx0;
  fprintf(stdout, "# [] source location %d = (%d,%d,%d,%d)\n", g_source_location, sx0, sx1, sx2, sx3);

  // allocate memory for the spinor fields
  g_spinor_field = NULL;
  no_fields = n_s*n_c;
//  if(fermion_type == _TM_FERMION) {
//    no_fields *= 2;
//  }
  if(N_Jacobi>0) no_fields++;
  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields-1; i++) alloc_spinor_field(&g_spinor_field[i], VOL3);
  alloc_spinor_field(&g_spinor_field[no_fields-1], VOL3);
  work = g_spinor_field[no_fields-1];

  spinor_field_checksum = (DML_Checksum*)malloc(no_fields * sizeof(DML_Checksum) );
  if(spinor_field_checksum == NULL ) {
    fprintf(stderr, "[] Error, could not alloc checksums for spinor fields\n");
    exit(73);
  }

  // allocate memory for the contractions
  items = 4* num_component*T;
  bytes = sizeof(double);
  connt = (double*)malloc(items*bytes);
  if(connt == NULL) {
    fprintf(stderr, "\n[] Error, could not alloc connt\n");
    exit(2);
  }
  for(ix=0; ix<items; ix++) connt[ix] = 0.;

  items = num_component * (size_t)VOL3;
  connq = create_sp_field( items );
  if(connq == NULL) {
    fprintf(stderr, "\n[] Error, could not alloc connq\n");
    exit(2);
  }


  /******************************************************
   * initialize FFTW
   ******************************************************/
  items = 2 * num_component * g_sv_dim * g_sv_dim * VOL3;
  bytes = sizeof(double);
  in  = (fftw_complex*)malloc(num_component*g_sv_dim*g_sv_dim*VOL3*sizeof(fftw_complex));
  if(in == NULL) {
    fprintf(stderr, "[] Error, could not malloc in for FFTW\n");
    exit(155);
  }
  dims[0]=LX; dims[1]=LY; dims[2]=LZ;
  //plan_p = fftwnd_create_plan(3, dims, FFTW_FORWARD, FFTW_MEASURE | FFTW_IN_PLACE);
  plan_p = fftwnd_create_plan_specific(3, dims, FFTW_FORWARD, FFTW_MEASURE, in, num_component*g_sv_dim*g_sv_dim, (fftw_complex*)( connq[0][0] ), num_component*g_sv_dim*g_sv_dim);

  uprop = (fermion_propagator_type*)malloc(g_num_threads * sizeof(fermion_propagator_type) );
  fp1   = (fermion_propagator_type*)malloc(g_num_threads * sizeof(fermion_propagator_type) );
  fp2   = (fermion_propagator_type*)malloc(g_num_threads * sizeof(fermion_propagator_type) );
  fp3   = (fermion_propagator_type*)malloc(g_num_threads * sizeof(fermion_propagator_type) );
  fpaux = (fermion_propagator_type*)malloc(g_num_threads * sizeof(fermion_propagator_type) );
  if(uprop==NULL || fp1==NULL || fp2==NULL || fp3==NULL || fpaux==NULL ) {
    fprintf(stderr, "[] Error, could not alloc fermion propagator points\n");
    exit(57);
  }
  sp1 = (spinor_propagator_type*)malloc(g_num_threads * sizeof(spinor_propagator_type) ); 
  sp2 = (spinor_propagator_type*)malloc(g_num_threads * sizeof(spinor_propagator_type) ); 
  if(sp1==NULL || sp2==NULL) {
    fprintf(stderr, "[] Error, could not alloc spinor propagator points\n");
    exit(59);
  }
  for(i=0;i<g_num_threads;i++) { create_fp(uprop+i); }
  for(i=0;i<g_num_threads;i++) { create_fp(fp1+i); }
  for(i=0;i<g_num_threads;i++) { create_fp(fp2+i); }
  for(i=0;i<g_num_threads;i++) { create_fp(fp3+i); }
  for(i=0;i<g_num_threads;i++) { create_fp(fpaux+i); }
  for(i=0;i<g_num_threads;i++) { create_sp(sp1+i); }
  for(i=0;i<g_num_threads;i++) { create_sp(sp2+i); }

  /******************************************************
   * loop on timeslices
   ******************************************************/
  for(timeslice=0; timeslice<T; timeslice++) {
    append = (int)( timeslice != 0 );

    // read timeslice of the gauge field
    if( N_Jacobi>0) {
      switch(g_gauge_file_format) {
        case 0:
          status = read_lime_gauge_field_doubleprec_timeslice(g_gauge_field, gauge_field_filename, timeslice, &ildg_gauge_field_checksum);
          break;
        case 1:
          status = read_nersc_gauge_field_timeslice(g_gauge_field, gauge_field_filename, timeslice, &nersc_gauge_field_checksum);
          break;
      }
      if(status != 0) {
        fprintf(stderr, "[] Error, could not read gauge field\n");
        exit(21);
      }

#ifdef OPENMP
      status = APE_Smearing_Step_Timeslice_threads(g_gauge_field, N_ape, alpha_ape);
#else
      for(i=0; i<N_ape; i++) { status = APE_Smearing_Step_Timeslice(g_gauge_field, alpha_ape); }
#endif

    }

    // read timeslice of the 12 up-type propagators and smear them
    for(is=0;is<n_s*n_c;is++) {
      if(do_gt == 0) {
        sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix, Nconf, sx0, sx1, sx2, sx3, is);
        status = read_lime_spinor_timeslice(g_spinor_field[is], timeslice, filename, 0, spinor_field_checksum+is);
        if(status != 0) {
          fprintf(stderr, "[] Error, could not read propagator from file %s\n", filename);
          exit(102);
        }
        if(N_Jacobi > 0) {
          fprintf(stdout, "# [] Jacobi smearing propagator no. %d with paramters N_Jacobi=%d, kappa_Jacobi=%f\n",
              is, N_Jacobi, kappa_Jacobi);
#ifdef OPENMP
          Jacobi_Smearing_Step_one_Timeslice_threads(g_gauge_field, g_spinor_field[is], work, N_Jacobi, kappa_Jacobi);
#else
          for(c=0; c<N_Jacobi; c++) {
            Jacobi_Smearing_Step_one_Timeslice(g_gauge_field, g_spinor_field[is], work, kappa_Jacobi);
          }
#endif
        }
      } else {  // of if do_gt == 0
        // apply gt
        apply_gt_prop(gauge_trafo, g_spinor_field[is], is/n_c, is%n_c, 4, filename_prefix, g_source_location);
      } // of if do_gt == 0
    }

    /******************************************************
     * contractions
     ******************************************************/
#ifdef OPENMP
  omp_set_num_threads(g_num_threads);
#pragma omp parallel private (ix,icomp,threadid) \
    firstprivate (fermion_type,gamma_component,num_component,connq,\
        gamma_component_sign,VOL3,g_spinor_field,fp1,fp2,fp3,fpaux,uprop,sp1,sp2)
{
    threadid = omp_get_thread_num();
#else
    threadid = 0;
#endif
    for(ix=threadid; ix<VOL3; ix+=g_num_threads)
    {
      // assign the propagators
      _assign_fp_point_from_field(uprop[threadid], g_spinor_field, ix);
      if(fermion_type == _TM_FERMION) {
        _fp_eq_rot_ti_fp(fp1[threadid], uprop[threadid], +1, fermion_type, fp2[threadid]);
        _fp_eq_fp_ti_rot(uprop[threadid], fp1[threadid], +1, fermion_type, fp2[threadid]);
      }

      for(icomp=0; icomp<num_component; icomp++) {

        _sp_eq_zero( connq[ix*num_component+icomp]);

        /******************************************************
         * prepare propagators
         ******************************************************/
        // fp1[threadid] = C Gamma_1 x S_u = g0 g2 Gamma_1 S_u
        _fp_eq_zero(fp1[threadid]);
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_gamma_ti_fp(fp1[threadid], gamma_component[0][icomp], uprop[threadid]);
        _fp_eq_gamma_ti_fp(fpaux[threadid], 2, fp1[threadid]);
        _fp_eq_gamma_ti_fp(fp1[threadid], 0, fpaux[threadid]);
        // fp2[threadid] = C Gamma_1 x S_u x C Gamma_2
        _fp_eq_zero(fp2[threadid]);
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_fp_ti_gamma(fp2[threadid], 0, fp1[threadid]);
        _fp_eq_fp_ti_gamma(fpaux[threadid], 2, fp2[threadid]);
        _fp_eq_fp_ti_gamma(fp2[threadid], gamma_component[1][icomp], fpaux[threadid]);
        // fp3[threadid] = S_u x C Gamma_2 = S_u g0 g2 Gamma_2
        _fp_eq_zero(fp3[threadid]);
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_fp_ti_gamma(fp3[threadid], 0, uprop[threadid]);
        _fp_eq_fp_ti_gamma(fpaux[threadid], 2, fp3[threadid]);
        _fp_eq_fp_ti_gamma(fp3[threadid], gamma_component[1][icomp], fpaux[threadid]);


        /******************************************************
         * contractions
         ******************************************************/
        // (1)
        // reduce
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_fp_eps_contract13_fp(fpaux[threadid], fp1[threadid], uprop[threadid]);
        // reduce to spin propagator
        _sp_eq_zero( sp1[threadid] );
        _sp_eq_fp_del_contract23_fp(sp1[threadid], fp3[threadid], fpaux[threadid]);
        // (2)
        // reduce to spin propagator
        _sp_eq_zero( sp2[threadid] );
        _sp_eq_fp_del_contract24_fp(sp2[threadid], fp3[threadid], fpaux[threadid]);
        // add and assign
        _sp_pl_eq_sp(sp1[threadid], sp2[threadid]);
        _sp_eq_sp_ti_re(sp2[threadid], sp1[threadid], -gamma_component_sign[icomp]);
        _sp_eq_sp( connq[ix*num_component+icomp], sp2[threadid]);

        // (3)
        // reduce
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_fp_eps_contract13_fp(fpaux[threadid], fp2[threadid], uprop[threadid]);
        // reduce to spin propagator
        _sp_eq_zero( sp1[threadid] );
        _sp_eq_fp_del_contract23_fp(sp1[threadid], uprop[threadid], fpaux[threadid]);
        // (4)
        // reduce
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_fp_eps_contract13_fp(fpaux[threadid], fp1[threadid], fp3[threadid]);
        // reduce to spin propagator
        _sp_eq_zero( sp2[threadid] );
        _sp_eq_fp_del_contract24_fp(sp2[threadid], uprop[threadid], fpaux[threadid]);
        // add and assign
        _sp_pl_eq_sp(sp1[threadid], sp2[threadid]);
        _sp_eq_sp_ti_re(sp2[threadid], sp1[threadid], -gamma_component_sign[icomp]);
        _sp_pl_eq_sp( connq[ix*num_component+icomp], sp2[threadid]);

        // (5)
        // reduce
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_fp_eps_contract13_fp(fpaux[threadid], fp2[threadid], uprop[threadid]);
        // reduce to spin propagator
        _sp_eq_zero( sp1[threadid] );
        _sp_eq_fp_del_contract34_fp(sp1[threadid], uprop[threadid], fpaux[threadid]);
        // (6)
        // reduce
        _fp_eq_zero(fpaux[threadid]);
        _fp_eq_fp_eps_contract13_fp(fpaux[threadid], fp1[threadid], fp3[threadid]);
        // reduce to spin propagator
        _sp_eq_zero( sp2[threadid] );
        _sp_eq_fp_del_contract34_fp(sp2[threadid], uprop[threadid], fpaux[threadid]);
        // add and assign
        _sp_pl_eq_sp(sp1[threadid], sp2[threadid]);
        _sp_eq_sp_ti_re(sp2[threadid], sp1[threadid], -gamma_component_sign[icomp]);
        _sp_pl_eq_sp( connq[ix*num_component+icomp], sp2[threadid]);
      }  // of icomp

    }    // of ix
#ifdef OPENMP
}
#endif

    /***********************************************
     * finish calculation of connq
     ***********************************************/
    if(g_propagator_bc_type == 0) {
      // multiply with phase factor
      fprintf(stdout, "# [] multiplying timeslice %d with boundary phase factor\n", timeslice);
      ir = (timeslice - sx0 + T_global) % T_global;
      w1.re = cos( 3. * M_PI*(double)ir / (double)T_global );
      w1.im = sin( 3. * M_PI*(double)ir / (double)T_global );
      for(ix=0;ix<num_component*VOL3;ix++) {
        _sp_eq_sp(sp1[0], connq[ix] );
        _sp_eq_sp_ti_co( connq[ix], sp1[0], w1);
      }
    } else if (g_propagator_bc_type == 1) {
      // multiply with step function
      if(timeslice < sx0) {
        fprintf(stdout, "# [] multiplying timeslice %d with boundary step function\n", timeslice);
        for(ix=0;ix<num_component*VOL3;ix++) {
          _sp_eq_sp(sp1[0], connq[ix] );
          _sp_eq_sp_ti_re( connq[ix], sp1[0], -1.);
        }
      }
    }
  
    if(write_ascii) {
      sprintf(filename, "%s_x.%.4d.t%.2dx%.2dy%.2dz%.2d.ascii", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
      write_contraction2( connq[0][0], filename, num_component*g_sv_dim*g_sv_dim, VOL3, 1, append);
    }

    /******************************************************************
     * Fourier transform
     ******************************************************************/
    items =  2 * num_component * g_sv_dim * g_sv_dim * VOL3;
    bytes = sizeof(double);

    memcpy(in, connq[0][0], items * bytes);
    ir = num_component * g_sv_dim * g_sv_dim;
#ifdef OPENMP
    fftwnd_threads(g_num_threads, plan_p, ir, in, ir, 1, (fftw_complex*)(connq[0][0]), ir, 1);
#else
    fftwnd(plan_p, ir, in, ir, 1, (fftw_complex*)(connq[0][0]), ir, 1);
#endif

    // add phase factor from the source location
    iix = 0;
    for(x1=0;x1<LX;x1++) {
      q[0] = (double)x1 / (double)LX;
    for(x2=0;x2<LY;x2++) {
      q[1] = (double)x2 / (double)LY;
    for(x3=0;x3<LZ;x3++) {
      q[2] = (double)x3 / (double)LZ;
      phase = 2. * M_PI * ( q[0]*sx1 + q[1]*sx2 + q[2]*sx3 );
      w1.re = cos(phase);
      w1.im = sin(phase);

      for(icomp=0; icomp<num_component; icomp++) {
        _sp_eq_sp(sp1[0], connq[iix] );
        _sp_eq_sp_ti_co( connq[iix], sp1[0], w1) ;
        iix++; 
      }
    }}}  // of x3, x2, x1

    // write to file
    sprintf(filename, "%s_q.%.4d.t%.2dx%.2dy%.2dz%.2d", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
    sprintf(contype, "2-pt. function, (t,q_1,q_2,q_3)-dependent, source_timeslice = %d", sx0);
    write_lime_contraction_timeslice(connq[0][0], filename, 64, num_component*g_sv_dim*g_sv_dim, contype, Nconf, 0, &connq_checksum, timeslice);

    if(write_ascii) {
      strcat(filename, ".ascii");
      write_contraction2(connq[0][0],filename, num_component*g_sv_dim*g_sv_dim, VOL3, 1, append);
    }


    /***********************************************
     * calculate connt
     ***********************************************/
    for(icomp=0;icomp<num_component; icomp++) {
      // fwd
      _sp_eq_sp(sp1[0], connq[icomp]);
      _sp_eq_gamma_ti_sp(sp2[0], 0, sp1[0]);
      _sp_pl_eq_sp(sp1[0], sp2[0]);
      _co_eq_tr_sp(&w, sp1[0]);
      connt[2*(icomp*T + timeslice)  ] = w.re * 0.25;
      connt[2*(icomp*T + timeslice)+1] = w.im * 0.25;
      // bwd
      _sp_eq_sp(sp1[0], connq[icomp]);
      _sp_eq_gamma_ti_sp(sp2[0], 0, sp1[0]);
      _sp_mi_eq_sp(sp1[0], sp2[0]);
      _co_eq_tr_sp(&w, sp1[0]);
      connt[2*(icomp*T+timeslice + num_component*T)  ] = w.re * 0.25;
      connt[2*(icomp*T+timeslice + num_component*T)+1] = w.im * 0.25;
    }

  }  // of loop on timeslice



  // write connt
  sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.fw", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  ofs = fopen(filename, "w");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for writing\n", filename);
    exit(3);
  }
  fprintf(ofs, "#%12.8f%3d%3d%3d%3d%8.4f%6d\n", g_kappa, T_global, LX, LY, LZ, g_mu, Nconf);

  for(icomp=0; icomp<num_component; icomp++) {
    ir = sx0;
    fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], 0, connt[2*(icomp*T+ir)], 0., Nconf);
    for(it=1;it<T/2;it++) {
      ir  = ( it + sx0 ) % T_global;
      ir2 = ( (T_global - it) + sx0 ) % T_global;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(icomp*T+ir)], connt[2*(icomp*T+ir2)], Nconf);
    }
    ir = ( it + sx0 ) % T_global;
    fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(icomp*T+ir)], 0., Nconf);
  }
  fclose(ofs);

  sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.bw", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  ofs = fopen(filename, "w");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for writing\n", filename);
    exit(3);
  }
  fprintf(ofs, "#%12.8f%3d%3d%3d%3d%8.4f%6d\n", g_kappa, T_global, LX, LY, LZ, g_mu, Nconf);

  for(icomp=0; icomp<num_component; icomp++) {
    ir = sx0;
    fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], 0, connt[2*(num_component*T+icomp*T+ir)], 0., Nconf);
    for(it=1;it<T/2;it++) {
      ir  = ( it + sx0 ) % T_global;
      ir2 = ( (T_global - it) + sx0 ) % T_global;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(num_component*T+icomp*T+ir)], connt[2*(num_component*T+icomp*T+ir2)], Nconf);
    }
    ir = ( it + sx0 ) % T_global;
    fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(num_component*T+icomp*T+ir)], 0., Nconf);
  }
  fclose(ofs);

  /***********************************************
   * free the allocated memory, finalize
   ***********************************************/
  free_geometry();
  if(connt!= NULL) free(connt);
  if(connq!= NULL) free(connq);
  if(gauge_trafo != NULL) free(gauge_trafo);

  if(g_spinor_field!=NULL) {
    for(i=0; i<no_fields; i++) free(g_spinor_field[i]);
    free(g_spinor_field); g_spinor_field=(double**)NULL;
  }
  if(spinor_field_checksum !=NULL) free(spinor_field_checksum);
  if(g_gauge_field != NULL) free(g_gauge_field);

  for(i=0;i<g_num_threads;i++) { free_fp(uprop+i); }
  for(i=0;i<g_num_threads;i++) { free_fp(fp1+i); }
  for(i=0;i<g_num_threads;i++) { free_fp(fp2+i); }
  for(i=0;i<g_num_threads;i++) { free_fp(fp3+i); }
  for(i=0;i<g_num_threads;i++) { free_fp(fpaux+i); }
  for(i=0;i<g_num_threads;i++) { free_sp(sp1+i); }
  for(i=0;i<g_num_threads;i++) { free_sp(sp2+i); }
  if(uprop!=NULL) free(uprop);
  if(fp1!=NULL) free(fp1);
  if(fp2!=NULL) free(fp2);
  if(fp3!=NULL) free(fp3);
  if(fpaux!=NULL) free(fpaux);
  if(sp1!=NULL) free(sp1);
  if(sp2!=NULL) free(sp2);

  free(in);
  fftwnd_destroy_plan(plan_p);

  g_the_time = time(NULL);
  fprintf(stdout, "# [] %s# [] end fo run\n", ctime(&g_the_time));
  fflush(stdout);
  fprintf(stderr, "# [] %s# [] end fo run\n", ctime(&g_the_time));
  fflush(stderr);

#ifdef MPI
  MPI_Finalize();
#endif
  return(0);
}
int main(int argc, char **argv) {
  
  const int n_c=3;
  const int n_s=4;
  const char outfile_prefix[] = "delta_pp_2pt_v3";

  int c, i, icomp;
  int filename_set = 0;
  int append, status;
  int l_LX_at, l_LXstart_at;
  int ix, it, iix, x1,x2,x3;
  int ir, ir2, is;
  int VOL3;
  int do_gt=0;
  int dims[3];
  double *connt=NULL;
  spinor_propagator_type *connq=NULL;
  int verbose = 0;
  int sx0, sx1, sx2, sx3;
  int write_ascii=0;
  int fermion_type = 1;  // Wilson fermion type
  int num_threads=1;
  int pos;
  char filename[200], contype[200], gauge_field_filename[200];
  double ratime, retime;
  //double plaq_m, plaq_r;
  double *work=NULL;
  fermion_propagator_type fp1=NULL, fp2=NULL, fp3=NULL, fp4=NULL, fpaux=NULL, uprop=NULL, dprop=NULL, *stochastic_fp=NULL;
  spinor_propagator_type sp1, sp2;
  double q[3], phase, *gauge_trafo=NULL;
  double *stochastic_source=NULL, *stochastic_prop=NULL;
  complex w, w1;
  size_t items, bytes;
  FILE *ofs;
  int timeslice;
  DML_Checksum ildg_gauge_field_checksum, *spinor_field_checksum=NULL, connq_checksum;
  uint32_t nersc_gauge_field_checksum;

/***********************************************************/
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL, qlatt_nclass=0;
  int use_lattice_momenta = 0;
  double **qlatt_list=NULL;
/***********************************************************/

/***********************************************************/
  int rel_momentum_filename_set = 0, rel_momentum_no=0;
  int **rel_momentum_list=NULL;
  char rel_momentum_filename[200];
/***********************************************************/

/***********************************************************/
  int snk_momentum_no = 1;
  int **snk_momentum_list = NULL;
  int snk_momentum_filename_set = 0;
  char snk_momentum_filename[200];
/***********************************************************/

/*******************************************************************
 * Gamma components for the Delta:
 */
  //const int num_component = 16;
  //int gamma_component[2][16] = { {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}, \
  //                               {0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3}};
  //double gamma_component_sign[16] = {1., 1.,-1., 1., 1., 1.,-1., 1.,-1.,-1., 1.,-1., 1., 1.,-1., 1.};
  const int num_component = 4;
  int gamma_component[2][4] = { {0, 1, 2, 3},
                                {0, 1, 2, 3} };
  double gamma_component_sign[4] = {+1.,+1.,+1.,+1.};
/*
 *******************************************************************/
  fftw_complex *in=NULL;
#ifdef MPI
   fftwnd_mpi_plan plan_p;
#else
   fftwnd_plan plan_p;
#endif 

#ifdef MPI
  MPI_Status status;
#endif

#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "ah?vgf:t:F:p:P:")) != -1) {
    switch (c) {
    case 'v':
      verbose = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'a':
      write_ascii = 1;
      fprintf(stdout, "# [] will write in ascii format\n");
      break;
    case 'F':
      if(strcmp(optarg, "Wilson") == 0) {
        fermion_type = _WILSON_FERMION;
      } else if(strcmp(optarg, "tm") == 0) {
        fermion_type = _TM_FERMION;
      } else {
        fprintf(stderr, "[] Error, unrecognized fermion type\n");
        exit(145);
      }
      fprintf(stdout, "# [] will use fermion type %s ---> no. %d\n", optarg, fermion_type);
      break;
    case 't':
      num_threads = atoi(optarg);
      fprintf(stdout, "# [] number of threads set to %d\n", num_threads);
      break;
    case 's':
      use_lattice_momenta = 1;
      fprintf(stdout, "# [] will use lattice momenta\n");
      break;
    case 'p':
      rel_momentum_filename_set = 1;
      strcpy(rel_momentum_filename, optarg);
      fprintf(stdout, "# [] will use current momentum file %s\n", rel_momentum_filename);
      break;
    case 'P':
      snk_momentum_filename_set = 1;
      strcpy(snk_momentum_filename, optarg);
      fprintf(stdout, "# [] will use nucleon momentum file %s\n", snk_momentum_filename);
      break;
    case 'g':
      do_gt = 1;
      fprintf(stdout, "# [] will perform gauge transform\n");
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

#ifdef OPENMP
  omp_set_num_threads(num_threads);
#endif

  /* set the default values */
  if(filename_set==0) strcpy(filename, "cvc.input");
  fprintf(stdout, "# reading input from file %s\n", filename);
  read_input_parser(filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stdout, "T and L's must be set\n");
    usage();
  }
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stdout, "kappa should be > 0.n");
    usage();
  }

  /* initialize MPI parameters */
  mpi_init(argc, argv);

#ifdef OPENMP
  status = fftw_threads_init();
  if(status != 0) {
    fprintf(stderr, "\n[] Error from fftw_init_threads; status was %d\n", status);
    exit(120);
  }
#endif

  /******************************************************
   *
   ******************************************************/
  VOL3 = LX*LY*LZ;
  l_LX_at      = LX;
  l_LXstart_at = 0;
  FFTW_LOC_VOLUME = T*LX*LY*LZ;
  fprintf(stdout, "# [%2d] parameters:\n"\
		  "# [%2d] l_LX_at      = %3d\n"\
		  "# [%2d] l_LXstart_at = %3d\n"\
		  "# [%2d] FFTW_LOC_VOLUME = %3d\n", 
		  g_cart_id, g_cart_id, l_LX_at,
		  g_cart_id, l_LXstart_at, g_cart_id, FFTW_LOC_VOLUME);

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
    exit(1);
  }

  geometry();

  if(N_Jacobi>0) {

    // alloc the gauge field
    alloc_gauge_field(&g_gauge_field, VOL3);
    switch(g_gauge_file_format) {
      case 0:
        sprintf(gauge_field_filename, "%s.%.4d", gaugefilename_prefix, Nconf);
        break;
      case 1:
        sprintf(gauge_field_filename, "%s.%.5d", gaugefilename_prefix, Nconf);
        break;
    }
  } else {
    g_gauge_field = NULL;
  }


  /*********************************************************************
   * gauge transformation
   *********************************************************************/
  if(do_gt) { init_gauge_trafo(&gauge_trafo, 1.); }

  // determine the source location
  sx0 = g_source_location/(LX*LY*LZ)-Tstart;
  sx1 = (g_source_location%(LX*LY*LZ)) / (LY*LZ);
  sx2 = (g_source_location%(LY*LZ)) / LZ;
  sx3 = (g_source_location%LZ);
//  g_source_time_slice = sx0;
  fprintf(stdout, "# [] source location %d = (%d,%d,%d,%d)\n", g_source_location, sx0, sx1, sx2, sx3);
  source_timeslice = sx0;


  if(!use_lattice_momenta) {
    status = make_qcont_orbits_3d_parity_avg(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
  } else {
    status = make_qlatt_orbits_3d_parity_avg(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map);
  }
  if(status != 0) {
    fprintf(stderr, "\n[] Error while creating h4-lists\n");
    exit(4);
  }
  fprintf(stdout, "# [] number of classes = %d\n", qlatt_nclass);


  /***************************************************************************
   * read the relative momenta q to be used
   ***************************************************************************/
/*
  ofs = fopen(rel_momentum_filename, "r");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for reading\n", rel_momentum_filename);
    exit(6);
  }
  rel_momentum_no = 0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      rel_momentum_no++;
    }
  }
  if(rel_momentum_no == 0) {
    fprintf(stderr, "[] Error, number of momenta is zero\n");
    exit(7);
  } else {
    fprintf(stdout, "# [] number of current momenta = %d\n", rel_momentum_no);
  }
  rewind(ofs);
  rel_momentum_list = (int**)malloc(rel_momentum_no * sizeof(int*));
  rel_momentum_list[0] = (int*)malloc(3*rel_momentum_no * sizeof(int));
  for(i=1;i<rel_momentum_no;i++) { rel_momentum_list[i] = rel_momentum_list[i-1] + 3; }
  count=0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      sscanf(line, "%d%d%d", rel_momentum_list[count], rel_momentum_list[count]+1, rel_momentum_list[count]+2);
      count++;
    }
  }
  fclose(ofs);
  fprintf(stdout, "# [] current momentum list:\n");
  for(i=0;i<rel_momentum_no;i++) {
    fprintf(stdout, "\t%3d%3d%3d%3d\n", i, rel_momentum_list[i][0], rel_momentum_list[i][1], rel_momentum_list[i][2]);
  }
*/

  /***************************************************************************
   * read the nucleon final momenta to be used
   ***************************************************************************/
  ofs = fopen(snk_momentum_filename, "r");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for reading\n", snk_momentum_filename);
    exit(6);
  }
  snk_momentum_no = 0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      snk_momentum_no++;
    }
  }
  if(snk_momentum_no == 0) {
    fprintf(stderr, "[] Error, number of momenta is zero\n");
    exit(7);
  } else {
    fprintf(stdout, "# [] number of nucleon final momenta = %d\n", snk_momentum_no);
  }
  rewind(ofs);
  snk_momentum_list = (int**)malloc(snk_momentum_no * sizeof(int*));
  snk_momentum_list[0] = (int*)malloc(3*snk_momentum_no * sizeof(int));
  for(i=1;i<snk_momentum_no;i++) { snk_momentum_list[i] = snk_momentum_list[i-1] + 3; }
  count=0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      sscanf(line, "%d%d%d", snk_momentum_list[count], snk_momentum_list[count]+1, snk_momentum_list[count]+2);
      count++;
    }
  }
  fclose(ofs);
  fprintf(stdout, "# [] the nucleon final momentum list:\n");
  for(i=0;i<snk_momentum_no;i++) {
    fprintf(stdout, "\t%3d%3d%3d%3d\n", i, snk_momentum_list[i][0], snk_momentum_list[i][1], snk_momentum_list[i][1], snk_momentum_list[i][2]);
  }



  /***********************************************************
   * allocate memory for the spinor fields
   ***********************************************************/
  g_spinor_field = NULL;
  if(fermion_type == _TM_FERMION) {
    no_fields = 2*n_s*n_c+3;
  } else {
    no_fields =   n_s*n_c+3;
  }
  if(N_Jacobi>0) no_fields++;

  g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
  for(i=0; i<no_fields-2; i++) alloc_spinor_field(&g_spinor_field[i], VOL3);
  // work
  if(N_Jacobi>0) work = g_spinor_field[no_fields-4];
  // stochastic_fv
  stochastic_fv = g_spinor_field[no_fields-3];
  // stochastic source and propagator
  alloc_spinor_field(&g_spinor_field[no_fields-2], VOLUME);
  stochastic_source = g_spinor_field[no_fields-2];
  alloc_spinor_field(&g_spinor_field[no_fields-1], VOLUME);
  stochastic_prop   = g_spinor_field[no_fields-1];


  spinor_field_checksum = (DML_Checksum*)malloc(no_fields * sizeof(DML_Checksum) );
  if(spinor_field_checksum == NULL ) {
    fprintf(stderr, "[] Error, could not alloc checksums for spinor fields\n");
    exit(73);
  }
  
  /*************************************************
   * allocate memory for the contractions
   *************************************************/
  items = 4* num_component*T;
  bytes = sizeof(double);
  connt = (double*)malloc(items*bytes);
  if(connt == NULL) {
    fprintf(stderr, "\n[] Error, could not alloc connt\n");
    exit(2);
  }
  for(ix=0; ix<items; ix++) connt[ix] = 0.;

  items = num_component * (size_t)VOL3;
  connq = create_sp_field( items );
  if(connq == NULL) {
    fprintf(stderr, "\n[] Error, could not alloc connq\n");
    exit(2);
  }

  items = (size_t)VOL3;
  stochastic_fp = create_sp_field( items );
  if(stochastic_fp== NULL) {
    fprintf(stderr, "\n[] Error, could not alloc stochastic_fp\n");
    exit(22);
  }

  /******************************************************
   * initialize FFTW
   ******************************************************/
  items = g_fv_dim * (size_t)VOL3;
  bytes = sizeof(fftw_complex);
  in  = (fftw_complex*)malloc( items * bytes );
  if(in == NULL) {
    fprintf(stderr, "[] Error, could not malloc in for FFTW\n");
    exit(155);
  }
  dims[0]=LX; dims[1]=LY; dims[2]=LZ;
  //plan_p = fftwnd_create_plan(3, dims, FFTW_FORWARD, FFTW_MEASURE | FFTW_IN_PLACE);
  plan_p = fftwnd_create_plan_specific(3, dims, FFTW_FORWARD, FFTW_MEASURE, in, g_fv_dim, (fftw_complex*)( stochastic_fv ), g_fv_dim);

  // create the fermion propagator points
  create_fp(&uprop);
  create_fp(&dprop);
  create_fp(&fp1);
  create_fp(&fp2);
  create_fp(&fp3);
  create_fp(&stochastic_fp);
  create_sp(&sp1);
  create_sp(&sp2);


  // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  // !! implement twisting for _TM_FERMION
  // !!
  // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#ifdef OPENMP
#pragma omp parallel for private(ix) shared(stochastic_prop)
#endif
  for(ix=0;ix<VOLUME;ix++) { _fv_eq_zero(stochastic_prop+_GSI(ix)); }

  for(sid=g_sourceid; sid<=g_sourceid2;sid+=g_sourceid_step) {
    switch(g_soruce_type) {
      case 2:  // timeslice source
        sprintf(filename, "%s.%.4d.%.2d.%.5d.inverted", filename_prefix, Nconf, source_timeslice, sid);
        break;
      default:
        fprintf(stderr, "# [] source type %d not implented; exit\n", g_source_type);
        exit(100);
    }
    fprintf(stdout, "# [] trying to read sample up-prop. from file %s\n", filename);
    read_lime_spinor(stochastic_source, filename, 0);
#ifdef OPENMP
#pragma omp parallel for private(ix) shared(stochastic_prop, stochastic_source)
#endif
    for(ix=0;ix<VOLUME;ix++) { _fv_pl_eq_fv(stochastic_prop+_GSI(ix), stochastic_source+_GSI(ix)); }
  }
#ifdef OPENMP
#pragma omp parallel for private(ix) shared(stochastic_prop, stochastic_source)
#endif
  fnorm = 1. / ( (double)(g_sourceid2 - g_sourceid + 1) * g_prop_normsqr );
  for(ix=0;ix<VOLUME;ix++) { _fv_ti_eq_re(stochastic_prop+_GSI(ix), fnorm); }
  //  calculate the source
  if(fermion_type && g_propagator_bc_type == 1) {
    Q_Wilson_phi(stochastic_source, stochastic_prop);
  } else {
    Q_phi_tbc(stochastic_source, stochastic_prop);
  }

  /******************************************************
   * prepare the stochastic fermion field
   ******************************************************/
  // read timeslice of the gauge field
  if( N_Jacobi>0) {
    switch(g_gauge_file_format) {
      case 0:
        status = read_lime_gauge_field_doubleprec_timeslice(g_gauge_field, gauge_field_filename, source_timeslice, &ildg_gauge_field_checksum);
        break;
      case 1:
        status = read_nersc_gauge_field_timeslice(g_gauge_field, gauge_field_filename, source_timeslice, &nersc_gauge_field_checksum);
        break;
    }
    if(status != 0) {
      fprintf(stderr, "[] Error, could not read gauge field\n");
      exit(21);
    }
    for(i=0; i<N_ape; i++) {
#ifdef OPENMP
      status = APE_Smearing_Step_Timeslice_threads(g_gauge_field, alpha_ape);
#else
      status = APE_Smearing_Step_Timeslice(g_gauge_field, alpha_ape);
#endif
    }
  }
  // read timeslice of the 12 up-type propagators and smear them
  //
  // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  // !! implement twisting for _TM_FERMION
  // !!
  // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  for(is=0;is<n_s*n_c;is++) {
    if(fermion_type != _TM_FERMION) {
      sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix, Nconf, sx0, sx1, sx2, sx3, is);
    } else {
      sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix2, Nconf, sx0, sx1, sx2, sx3, is);
    }
    status = read_lime_spinor_timeslice(g_spinor_field[is], source_timeslice, filename, 0, spinor_field_checksum+is);
    if(status != 0) {
      fprintf(stderr, "[] Error, could not read propagator from file %s\n", filename);
      exit(102);
    }
    if(N_Jacobi > 0) {
      fprintf(stdout, "# [] Jacobi smearing propagator no. %d with paramters N_Jacobi=%d, kappa_Jacobi=%f\n",
          is, N_Jacobi, kappa_Jacobi);
      for(c=0; c<N_Jacobi; c++) {
#ifdef OPENMP
        Jacobi_Smearing_Step_one_Timeslice_threads(g_gauge_field, g_spinor_field[is], work, kappa_Jacobi);
#else
        Jacobi_Smearing_Step_one_Timeslice(g_gauge_field, g_spinor_field[is], work, kappa_Jacobi);
#endif
      }
    }
  }
  for(is=0;is<g_fv_dim;is++) {
    for(ix=0;ix<VOL3;ix++) {
      iix = source_timeslice * VOL3 + ix;
      _fv_eq_gamma_ti_fv(spinor1, 5, g_spinor_field[is]+_GSI(iix));
      _co_eq_fv_dagger_ti_fv(&w, stochastic_source+_GSI(ix), spinor1);
      stochastic_fv[_GSI(ix)+2*is  ] = w.re;
      stochastic_fv[_GSI(ix)+2*is+1] = w.im;
    }
  }
  // Fourier transform
  items = g_fv_dim * (size_t)VOL3;
  bytes = sizeof(double);
  memcpy(in, stochastic_fv, items*bytes );
#ifdef OPENMP
  fftwnd_threads(num_threads, plan_p, g_fv_dim, in, g_fv_dim, 1, (fftw_complex*)(stochastic_fv), g_fv_dim, 1);
#else
  fftwnd(plan_p, g_fv_dim, in, g_fv_dim, 1, (fftw_complex*)(stochastic_fv), g_fv_dim, 1);
#endif


  /******************************************************
   * loop on sink momenta (most likely only one: Q=(0,0,0))
   ******************************************************/
  for(imom_snk=0;imom_snk<snk_momentum_no; imom_snk++) {

    // create Phi_tilde
    _fv_eq_zero( spinor2 );
    for(ix=0;ix<LX;ix++) {
    for(iy=0;iy<LY;iy++) {
    for(iz=0;iz<LZ;iz++) {
      iix = timeslice * VOL3 + ix;
      phase = -2.*M_PI*( (ix-sx1) * snk_momentum_list[imom_snk][0] / (double)LX 
                       + (iy-sx2) * snk_momentum_list[imom_snk][1] / (double)LY 
                       + (iz-sx3) * snk_momentum_list[imom_snk][2] / (double)LZ);
      w.re = cos(phase);
      w.im = sin(phase);
      _fv_eq_fv_ti_co(spinor1, stochastic_prop + _GSI(iix), &w);
      _fv_pl_eq_fv(spinor2, spinor);
    }}}
    // create Theta
    for(ir=0;ir<g_fv_dim;ir++) {
    for(is=0;is<g_fv_dim;is++) {
      _co_eq_co_ti_co( &(stochastic_fp[ix][ir][2*is]), &(spinor2[2*ir]), &(stochastic_fv[_GSI(ix)+2*is]) );
    }}

    /******************************************************
     * loop on timeslices
     ******************************************************/
    for(timeslice=0; timeslice<T; timeslice++) {
      append = (int)( timeslice != 0 );

      // read timeslice of the gauge field
      if( N_Jacobi>0) {
        switch(g_gauge_file_format) {
          case 0:
            status = read_lime_gauge_field_doubleprec_timeslice(g_gauge_field, gauge_field_filename, timeslice, &ildg_gauge_field_checksum);
            break;
          case 1:
            status = read_nersc_gauge_field_timeslice(g_gauge_field, gauge_field_filename, timeslice, &nersc_gauge_field_checksum);
            break;
        }
        if(status != 0) {
          fprintf(stderr, "[] Error, could not read gauge field\n");
          exit(21);
        }

        for(i=0; i<N_ape; i++) {
#ifdef OPENMP
          status = APE_Smearing_Step_Timeslice_threads(g_gauge_field, alpha_ape);
#else
          status = APE_Smearing_Step_Timeslice(g_gauge_field, alpha_ape);
#endif
        }

      }

      // read timeslice of the 12 up-type propagators and smear them
      for(is=0;is<n_s*n_c;is++) {
          sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix, Nconf, sx0, sx1, sx2, sx3, is);
          status = read_lime_spinor_timeslice(g_spinor_field[is], timeslice, filename, 0, spinor_field_checksum+is);
          if(status != 0) {
            fprintf(stderr, "[] Error, could not read propagator from file %s\n", filename);
            exit(102);
          }
          if(N_Jacobi > 0) {
            fprintf(stdout, "# [] Jacobi smearing propagator no. %d with paramters N_Jacobi=%d, kappa_Jacobi=%f\n",
                is, N_Jacobi, kappa_Jacobi);
            for(c=0; c<N_Jacobi; c++) {
#ifdef OPENMP
              Jacobi_Smearing_Step_one_Timeslice_threads(g_gauge_field, g_spinor_field[is], work, kappa_Jacobi);
#else
              Jacobi_Smearing_Step_one_Timeslice(g_gauge_field, g_spinor_field[is], work, kappa_Jacobi);
#endif
            }
          }
      }

      if(fermion_type == _TM_FERMION) {
        // read timeslice of the 12 down-type propagators, smear them
        for(is=0;is<n_s*n_c;is++) {
          if(do_gt == 0) {
            sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix2, Nconf, sx0, sx1, sx2, sx3, is);
            status = read_lime_spinor_timeslice(g_spinor_field[n_s*n_c+is], timeslice, filename, 0, spinor_field_checksum+n_s*n_c+is);
            if(status != 0) {
              fprintf(stderr, "[] Error, could not read propagator from file %s\n", filename);
              exit(102);
            }
            if(N_Jacobi > 0) {
              fprintf(stdout, "# [] Jacobi smearing propagator no. %d with paramters N_Jacobi=%d, kappa_Jacobi=%f\n",
                   is, N_Jacobi, kappa_Jacobi);
              for(c=0; c<N_Jacobi; c++) {
#ifdef OPENMP
                Jacobi_Smearing_Step_one_Timeslice_threads(g_gauge_field, g_spinor_field[n_s*n_c+is], work, kappa_Jacobi);
#else
                Jacobi_Smearing_Step_one_Timeslice(g_gauge_field, g_spinor_field[n_s*n_c+is], work, kappa_Jacobi);
#endif
              }
            }
        }
      }

  
      /******************************************************
       * contractions
       ******************************************************/
      for(ix=0;ix<VOL3;ix++) 
      //for(ix=0;ix<1;ix++) 
      {
  
        // assign the propagators
        _assign_fp_point_from_field(uprop, g_spinor_field, ix);
        if(fermion_type==_TM_FERMION) {
          _assign_fp_point_from_field(dprop, g_spinor_field+n_s*n_c, ix);
        } else {
          _fp_eq_fp(dprop, uprop);
        }
        flavor rotation for twisted mass fermions
        if(fermion_type == _TM_FERMION) {
          _fp_eq_rot_ti_fp(fp1, uprop, +1, fermion_type, fp2);
          _fp_eq_fp_ti_rot(uprop, fp1, +1, fermion_type, fp2);
  //        _fp_eq_rot_ti_fp(fp1, dprop, -1, fermion_type, fp2);
  //        _fp_eq_fp_ti_rot(dprop, fp1, -1, fermion_type, fp2);
        }
  
        // test: print fermion propagator point
        //printf_fp(uprop, stdout);
  
  
        for(icomp=0; icomp<num_component; icomp++) {
  
          _sp_eq_zero( connq[ix*num_component+icomp]);
  
          /******************************************************
           * first contribution
           ******************************************************/
          _fp_eq_zero(fp1);
          _fp_eq_zero(fp2);
          _fp_eq_zero(fp3);
          // C Gamma_1 x S_u = g0 g2 Gamma_1 S_u
          _fp_eq_gamma_ti_fp(fp1, gamma_component[0][icomp], uprop);
          _fp_eq_gamma_ti_fp(fp3, 2, fp1);
          _fp_eq_gamma_ti_fp(fp1, 0, fp3);
  
          // S_u x C Gamma_2 = S_u x g0 g2 Gamma_2
          _fp_eq_fp_ti_gamma(fp2, 0, uprop);
          _fp_eq_fp_ti_gamma(fp3, 2, fp2);
          _fp_eq_fp_ti_gamma(fp2, gamma_component[1][icomp], fp3);
    
          // first part
          // reduce
          _fp_eq_zero(fp3);
          _fp_eq_fp_eps_contract13_fp(fp3, fp1, uprop);
          // reduce to spin propagator
          _sp_eq_zero( sp1 );
          _sp_eq_fp_del_contract23_fp(sp1, fp2, fp3);
          // second part
          // reduce to spin propagator
          _sp_eq_zero( sp2 );
          _sp_eq_fp_del_contract24_fp(sp2, fp2, fp3);
          // add and assign
          _sp_pl_eq_sp(sp1, sp2);
          _sp_eq_sp_ti_re(sp2, sp1, -gamma_component_sign[icomp]);
          _sp_eq_sp( connq[ix*num_component+icomp], sp2);
  
          /******************************************************
           * second contribution
           ******************************************************/
          _fp_eq_zero(fp1);
          _fp_eq_zero(fp2);
          _fp_eq_zero(fp3);
          // first part
          // C Gamma_1 x S_u = g0 g2 Gamma_1 S_u 
          _fp_eq_gamma_ti_fp(fp1, gamma_component[0][icomp], uprop);
          _fp_eq_gamma_ti_fp(fp3, 2, fp1);
          _fp_eq_gamma_ti_fp(fp1, 0, fp3);
          // S_u x C Gamma_2 = S_u g0 g2 Gamma_2 (same S_u as above)
          _fp_eq_fp_ti_gamma(fp2, 0, fp1);
          _fp_eq_fp_ti_gamma(fp3, 2, fp2);
          _fp_eq_fp_ti_gamma(fp1, gamma_component[1][icomp], fp3);
          // reduce
          _fp_eq_zero(fp3);
          _fp_eq_fp_eps_contract13_fp(fp3, fp1, uprop);
          // reduce to spin propagator
          _sp_eq_zero( sp1 );
          _sp_eq_fp_del_contract23_fp(sp1, uprop, fp3);
          // second part
          // C Gamma_1 x S_u = g0 g2 Gamma_1 S_u
          _fp_eq_gamma_ti_fp(fp1, gamma_component[0][icomp], uprop);
          _fp_eq_gamma_ti_fp(fp3, 2, fp1);
          _fp_eq_gamma_ti_fp(fp1, 0, fp3);
          // S_u x C Gamma_2 = S_u g0 g2 Gamma_2
          _fp_eq_fp_ti_gamma(fp2, 0, uprop);
          _fp_eq_fp_ti_gamma(fp3, 2, fp2);
          _fp_eq_fp_ti_gamma(fp2, gamma_component[1][icomp], fp3);
          // reduce
          _fp_eq_zero(fp3);
          _fp_eq_fp_eps_contract13_fp(fp3, fp1, fp2);
          // reduce to spin propagator
          _sp_eq_zero( sp2 );
          _sp_eq_fp_del_contract24_fp(sp2, uprop, fp3);
          // add and assign
          _sp_pl_eq_sp(sp1, sp2);
          _sp_eq_sp_ti_re(sp2, sp1, -gamma_component_sign[icomp]);
          _sp_pl_eq_sp( connq[ix*num_component+icomp], sp2);
  
          /******************************************************
           * third contribution
           ******************************************************/
          _fp_eq_zero(fp1);
          _fp_eq_zero(fp2);
          _fp_eq_zero(fp3);
          // first part
          // C Gamma_1 x S_u = g0 g2 Gamma_1 S_u
          _fp_eq_gamma_ti_fp(fp1, gamma_component[0][icomp], uprop);
          _fp_eq_gamma_ti_fp(fp3, 2, fp1);
          _fp_eq_gamma_ti_fp(fp1, 0, fp3);
          // S_u x C Gamma_2 = S_u g0 g2 Gamma_2
          _fp_eq_fp_ti_gamma(fp2, 0, fp1);
          _fp_eq_fp_ti_gamma(fp3, 2, fp2);
          _fp_eq_fp_ti_gamma(fp1, gamma_component[1][icomp], fp3);
          // reduce
          _fp_eq_zero(fp3);
          _fp_eq_fp_eps_contract13_fp(fp3, fp1, uprop);
          // reduce to spin propagator
          _sp_eq_zero( sp1 );
          _sp_eq_fp_del_contract34_fp(sp1, uprop, fp3);
          // second part
          // C Gamma_1 x S_u = g0 g2 Gamma_1 S_u
          _fp_eq_gamma_ti_fp(fp1, gamma_component[0][icomp], uprop);
          _fp_eq_gamma_ti_fp(fp3, 2, fp1);
          _fp_eq_gamma_ti_fp(fp1, 0, fp3);
          // S_u x C Gamma_2 = S_u g0 g2 Gamma_2
          _fp_eq_fp_ti_gamma(fp2, 0, uprop);
          _fp_eq_fp_ti_gamma(fp3, 2, fp2);
          _fp_eq_fp_ti_gamma(fp2, gamma_component[1][icomp], fp3);
          // reduce
          _fp_eq_zero(fp3);
          _fp_eq_fp_eps_contract13_fp(fp3, fp1, fp2);
          // reduce to spin propagator
          _sp_eq_zero( sp2 );
          _sp_eq_fp_del_contract34_fp(sp2, uprop, fp3);
          // add and assign
          _sp_pl_eq_sp(sp1, sp2);
          _sp_eq_sp_ti_re(sp2, sp1, -gamma_component_sign[icomp]);
          _sp_pl_eq_sp( connq[ix*num_component+icomp], sp2);
  
        }  // of icomp
  
      }    // of ix
  
      /***********************************************
       * finish calculation of connq
       ***********************************************/
      if(g_propagator_bc_type == 0) {
        // multiply with phase factor
        fprintf(stdout, "# [] multiplying timeslice %d with boundary phase factor\n", timeslice);
        ir = (timeslice - sx0 + T_global) % T_global;
        w1.re = cos( 3. * M_PI*(double)ir / (double)T_global );
        w1.im = sin( 3. * M_PI*(double)ir / (double)T_global );
        for(ix=0;ix<num_component*VOL3;ix++) {
          _sp_eq_sp(sp1, connq[ix] );
          _sp_eq_sp_ti_co( connq[ix], sp1, w1);
        }
      } else if (g_propagator_bc_type == 1) {
        // multiply with step function
        if(timeslice < sx0) {
          fprintf(stdout, "# [] multiplying timeslice %d with boundary step function\n", timeslice);
          for(ix=0;ix<num_component*VOL3;ix++) {
            _sp_eq_sp(sp1, connq[ix] );
            _sp_eq_sp_ti_re( connq[ix], sp1, -1.);
          }
        }
      }
    
      if(write_ascii) {
        sprintf(filename, "%s_x.%.4d.t%.2dx%.2dy%.2dz%.2d.ascii", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
        write_contraction2( connq[0][0], filename, num_component*g_sv_dim*g_sv_dim, VOL3, 1, append);
      }
  
      /******************************************************************
       * Fourier transform
       ******************************************************************/
      items =  2 * num_component * g_sv_dim * g_sv_dim * VOL3;
      bytes = sizeof(double);
  
      memcpy(in, connq[0][0], items * bytes);
      ir = num_component * g_sv_dim * g_sv_dim;
  #ifdef OPENMP
      fftwnd_threads(num_threads, plan_p, ir, in, ir, 1, (fftw_complex*)(connq[0][0]), ir, 1);
  #else
      fftwnd(plan_p, ir, in, ir, 1, (fftw_complex*)(connq[0][0]), ir, 1);
  #endif
  
      // add phase factor from the source location
      iix = 0;
      for(x1=0;x1<LX;x1++) {
        q[0] = (double)x1 / (double)LX;
      for(x2=0;x2<LY;x2++) {
        q[1] = (double)x2 / (double)LY;
      for(x3=0;x3<LZ;x3++) {
        q[2] = (double)x3 / (double)LZ;
        phase = 2. * M_PI * ( q[0]*sx1 + q[1]*sx2 + q[2]*sx3 );
        w1.re = cos(phase);
        w1.im = sin(phase);
  
        for(icomp=0; icomp<num_component; icomp++) {
          _sp_eq_sp(sp1, connq[iix] );
          _sp_eq_sp_ti_co( connq[iix], sp1, w1) ;
          iix++; 
        }
      }}}  // of x3, x2, x1
  
      // write to file
      sprintf(filename, "%s_q.%.4d.t%.2dx%.2dy%.2dz%.2d.Qx%.2dQy%.2dQz%.2d.%.5d", outfile_prefix, Nconf, sx0, sx1, sx2, sx3,
         qlatt_rep[snk_momentum_list[imom_snk]][1],qlatt_rep[snk_momentum_list[imom_snk]][2],qlatt_rep[snk_momentum_list[imom_snk]][3],
         g_sourceid2-g_sourceid+1);
      sprintf(contype, "2-pt. function, (t,q_1,q_2,q_3)-dependent, source_timeslice = %d", sx0);
      write_lime_contraction_timeslice(connq[0][0], filename, 64, num_component*g_sv_dim*g_sv_dim, contype, Nconf, 0, &connq_checksum, timeslice);
  
      if(write_ascii) {
        strcat(filename, ".ascii");
        write_contraction2(connq[0][0],filename, num_component*g_sv_dim*g_sv_dim, VOL3, 1, append);
      }
  
  
      /***********************************************
       * calculate connt
       ***********************************************/
      for(icomp=0;icomp<num_component; icomp++) {
        // fwd
        _sp_eq_sp(sp1, connq[icomp]);
        _sp_eq_gamma_ti_sp(sp2, 0, sp1);
        _sp_pl_eq_sp(sp1, sp2);
        _co_eq_tr_sp(&w, sp1);
        connt[2*(icomp*T + timeslice)  ] = w.re * 0.25;
        connt[2*(icomp*T + timeslice)+1] = w.im * 0.25;
        // bwd
        _sp_eq_sp(sp1, connq[icomp]);
        _sp_eq_gamma_ti_sp(sp2, 0, sp1);
        _sp_mi_eq_sp(sp1, sp2);
        _co_eq_tr_sp(&w, sp1);
        connt[2*(icomp*T+timeslice + num_component*T)  ] = w.re * 0.25;
        connt[2*(icomp*T+timeslice + num_component*T)+1] = w.im * 0.25;
      }
  
    }  // of loop on timeslice

    // write connt
    sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.fw", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
    ofs = fopen(filename, "w");
    if(ofs == NULL) {
      fprintf(stderr, "[] Error, could not open file %s for writing\n", filename);
      exit(3);
    }
    fprintf(ofs, "#%12.8f%3d%3d%3d%3d%8.4f%6d\n", g_kappa, T_global, LX, LY, LZ, g_mu, Nconf);
  
    for(icomp=0; icomp<num_component; icomp++) {
      ir = sx0;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], 0, connt[2*(icomp*T+ir)], 0., Nconf);
      for(it=1;it<T/2;it++) {
        ir  = ( it + sx0 ) % T_global;
        ir2 = ( (T_global - it) + sx0 ) % T_global;
        fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(icomp*T+ir)], connt[2*(icomp*T+ir2)], Nconf);
      }
      ir = ( it + sx0 ) % T_global;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(icomp*T+ir)], 0., Nconf);
    }
    fclose(ofs);
  
    sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.bw", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
    ofs = fopen(filename, "w");
    if(ofs == NULL) {
      fprintf(stderr, "[] Error, could not open file %s for writing\n", filename);
      exit(3);
    }
    fprintf(ofs, "#%12.8f%3d%3d%3d%3d%8.4f%6d\n", g_kappa, T_global, LX, LY, LZ, g_mu, Nconf);
  
    for(icomp=0; icomp<num_component; icomp++) {
      ir = sx0;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], 0, connt[2*(num_component*T+icomp*T+ir)], 0., Nconf);
      for(it=1;it<T/2;it++) {
        ir  = ( it + sx0 ) % T_global;
        ir2 = ( (T_global - it) + sx0 ) % T_global;
        fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(num_component*T+icomp*T+ir)], connt[2*(num_component*T+icomp*T+ir2)], Nconf);
      }
      ir = ( it + sx0 ) % T_global;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*(num_component*T+icomp*T+ir)], 0., Nconf);
    }
    fclose(ofs);

  }  // of loop on sink momentum ( = Delta^++ momentum, Qvec)

  /***********************************************
   * free the allocated memory, finalize
   ***********************************************/
  free_geometry();
  if(connt!= NULL) free(connt);
  if(connq!= NULL) free(connq);
  if(gauge_trafo != NULL) free(gauge_trafo);

  if(g_spinor_field!=NULL) {
    for(i=0; i<no_fields; i++) free(g_spinor_field[i]);
    free(g_spinor_field); g_spinor_field=(double**)NULL;
  }
  if(spinor_field_checksum !=NULL) free(spinor_field_checksum);
  if(g_gauge_field != NULL) free(g_gauge_field);

  if(snk_momemtum_list != NULL) {
    if(snk_momentum_list[0] != NULL) free(snk_momentum_list[0]);
    free(snk_momentum_list);
  }
  if(rel_momemtum_list != NULL) {
    if(rel_momentum_list[0] != NULL) free(rel_momentum_list[0]);
    free(rel_momentum_list);
  }

  // free the fermion propagator points
  free_fp( &uprop );
  free_fp( &dprop );
  free_fp( &fp1 );
  free_fp( &fp2 );
  free_fp( &fp3 );
  free_sp( &sp1 );
  free_sp( &sp2 );

  free(in);
  fftwnd_destroy_plan(plan_p);

  g_the_time = time(NULL);
  fprintf(stdout, "# [] %s# [] end fo run\n", ctime(&g_the_time));
  fflush(stdout);
  fprintf(stderr, "# [] %s# [] end fo run\n", ctime(&g_the_time));
  fflush(stderr);

#ifdef MPI
  MPI_Finalize();
#endif
  return(0);
}
Exemple #6
0
fftwnd_plan fftwnd_create_plan(int rank, const int *n,
			       fftw_direction dir, int flags)
{
     return fftwnd_create_plan_specific(rank, n, dir, flags, 0, 1, 0, 1);
}
Exemple #7
0
void testnd_in_place(int rank, int *n, fftw_direction dir,
		     fftwnd_plan validated_plan,
		     int alternate_api, int specific, int force_buffered)
{
     int istride;
     int N, dim, i;
     fftw_complex *in1, *in2, *out2;
     fftwnd_plan p;
     int flags = measure_flag | wisdom_flag | FFTW_IN_PLACE;

     if (coinflip())
	  flags |= FFTW_THREADSAFE;

     if (force_buffered)
	  flags |= FFTWND_FORCE_BUFFERED;

     N = 1;
     for (dim = 0; dim < rank; ++dim)
	  N *= n[dim];

     in1 = (fftw_complex *) fftw_malloc(N * MAX_STRIDE * sizeof(fftw_complex));
     in2 = (fftw_complex *) fftw_malloc(N * sizeof(fftw_complex));
     out2 = (fftw_complex *) fftw_malloc(N * sizeof(fftw_complex));

     if (!specific) {
	  if (alternate_api && (rank == 2 || rank == 3)) {
	       if (rank == 2)
		    p = fftw2d_create_plan(n[0], n[1], dir, flags);
	       else
		    p = fftw3d_create_plan(n[0], n[1], n[2], dir, flags);
	  } else		/* standard api */
	       p = fftwnd_create_plan(rank, n, dir, flags);
     } else {			/* specific plan creation */
	  if (alternate_api && (rank == 2 || rank == 3)) {
	       if (rank == 2)
		    p = fftw2d_create_plan_specific(n[0], n[1], dir, flags,
						    in1, 1,
					       (fftw_complex *) NULL, 1);
	       else
		    p = fftw3d_create_plan_specific(n[0], n[1], n[2], dir, flags,
						    in1, 1,
					       (fftw_complex *) NULL, 1);
	  } else		/* standard api */
	       p = fftwnd_create_plan_specific(rank, n, dir, flags,
					       in1, 1,
					       (fftw_complex *) NULL, 1);

     }

     for (istride = 1; istride <= MAX_STRIDE; ++istride) {
	  /* 
	   * generate random inputs */
	  for (i = 0; i < N; ++i) {
	       int j;
	       c_re(in2[i]) = DRAND();
	       c_im(in2[i]) = DRAND();
	       for (j = 0; j < istride; ++j) {
		    c_re(in1[i * istride + j]) = c_re(in2[i]);
		    c_im(in1[i * istride + j]) = c_im(in2[i]);
	       }
	  }

	  if (istride != 1 || istride != 1 || coinflip())
	       fftwnd(p, istride, in1, istride, 1, (fftw_complex *) NULL, 1, 1);
	  else
	       fftwnd_one(p, in1, NULL);

	  fftwnd(validated_plan, 1, in2, 1, 1, out2, 1, 1);

	  for (i = 0; i < istride; ++i)
	       CHECK(compute_error_complex(in1 + i, istride, out2, 1, N) < TOLERANCE,
		     "testnd_in_place: wrong answer");
     }

     fftwnd_destroy_plan(p);

     fftw_free(out2);
     fftw_free(in2);
     fftw_free(in1);
}
Exemple #8
0
maxwell_data *create_maxwell_data(int nx, int ny, int nz,
				  int *local_N, int *N_start, int *alloc_N,
				  int num_bands,
				  int max_fft_bands)
{
     int n[3], rank = (nz == 1) ? (ny == 1 ? 1 : 2) : 3;
     maxwell_data *d = 0;
     int fft_data_size;

     n[0] = nx;
     n[1] = ny;
     n[2] = nz;

#if !defined(HAVE_FFTW) && !defined(HAVE_FFTW3)
#  error Non-FFTW FFTs are not currently supported.
#endif
     

#if defined(HAVE_FFTW)
     CHECK(sizeof(fftw_real) == sizeof(real),
	   "floating-point type is inconsistent with FFTW!");
#endif

     CHK_MALLOC(d, maxwell_data, 1);

     d->nx = nx;
     d->ny = ny;
     d->nz = nz;
     
     d->max_fft_bands = MIN2(num_bands, max_fft_bands);
     maxwell_set_num_bands(d, num_bands);

     d->current_k[0] = d->current_k[1] = d->current_k[2] = 0.0;
     d->parity = NO_PARITY;

     d->last_dim_size = d->last_dim = n[rank - 1];

     /* ----------------------------------------------------- */
     d->nplans = 1;
#ifndef HAVE_MPI 
     d->local_nx = nx; d->local_ny = ny;
     d->local_x_start = d->local_y_start = 0;
     *local_N = *alloc_N = nx * ny * nz;
     *N_start = 0;
     d->other_dims = *local_N / d->last_dim;

     d->fft_data = 0;  /* initialize it here for use in specific planner? */

#  if defined(HAVE_FFTW3)
     d->nplans = 0; /* plans will be created as needed */
#    ifdef SCALAR_COMPLEX
     d->fft_output_size = fft_data_size = nx * ny * nz;
#    else
     d->last_dim_size = 2 * (d->last_dim / 2 + 1);
     d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2;
#    endif

#  elif defined(HAVE_FFTW)
#    ifdef SCALAR_COMPLEX
     d->fft_output_size = fft_data_size = nx * ny * nz;
     d->plans[0] = fftwnd_create_plan_specific(rank, n, FFTW_BACKWARD,
					   FFTW_ESTIMATE | FFTW_IN_PLACE,
					   (fftw_complex*) d->fft_data,
					   3 * d->num_fft_bands,
					   (fftw_complex*) d->fft_data,
					   3 * d->num_fft_bands);
     d->iplans[0] = fftwnd_create_plan_specific(rank, n, FFTW_FORWARD,
					    FFTW_ESTIMATE | FFTW_IN_PLACE,
					    (fftw_complex*) d->fft_data,
					    3 * d->num_fft_bands,
					    (fftw_complex*) d->fft_data,
					    3 * d->num_fft_bands);
#    else /* not SCALAR_COMPLEX */
     d->last_dim_size = 2 * (d->last_dim / 2 + 1);
     d->fft_output_size = (fft_data_size = d->other_dims * d->last_dim_size)/2;
     d->plans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_COMPLEX_TO_REAL,
					    FFTW_ESTIMATE | FFTW_IN_PLACE,
					    (fftw_real*) d->fft_data,
					    3 * d->num_fft_bands,
					    (fftw_real*) d->fft_data,
					    3 * d->num_fft_bands);
     d->iplans[0] = rfftwnd_create_plan_specific(rank, n, FFTW_REAL_TO_COMPLEX,
					     FFTW_ESTIMATE | FFTW_IN_PLACE,
					     (fftw_real*) d->fft_data,
					     3 * d->num_fft_bands,
					     (fftw_real*) d->fft_data,
					     3 * d->num_fft_bands);
#    endif /* not SCALAR_COMPLEX */
#  endif /* HAVE_FFTW */

#else /* HAVE_MPI */
     /* ----------------------------------------------------- */

#  if defined(HAVE_FFTW3)
{
     int i;
     ptrdiff_t np[3], local_nx, local_ny, local_x_start, local_y_start;

     CHECK(rank > 1, "rank < 2 MPI computations are not supported");

     d->nplans = 0; /* plans will be created as needed */

     for (i = 0; i < rank; ++i) np[i] = n[i];
     
#    ifndef SCALAR_COMPLEX
     d->last_dim_size = 2 * (np[rank-1] = d->last_dim / 2 + 1);
#    endif

     fft_data_size = *alloc_N 
	  = FFTW(mpi_local_size_transposed)(rank, np, MPI_COMM_WORLD,
					    &local_nx, &local_x_start,
					    &local_ny, &local_y_start);
#    ifndef SCALAR_COMPLEX
     fft_data_size = (*alloc_N *= 2); // convert to # of real scalars
#    endif

     d->local_nx = local_nx;
     d->local_x_start = local_x_start;
     d->local_ny = local_ny;
     d->local_y_start = local_y_start;

     d->fft_output_size = nx * d->local_ny * (rank==3 ? np[2] : nz);
     *local_N = d->local_nx * ny * nz;
     *N_start = d->local_x_start * ny * nz;
     d->other_dims = *local_N / d->last_dim;
}
#  elif defined(HAVE_FFTW)

     CHECK(rank > 1, "rank < 2 MPI computations are not supported");

#    ifdef SCALAR_COMPLEX
     d->iplans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n,
				       FFTW_FORWARD,
				       FFTW_ESTIMATE | FFTW_IN_PLACE);
     {
	  int nt[3]; /* transposed dimensions for reverse FFT */
	  nt[0] = n[1]; nt[1] = n[0]; nt[2] = n[2]; 
	  d->plans[0] = fftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, nt,
					   FFTW_BACKWARD,
					   FFTW_ESTIMATE | FFTW_IN_PLACE);
     }

     fftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start,
			    &d->local_ny, &d->local_y_start,
			    &fft_data_size);
     
     d->fft_output_size = nx * d->local_ny * nz;

#    else /* not SCALAR_COMPLEX */

     CHECK(rank > 1, "rank < 2 MPI computations are not supported");

     d->iplans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n,
					FFTW_REAL_TO_COMPLEX,
					FFTW_ESTIMATE | FFTW_IN_PLACE);

     /* Unlike fftwnd_mpi, we do *not* pass transposed dimensions for
	the reverse transform here--we always pass the dimensions of the
	original real array, and rfftwnd_mpi assumes that if one
	transform is transposed, then the other is as well. */
     d->plans[0] = rfftwnd_mpi_create_plan(MPI_COMM_WORLD, rank, n,
				       FFTW_COMPLEX_TO_REAL,
				       FFTW_ESTIMATE | FFTW_IN_PLACE);

     rfftwnd_mpi_local_sizes(d->iplans[0], &d->local_nx, &d->local_x_start,
			     &d->local_ny, &d->local_y_start,
			     &fft_data_size);

     d->last_dim_size = 2 * (d->last_dim / 2 + 1);
     if (rank == 2)
	  d->fft_output_size = nx * d->local_ny * nz;
     else
	  d->fft_output_size = nx * d->local_ny * (d->last_dim_size / 2);

#    endif /* not SCALAR_COMPLEX */
     
     *local_N = d->local_nx * ny * nz;
     *N_start = d->local_x_start * ny * nz;
     *alloc_N = *local_N;
     d->other_dims = *local_N / d->last_dim;

#  endif /* HAVE_FFTW */

#endif /* HAVE_MPI */
     /* ----------------------------------------------------- */

#ifdef HAVE_FFTW
     CHECK(d->plans[0] && d->iplans[0], "FFTW plan creation failed");
#endif

     CHK_MALLOC(d->eps_inv, symmetric_matrix, d->fft_output_size);

     /* A scratch output array is required because the "ordinary" arrays
	are not in a cartesian basis (or even a constant basis). */
     fft_data_size *= d->max_fft_bands;
#if defined(HAVE_FFTW3)
     d->fft_data = (scalar *) FFTW(malloc)(sizeof(scalar) * 3 * fft_data_size);
     CHECK(d->fft_data, "out of memory!");
     d->fft_data2 = d->fft_data; /* works in-place */
#else     
     CHK_MALLOC(d->fft_data, scalar, 3 * fft_data_size);
     d->fft_data2 = d->fft_data; /* works in-place */
#endif

     CHK_MALLOC(d->k_plus_G, k_data, *local_N);
     CHK_MALLOC(d->k_plus_G_normsqr, real, *local_N);

     d->eps_inv_mean = 1.0;

     d->local_N = *local_N;
     d->N_start = *N_start;
     d->alloc_N = *alloc_N;
     d->N = nx * ny * nz;

     return d;
}
int main(int argc, char **argv) {
  
  const int n_c=3;
  const int n_s=4;
  const char outfile_prefix[] = "deltapp2piN";

  int c, i, icomp, imom, count;
  int filename_set = 0;
  int append, status;
  int l_LX_at, l_LXstart_at;
  int ix, it, iix, x1,x2,x3;
  int ir, ir2, is;
  int VOL3;
  int do_gt=0;
  int dims[3];
  double *connt=NULL;
  spinor_propagator_type *connq=NULL;
  int verbose = 0;
  int sx0, sx1, sx2, sx3;
  int write_ascii=0;
  int fermion_type = _WILSON_FERMION;  // Wilson fermion type
  int pos;
  char filename[200], contype[200], gauge_field_filename[200], line[200];
  double ratime, retime;
  //double plaq_m, plaq_r;
  int mode = -1;
  double *work=NULL;
  fermion_propagator_type fp1=NULL, fp2=NULL, fp3=NULL, fp4=NULL, fpaux=NULL, uprop=NULL, dprop=NULL;
  spinor_propagator_type sp1=NULL, sp2=NULL;
  double q[3], phase, *gauge_trafo=NULL, spinor1[24];
  complex w, w1;
  size_t items, bytes;
  FILE *ofs;
  int timeslice;
  DML_Checksum ildg_gauge_field_checksum, *spinor_field_checksum=NULL, connq_checksum, *seq_spinor_field_checksum=NULL;
  uint32_t nersc_gauge_field_checksum;
  int gamma_proj_sign[] = {1,1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};

/***********************************************************/
  int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL, qlatt_nclass=0;
  int use_lattice_momenta = 0;
  double **qlatt_list=NULL;
/***********************************************************/

/***********************************************************/
  int rel_momentum_filename_set = 0, rel_momentum_no=0;
  int **rel_momentum_list=NULL;
  char rel_momentum_filename[200];
/***********************************************************/

/***********************************************************/
  int snk_momentum_no = 1;
  int **snk_momentum_list = NULL;
  int snk_momentum_filename_set = 0;
  char snk_momentum_filename[200];
/***********************************************************/


/*******************************************************************
 * Gamma components for the Delta:
 */
  const int num_component = 4;
  int gamma_component[2][4] = { {0, 1, 2, 3},
                                {5, 5, 5, 5} };
  double gamma_component_sign[4] = {+1., +1., +1., +1.};
/*
 *******************************************************************/
  fftw_complex *in=NULL;
#ifdef MPI
   fftwnd_mpi_plan plan_p;
#else
   fftwnd_plan plan_p;
#endif 

#ifdef MPI
  MPI_Status status;
#endif

#ifdef MPI
  MPI_Init(&argc, &argv);
#endif

  while ((c = getopt(argc, argv, "ah?vgf:F:p:P:s:m:")) != -1) {
    switch (c) {
    case 'v':
      verbose = 1;
      break;
    case 'f':
      strcpy(filename, optarg);
      filename_set=1;
      break;
    case 'a':
      write_ascii = 1;
      fprintf(stdout, "# [] will write in ascii format\n");
      break;
    case 'F':
      if(strcmp(optarg, "Wilson") == 0) {
        fermion_type = _WILSON_FERMION;
      } else if(strcmp(optarg, "tm") == 0) {
        fermion_type = _TM_FERMION;
      } else {
        fprintf(stderr, "[] Error, unrecognized fermion type\n");
        exit(145);
      }
      fprintf(stdout, "# [] will use fermion type %s ---> no. %d\n", optarg, fermion_type);
      break;
    case 'g':
      do_gt = 1;
      fprintf(stdout, "# [] will perform gauge transform\n");
      break;
    case 's':
      use_lattice_momenta = 1;
      fprintf(stdout, "# [] will use lattice momenta\n");
      break;
    case 'p':
      rel_momentum_filename_set = 1;
      strcpy(rel_momentum_filename, optarg);
      fprintf(stdout, "# [] will use current momentum file %s\n", rel_momentum_filename);
      break;
    case 'P':
      snk_momentum_filename_set = 1;
      strcpy(snk_momentum_filename, optarg);
      fprintf(stdout, "# [] will use nucleon momentum file %s\n", snk_momentum_filename);
      break;
    case 'm':
      if(strcmp(optarg, "sequential")==0) {
        mode = 1;
      } else if(strcmp(optarg, "contract")==0) {
        mode = 2;
      }
      fprintf(stdout, "# [] will use mode %d\n", mode);
      break;
    case 'h':
    case '?':
    default:
      usage();
      break;
    }
  }

  /* set the default values */
  if(filename_set==0) strcpy(filename, "cvc.input");
  fprintf(stdout, "# reading input from file %s\n", filename);
  read_input_parser(filename);

  /* some checks on the input data */
  if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) {
    if(g_proc_id==0) fprintf(stdout, "T and L's must be set\n");
    usage();
  }
  if(g_kappa == 0.) {
    if(g_proc_id==0) fprintf(stdout, "kappa should be > 0.n");
    usage();
  }

#ifdef OPENMP
 omp_set_num_threads(g_num_threads);
#else
 fprintf(stdout, "[delta_pp_2_pi_N_sequential] Warning, resetting global thread number to 1\n");
 g_num_threads = 1;
#endif

  /* initialize MPI parameters */
  mpi_init(argc, argv);

#ifdef OPENMP
  status = fftw_threads_init();
  if(status != 0) {
    fprintf(stderr, "\n[] Error from fftw_init_threads; status was %d\n", status);
    exit(120);
  }
#endif

  /******************************************************
   *
   ******************************************************/
  VOL3 = LX*LY*LZ;
  l_LX_at      = LX;
  l_LXstart_at = 0;
  FFTW_LOC_VOLUME = T*LX*LY*LZ;
  fprintf(stdout, "# [%2d] parameters:\n"\
		  "# [%2d] l_LX_at      = %3d\n"\
		  "# [%2d] l_LXstart_at = %3d\n"\
		  "# [%2d] FFTW_LOC_VOLUME = %3d\n", 
		  g_cart_id, g_cart_id, l_LX_at,
		  g_cart_id, l_LXstart_at, g_cart_id, FFTW_LOC_VOLUME);

  if(init_geometry() != 0) {
    fprintf(stderr, "ERROR from init_geometry\n");
    exit(1);
  }

  geometry();

  if(N_Jacobi>0) {

    // alloc the gauge field
    alloc_gauge_field(&g_gauge_field, VOL3);
    switch(g_gauge_file_format) {
      case 0:
        sprintf(gauge_field_filename, "%s.%.4d", gaugefilename_prefix, Nconf);
        break;
      case 1:
        sprintf(gauge_field_filename, "%s.%.5d", gaugefilename_prefix, Nconf);
        break;
    }
  } else {
    g_gauge_field = NULL;
  }


  /*********************************************************************
   * gauge transformation
   *********************************************************************/
  if(do_gt) { init_gauge_trafo(&gauge_trafo, 1.); }

  // determine the source location
  sx0 = g_source_location/(LX*LY*LZ)-Tstart;
  sx1 = (g_source_location%(LX*LY*LZ)) / (LY*LZ);
  sx2 = (g_source_location%(LY*LZ)) / LZ;
  sx3 = (g_source_location%LZ);
//  g_source_time_slice = sx0;
  fprintf(stdout, "# [] source location %d = (%d,%d,%d,%d)\n", g_source_location, sx0, sx1, sx2, sx3);

if(mode == 1 || mode == 2) {
  /***************************************************************************
   * read the relative momenta q to be used
   ***************************************************************************/
  ofs = fopen(rel_momentum_filename, "r");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for reading\n", rel_momentum_filename);
    exit(6);
  }
  rel_momentum_no = 0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      rel_momentum_no++;
    }
  }
  if(rel_momentum_no == 0) {
    fprintf(stderr, "[] Error, number of momenta is zero\n");
    exit(7);
  } else {
    fprintf(stdout, "# [] number of current momenta = %d\n", rel_momentum_no);
  }
  rewind(ofs);
  rel_momentum_list = (int**)malloc(rel_momentum_no * sizeof(int*));
  rel_momentum_list[0] = (int*)malloc(3*rel_momentum_no * sizeof(int));
  for(i=1;i<rel_momentum_no;i++) { rel_momentum_list[i] = rel_momentum_list[i-1] + 3; }
  count=0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      sscanf(line, "%d%d%d", rel_momentum_list[count], rel_momentum_list[count]+1, rel_momentum_list[count]+2);
      count++;
    }
  }
  fclose(ofs);
  fprintf(stdout, "# [] current momentum list:\n");
  for(i=0;i<rel_momentum_no;i++) {
    fprintf(stdout, "\t%3d%3d%3d%3d\n", i, rel_momentum_list[i][0], rel_momentum_list[i][1], rel_momentum_list[i][2]);
  }
}  // of if mode == 1

if(mode == 2) {
  /***************************************************************************
   * read the nucleon final momenta to be used
   ***************************************************************************/
  ofs = fopen(snk_momentum_filename, "r");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for reading\n", snk_momentum_filename);
    exit(6);
  }
  snk_momentum_no = 0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      snk_momentum_no++;
    }
  }
  if(snk_momentum_no == 0) {
    fprintf(stderr, "[] Error, number of momenta is zero\n");
    exit(7);
  } else {
    fprintf(stdout, "# [] number of nucleon final momenta = %d\n", snk_momentum_no);
  }
  rewind(ofs);
  snk_momentum_list = (int**)malloc(snk_momentum_no * sizeof(int*));
  snk_momentum_list[0] = (int*)malloc(3*snk_momentum_no * sizeof(int));
  for(i=1;i<snk_momentum_no;i++) { snk_momentum_list[i] = snk_momentum_list[i-1] + 3; }
  count=0;
  while( fgets(line, 199, ofs) != NULL) {
    if(line[0] != '#') {
      sscanf(line, "%d%d%d", snk_momentum_list[count], snk_momentum_list[count]+1, snk_momentum_list[count]+2);
      count++;
    }
  }
  fclose(ofs);
  fprintf(stdout, "# [] the nucleon final momentum list:\n");
  for(i=0;i<snk_momentum_no;i++) {
    fprintf(stdout, "\t%3d%3d%3d%3d\n", i, snk_momentum_list[i][0], snk_momentum_list[i][1], snk_momentum_list[i][2]);
  }
}  // of if mode == 2

  // allocate memory for the spinor fields
  g_spinor_field = NULL;
  if(mode == 1) {
    no_fields = 3;
    g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
    for(i=0; i<no_fields-1; i++) alloc_spinor_field(&g_spinor_field[i], VOL3);
    alloc_spinor_field(&g_spinor_field[no_fields-1], VOLUME);
    if(N_Jacobi>0) work = g_spinor_field[1];
  } else if(mode == 2) {
    no_fields = 2*n_s*n_c;
    if(N_Jacobi>0) no_fields++;
    g_spinor_field = (double**)calloc(no_fields, sizeof(double*));
    for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOL3);
    if(N_Jacobi>0) work = g_spinor_field[no_fields-1];
  }

  spinor_field_checksum = (DML_Checksum*)malloc(n_s*n_c * sizeof(DML_Checksum) );
  if(spinor_field_checksum == NULL ) {
    fprintf(stderr, "[] Error, could not alloc checksums for spinor fields\n");
    exit(73);
  }

  seq_spinor_field_checksum = (DML_Checksum*)malloc(rel_momentum_no*n_s*n_c * sizeof(DML_Checksum) );
  if(seq_spinor_field_checksum == NULL ) {
    fprintf(stderr, "[] Error, could not alloc checksums for seq. spinor fields\n");
    exit(73);
  }

if(mode == 1) {
  
  /*************************************************************************
   * sequential source
   *************************************************************************/

    // (1) read the prop., smear, multiply with gamma_5, save as source 

    // read timeslice of the gauge field
    if( N_Jacobi>0) {
      switch(g_gauge_file_format) {
        case 0:
          status = read_lime_gauge_field_doubleprec_timeslice(g_gauge_field, gauge_field_filename, sx0, &ildg_gauge_field_checksum);
          break;
        case 1:
          status = read_nersc_gauge_field_timeslice(g_gauge_field, gauge_field_filename, sx0, &nersc_gauge_field_checksum);
          break;
      }
      if(status != 0) {
        fprintf(stderr, "[] Error, could not read gauge field\n");
        exit(21);
      }
#ifdef OPENMP
        status = APE_Smearing_Step_Timeslice_threads(g_gauge_field, N_ape, alpha_ape);
#else
      for(i=0; i<N_ape; i++) {
        status = APE_Smearing_Step_Timeslice(g_gauge_field, alpha_ape);
      }
#endif
    }
    // read timeslice of the 12 down-type propagators and smear them
    for(is=0;is<n_s*n_c;is++) {
      if(fermion_type != _TM_FERMION) {
        sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix, Nconf, sx0, sx1, sx2, sx3, is);
      } else {
        sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix2, Nconf, sx0, sx1, sx2, sx3, is);
      }
      status = read_lime_spinor_timeslice(g_spinor_field[0], sx0, filename, 0, spinor_field_checksum+is);
      if(status != 0) {
        fprintf(stderr, "[] Error, could not read propagator from file %s\n", filename);
        exit(102);
      }
      if(N_Jacobi > 0) {
        fprintf(stdout, "# [] Jacobi smearing propagator no. %d with paramters N_Jacobi=%d, kappa_Jacobi=%f\n",
            is, N_Jacobi, kappa_Jacobi);
#ifdef OPENMP
        Jacobi_Smearing_Step_one_Timeslice_threads(g_gauge_field, g_spinor_field[0], work, N_Jacobi, kappa_Jacobi);
#else
        for(c=0; c<N_Jacobi; c++) {
          Jacobi_Smearing_Step_one_Timeslice(g_gauge_field, g_spinor_field[0], work, kappa_Jacobi);
        }
#endif
      }

      for(imom=0;imom<rel_momentum_no;imom++) {
        for(ix=0;ix<VOLUME;ix++) { _fv_eq_zero(g_spinor_field[2]+_GSI(ix)); }
        ix = 0;
        iix = sx0 * VOL3;
        for(x1=0;x1<LX;x1++) {
        for(x2=0;x2<LY;x2++) {
        for(x3=0;x3<LZ;x3++) {
          phase = 2. * M_PI * ( (x1-sx1) * rel_momentum_list[imom][0] / (double)LX
                              + (x2-sx2) * rel_momentum_list[imom][1] / (double)LY
                              + (x3-sx3) * rel_momentum_list[imom][2] / (double)LZ );
          w.re =  cos(phase);
          w.im = -sin(phase);
          _fv_eq_gamma_ti_fv(spinor1, 5, g_spinor_field[0] + _GSI(ix));
          _fv_eq_fv_ti_co(g_spinor_field[2]+_GSI(iix), spinor1, &w);
          ix++;
          iix++;
        }}}

        // save the sourceg_spinor_field[2]
        sprintf(filename, "seq_%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, sx0, sx1, sx2, sx3, is,
           rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
        status = write_lime_spinor(g_spinor_field[2], filename, 0, g_propagator_precision);
/*
        fprintf(stdout, "# [] the sequential source:\n");
        for(ix=0;ix<VOLUME;ix++) {
          for(i=0;i<12;i++) {
            fprintf(stdout, "\t%6d%3d%25.16e%25.16e\n", ix, i, g_spinor_field[2][_GSI(ix)+2*i], g_spinor_field[2][_GSI(ix)+2*i+1]);
          }
        }
*/

      }  // of imom
    }    // of is
}  // of if mode == 1

if(mode == 2) {

  /*************************************************************************
   * contractions
   *************************************************************************/

  // allocate memory for the contractions
  items = 4 * rel_momentum_no * num_component * T;
  bytes = sizeof(double);
  connt = (double*)malloc(items*bytes);
  if(connt == NULL) {
    fprintf(stderr, "\n[] Error, could not alloc connt\n");
    exit(2);
  }
  for(ix=0; ix<items; ix++) connt[ix] = 0.;

  items = num_component * (size_t)VOL3;
  connq = create_sp_field( items );
  if(connq == NULL) {
    fprintf(stderr, "\n[] Error, could not alloc connq\n");
    exit(2);
  }


  // initialize FFTW
  items = 2 * num_component * g_sv_dim * g_sv_dim * VOL3;
  bytes = sizeof(double);
  in  = (fftw_complex*)malloc(num_component*g_sv_dim*g_sv_dim*VOL3*sizeof(fftw_complex));
  if(in == NULL) {
    fprintf(stderr, "[] Error, could not malloc in for FFTW\n");
    exit(155);
  }
  dims[0]=LX; dims[1]=LY; dims[2]=LZ;
  //plan_p = fftwnd_create_plan(3, dims, FFTW_FORWARD, FFTW_MEASURE | FFTW_IN_PLACE);
  plan_p = fftwnd_create_plan_specific(3, dims, FFTW_FORWARD, FFTW_MEASURE, in, num_component*g_sv_dim*g_sv_dim, (fftw_complex*)( connq[0][0] ), num_component*g_sv_dim*g_sv_dim);

  // create the fermion propagator points
  create_fp(&uprop);
  create_fp(&dprop);
  create_fp(&fp1);
  create_fp(&fp2);
  create_fp(&fp3);
  create_fp(&fp4);
  create_fp(&fpaux);
  create_sp(&sp1);
  create_sp(&sp2);


  /******************************************************
   * loop on timeslices
   ******************************************************/
  for(timeslice=0; timeslice<T; timeslice++)
  // for(timeslice=1; timeslice<2; timeslice++)
  {
    append = (int)( timeslice != 0 );

    // read timeslice of the gauge field
    if( N_Jacobi>0) {
      switch(g_gauge_file_format) {
        case 0:
          status = read_lime_gauge_field_doubleprec_timeslice(g_gauge_field, gauge_field_filename, timeslice, &ildg_gauge_field_checksum);
          break;
        case 1:
          status = read_nersc_gauge_field_timeslice(g_gauge_field, gauge_field_filename, timeslice, &nersc_gauge_field_checksum);
          break;
      }
      if(status != 0) {
        fprintf(stderr, "[] Error, could not read gauge field\n");
        exit(21);
      }
#ifdef OPENMP
      status = APE_Smearing_Step_Timeslice_threads(g_gauge_field, N_ape, alpha_ape);
#else
      for(i=0; i<N_ape; i++) {
        status = APE_Smearing_Step_Timeslice(g_gauge_field, alpha_ape);
      }
#endif
    }

    // read timeslice of the 12 up-type propagators and smear them
    for(is=0;is<n_s*n_c;is++) {
//      if(do_gt == 0) {
        sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.inverted", filename_prefix, Nconf, sx0, sx1, sx2, sx3, is);
        status = read_lime_spinor_timeslice(g_spinor_field[is], timeslice, filename, 0, spinor_field_checksum+is);
        if(status != 0) {
          fprintf(stderr, "[] Error, could not read propagator from file %s\n", filename);
          exit(102);
        }
        if(N_Jacobi > 0) {
          fprintf(stdout, "# [] Jacobi smearing propagator no. %d with paramters N_Jacobi=%d, kappa_Jacobi=%f\n",
              is, N_Jacobi, kappa_Jacobi);
#ifdef OPENMP
          Jacobi_Smearing_Step_one_Timeslice_threads(g_gauge_field, g_spinor_field[is], work, N_Jacobi, kappa_Jacobi);
#else
          for(c=0; c<N_Jacobi; c++) {
            Jacobi_Smearing_Step_one_Timeslice(g_gauge_field, g_spinor_field[is], work, kappa_Jacobi);
          }
#endif
        }
//      } else {  // of if do_gt == 0
//        // apply gt
//        apply_gt_prop(gauge_trafo, g_spinor_field[is], is/n_c, is%n_c, 4, filename_prefix, g_source_location);
//      } // of if do_gt == 0

    }


    /******************************************************
     * loop on relative momenta
     ******************************************************/
    for(imom=0;imom<rel_momentum_no; imom++) {

      // read 12 sequential propagators
      for(is=0;is<n_s*n_c;is++) {
//        if(do_gt == 0) {
          sprintf(filename, "seq_%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d.inverted", filename_prefix, Nconf, sx0, sx1, sx2, sx3, is,
            rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
          status = read_lime_spinor_timeslice(g_spinor_field[n_s*n_c+is], timeslice, filename, 0, seq_spinor_field_checksum+imom*n_s*n_c+is);
          if(status != 0) {
            fprintf(stderr, "[] Error, could not read propagator from file %s\n", filename);
            exit(102);
          }
          if(N_Jacobi > 0) {
            fprintf(stdout, "# [] Jacobi smearing propagator no. %d with paramters N_Jacobi=%d, kappa_Jacobi=%f\n",
                 is, N_Jacobi, kappa_Jacobi);
#ifdef OPENMP
            Jacobi_Smearing_Step_one_Timeslice_threads(g_gauge_field, g_spinor_field[n_s*n_c+is], work, N_Jacobi, kappa_Jacobi);
#else
            for(c=0; c<N_Jacobi; c++) {
              Jacobi_Smearing_Step_one_Timeslice(g_gauge_field, g_spinor_field[n_s*n_c+is], work, kappa_Jacobi);
            }
#endif
          }
//        } else {  // of if do_gt == 0
//          // apply gt
//          apply_gt_prop(gauge_trafo, g_spinor_field[n_s*n_c+is], is/n_c, is%n_c, 4, filename_prefix, g_source_location);
//        } // of if do_gt == 0
      }
  
  
      /******************************************************
       * contractions
       *
       * REMEMBER:
       *
       *   uprop = S_u
       *   dprop = S_seq
       *   fp1   = C Gamma_1 S_u
       *   fp2   = C Gamma_1 S_u C Gamma_2
       *   fp3   =           S_u C Gamma_2
       *   fp4   = C Gamma_1 S_seq
       *   Gamma_1 = gamma_mu (always multiplied from the left)
       *   Gamma_2 = gamma-5  (always multiplied from the right)
       ******************************************************/
      for(ix=0;ix<VOL3;ix++) 
//      for(ix=0;ix<1;ix++) 
      {
  
        // assign the propagators
        _assign_fp_point_from_field(uprop, g_spinor_field, ix);
        _assign_fp_point_from_field(dprop, g_spinor_field+n_s*n_c, ix);
        // flavor rotation for twisted mass fermions
        if(fermion_type == _TM_FERMION) {
          _fp_eq_rot_ti_fp(fp1, uprop, +1, fermion_type, fp2);
          _fp_eq_fp_ti_rot(uprop, fp1, +1, fermion_type, fp2);
          _fp_eq_rot_ti_fp(fp1, dprop, +1, fermion_type, fp2);
          _fp_eq_fp_ti_rot(dprop, fp1, -1, fermion_type, fp2);
        }

        if(do_gt) {
          // up propagator
          _fp_eq_cm_ti_fp(fp1, gauge_trafo+18*(timeslice*VOL3+ix), uprop);
          _fp_eq_fp_ti_cm_dagger(uprop, gauge_trafo+18*(timeslice*VOL3+ix), fp1);
          // sequential propagator
          _fp_eq_cm_ti_fp(fp1, gauge_trafo+18*(timeslice*VOL3+ix), dprop);
          _fp_eq_fp_ti_cm_dagger(dprop, gauge_trafo+18*(timeslice*VOL3+ix), fp1);
        }
  
        // test: print fermion propagator point
/*
        fprintf(stdout, "# uprop:\n");
        printf_fp(uprop, "uprop", stdout);
        fprintf(stdout, "# dprop:\n");
        printf_fp(dprop, "dprop", stdout);
*/
/*
        double fp_in_base[32];
        int mu;
//        _project_fp_to_basis(fp_in_base, uprop, 0);
        _project_fp_to_basis(fp_in_base, dprop, 0);
        fprintf(stdout, "# [] t=%3d; ix=%6d\n", timeslice, ix);
        for(mu=0;mu<16;mu++) {
          fprintf(stdout, "\t%3d%16.7e%16.7e\n", mu, fp_in_base[2*mu], fp_in_base[2*mu+1]);
        }
*/
  
        for(icomp=0; icomp<num_component; icomp++) {
  
          _sp_eq_zero( connq[ix*num_component+icomp]);
  
          /******************************************************
           * prepare fermion propagators
           ******************************************************/
          _fp_eq_zero(fp1);
          _fp_eq_zero(fp2);
          _fp_eq_zero(fp3);
          _fp_eq_zero(fp4);
          _fp_eq_zero(fpaux);
          // fp1 = C Gamma_1 x S_u = g0 g2 Gamma_1 S_u
          _fp_eq_gamma_ti_fp(fp1, gamma_component[0][icomp], uprop);
          _fp_eq_gamma_ti_fp(fpaux, 2, fp1);
          _fp_eq_gamma_ti_fp(fp1,   0, fpaux);
  
          // fp2 = C Gamma_1 x S_u x C Gamma_2 = fp1 x g0 g2 Gamma_2
          _fp_eq_fp_ti_gamma(fp2, 0, fp1);
          _fp_eq_fp_ti_gamma(fpaux, 2, fp2);
          _fp_eq_fp_ti_gamma(fp2, gamma_component[1][icomp], fpaux);
   
          // fp3 = S_u x C Gamma_2 = uprop x g0 g2 Gamma_2
          _fp_eq_fp_ti_gamma(fp3,   0, uprop);
          _fp_eq_fp_ti_gamma(fpaux, 2, fp3);
          _fp_eq_fp_ti_gamma(fp3, gamma_component[1][icomp], fpaux);
   
          // fp4 = C Gamma_1 x S_seq = g0 g2 Gamma_1 dprop 
          _fp_eq_gamma_ti_fp(fp4, gamma_component[0][icomp], dprop);
          _fp_eq_gamma_ti_fp(fpaux, 2, fp4);
          _fp_eq_gamma_ti_fp(fp4,   0, fpaux);
/*          
        fprintf(stdout, "# fp1:\n");
        printf_fp(fp1, "fp1",stdout);
        fprintf(stdout, "# fp2:\n");
        printf_fp(fp2, "fp2",stdout);
        fprintf(stdout, "# fp3:\n");
        printf_fp(fp3, "fp3",stdout);
        fprintf(stdout, "# fp4:\n");
        printf_fp(fp4, "fp4",stdout);
*/
/*
        double fp_in_base[4][32];
        int mu;
        _project_fp_to_basis(fp_in_base[0], fp1, 0);
        _project_fp_to_basis(fp_in_base[1], fp2, 0);
        _project_fp_to_basis(fp_in_base[2], fp3, 0);
        _project_fp_to_basis(fp_in_base[3], fp4, 0);
        fprintf(stdout, "# [] t=%3d; ix=%6d\n", timeslice, ix);
        for(mu=0;mu<16;mu++) {
          fprintf(stdout, "\t%3d%16.7e%16.7e%16.7e%16.7e%16.7e%16.7e%16.7e%16.7e\n", mu,
              fp_in_base[0][2*mu], fp_in_base[0][2*mu+1],
              fp_in_base[1][2*mu], fp_in_base[1][2*mu+1],
              fp_in_base[2][2*mu], fp_in_base[2][2*mu+1],
              fp_in_base[3][2*mu], fp_in_base[3][2*mu+1]);
        }
*/

          // (1)
          // reduce
          _fp_eq_zero(fpaux);
          _fp_eq_fp_eps_contract13_fp(fpaux, fp2, uprop);
          // reduce to spin propagator
          _sp_eq_zero( sp1 );
          _sp_eq_fp_del_contract23_fp(sp1, dprop, fpaux);
          // (2)
          // reduce
          _fp_eq_zero(fpaux);
          _fp_eq_fp_eps_contract13_fp(fpaux, fp1, fp3);
          // reduce to spin propagator
          _sp_eq_zero( sp2 );
          _sp_eq_fp_del_contract24_fp(sp2, dprop, fpaux);
          // add and assign
          _sp_pl_eq_sp(sp1, sp2);
          _sp_eq_sp_ti_re(sp2, sp1, -gamma_component_sign[icomp]);
          _sp_pl_eq_sp( connq[ix*num_component+icomp], sp2);

          // (3)
          // reduce
          _fp_eq_zero(fpaux);
          _fp_eq_fp_eps_contract13_fp(fpaux, fp4, uprop);
          // reduce to spin propagator
          _sp_eq_zero( sp1 );
          _sp_eq_fp_del_contract23_fp(sp1, fp3, fpaux);
          // (4)
          // reduce
          _fp_eq_zero(fpaux);
          _fp_eq_fp_eps_contract13_fp(fpaux, fp1, dprop);
          // reduce to spin propagator
          _sp_eq_zero( sp2 );
          _sp_eq_fp_del_contract24_fp(sp2, fp3, fpaux);
          // add and assign
          _sp_pl_eq_sp(sp1, sp2);
          _sp_eq_sp_ti_re(sp2, sp1, -gamma_component_sign[icomp]);
          _sp_pl_eq_sp( connq[ix*num_component+icomp], sp2);

          // (5)
          // reduce
          _fp_eq_zero(fpaux);
          _fp_eq_fp_eps_contract13_fp(fpaux, fp4, fp3);
          // reduce to spin propagator
          _sp_eq_zero( sp1 );
          _sp_eq_fp_del_contract34_fp(sp1, uprop, fpaux);
          //fprintf(stdout, "# sp1:\n");
          //printf_sp(sp1, "sp1",stdout);
          // (6)
          // reduce
          _fp_eq_zero(fpaux);
          _fp_eq_fp_eps_contract13_fp(fpaux, fp2, dprop);
          // reduce to spin propagator
          _sp_eq_zero( sp2 );
          _sp_eq_fp_del_contract34_fp(sp2, uprop, fpaux);
          //fprintf(stdout, "# sp2:\n");
          //printf_sp(sp2, "sp2",stdout);
          // add and assign
          _sp_pl_eq_sp(sp1, sp2);
          _sp_eq_sp_ti_re(sp2, sp1, -gamma_component_sign[icomp]);
          _sp_pl_eq_sp( connq[ix*num_component+icomp], sp2);

  
        }  // of icomp
  
      }    // of ix
  
      /***********************************************
       * finish calculation of connq
       ***********************************************/
      if(g_propagator_bc_type == 0) {
        // multiply with phase factor
        fprintf(stdout, "# [] multiplying timeslice %d with boundary phase factor\n", timeslice);
        ir = (timeslice - sx0 + T_global) % T_global;
        w1.re = cos( 3. * M_PI*(double)ir / (double)T_global );
        w1.im = sin( 3. * M_PI*(double)ir / (double)T_global );
        for(ix=0;ix<num_component*VOL3;ix++) {
          _sp_eq_sp(sp1, connq[ix] );
          _sp_eq_sp_ti_co( connq[ix], sp1, w1);
        }
      } else if (g_propagator_bc_type == 1) {
        // multiply with step function
        if(timeslice < sx0) {
          fprintf(stdout, "# [] multiplying timeslice %d with boundary step function\n", timeslice);
          for(ix=0;ix<num_component*VOL3;ix++) {
            _sp_eq_sp(sp1, connq[ix] );
            _sp_eq_sp_ti_re( connq[ix], sp1, -1.);
          }
        }
      }
    
      if(write_ascii) {
        sprintf(filename, "%s_x.%.4d.t%.2dx%.2dy%.2dz.qx%.2dqy%.2dqz%.2d%.2d.ascii", outfile_prefix, Nconf, sx0, sx1, sx2, sx3,
            rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
        write_contraction2( connq[0][0], filename, num_component*g_sv_dim*g_sv_dim, VOL3, 1, append);
      }
  
      /******************************************************************
       * Fourier transform
       ******************************************************************/
      items =  2 * num_component * g_sv_dim * g_sv_dim * VOL3;
      bytes = sizeof(double);
  
      memcpy(in, connq[0][0], items * bytes);
      ir = num_component * g_sv_dim * g_sv_dim;
  #ifdef OPENMP
      fftwnd_threads(g_num_threads, plan_p, ir, in, ir, 1, (fftw_complex*)(connq[0][0]), ir, 1);
  #else
      fftwnd(plan_p, ir, in, ir, 1, (fftw_complex*)(connq[0][0]), ir, 1);
  #endif
  
      // add phase factor from the source location
      iix = 0;
      for(x1=0;x1<LX;x1++) {
        q[0] = (double)x1 / (double)LX;
      for(x2=0;x2<LY;x2++) {
        q[1] = (double)x2 / (double)LY;
      for(x3=0;x3<LZ;x3++) {
        q[2] = (double)x3 / (double)LZ;
        phase = 2. * M_PI * ( q[0]*sx1 + q[1]*sx2 + q[2]*sx3 );
        w1.re = cos(phase);
        w1.im = sin(phase);
  
        for(icomp=0; icomp<num_component; icomp++) {
          _sp_eq_sp(sp1, connq[iix] );
          _sp_eq_sp_ti_co( connq[iix], sp1, w1) ;
          iix++; 
        }
      }}}  // of x3, x2, x1
  
      // write to file
      sprintf(filename, "%s_q.%.4d.t%.2dx%.2dy%.2dz%.2d.qx%.2dqy%.2dqz%.2d", outfile_prefix, Nconf, sx0, sx1, sx2, sx3,
          rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
      sprintf(contype, "2-pt. function, (t,Q_1,Q_2,Q_3)-dependent, source_timeslice = %d, rel. momentum = (%d, %d. %d)", sx0,
          rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
      write_lime_contraction_timeslice(connq[0][0], filename, 64, num_component*g_sv_dim*g_sv_dim, contype, Nconf, 0, &connq_checksum, timeslice);
  
      if(write_ascii) {
        strcat(filename, ".ascii");
        write_contraction2(connq[0][0],filename, num_component*g_sv_dim*g_sv_dim, VOL3, 1, append);
      }
  
  
      /***********************************************
       * calculate connt
       ***********************************************/
      for(icomp=0;icomp<num_component; icomp++) {
        // fwd
        _sp_eq_sp(sp1, connq[icomp]);
        _sp_eq_gamma_ti_sp(sp2, 0, sp1);
        _sp_pl_eq_sp(sp1, sp2);
        _co_eq_tr_sp(&w, sp1);
        connt[2*( (imom*2*num_component + icomp) * T + timeslice)  ] = w.re * 0.25;
        connt[2*( (imom*2*num_component + icomp) * T + timeslice)+1] = w.im * 0.25;
        // bwd
        _sp_eq_sp(sp1, connq[icomp]);
        _sp_eq_gamma_ti_sp(sp2, 0, sp1);
        _sp_mi_eq_sp(sp1, sp2);
        _co_eq_tr_sp(&w, sp1);
        connt[2*( (imom*2*num_component + icomp + num_component ) * T + timeslice)  ] = w.re * 0.25;
        connt[2*( (imom*2*num_component + icomp + num_component ) * T + timeslice)+1] = w.im * 0.25;
      }

    }  // of loop on relative momenta

  }  // of loop on timeslice

  // write connt
  sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.fw", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  ofs = fopen(filename, "w");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for writing\n", filename);
    exit(3);
  }
 
  for(imom=0;imom<rel_momentum_no;imom++) {
    fprintf(ofs, "#%12.8f%3d%3d%3d%3d%8.4f%6d%3d%3d%3d\n", g_kappa, T_global, LX, LY, LZ, g_mu, Nconf,
        rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);

    for(icomp=0; icomp<num_component; icomp++) {
//      ir = sx0;
//      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d%3d%3d%3d\n", gamma_component[0][icomp], gamma_component[1][icomp], 0, connt[2*((imom*2*num_component+icomp)*T+ir)], 0., Nconf,
//          rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
//      for(it=1;it<T/2;it++) {
//        ir  = ( it + sx0 ) % T_global;
//       ir2 = ( (T_global - it) + sx0 ) % T_global;
//        fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d%3d%3d%3d\n", gamma_component[0][icomp], gamma_component[1][icomp], it,
//            connt[2*((imom*2*num_component+icomp)*T+ir)], connt[2*((imom*2*num_component+icomp)*T+ir2)], Nconf,
//            rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
//      }
//      ir = ( it + sx0 ) % T_global;
//      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d%3d%3d%3d\n", gamma_component[0][icomp], gamma_component[1][icomp], it, connt[2*((imom*2*num_component+icomp)*T+ir)], 0., Nconf,
//          rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
      for(it=0;it<T;it++) {
        ir  = ( it + sx0 ) % T_global;
        fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d%3d%3d%3d\n", gamma_component[0][icomp], gamma_component[1][icomp], it,
            connt[2*((imom*2*num_component+icomp)*T+ir)], connt[2*((imom*2*num_component+icomp)*T+ir)+1], Nconf,
            rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
      }
    }
  }
  fclose(ofs);
  
  sprintf(filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.bw", outfile_prefix, Nconf, sx0, sx1, sx2, sx3);
  ofs = fopen(filename, "w");
  if(ofs == NULL) {
    fprintf(stderr, "[] Error, could not open file %s for writing\n", filename);
    exit(3);
  }

  for(imom=0;imom<rel_momentum_no;imom++) {
    fprintf(ofs, "#%12.8f%3d%3d%3d%3d%8.4f%6d%3d%3d%3d\n", g_kappa, T_global, LX, LY, LZ, g_mu, Nconf,
        rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
  
    for(icomp=0; icomp<num_component; icomp++) {
      ir = sx0;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d%3d%3d%3d\n", gamma_component[0][icomp], gamma_component[1][icomp], 0,
          connt[2*((imom*2*num_component+num_component+icomp)*T+ir)], 0., Nconf,
          rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
      for(it=1;it<T/2;it++) {
        ir  = ( it + sx0 ) % T_global;
        ir2 = ( (T_global - it) + sx0 ) % T_global;
        fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d%3d%3d%3d\n", gamma_component[0][icomp], gamma_component[1][icomp], it,
            connt[2*((imom*2*num_component+num_component+icomp)*T+ir)], connt[2*((imom*2*num_component+num_component+icomp)*T+ir2)], Nconf,
            rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
      }
      ir = ( it + sx0 ) % T_global;
      fprintf(ofs, "%3d%3d%3d%16.7e%16.7e%6d%3d%3d%3d\n", gamma_component[0][icomp], gamma_component[1][icomp], it,
          connt[2*((imom*2*num_component+num_component+icomp)*T+ir)], 0., Nconf,
          rel_momentum_list[imom][0],rel_momentum_list[imom][1],rel_momentum_list[imom][2]);
    }
  }
  fclose(ofs);

  if(in!=NULL) free(in);
  fftwnd_destroy_plan(plan_p);

}  // of if mode == 2


  /***********************************************
   * free the allocated memory, finalize
   ***********************************************/
  free_geometry();
  if(connt!= NULL) free(connt);
  if(connq!= NULL) free(connq);
  if(gauge_trafo != NULL) free(gauge_trafo);

  if(g_spinor_field!=NULL) {
    for(i=0; i<no_fields; i++) free(g_spinor_field[i]);
    free(g_spinor_field); g_spinor_field=(double**)NULL;
  }
  if(spinor_field_checksum !=NULL) free(spinor_field_checksum);
  if(seq_spinor_field_checksum !=NULL) free(seq_spinor_field_checksum);
  if(g_gauge_field != NULL) free(g_gauge_field);

  // create the fermion propagator points
  free_fp( &uprop );
  free_fp( &dprop );
  free_fp( &fp1 );
  free_fp( &fp2 );
  free_fp( &fp3 );
  free_fp( &fp4 );
  free_fp( &fpaux );
  free_sp( &sp1 );
  free_sp( &sp2 );

  if(rel_momentum_list!=NULL) {
    if(rel_momentum_list[0]!=NULL) free(rel_momentum_list[0]);
    free(rel_momentum_list);
  }
  if(snk_momentum_list!=NULL) {
    if(snk_momentum_list[0]!=NULL) free(snk_momentum_list[0]);
    free(snk_momentum_list);
  }

  g_the_time = time(NULL);
  fprintf(stdout, "# [] %s# [] end fo run\n", ctime(&g_the_time));
  fflush(stdout);
  fprintf(stderr, "# [] %s# [] end fo run\n", ctime(&g_the_time));
  fflush(stderr);

#ifdef MPI
  MPI_Finalize();
#endif
  return(0);
}