Beispiel #1
0
Vector_double
stf::linCorr(const Vector_double& data, const Vector_double& templ, stfio::ProgressInfo& progDlg)
{
    bool skipped = false;
    // the template has to be smaller than the data waveform:
    if (data.size()<templ.size()) {
        throw std::runtime_error("Template larger than data in stf::crossCorr");
    }
    if (data.size()==0 || templ.size()==0) {
        throw std::runtime_error("Array of size 0 in stf::crossCorr");
    }
    Vector_double Corr(data.size()-templ.size());

    // Optimal scaling & offset:
    // avoid redundant computations:
    double sum_templ_data=0.0, sum_templ=0.0, sum_templ_sqr=0.0, sum_data=0.0, sum_data_sqr=0.0;
    for (int n_templ=0; n_templ<(int)templ.size();++n_templ) {
        sum_templ_data+=templ[n_templ]*data[0+n_templ];
        sum_data+=data[0+n_templ];
        sum_data_sqr+=data[0+n_templ]*data[0+n_templ];
        sum_templ+=templ[n_templ];
        sum_templ_sqr+=templ[n_templ]*templ[n_templ];
    }
    double y_old=0.0;
    double y2_old=0.0;
    int progCounter=0;
    double progFraction=(data.size()-templ.size())/100;
    for (unsigned n_data=0; n_data<data.size()-templ.size(); ++n_data) {
        if (n_data/progFraction>progCounter) {
            progDlg.Update( (int)((double)n_data/(double)(data.size()-templ.size())*100.0),
                            "Calculating correlation coefficient", &skipped );
            if (skipped) {
                Corr.resize(0);
                return Corr;
            }
            progCounter++;
        }
        if (n_data!=0) {
            sum_templ_data=0.0;
            // The product has to be computed in full length:
            for (int n_templ=0; n_templ<(int)templ.size();++n_templ) {
                sum_templ_data+=templ[n_templ]*data[n_data+n_templ];
            }
            // The new value that will be added is:
            double y_new=data[n_data+templ.size()-1];
            double y2_new=data[n_data+templ.size()-1]*data[n_data+templ.size()-1];
            sum_data+=y_new-y_old;
            sum_data_sqr+=y2_new-y2_old;
        }
        // The first value that was added (and will have to be subtracted during
        // the next loop):
        y_old=data[n_data+0];
        y2_old=data[n_data+0]*data[n_data+0];

        double scale=(sum_templ_data-sum_templ*sum_data/templ.size())/
        (sum_templ_sqr-sum_templ*sum_templ/templ.size());
        double offset=(sum_data-scale*sum_templ)/templ.size();

        // Now that the optimal template has been found,
        // compute the correlation between data and optimal template.
        // The correlation coefficient is computed in a way that avoids
        // numerical instability; therefore, the sum of squares
        // computed above can't be re-used.
        // Get the means:
        double mean_data=sum_data/templ.size();
        double sum_optTempl=sum_templ*scale+offset*templ.size();
        double mean_optTempl=sum_optTempl/templ.size();

        // Get SDs:
        double sd_data=0.0;
        double sd_templ=0.0;
        for (int i=0;i<(int)templ.size();++i) {
            sd_data+=SQR(data[i+n_data]-mean_data);
            sd_templ+=SQR(templ[i]*scale+offset-mean_optTempl);
        }
        sd_data=sqrt(sd_data/templ.size());
        sd_templ=sqrt(sd_templ/templ.size());

        // Get correlation:
        double r=0.0;
        for (int i=0;i<(int)templ.size();++i) {
            r+=(data[i+n_data]-mean_data)*(templ[i]*scale+offset-mean_optTempl);
        }
        r/=((templ.size()-1)*sd_data*sd_templ);
        Corr[n_data]=r;
    }
    return Corr;
}
Beispiel #2
0
int main (int ac, char* av[]) {
  // ***************************************************************************
  // ***************************************************************************
  // initialization ************************************************************
  // ***************************************************************************
  // ***************************************************************************

  // initialization of eigen OMP paralization
  Eigen::initParallel();
  // set numer of threads used by eigen
  // first line sets number of threads directly
  // second line lets OMP decide on the number of threads,
  // e. g. via OMP_NUM_THREADS
  //Eigen::setNbThreads(4);
  Eigen::setNbThreads(0);

  //check the number of threads used
  const int nthreads = Eigen::nbThreads();
  std::cout << "contraction code for stochastic dilution" << std::endl;
  std::cout << "using " << nthreads << " threads for eigen\n" << std::endl;

  // reading in global parameters from input file
  GlobalData* global_data = GlobalData::Instance();
  global_data->read_parameters(ac, av);

  // reading in of data
  ReadWrite* rewr = new ReadWrite;

  // everything for operator handling
  BasicOperator* basic = new BasicOperator();

  // global variables from input file needed in main function
  const int Lt = global_data->get_Lt();
  const int end_config = global_data->get_end_config();
  const int delta_config = global_data->get_delta_config();
  const int start_config = global_data->get_start_config();
  const int number_of_eigen_vec = global_data->get_number_of_eigen_vec();

  const int number_of_max_mom = global_data->get_number_of_max_mom();
  const int max_mom_squared = number_of_max_mom * number_of_max_mom;
  const int number_of_momenta = global_data->get_number_of_momenta();
  const std::vector<int> mom_squared = global_data->get_momentum_squared();

  const std::vector<quark> quarks = global_data->get_quarks();
  const int number_of_rnd_vec = quarks[0].number_of_rnd_vec;

  const int dirac_min = global_data->get_dirac_min();
  const int dirac_max = global_data->get_dirac_max();
  const int number_of_dirac = dirac_max - dirac_min + 1;

  const int displ_min = global_data->get_displ_min();
  const int displ_max = global_data->get_displ_max();
  const int number_of_displ = displ_max - displ_min + 1;

  const int p_min = 0; //number_of_momenta/2;
  const int p_max = number_of_momenta;

  // TODO decide on path
  std::string outpath = global_data->get_output_path() + "/";

  // other variables
  clock_t time;

  const std::complex<double> I(0.0, 1.0);

  char outfile[400];
  FILE *fp = NULL;

  // ***************************************************************************
  // ***************************************************************************
  // memory allocation *********************************************************
  // ***************************************************************************
  // ***************************************************************************

  // abbreviations for clearer memory allocation. Wont be used in loops and 
  // when building the contractions
  // CJ: but it is a little bit ugly...

  const size_t nmom = number_of_momenta;
  const size_t nrnd = number_of_rnd_vec;
  const size_t ndir = number_of_dirac;
  const size_t ndis = number_of_displ;

  // memory for the correlation function
  array_cd_d7 C2_mes(boost::extents[nmom][nmom][ndir][ndir][ndis][ndis][Lt]);
  //TODO: dont need the memory for p_u^2 > p_d^2
  array_cd_d10 Corr(boost::extents[nmom][nmom][ndir][ndir][ndis][ndis][Lt][Lt][nrnd][nrnd]);

  int norm = 0;
  for(int rnd1 = 0; rnd1 < number_of_rnd_vec; ++rnd1){
    for(int rnd3 = rnd1 + 1; rnd3 < number_of_rnd_vec; ++rnd3){
      for(int rnd2 = 0; rnd2 < number_of_rnd_vec; ++rnd2){
        if((rnd2 != rnd1) && (rnd2 != rnd3)){
          for(int rnd4 = rnd2 + 1; rnd4 < number_of_rnd_vec; ++rnd4){
            if((rnd4 != rnd1) && (rnd4 != rnd3)){
              norm++;
              //std::cout << "\n\nnorm: " << norm << rnd1 << rnd3 << rnd2 << rnd4 << std::endl;
            }
          }
        }
      }
    }
  }

  std::cout << "\n\tNumber of contraction combinations: " << norm << std::endl;
//  const double norm1 = Lt * norm;

  // Memory for propagation matrices (is that a word?) from t_source to t_sink
  // (op_1) and vice versa (op_2)
  // additional t_source to t_sink (op_3) and t_sink to t_source (op_4) for
  // 4-point functions
  // 1, 3 -> u-quarks; 2, 4 -> d-quarks; 5, 6 -> u quarks for neutral particle

  array_Xcd_d2_eigen op_1(boost::extents[nrnd][nrnd]);
  array_Xcd_d2_eigen op_3(boost::extents[nrnd][nrnd]);
  vec_Xcd_eigen op_2(number_of_rnd_vec);
  vec_Xcd_eigen op_4(number_of_rnd_vec);
  vec_Xcd_eigen op_5(number_of_rnd_vec);
  vec_Xcd_eigen op_6(number_of_rnd_vec);

  for(int rnd_i = 0; rnd_i < number_of_rnd_vec; ++rnd_i){
    for(int rnd_j = 0; rnd_j < number_of_rnd_vec; ++rnd_j){

      op_1[rnd_i][rnd_j] = Eigen::MatrixXcd(4 * number_of_eigen_vec, 
          4 * quarks[0].number_of_dilution_E);
       }

    op_2[rnd_i] = Eigen::MatrixXcd(4 * quarks[0].number_of_dilution_E, 
        4 * number_of_eigen_vec);
    
  }

  // ***************************************************************************
  // ***************************************************************************
  // Loop over all configurations **********************************************
  // ***************************************************************************
  // ***************************************************************************

  for(int config_i = start_config; config_i <= end_config; config_i +=
      delta_config){

    std::cout << "\nprocessing configuration: " << config_i << "\n\n";

    rewr->read_perambulators_from_file(config_i);
    rewr->read_rnd_vectors_from_file(config_i);
//    rewr->read_eigenvectors_from_file(config_i);
    rewr->read_lime_gauge_field_doubleprec_timeslices(config_i);
    rewr->build_source_matrix(config_i);


    // *************************************************************************
    // TWO PT CONTRACTION 1 ****************************************************
    // *************************************************************************

    // setting the correlation function to zero
    std::cout << "\n\tcomputing the connected contribution of pi_+/-:\n";
    time = clock();

    // setting the correlation function to zero
    for(int p1 = 0; p1 < number_of_momenta; ++p1)
      for(int p2 = 0; p2 < number_of_momenta; ++p2)
        for(int dirac1 = 0; dirac1 < number_of_dirac; ++dirac1)
          for(int dirac2 = 0; dirac2 < number_of_dirac; ++dirac2)
            for(int displ1 = 0; displ1 < number_of_displ; ++displ1)
              for(int displ2 = 0; displ2 < number_of_displ; ++displ2)
                for(int t1 = 0; t1 < Lt; ++t1)
                   for(int t1 = 0; t1 < Lt; ++t1)
                     C2_mes[p1][p2][dirac1][dirac2][displ1][displ2][t1] = std::complex<double>(0.0, 0.0);

    for(int p1 = 0; p1 < number_of_momenta; ++p1)
      for(int p2 = 0; p2 < number_of_momenta; ++p2)
        for(int dirac1 = 0; dirac1 < number_of_dirac; ++dirac1)
          for(int dirac2 = 0; dirac2 < number_of_dirac; ++dirac2)
            for(int displ1 = 0; displ1 < number_of_displ; ++displ1)
              for(int displ2 = 0; displ2 < number_of_displ; ++displ2)
                for(int t1 = 0; t1 < Lt; ++t1)
                  for(int t2 = 0; t2 < Lt; ++t2)
                    for(int rnd1 = 0; rnd1 < number_of_rnd_vec; rnd1++)
                      for(int rnd2 = 0; rnd2 < number_of_rnd_vec; rnd2++)
                        Corr[p1][p2][dirac1][dirac2][displ1][displ2][t1][t2][rnd1][rnd2] = 
                            std::complex<double>(0.0, 0.0);

#if 1 // PI^+/-
    // initializing of Corr: calculate all two-operator traces of the form tr(u \Gamma \bar{d})
    // build all combinations of momenta, dirac_structures and displacements as specified in
    // infile

    for(int displ_u = 0; displ_u < number_of_displ; displ_u++){
      for(int displ_d = 0; displ_d < number_of_displ; displ_d++){

          for(int t_source = 0; t_source < Lt; ++t_source){
            for(int t_sink = 0; t_sink < Lt; ++t_sink){
    
              for(int p = p_min; p < p_max; ++p) {
                // initialize contraction[rnd_i] = perambulator * basicoperator
                // = D_u^-1
                // choose 'i' for interlace or 'b' for block time dilution scheme
                // TODO: get that from input file
                // choose 'c' for charged or 'u' for uncharged particles
                basic->init_operator_u(0, t_source, t_sink, rewr, 'b', p, displ_min + displ_u);
                basic->init_operator_d(0, t_source, t_sink, rewr, 'b', p, displ_min + displ_d);
              }
    
              for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
                for(int p_u = p_min; p_u < p_max; ++p_u) {
                  // code for pi+-
    
                  // "multiply contraction[rnd_i] with gamma structure"
                  // contraction[rnd_i] are the columns of D_u^-1 which get
                  // reordered by gamma multiplication. No actual multiplication
                  // is carried out
                  basic->get_operator_charged(op_1, 0, t_sink, rewr, dirac_min + dirac_u, p_u);
    
                  for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
                    for(int p_d = p_min; p_d < p_max; ++p_d) {
                      if(mom_squared[p_u] <= mom_squared[p_d]){
      
                        // same as get_operator but with gamma_5 trick. D_u^-1 is
                        // daggered and multipied with gamma_5 from left and right
                        // the momentum is changed to reflect the switched sign in
                        // the momentum exponential for pi_+-
                        basic->get_operator_g5(op_2, 0, dirac_min + dirac_d, p_d);
           
                        for(int rnd1 = 0; rnd1 < number_of_rnd_vec; ++rnd1){
                          for(int rnd2 = rnd1 + 1; rnd2 < number_of_rnd_vec; ++rnd2){

                            // build all 2pt traces leading to C2_mes
                            // Corr = tr(D_d^-1(t_sink) Gamma 
                            //     D_u^-1(t_source) Gamma)

                            Corr[p_u][p_d][dirac_u][dirac_d][displ_u][displ_d]
                                [t_source][t_sink][rnd1][rnd2] = 
                              (op_2[rnd2] * op_1[rnd1][rnd2]).trace();

//                            std::cout << "p" << p_u << p_d << "dirac" << dirac_u << dirac_d << "\nCorr "
//                                << Corr[p_u][p_d][dirac_u][dirac_d][displ_u][displ_d]
//                                [t_source][t_sink][rnd1][rnd2] << std::endl;
          
          
                          }
                        }   
      
                      }
                    }
                  }
    
                }
              }
    
            }
          }
        
        }
      }

    // build 2pt-function C2_mes for pi^+ from Corr. Equivalent two just summing
    // up traces with same time difference between source and sink (all to all)
    // for every dirac structure, momentum, displacement

    // build 2pt-function C2_mes for pi^+ from Corr. Equivalent two just summing
    // up traces with same time difference between source and sink (all to all)
    // for every dirac structure, momentum, displacement

    for(int t_source = 0; t_source < Lt; ++t_source){
      for(int t_sink = 0; t_sink < Lt; ++t_sink){

         for(int p_u = p_min; p_u < p_max; ++p_u) {
           for(int p_d = p_min; p_d < p_max; ++p_d) {
            if(mom_squared[p_u] <= mom_squared[p_d]){
              for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
                for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
                  for(int displ_u = 0; displ_u < number_of_displ; displ_u++){
                    for(int displ_d = 0; displ_d < number_of_displ; displ_d++){
            
                      for(int rnd1 = 0; rnd1 < number_of_rnd_vec; ++rnd1){
                        for(int rnd2 = rnd1 + 1; rnd2 < number_of_rnd_vec; ++rnd2){
              
                              // building Correlation function 
                              // C2 = tr(D_d^-1 Gamma D_u^-1 Gamma)
                              // TODO: find signflip of imaginary part
                              // TODO: is C2_mes[dirac][p] better?

                          C2_mes[p_u][p_d][dirac_u][dirac_d][displ_u][displ_d]
                              [abs((t_sink - t_source - Lt) % Lt)] += 
                            Corr[p_u][number_of_momenta - p_d - 1]
                              [dirac_u][dirac_d][displ_u][displ_d]
                              [t_source][t_sink][rnd1][rnd2];
                        }
                      }
              
                    }
                  }
                }
              }
            }
          }
        }

      }
    }

    // normalization of correlation function
    double norm3 = Lt * number_of_rnd_vec * (number_of_rnd_vec - 1) * 0.5;
    for(int p_u = p_min; p_u < p_max; ++p_u) {
      for(int p_d = p_min; p_d < p_max; ++p_d) {
        if(mom_squared[p_u] <= mom_squared[p_d]){
          for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
            for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
              for(int displ_u = 0; displ_u < number_of_displ; ++displ_u){
                for(int displ_d = 0; displ_d < number_of_displ; ++displ_d){
                  for(int t = 0; t < Lt; ++t){
                    C2_mes[p_u][p_d][dirac_u][dirac_d][displ_u][displ_d][t] /= norm3;
                  }
                }
              }
            }
          }
        }
      }
    }
#endif


    // output to binary file
    // to build a GEVP, the correlators are written into a seperate folder
    // for every dirac structure, momentum, displacement (entry of the GEVP
    // matrix). In the folders a file is created for every configuration which
    // contains all momentum combinations with same momentum squared
    // The folders are created when running create_runs.sh. If they dont exist, 
    // a segmentation fault will occur
    // TODO: implement check for existence of folders

    for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
      for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
        for(int p = 0; p <= max_mom_squared; p++){
          for(int displ_u = 0; displ_u < number_of_displ; ++displ_u){
            for(int displ_d = 0; displ_d < number_of_displ; ++displ_d){

              sprintf(outfile, 
                  "%s/dirac_%02d_%02d_p_%01d_%01d_displ_%01d_%01d_unsuppressed/"
                  "C2_pi+-_conf%04d.dat", 
                  outpath.c_str(), dirac_min + dirac_u, dirac_min + dirac_d, p, p, 
                  displ_min, displ_max, config_i);
              if((fp = fopen(outfile, "wb")) == NULL)
                std::cout << "fail to open outputfile" << std::endl;

              for(int p_u = p_min; p_u < p_max; ++p_u){
                if(rewr.mom_squared[p_u] == p){

              		fwrite((double*) &(C2_mes[p_u][p_u][dirac_u][dirac_d]
                      [displ_u][displ_d][0]), 
                      sizeof(double), 2 * Lt, fp);
                }
              }

		          fclose(fp);

            }
          }
        }
      }
    }

    for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
      for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
        for(int p1 = 0; p1 <= max_mom_squared; p1++){
         for(int p2 = p1; p2 <= max_mom_squared; p2++){
           for(int displ_u = 0; displ_u < number_of_displ; ++displ_u){
              for(int displ_d = 0; displ_d < number_of_displ; ++displ_d){

                printf("Writing to file: ");
//                sprintf(outfile, 
//                    "%s/dirac_%02d_%02d_p_%01d_%01d_displ_%01d_%01d/"
//                    "C2_pi+-_conf%04d.dat", 
//                    outpath.c_str(), dirac_min + dirac_u, dirac_min + dirac_d, 
//                    p1, p2, displ_min + displ_u, displ_min + displ_d, config_i);
                sprintf(outfile, "%s/C2_pi+-_conf%04d.dat", outpath.c_str(), config_i);
                printf("%s\n", outfile);
                if((fp = fopen(outfile, "wb")) == NULL)
                  std::cout << "fail to open outputfile" << std::endl;

                for(int p_u = p_min; p_u < p_max; ++p_u){
                  if(mom_squared[p_u] == p1){
                    for(int p_d = p_min; p_d < p_max; ++p_d){
                      if(mom_squared[p_d] == p2){

                        fwrite((double*) &(C2_mes[p_u][p_d][dirac_u][dirac_d][displ_u][displ_d][0]), 
                            sizeof(double), 2 * Lt, fp);
                      }
                    }
                  }
                }

                fclose(fp);
  
              }
            }
          }
        }
      }
    }



#if 0 // (old?) output routine and output to terminal
    sprintf(outfile, 
        "%s/dirac_%02d_%02d_p_0_%01d_displ_%01d_%01d/C2_pi+-_conf%04d.dat", 
        outpath.c_str(), dirac_min, dirac_max, 0, 
        displ_min, displ_max, config_i);
    if((fp = fopen(outfile, "wb")) == NULL)
      std::cout << "fail to open outputfile" << std::endl;
    for(int dirac = dirac_min; dirac < dirac_max + 1; ++dirac)
      fwrite((double*) C2_mes[number_of_momenta/2]
          [number_of_momenta/2][dirac], sizeof(double), 2 * Lt, fp);
    fclose(fp);

    for(int rnd_i = 0; rnd_i < number_of_rnd_vec; ++rnd_i) {
      sprintf(outfile, 
          "%s/dirac_%02d_%02d_p_0_%01d_displ_%01d_%01d/C2_dis_u_rnd%02d_conf%04d.dat", 
          outpath.c_str(), dirac_min, dirac_max, number_of_max_mom, displ_min, 
          displ_max, rnd_i, config_i);
      if((fp = fopen(outfile, "wb")) == NULL)
        std::cout << "fail to open outputfile" << std::endl;
      for(int dirac = dirac_min; dirac < dirac_max + 1; ++dirac)
        for(int p = 0; p < number_of_momenta; ++p)
          fwrite((double*) C2_dis[p][dirac][rnd_i], sizeof(double), 2 * Lt, fp);
      fclose(fp);
    }

    // output to terminal
//    printf("\n");
//    for(int dirac = dirac_min; dirac < dirac_max + 1; ++dirac){
//      printf("\tdirac    = %02d\n", dirac);
//      for(int p = 0; p <= max_mom_squared; p++){
//        printf("\tmomentum_u = %02d\n", p);
//        printf("\tmomentum_d = %02d\n", p);
//        for(int p_u = p_min; p_u < p_max; ++p_u){
//          if((mom_squared[p_u] == p)){
//            //printf(
//            //    "\t t\tRe(C2_con)\tIm(C2_con)\n\t----------------------------------\n");
////                  for(int t1 = 0; t1 < Lt; ++t1){
////                    printf("\t%02d\t%.5e\t%.5e\n", t1, real(C2_mes[p_u][p_u][dirac][t1]),
////                        imag(C2_mes[p_u][p_u][dirac][t1]));
////                  }
//            printf("\n");
//            printf("p_u = %02d\n", p_u);
//          }
//        }
//      }
//
//      for(int p = 1; p <= max_mom_squared; p++){
//        printf("\tmomentum_u = %02d\n", 0);
//        printf("\tmomentum_d = %02d\n", p);
//        for(int p_u = p_min; p_u < p_max; ++p_u){
//          if((mom_squared[p_u] == p)){
//            //printf(
//            //    "\t t\tRe(C2_con)\tIm(C2_con)\n\t----------------------------------\n");
////                  for(int t1 = 0; t1 < Lt; ++t1){
////                    printf("\t%02d\t%.5e\t%.5e\n", t1, real(C2_mes[p_u][p_u][dirac][t1]),
////                        imag(C2_mes[p_u][p_u][dirac][t1]));
////                  }
//            printf("\n");
//            printf("p_u = %02d\n", p_u);
//          }
//        }
//      }
//
//    }

#endif // (old?) output routine and output to terminal
    time = clock() - time;
    std::cout << "\t\tSUCCESS - " << std::fixed << std::setprecision(1)
      << ((float) time)/CLOCKS_PER_SEC << " seconds" << std::endl;

    // *************************************************************************
    // FOUR PT CONTRACTION 1 ***************************************************
    // *************************************************************************
#if 0 //4-point contraction 1
    // setting the correlation function to zero
    std::cout << "\n\tcomputing the connected contribution of C4_1:\n";
    time = clock();

    // displacement not supported for 4pt functions atm
    displ_min = 0;
    displ_max = 0;

    std::cout << "\n\tcomputing the connected contribution of C4_1:\n";
    time = clock();

    // setting the correlation function to zero

    for(int p_u = 0; p_u < number_of_momenta; ++p_u)
      for(int p_d = 0; p_d < number_of_momenta; ++p_d)
        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u)
          for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d)
            for(int t1 = 0; t1 < Lt; ++t1)
              C4_mes[p_u][p_d][dirac_u][dirac_d][t1] = 
                  std::complex<double>(0.0, 0.0);

    for(int t_source = 0; t_source < Lt; ++t_source){
      for(int t_sink = 0; t_sink < Lt; ++t_sink){

        int t_source_1 = (t_source + 1) % Lt;
        int t_sink_1 = (t_sink + 1) % Lt;

        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){     
          for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
             for(int p_u = p_min; p_u < p_max; ++p_u) {
               for(int p_d = p_min; p_d < p_max; ++p_d) {
                if(mom_squared[p_u] <= mom_squared[p_d]){
          
                  // complete diagramm
                  // every quark line must have its own random vec

                  for(int rnd1 = 0; rnd1 < number_of_rnd_vec; ++rnd1){
                    for(int rnd3 = rnd1 + 1; rnd3 < number_of_rnd_vec; ++rnd3){
                      for(int rnd2 = 0; rnd2 < number_of_rnd_vec; ++rnd2){      
                        if((rnd2 != rnd1) && (rnd2 != rnd3)){
                          for(int rnd4 = rnd2 + 1; rnd4 < number_of_rnd_vec; ++rnd4){
                            if((rnd4 != rnd1) && (rnd4 != rnd3)){

                              C4_mes[p_u][p_d][dirac_u][dirac_d]
                                  [abs((t_sink - t_source - Lt) % Lt)] +=
                                (Corr[p_u]
                                  [number_of_momenta - p_d - 1]
                                  [dirac_u][dirac_d][0][0]
                                  [t_source_1][t_sink_1][rnd1][rnd3]) *
                                (Corr[number_of_momenta - p_u - 1]
                                  [p_d][dirac_u][dirac_d][0][0]
                                  [t_source][t_sink][rnd2][rnd4]);
                            }
                          }
                        }
                      }
                    }
                  }

                }
              }
            }    
          }
        }

      }
    }

    // Normalization of 4pt-function. Accounts for all rnd-number combinations

    for(int t = 0; t < Lt; ++t){
      for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){     
        for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
           for(int p_u = p_min; p_u < p_max; ++p_u) {
             for(int p_d = p_min; p_d < p_max; ++p_d) {
              if(mom_squared[p_u] <= mom_squared[p_d]){
                C4_mes[p_u][p_d][dirac_u][dirac_d][t] /= norm1;
              }
            }
          }
        }
      }
    }


    // output to binary file
    // see output to binary file for C2. 
    // write into folders with suffix "_unsuppressed". These only include
    // correlators of the diagonal matrix elements of the GEVP for which
    // the three-momentum remains unchanged for both quarks. Because the
    // quarks have to be back-to-back, for the offdiagonal elements this
    // cannot occur. The suppression can be interpreted as Zweig-suppressed
    // gluon exchange


    for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
      for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
        for(int p = 0; p <= max_mom_squared; p++){

          sprintf(outfile, 
              "%s/dirac_%02d_%02d_p_%01d_%01d_displ_%01d_%01d_unsuppressed/"
              "C4_1_conf%04d.dat", 
              outpath.c_str(), dirac_min + dirac_u, dirac_min + dirac_d, p, p, 
              displ_min, displ_max, config_i);
          if((fp = fopen(outfile, "wb")) == NULL)
            std::cout << "fail to open outputfile" << std::endl;

          for(int p_u = p_min; p_u < p_max; ++p_u){
            if(mom_squared[p_u] == p){

              fwrite((double*) &(C4_mes[p_u][p_u][dirac_u][dirac_d][0]), 
                  sizeof(double), 2 * Lt, fp);
            }
          }

          fclose(fp);

        }
      }
    }

    // to build a GEVP, the correlators are written into a seperate folder
    // for every dirac structure, momentum, (entry of the GEVP matrix).
    // displacement is not supported at the moment

    for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
      for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
        for(int p1 = 0; p1 <= max_mom_squared; p1++){
          for(int p2 = p1; p2 <= max_mom_squared; p2++){

            sprintf(outfile, 
               "%s/dirac_%02d_%02d_p_%01d_%01d_displ_%01d_%01d/"
               "C4_1_conf%04d.dat", 
               outpath.c_str(), dirac_min + dirac_u, dirac_min + dirac_d, 
               p1, p2, displ_min, displ_max, config_i);
           if((fp = fopen(outfile, "wb")) == NULL)
             std::cout << "fail to open outputfile" << std::endl;

           for(int p_u = p_min; p_u < p_max; ++p_u){
              if(mom_squared[p_u] == p1){
                for(int p_d = p_min; p_d < p_max; ++p_d){
                  if(mom_squared[p_d] == p2){

                    fwrite((double*) &(C4_mes[p_u][p_d][dirac_u][dirac_d][0]), 
                        sizeof(double), 2 * Lt, fp);
                  }
                }
              }
            }

            fclose(fp);

          }
        }
      }
    }

    // output to terminal
//		printf("\n");
//    for(int dirac = dirac_min; dirac < dirac_max + 1; ++dirac){
//		  printf("\tdirac    = %02d\n", dirac);
//      for(int offset = 0; offset <= max_mom_squared; offset++){
//        for(int p = 0; p <= max_mom_squared; p++){
//          if((p + offset) <= max_mom_squared){
//            for(int p_u = p_min; p_u < p_max; ++p_u){
//              if((rewr.mom_squared[p_u] == p) && ((p + offset) <= max_mom_squared)){
//                for(int p_d = p_min; p_d < p_max; ++p_d){
//                  if(rewr.mom_squared[p_d] == (p + offset)){
//            			  //printf(
//            				//  	"\t t\tRe(C4_1_con)\tIm(C4_1_con)\n\t----------------------------------\n");
////            			  for(int t1 = 0; t1 < Lt; ++t1){
////            				  printf("\t%02d\t%.5e\t%.5e\n", t1, real(C4_mes[p_u][p_d][dirac][dirac][t1]),
////            				      imag(C4_mes[p_u][p_d][dirac][dirac][t1]));
////            			  }
//            			printf("\n");
//                  printf("p_u = %02d\tp_d = %02d\n", p_u, p_d);
//            		  }
//                }
//              }
//            }
//          }
//        printf("\n");
//        }
//      }
//    printf("\n");
//    }

    time = clock() - time;
    printf("\t\tSUCCESS - %.1f seconds\n", ((float) time)/CLOCKS_PER_SEC);
#endif // 4-point contraction 1
    // *************************************************************************
    // FOUR PT CONTRACTION 2 ***************************************************
    // *************************************************************************
#if 0 // 4-point contraction 2
    // setting the correlation function to zero
    std::cout << "\n\tcomputing the connected contribution of C4_2:\n";
    time = clock();

    for(int p_u = 0; p_u < number_of_momenta; ++p_u)
      for(int p_d = 0; p_d < number_of_momenta; ++p_d)
        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u)
          for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d)
            for(int t1 = 0; t1 < Lt; ++t1)
              C4_mes[p_u][p_d][dirac_u][dirac_d][t1] = 
                  std::complex<double>(0.0, 0.0);

    for(int t_source = 0; t_source < Lt; ++t_source){
      for(int t_sink = 0; t_sink < Lt - 1; ++t_sink){

        int t_source_1 = (t_source + 1) % Lt;
        int t_sink_1 = (t_sink + 1) % Lt;

        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){     
          for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
             for(int p_u = p_min; p_u < p_max; ++p_u) {
               for(int p_d = p_min; p_d < p_max; ++p_d) {
                if(mom_squared[p_u] <= mom_squared[p_d]){

                  // complete diagramm
                  // every quark line must have its own random vec

                  for(int rnd1 = 0; rnd1 < number_of_rnd_vec; ++rnd1){
                    for(int rnd3 = rnd1 + 1; rnd3 < number_of_rnd_vec; ++rnd3){
                      for(int rnd2 = 0; rnd2 < number_of_rnd_vec; ++rnd2){      
                        if((rnd2 != rnd1) && (rnd2 != rnd3)){
                          for(int rnd4 = rnd2 + 1; rnd4 < number_of_rnd_vec; ++rnd4){
                            if((rnd4 != rnd1) && (rnd4 != rnd3)){

                              C4_mes[p_u][p_d][dirac_u][dirac_d]
                                  [abs((t_sink - t_source - Lt) % Lt)] +=
                                (Corr[p_u][number_of_momenta - p_d - 1]
                                  [dirac_u][dirac_d][0][0][t_source_1][t_sink]
                                  [rnd1][rnd3]) *
                                (Corr[number_of_momenta - p_u - 1][p_d]
                                  [dirac_u][dirac_d][0][0][t_source][t_sink_1]
                                  [rnd2][rnd4]);
                            }
                          }
                        }
                      }
                    }
                  }

                }
              }
            }
          }
        }

      }
    }

    // Normalization of 4pt-function. Accounts for all rnd-number combinations
    for(int t = 0; t < Lt; ++t){
      for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){     
        for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
           for(int p_u = p_min; p_u < p_max; ++p_u) {
             for(int p_d = p_min; p_d < p_max; ++p_d) {
              if(mom_squared[p_u] <= mom_squared[p_d]){
                C4_mes[p_u][p_d][dirac_u][dirac_d][t] /= norm1;
              }
            }
          }
        }
      }
    }

    // output to binary file
    // see output to binary file for C2. 
    // write into folders with suffix "_unsuppressed". These only include
    // correlators of the diagonal matrix elements of the GEVP for which
    // the three-momentum remains unchanged for both quarks. Because the
    // quarks have to be back-to-back, for the offdiagonal elements this
    // cannot occur. The suppression can be interpreted as Zweig-suppressed
    // gluon exchange

    for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
      for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
        for(int p = 0; p <= max_mom_squared; p++){

          sprintf(outfile, 
              "%s/dirac_%02d_%02d_p_%01d_%01d_displ_%01d_%01d_unsuppressed/"
              "C4_2_conf%04d.dat", 
              outpath.c_str(), dirac_min + dirac_u, dirac_min + dirac_d, p, p, 
              displ_min, displ_max, config_i);
          if((fp = fopen(outfile, "wb")) == NULL)
            std::cout << "fail to open outputfile" << std::endl;

          for(int p_u = p_min; p_u < p_max; ++p_u){
            if(mom_squared[p_u] == p){

              fwrite((double*) &(C4_mes[p_u][p_u][dirac_u][dirac_d][0]), 
                  sizeof(double), 2 * Lt, fp);
            }
          }

          fclose(fp);

        }
      }
    }

    // to build a GEVP, the correlators are written into a seperate folder
    // for every dirac structure, momentum, (entry of the GEVP matrix).
    // displacement is not supported at the moment

    for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
      for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
        for(int p1 = 0; p1 <= max_mom_squared; p1++){
          for(int p2 = p1; p2 <= max_mom_squared; p2++){

            sprintf(outfile, 
               "%s/dirac_%02d_%02d_p_%01d_%01d_displ_%01d_%01d/"
               "C4_2_conf%04d.dat", 
               outpath.c_str(), dirac_min + dirac_u, dirac_min + dirac_d, 
               p1, p2, displ_min, displ_max, config_i);
           if((fp = fopen(outfile, "wb")) == NULL)
             std::cout << "fail to open outputfile" << std::endl;

           for(int p_u = p_min; p_u < p_max; ++p_u){
              if(mom_squared[p_u] == p1){
                for(int p_d = p_min; p_d < p_max; ++p_d){
                  if(mom_squared[p_d] == p2){

                    fwrite((double*) &(C4_mes[p_u][p_d][dirac_u][dirac_d][0]), 
                        sizeof(double), 2 * Lt, fp);
                  }
                }
              }
            }

            fclose(fp);

          }
        }
      }
    }

    // output to terminal
//		printf("\n");
//    for(int dirac = dirac_min; dirac < dirac_max + 1; ++dirac){
//		  printf("\tdirac    = %02d\n", dirac);
//      for(int offset = 0; offset <= max_mom_squared; offset++){
//        for(int p = 0; p <= max_mom_squared; p++){
//          if((p + offset) <= max_mom_squared){
//            for(int p_u = p_min; p_u < p_max; ++p_u){
//              if((rewr.mom_squared[p_u] == p) && ((p + offset) <= max_mom_squared)){
//                for(int p_d = p_min; p_d < p_max; ++p_d){
//                  if(rewr.mom_squared[p_d] == (p + offset)){
//            			  //printf(
//            				//  	"\t t\tRe(C4_2_con)\tIm(C4_2_con)\n\t----------------------------------\n");
////            			  for(int t1 = 0; t1 < Lt; ++t1){
////            				  printf("\t%02d\t%.5e\t%.5e\n", t1, real(C4_mes[p][p][dirac][dirac][t1]),
////            				      imag(C4_mes[p][p][dirac][dirac][t1]));
////                    }
//            			  printf("\n");
//                    printf("p_u = %02d\tp_d = %02d\n", p_u, p_d);
//            		  }
//                }
//              }
//            }
//          }
//          printf("\n");
//        }
//      }
//      printf("\n");
//    }

    time = clock() - time;
    printf("\t\tSUCCESS - %.1f seconds\n", ((float) time)/CLOCKS_PER_SEC);

#endif // 4-point contraction 2
    // *************************************************************************
    // FOUR PT CONTRACTION 3 ***************************************************
    // *************************************************************************

    // TODO: check dirac indices. maybe dirac(t_source) and dirac(t_sink) have
    // to be equal or there may be four different structures rather than u- and
    // d-quark always having the same dirac structure
    // doesn't matter as long as all used dirac structures are equal

#if 0 // 4-point contraction 3
    std::cout << "\n\tcomputing the connected contribution of C4_3:\n";
    time = clock();

    // setting the correlation function to zero

    for(int p_u = 0; p_u < number_of_momenta; ++p_u)
      for(int p_d = 0; p_d < number_of_momenta; ++p_d)
        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u)
          for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d)
            for(int t1 = 0; t1 < Lt; ++t1)
              C4_mes[p_u][p_d][dirac_u][dirac_d][t1] = 
                  std::complex<double>(0.0, 0.0);

    for(int t_source = 0; t_source < Lt; ++t_source){
      for(int t_sink = 0; t_sink < Lt; ++t_sink){

        int t_source_1 = (t_source + 1) % Lt;
        int t_sink_1 = (t_sink + 1) % Lt;

        // initialize basic->contraction[]
        // p_u = number_of_momenta/2 and the break; statement arrange
        // for one-to-all calculation in momentum space. (only one source
        // momentum is used. the first five are {(0,0,0), (0,0,1), 
        // (0,1,-1), (1,-1,-1), (0,0,2)}

        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
          for(int p = 0; p <= max_mom_squared; p++){
            for(int p_u = number_of_momenta/2; p_u < p_max; ++p_u){
              if(mom_squared[p_u] == p){
                basic->init_operator_u(0, t_source, t_sink, rewr, 'b', p_u, 0);
                basic->init_operator_u(1, t_source_1, t_sink_1, rewr, 'b', 
                    number_of_momenta - p_u - 1, 0);
                break;
              }
            }
          }
        }

        // initialize basic->contraction_dagger[]
        // build all momenta for sinks

        for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
          for(int p_d = p_min; p_d < p_max; ++p_d){

            basic->init_operator_d(0, t_source_1, t_sink, rewr, 'b', p_d, 0);
            basic->init_operator_d(1, t_source, t_sink_1, rewr, 'b', 
                number_of_momenta - p_d - 1, 0);
          }
        }

        // build 4pt-function C4_mes for pi^+pi^+ Equivalent two just summing
        // up the four-trace with same time difference between source and sink 
        // (all to all) for every dirac structure, momentum
        // displacement not supported at the moment
        // to build the trace with four matrices, build combinations 
        // X = D_d^-1(t_sink | t_source + 1) 
        //     Gamma D_u^-1(t_source + 1 | t_sink + 1) Gamma
        // Y = D_d^-1(t_sink + 1| t_source) 
        //     Gamma D_u^-1(t_source| t_sink) Gamma
        // these have dimension
        // (4 * quarks[0].number_of_dilution_E) x (4 * 
        //     quarks[0].number_of_dilution_E)
        // thus the multiplication in this order is fastest

        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){     
          for(int p = 0; p <= max_mom_squared; p++){
             for(int p_u = number_of_momenta / 2; p_u < p_max; ++p_u) {
              if(mom_squared[p_u] == p){
                for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
                   for(int p_d = p_min; p_d < p_max; ++p_d) {
                    if(mom_squared[p_u] <= mom_squared[p_d]){

                      // initialisation of X. rnd loops and if-statements rule
                      // forbidden randomvector combinations (to improve 
                      // statistical error never use the same randomvector
                      // for different indices
    
                      basic->get_operator_g5(op_2, 0, dirac_min + dirac_d, 
                          number_of_momenta - p_d - 1);
                      basic->get_operator_charged(op_3, 1, t_sink_1, &rewr, 
                          dirac_min + dirac_u, number_of_momenta - p_u - 1);
          
                      // second u quark: t_source_1 -> t_sink_1

                      for(int rnd3 = 1; rnd3 < number_of_rnd_vec; ++rnd3){
                        for(int rnd2 = 0; rnd2 < number_of_rnd_vec; ++rnd2){
                          if(rnd2 != rnd3){

                            // first d quark: t_sink_1 -> t_source

                            for(int rnd4 = rnd2 + 1; rnd4 < number_of_rnd_vec; ++rnd4){
                              if(rnd4 != rnd3){

                                X[rnd3][rnd2][rnd4] = op_2[rnd3] * 
                                    op_3[rnd2][rnd4] ;
                              }
                            }
                          }
                        }
                      }

                      // initialisation of Y. see initialisation of X
    
                      basic->get_operator_g5(op_4, 1, dirac_min + dirac_d, p_d);
                      basic->get_operator_charged(op_1, 0, t_sink, rewr, 
                          dirac_min + dirac_u, p_u);
    
                      // first u quark: t_source -> t_sink

                      for(int rnd1 = 0; rnd1 < number_of_rnd_vec; ++rnd1){
                        for(int rnd3 = rnd1 + 1; rnd3 < number_of_rnd_vec; ++rnd3){      

                          // second d quark: t_sink -> t_source_1

                          for(int rnd4 = 1; rnd4 < number_of_rnd_vec; ++rnd4){
                            if((rnd4 != rnd1) && (rnd4 != rnd3)){

                              Y[rnd4][rnd1][rnd3] = op_4[rnd4] * 
                                  op_1[rnd1][rnd3];
                            }
                          }
                        }
                      }
              
                      // complete diagramm. combine X and Y to four-trace
                      // C4_mes = tr(D_u^-1(t_source| t_sink) Gamma 
                      //     D_d^-1(t_sink | t_source + 1) Gamma 
                      //     D_u^-1(t_source + 1 | t_sink + 1) Gamma 
                      //     D_d^-1(t_sink + 1| t_source) Gamma)
                      // every quark line must have its own random vec

                      for(int rnd1 = 0; rnd1 < number_of_rnd_vec; ++rnd1){
                        for(int rnd3 = rnd1 + 1; rnd3 < number_of_rnd_vec; ++rnd3){
                          for(int rnd2 = 0; rnd2 < number_of_rnd_vec; ++rnd2){      
                            if((rnd2 != rnd1) && (rnd2 != rnd3)){
                              for(int rnd4 = rnd2 + 1; rnd4 < number_of_rnd_vec; ++rnd4){
                                if((rnd4 != rnd1) && (rnd4 != rnd3)){

                                  C4_mes[p_u][p_d][dirac_u][dirac_d]
                                      [abs((t_sink - t_source - Lt) % Lt)] += 
                                    ((X[rnd3][rnd2][rnd4] * 
                                      Y[rnd4][rnd1][rnd3]).trace());
                                }
                              }
                            }
                          }
                        }
                      }
    
                    }
                  }
                }

                break;

              }
            }
          }
        }

      }
    }

    // Normalization of 4pt-function. Accounts for all rnd-number combinations

    for(int p1 = 0; p1 < number_of_momenta; ++p1)
      for(int p2 = 0; p2 < number_of_momenta; ++p2)
        for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u)
          for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d)
            for(int t = 0; t < Lt; ++t)
              C4_mes[p1][p2][dirac_u][dirac_d][t] /= norm1;


    // output to binary file

    // see output to binary file for C2. 
    // for the C4_3 diagram the four propagators are connected in the same
    // trace. Thus there are no gluon lines which could be cut to create a
    // disconnected diagrams and thus no Zweig suppression.
    // To build a GEVP, the correlators are written into a seperate folder
    // for every dirac structure, momentum, (entry of the GEVP matrix).
    // displacement is not supported at the moment

    for(int dirac_u = 0; dirac_u < number_of_dirac; ++dirac_u){
      for(int dirac_d = 0; dirac_d < number_of_dirac; ++dirac_d){
        for(int p1 = 0; p1 <= max_mom_squared; p1++){
          for(int p2 = p1; p2 <= max_mom_squared; p2++){

            sprintf(outfile, 
                "%s/dirac_%02d_%02d_p_%01d_%01d_displ_%01d_%01d/"
                "C4_3_conf%04d.dat", 
                outpath.c_str(), dirac_min + dirac_u, dirac_min + dirac_d, 
                p1, p2, displ_min, displ_max, config_i);
            if((fp = fopen(outfile, "wb")) == NULL)
              std::cout << "fail to open outputfile" << std::endl;

            for(int p_u = number_of_momenta / 2; p_u < p_max; ++p_u){
              if(mom_squared[p_u] == p1){
                for(int p_d = p_min; p_d < p_max; ++p_d){
                  if(mom_squared[p_d] == p2){

                    fwrite((double*) &(C4_mes[p_u][p_d][dirac_u][dirac_d][0]), 
                        sizeof(double), 2 * Lt, fp);
                  }
                }

                break;

              }
            }

            fclose(fp);

          }
        }
      }
    }

#if 0
    sprintf(outfile, 
        "%s/dirac_%02d_%02d_p_0_%01d_displ_%01d_%01d/C4_3_conf%04d.dat", 
        outpath.c_str(), dirac_min, dirac_max, 0, displ_min, 
        displ_max, config_i);
    if((fp = fopen(outfile, "wb")) == NULL)
      std::cout << "fail to open outputfile" << std::endl;
    for(int dirac = dirac_min; dirac < dirac_max + 1; ++dirac)
      fwrite((double*) C4_mes[number_of_momenta/2]
          [number_of_momenta/2][dirac][dirac], sizeof(double), 2 * Lt, fp);
    fclose(fp);
#endif

    // output to terminal
//    printf("\n");
//    for(int dirac = dirac_min; dirac < dirac_max + 1; ++dirac){
//      printf("\tdirac    = %02d\n", dirac);
//      for(int p = p_min; p < p_max; ++p) {
//        printf("\tmomentum = %02d\n", p);
//        //printf(
//        //    "\t t\tRe(C4_3_con)\tIm(C4_3_con)\n\t----------------------------------\n");
//        for(int t1 = 0; t1 < Lt; ++t1){
//          printf("\t%02d\t%.5e\t%.5e\n", t1, real(C4_mes[p][p][dirac][dirac][t1]),
//              imag(C4_mes[p][p][dirac][dirac][t1]));
//        }
//        printf("\n");
//      }
//      printf("\n");
//    }


    time = clock() - time;
    printf("\t\tSUCCESS - %.1f seconds\n", ((float) time)/CLOCKS_PER_SEC);
#endif // 4-point contraction 3
    // *************************************************************************
    // FOUR PT CONTRACTION 4 ***************************************************
    // *************************************************************************

    // identical to FOUR PT CONTRACTION 3

  } // loop over configs ends here

  // TODO: freeing all memory!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

  delete rewr;
  delete basic;
}
int main (int argc, char *argv[])
{
  if ((argc < 3) || (argc > 4)) usage (argv[0]);
  std::string mapfile = argv[1];
  std::string quad_list_prefix = argv[2];

  Healpix_Map<double> map;
  read_Healpix_map_from_fits (mapfile, map);
  bool have_mask = false;
  Healpix_Map<double> mask;
  if (argc == 4) {
    read_Healpix_map_from_fits (argv[3], mask);
    have_mask = true;
  }

  // Figure out how many bins there are by trying to open files.
  std::vector<std::string> quad_list_files
    = Npoint_Functions::get_range_file_list(quad_list_prefix, 0, 180);

  std::vector<double> bin_list(quad_list_files.size());
  std::vector<double> Corr(quad_list_files.size());

#pragma omp parallel shared(Corr, bin_list, quad_list_files)
  {
    Npoint_Functions::Quadrilateral_List_File<int> qlf;

#pragma omp for schedule(dynamic,2)
    for (size_t k=0; k < quad_list_files.size(); ++k) {

      if (! qlf.initialize (quad_list_files[k])) {
        std::cerr << "Error initializing quadrilateral list from "
                  << quad_list_files[k] << std::endl;
        std::exit(1);
      }
      if (static_cast<size_t>(map.Nside()) != qlf.Nside()) {
        std::cerr << "Map has Nside = " << map.Nside()
                  << " but quad list has Nside = " << qlf.Nside()
                  << "\nGiving up!\n";
        std::exit(1);
      }
      if (map.Scheme() != qlf.Scheme()) map.swap_scheme();
      if (have_mask) {
        if (static_cast<size_t>(mask.Nside()) != qlf.Nside()) {
          std::cerr << "Mask and quadrilateral lists do not have"
                    << " the same Nside: " << mask.Nside() 
                    << " != " << qlf.Nside() << std::endl;
          std::exit(1);
        }
        if (mask.Scheme() != qlf.Scheme()) mask.swap_scheme();
      }

#pragma omp critical
      {
        std::cerr 
#ifdef OMP
          << omp_get_thread_num() << " "
#endif       
          << k << std::endl;
      }

      bin_list[k] = qlf.bin_value();
      if (have_mask) {
        Corr[k] = calculate_masked_fourpoint_function (map, mask, qlf);
      } else {
        Corr[k] = calculate_fourpoint_function (map, qlf);
      }
    }
  }
  
  for (size_t k=0; k < bin_list.size(); ++k) {
    // Same format as spice
    std::cout << bin_list[k]*M_PI/180 << " " 
              << cos(bin_list[k]*M_PI/180) << " "
              << Corr[k] << std::endl;
  }

  return 0;
}
/**
 * @brief Apply the whole algorithm of K-SVD
 *
 * @param img_noisy : pointer to an allocated array containing
 *                    the original noisy image;
 * @param img_denoised : pointer to an allocated array which
 *                       will contain the final denoised image;
 * @param patches : matrix containing all patches including in
 *                  img_noisy;
 * @param dictionary : initial random dictionary, which will be
 *                     updated in each iteration of the algo;
 * @param sigma : noise value;
 * @param N1 : size of patches (N1 x N1);
 * @param N2 : number of atoms in the dictionary;
 * @param N_iter : number of iteration;
 * @param gamma : value used in the correction matrix in the
 *                case of color image;
 * @param C : coefficient used for the stopping criteria of
 *            the ORMP;
 * @param width : width of both images;
 * @param height : height of both images;
 * @param chnls : number of channels of both images;
 * @param doReconstruction : if true, do the reconstruction of
 *                           the final denoised image from patches
 *                           (only in the case of the acceleration
 *                            trick).
 *
 * @return none.
 **/
void ksvd_process(matD_t        &patches,
                  matD_t        &dictionary,
                  matD_t		&gamma,
                  const unsigned N1, // size of features (i.e. 324)
                  const unsigned N2, // size of the dictionary (i.e. 1000)
                  const unsigned N_iter, // i.e. 40
                  const double   C)
{
	//! Declarations
	const unsigned N1_2 = N1;
	const double   corr = 0; //(sqrtl(1.0l + gamma) - 1.0l) / ((double) N1_2);
	const unsigned chnls = 1;
	const double   eps  = ((double) (N1_2)) * C * C;
	const unsigned h_p  = patches[0].size();
	const unsigned w_p  = patches.size();

	//! Mat & Vec initializations
	matD_t dict_ormp   (N2 , vecD_t(h_p, 0.0l));
	matD_t patches_ormp(w_p, vecD_t(h_p, 0.0l));
	matD_t tmp         (h_p, vecD_t(N2, 0.0l));
	vecD_t normCol     (N2);
	matD_t Corr        (h_p, vecD_t(h_p, 0.0l));
	vecD_t U           (h_p);
	vecD_t V;
	matD_t E           (w_p, vecD_t(h_p));

	//! Vector for ORMP
	matD_t ormp_val        (w_p, vecD_t ());
	matU_t ormp_ind        (w_p, vecU_t ());
	matD_t res_ormp        (N2, vecD_t (w_p));
	matU_t omega_table     (N2, vecU_t ());
	vecU_t omega_size_table(N2, 0);
	matD_t alpha           (N2, vecD_t ()); // this is a function parameter

	//! To avoid reallocation of memory
	for (unsigned k = 0; k < w_p; k++)
	{
		ormp_val[k].reserve(N2);
		ormp_ind[k].reserve(N2);
	}

	for (matU_t::iterator it = omega_table.begin(); it < omega_table.end(); it++)
		it->reserve(w_p);

	V.reserve(w_p);

	//! Correcting matrix
	for (unsigned i = 0; i < h_p; i++)
		Corr[i][i] = 1.0l;

	for (unsigned c = 0; c < 1; c++)
	{
		matD_t::iterator it_Corr = Corr.begin() + N1_2 * c;
		for (unsigned i = 0; i < N1_2; i++, it_Corr++)
		{
			iterD_t it = it_Corr->begin() + N1_2 * c;
			for (unsigned j = 0; j < N1_2; j++, it++)
				(*it) += corr;
		}
	}

	#pragma omp parallel for
	for (int j = 0; j < w_p; j++)
	{
		for (unsigned c = 0; c < chnls; c++)
		{
			iterD_t it_ormp = patches_ormp[j].begin() + c * N1_2;
			iterD_t it = patches[j].begin() + c * N1_2;
			for (unsigned i = 0; i < N1_2; i++, it++, it_ormp++)
			{
				double val = 0.0l;
				iterD_t it_tmp = patches[j].begin() + c * N1_2;
				for (unsigned k = 0; k < N1_2; k++, it_tmp++)
					val += corr * (*it_tmp);
				(*it_ormp) = val + (*it);
			}
		}
	}

	//! Big loop
	for (unsigned iter = 0; iter < N_iter; iter++)
	{
		std::cout << "Step " << iter + 1 << ":" << std::endl;
		std::cout << " - Sparse coding" << std::endl;

		for (unsigned i = 0; i < h_p; i++)
		{
			iterD_t it_tmp = tmp[i].begin();
			for (unsigned j = 0; j < N2; j++, it_tmp++)
			{
				double val = 0.0l;
				iterD_t it_corr_i = Corr[i].begin();
				iterD_t it_dict_j = dictionary[j].begin();
				for (unsigned k = 0; k < h_p; k++, it_corr_i++, it_dict_j++)
					val += (*it_corr_i) * (*it_dict_j);
				(*it_tmp) = val * val;
			}
		}

		iterD_t it_normCol = normCol.begin();
		for (unsigned j = 0; j < N2; j++, it_normCol++)
		{
			double val = 0.0l;
			for (unsigned i = 0; i < h_p; i++)
				val += tmp[i][j];
			(*it_normCol) = 1.0l / sqrtl(val);
		}

		for (unsigned i = 0; i < h_p; i++)
		{
			iterD_t it_normCol_j = normCol.begin();
			for (unsigned j = 0; j < N2; j++, it_normCol_j++)
			{
				double val = 0.0l;
				iterD_t it_corr_i  = Corr[i].begin();
				iterD_t it_dict_j = dictionary[j].begin();
				for (unsigned k = 0; k < h_p; k++, it_corr_i++, it_dict_j++)
					val += (*it_corr_i) * (*it_dict_j);
				dict_ormp[j][i] = val * (*it_normCol_j);
			}
		}

		//! ORMP process
		std::cout << " - ORMP process" << std::endl;
		ormp_process(patches_ormp, dict_ormp, ormp_ind, ormp_val, N2, eps);

		for (unsigned i = 0; i < w_p; i++)
		{
			iterU_t it_ind = ormp_ind[i].begin();
			iterD_t it_val = ormp_val[i].begin();
			const unsigned size = ormp_val[i].size();
			for (unsigned j = 0; j < size; j++, it_ind++, it_val++)
				(*it_val) *= normCol[*it_ind];
		}

		//! Residus
		for (unsigned i = 0; i < N2; i++)
		{
			omega_size_table[i] = 0;
			omega_table[i].clear();
			alpha[i].clear();
			for (iterD_t it = res_ormp[i].begin(); it < res_ormp[i].end(); it++)
				*it = 0.0l;
		}

		for (unsigned i = 0; i < w_p; i++)
		{
			iterU_t it_ind = ormp_ind[i].begin();
			iterD_t it_val = ormp_val[i].begin();
			for (unsigned j = 0; j < ormp_val[i].size(); j++, it_ind++, it_val++)
			{
				omega_table[*it_ind].push_back(i);
				omega_size_table[*it_ind]++;
				alpha[*it_ind].push_back(*it_val);
				res_ormp[*it_ind][i] = *it_val;
			}
		}

		//! Dictionary update
		std::cout << " - Dictionary update" << std::endl;
		for (unsigned l = 0; l < N2; l++)
		{
			//! Initializations
			const unsigned omega_size = omega_size_table[l];
			iterD_t it_dict_l = dictionary[l].begin();
			iterD_t it_alpha_l = alpha[l].begin();
			iterU_t it_omega_l = omega_table[l].begin();
			U.assign(U.size(), 0.0l);

			if (omega_size > 0)
			{
				iterD_t it_a = it_alpha_l;
				iterU_t it_o = it_omega_l;
				for (unsigned j = 0; j < omega_size; j++, it_a++, it_o++)
				{
					iterD_t it_d = it_dict_l;
					iterD_t it_e = E[j].begin();
					iterD_t it_p = patches[*it_o].begin();
					for (unsigned i = 0; i < h_p; i++, it_d++, it_e++, it_p++)
						(*it_e) = (*it_p) + (*it_d) * (*it_a);
				}

				matD_t::iterator it_res = res_ormp.begin();
				for (unsigned k = 0; k < N2; k++, it_res++)
				{
					iterU_t it_o = it_omega_l;
					iterD_t it_dict_k = dictionary[k].begin();
					for (unsigned j = 0; j < omega_size; j++, it_o++)
					{
						const double val = (*it_res)[*it_o];
						if (fabs(val) > 0.0l)
						{
							iterD_t it_d = it_dict_k;
							iterD_t it_e = E[j].begin();
							for (unsigned i = 0; i < h_p; i++, it_d++, it_e++)
								(*it_e) -= (*it_d) * val;
						}
					}
				}

				//! SVD truncated
				V.resize(omega_size);
				double S = svd_trunc(E, U, V);

				dictionary[l] = U;

				it_a = it_alpha_l;
				iterD_t it_v = V.begin();
				it_o = it_omega_l;
				for (unsigned j = 0; j < omega_size; j++, it_a++, it_v++, it_o++)
					res_ormp[l][*it_o] = (*it_a) = (*it_v) * S;
			}
		}
		std::cout << " - done." << std::endl;
	}


	// USE omega_table, omega_size_table, and alpha information
	// above to build the gamma matrix
	// the size of the gamma matrix should be (sizeofdict)x(numofpatches)
	for(unsigned i = 0; i < N2; i++)
	{
		for(unsigned j = 0; j < omega_size_table[i]; j++)
		{
			unsigned pI = omega_table[i].at(j);
			float alphaV = alpha[i].at(j);
			gamma[pI].at(i) = alphaV;
		}
	}



}
int main (int argc, char *argv[])
{
  if ((argc < 5) || (argc > 6)) usage (argv[0]);
  std::string quad_list_prefix = argv[1];
  std::string alm_dir = argv[2];
  size_t Nstart, Nend;
  if (! Npoint_Functions::from_string (argv[3], Nstart)) {
    std::cerr << "Could not parse Nstart\n";
    usage (argv[0]);
  }
  if (! Npoint_Functions::from_string (argv[4], Nend)) {
    std::cerr << "Could not parse Nend\n";
    usage (argv[0]);
  }
  bool have_mask = false;
  Healpix_Map<double> mask;
  if (argc == 6) {
    read_Healpix_map_from_fits (argv[5], mask);
    have_mask = true;
  }

  // Figure out how many bins there are by trying to open files.
  std::vector<std::string> quad_list_files
    = Npoint_Functions::get_range_file_list(quad_list_prefix, 0, 400);
  if (quad_list_files.size() == 0) {
    std::cerr << "No quad list files found!\n";
    usage (argv[0]);
  }

  int Lmax;
  std::vector<Healpix_Map<double> > maps (Nend-Nstart);
  // Make maps
  {
    Npoint_Functions::Quadrilateral_List_File<int> qlf;
    qlf.initialize (quad_list_files[0]);
    if (have_mask) {
      if (static_cast<size_t>(mask.Nside()) != qlf.Nside()) {
        std::cerr << "Mask and quadrilateral lists do not have"
                  << " the same Nside: " << mask.Nside() 
                  << " != " << qlf.Nside() << std::endl;
        std::exit(1);
      }
      if (mask.Scheme() != qlf.Scheme()) mask.swap_scheme();
    }
    Lmax = std::min(200UL, 4*qlf.Nside()+1);
    //#pragma omp parallel shared(qlf, maps)
    {
      Alm<xcomplex<double> > alm (Lmax, Lmax);
      //#pragma omp for schedule(static)
      for (size_t k=0; k < maps.size(); ++k) {
        read_Alm_from_fits (dirtree::filename(alm_dir, "alm_T_", ".fits",
                                              k+Nstart),
                            alm, Lmax, Lmax);
        maps[k].SetNside (qlf.Nside(), RING);
        alm2map (alm, maps[k]);
        if (maps[k].Scheme() != qlf.Scheme()) maps[k].swap_scheme();
      }
    }
  }

  std::vector<double> bin_list(quad_list_files.size());
  /* We will generate this by bin for each map so make the bin number the
   * first index. */
  std::vector<std::vector<double> > Corr(quad_list_files.size());

#pragma omp parallel shared(Corr, bin_list, quad_list_files, maps, mask)
  {
    Npoint_Functions::Quadrilateral_List_File<int> qlf;
    
#pragma omp for schedule(dynamic,2)
    for (size_t k=0; k < quad_list_files.size(); ++k) {
      if (! qlf.initialize (quad_list_files[k])) {
        std::cerr << "Error initializing quadrilateral list from "
                  << quad_list_files[k] << std::endl;
        std::exit(1);
      }

      bin_list[k] = qlf.bin_value();
      if (have_mask) {
        Npoint_Functions::calculate_masked_fourpoint_function_list
          (maps, mask, qlf, Corr[k]);
      } else {
        Npoint_Functions::calculate_fourpoint_function_list
          (maps, qlf, Corr[k]);
      }
    }
  }
  
  std::cout << "# LCDM four point function from " << quad_list_prefix
            << std::endl;
  std::cout << "# First line is bin values, rest are the four point function.\n";
  for (size_t k=0; k < bin_list.size(); ++k) {
    std::cout << bin_list[k] << " ";
  }
  std::cout << std::endl;

  for (size_t j=0; j < maps.size(); ++j) {
    for (size_t k=0; k < bin_list.size(); ++k) {
      std::cout << Corr[k][j] << " ";
    }
    std::cout << std::endl;
  }

  return 0;
}