Beispiel #1
0
void SendTime(ArgStruct *p, double *t)
{
   *pTime=*t;

   shmem_double_put(pTime,pTime,1,p->prot.nbor);
   shmem_int_put(p->prot.flag,p->prot.flag,1,p->prot.nbor);
}
Beispiel #2
0
/* 1. */
void xchange_halffield() {

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchangehalf)
#endif
#  ifdef MPI

  shmem_barrier_all();

  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ/2), 
		   (double*)(HalfSpinor + 4*VOLUME),
                   (LX*LY*LZ*6), g_nb_t_up);
  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2), 
		   (double*)(HalfSpinor + 4*VOLUME + LX*LY*LZ/2),
                   (LX*LY*LZ*6), g_nb_t_dn);

#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ/2), 
		   (double*)(HalfSpinor + 4*VOLUME + LX*LY*LZ),
                   (T*LY*LZ*6), g_nb_x_up);
  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ),
		   (double*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ/2),
                   (T*LY*LZ*6), g_nb_x_dn);

#    endif

#    if (defined PARALLELXYT || defined PARALLELXYZT)
  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2),
		   (double*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ),
                   (T*LX*LZ*6), g_nb_y_up);
  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ),
		   (double*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2),
                   (T*LX*LZ*6), g_nb_y_dn);

#    endif

#    if (defined PARALLELXYZT)
  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2),
		   (double*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ),
                   (T*LX*LY*6), g_nb_z_up);
  shmem_double_put((double*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ),
		   (double*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2),
                   (T*LX*LY*6), g_nb_z_dn);

#    endif

  shmem_barrier_all();
#  endif /* MPI */
  return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchangehalf)
#endif
}
Beispiel #3
0
int main(int argc, char *argv[])
{
  int size, rank, world_rank, my_group;
  int num_lsms; // number of parallel LSMS instances
  int size_lsms; // number of atoms in a lsms instance
  int num_steps; // number of energy calculations
  int initial_steps; // number of steps before sampling starts
  int stepCount=0; // count the Monte Carlo steps executed
  double max_time; // maximum walltime for this run in seconds
  bool restrict_time = false;       // was the maximum time specified?
  bool restrict_steps = false; // or the max. numer of steps?
  int align; // alignment of lsms_instances
  
  double magnetization;
  double energy_accumulator; // accumulates the enegy to calculate the mean
  int energies_accumulated;


  int new_peid,new_root;
  static int op,flag;
  double *evec,*r_values;
  evec=(double *)shmalloc(sizeof(double)*3*size_lsms);
  r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1)));




  energy_accumulator=0.0;
  energies_accumulated=0;

  double walltime_0,walltime;

  double restartWriteFrequency=30.0*60.0;
  double nextWriteTime=restartWriteFrequency;

  MPI_Comm local_comm;
  int *lsms_rank0;
  MPI_Status status;

  char prefix[40];
  char i_lsms_name[64];
  char gWL_in_name[64], gWL_out_name[64];
  char mode_name[64];
  char energy_calculation_name[64];
  char stupid[37];

  char step_out_name[64];
  char wl_step_out_name[128];
  char *wl_stepf=NULL;
  bool step_out_flag=false;
  std::ofstream step_out_file;
  typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode;
  typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension;

  EvecGenerationMode evec_generation_mode = Constant;
  SecondDimension second_dimension = MagneticMoment;
  double ev0[3];

  bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step.
  bool generator_needs_moment=false;

  typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode;
  EnergyCalculationMode energyCalculationMode = OneStepEnergy;
  int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy)

  ev0[0]=ev0[1]=0.0; ev0[2]=1.0;
  // size has to be align + size_lsms*num_lsms
  align=1;
  num_lsms=1;
  size_lsms=-1;
  my_group=-1;
  num_steps=1;
  initial_steps=0;

  sprintf(i_lsms_name,"i_lsms");
  gWL_in_name[0]=gWL_out_name[0]=0;
  mode_name[0]=0;
  energy_calculation_name[0]=0;

  // check command line arguments
  for(int i=0; i<argc; i++)
  {
    if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]);
    if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]);
    if(!strcmp("-align",argv[i])) align=atoi(argv[++i]);
    if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;}
    if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); 
    if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;}
    if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64);
    if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;}
    if(!strcmp("-step_out",argv[i]))
    {strncpy(step_out_name,argv[++i],64); step_out_flag=true;
      return_moments_flag=true;}
    if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64);
    if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64);
    if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64);
    if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64);
  }

  if(!(restrict_steps || restrict_time)) restrict_steps=true;

  if(mode_name[0]!=0)
  {
    if(!strcmp("constant",mode_name)) evec_generation_mode = Constant;
    if(!strcmp("random",mode_name)) evec_generation_mode = Random;
    if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d;
    if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing;
    if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d;
    if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;}
    if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;}
    if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;}
    if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;}
  }

  if(energy_calculation_name[0]!=0)
  {
    if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; }
  }

#ifdef USE_PAPI
#define NUM_PAPI_EVENTS 4
  int hw_counters = PAPI_num_counters();
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS};
  char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS",
                             "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE",
                             "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"};
  // "RETIRED_INSTRUCTIONS",
  // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2",
  // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1",
  // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1"
  // get events from names:
  for(int i=0; i<NUM_PAPI_EVENTS; i++)
  {
    if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK)
    {
      // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]),
      //           std::cerr,parameters.myrankWorld);
      // printline("Skipping all following events",
      //           std::cerr,parameters.myrankWorld);
      if(hw_counters>i) hw_counters=i;
    }
  }
  long long papi_values[NUM_PAPI_EVENTS+4];
  // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld);
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  long long papi_real_cyc_0 = PAPI_get_real_cyc();
  long long papi_real_usec_0 = PAPI_get_real_usec();
  long long papi_virt_cyc_0 = PAPI_get_virt_cyc();
  long long papi_virt_usec_0 = PAPI_get_virt_usec();
  PAPI_start_counters(papi_events,hw_counters);
#endif


  lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1));

  // initialize MPI:
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  world_rank=rank;
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  walltime_0 = get_rtc();

#ifndef SVN_REV
#define SVN_REV "unknown"
#endif

// make sure 'return_moments_flag' is set correctly
  switch(evec_generation_mode)
  {
  case Constant : break;
  case Random : break;
  case WangLandau_1d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  case ExhaustiveIsing : break;
  case WangLandau_2d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
  }

  if(rank==0)
  {
    std::cout<<"LSMS_3"<<std::endl;
    std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl;
#ifdef USE_PAPI
    std::cout<<" Using Papi counters"<<std::endl<<std::endl; 
#endif
    std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n";
    std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl;
    std::cout<<" LSMS Energy calculated using ";
    switch(energyCalculationMode)
    {
    case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break;
    case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break;
    case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break;
    default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1);
    }
    if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl;
    if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n";
    std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl;
    switch(evec_generation_mode)
    {
    case Constant : std::cout<<" Constant moments direction along "
                             <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl;
      break;
    case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl;
      break;
    case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break;
    case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, ";
      switch(second_dimension)
      {
      case MagneticMoment  : std::cout<<"magnitude of magnetization)"; break;
      case MagneticMomentX : std::cout<<"x component of magnetization)"; break;
      case MagneticMomentY : std::cout<<"y component of magnetization)"; break;
      case MagneticMomentZ : std::cout<<"z component of magnetization)"; break;
      }
      std::cout<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
    }
    if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl;
    std::cout<<std::endl;

    if(step_out_flag && (evec_generation_mode==WangLandau_1d))
    {
      // step_out_flag=false;
      snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name);
      wl_stepf=wl_step_out_name;
    }

    if(step_out_flag)
    {
      step_out_file.open(step_out_name);
      step_out_file<<"#";
      for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i];
      step_out_file<<std::endl<<size_lsms<<std::endl;
    }
  }

  if(generator_needs_moment) return_moments_flag=true;

  if(num_lsms==1)
  {
    SHMEM_activeset local_comm;
    local_comm.rank=shmem_my_pe();
    local_comm.size=shmem_n_pes();
    local_comm.start_pe=0;
    local_comm.logPE_stride=0;
    LSMS lsms_calc(local_comm,i_lsms_name,"1_");
      
    if(rank==0)
    {
      std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
      std::cout<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
    }

    if(energyCalculationMode==OneStepEnergy)
      std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl;
    else if(energyCalculationMode==MultiStepEnergy)
      std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl;
    else if(energyCalculationMode==ScfEnergy)
      std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl;
    else
    {
      printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
     // MPI_Abort(MPI_COMM_WORLD,5);
      exit(5);
    }
  }
  else
  {
    // build the communicators
    //int color=MPI_UNDEFINED;
    //Assuming user passes a power of two while using "-align"
    int s = align;
    int comm_size=(size-align)/num_lsms;
    int world_rank;
    for(int i=0; i<num_lsms; i++)
    {
      if((world_rank>=s) && (world_rank<s+comm_size)) 
      { 
        my_group=i; 
        //color=i; 
        new_peid=world_rank-s;
        new_root=s;
      }
      lsms_rank0[i]=s;
      s+=comm_size;
    }
    if(world_rank==0){ 
      //color=num_lsms;
      new_peid=0;
      comm_size=1;
      new_root=0;
    }

    //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm);
    SHMEM_activeset local_comm;
    local_comm.rank=new_peid;
    local_comm.size=comm_size;
    local_comm.start_pe=new_root;
    local_comm.logPE_stride=0;

    std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl;

      
    snprintf(prefix,38,"Group %4d: ",my_group);

    // now we get ready to do some calculations...

    if(my_group>=0)
    {
      double energy;
      double band_energy;
      int static i_values[10];
      double static r_values[10];
      static int op;


      //MPI_Comm_rank(local_comm, &rank);
      rank = local_comm.rank;
      snprintf(prefix,38,"%d_",my_group);
      // to use the ramdisk on jaguarpf:
      // snprintf(prefix,38,"/tmp/ompi/%d_",my_group);
      LSMS lsms_calc(local_comm,i_lsms_name,prefix);
      snprintf(prefix,38,"Group %4d: ",my_group);

      if(rank==0 && my_group==0)
      {
        std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
        std::cout<<prefix<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
      }

      // wait for commands from master
      bool finished=false;
      while(!finished)
      {
        if(rank==0)
        {
          //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //op =status.MPI_TAG;
          if (lsms_rank0[0]==world_rank)
                shmem_barrier(0, lsms_rank0[0], 2, pSync1);

        }
        //MPI_Bcast(&op,1,MPI_INT,0,local_comm);
        shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); 

/* recognized opcodes:
   5: calculate energy

   recognized energy calculation modes:
   OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef)
   use only if the Fermi energy will not change due to MC steps!
   The only method available in LSMS_1.9
   MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy
   This should be the new default method. If the Fermi energy doesn't change
   multiStepEnergy only performs one step and should be equivalent to oneStepEnergy
   The tolerance for Ef convergence can be set with LSMS::setEfTol(Real).
   The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6).
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   ScfEnergy : this will calculate the selfconsistent total energy.
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   NOT IMPLEMENTED YET!!!

   10: get number of sites
*/

        if(op==5)
        {
          lsms_calc.setEvec(evec);
          if(energyCalculationMode==OneStepEnergy)
            energy=lsms_calc.oneStepEnergy(&band_energy);
          else if(energyCalculationMode==MultiStepEnergy)
            band_energy=energy=lsms_calc.multiStepEnergy();
          else if(energyCalculationMode==ScfEnergy)
            energy=lsms_calc.scfEnergy(&band_energy);
          else
          {
            printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
            //MPI_Abort(MPI_COMM_WORLD,5);
            exit(5);
          }
          r_values[0]=energy;
          r_values[1]=band_energy;
          if(return_moments_flag)
          {
            lsms_calc.getMag(&r_values[R_VALUE_OFFSET]);
          }
          if(rank==0)
          {
            if(return_moments_flag)
            {
              //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0);

            } else {
              //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0);
            }
            shmem_fence();
            shmem_int_swap(&flag, world_rank, 0);

          }
              
        } else if(op==10) {
          i_values[0]=lsms_calc.numSpins();
          //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD);
          shmem_int_put(i_values, i_values, 10, 0);
        } else {
          // printf("world rank %d: recieved exit\n",world_rank); 
          finished=true;
        }
      }

      shfree(evec);
      //shfree(r_values);
    }
    else if(world_rank==0)
    {
      int running;
      double **evecs;
      //double *r_values;
      //int i_values[10];
      int *init_steps;
      int total_init_steps;
      bool accepted;
        
      char *wl_inf=NULL;
      char *wl_outf=NULL;
      if(gWL_in_name) wl_inf=gWL_in_name;
      if(gWL_out_name) wl_outf=gWL_out_name;
        
      EvecGenerator *generator;

/*
      // get number of spins from first LSMS instance
      // temp r_values:
      r_values=(double *)malloc(sizeof(double)*10);
      MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD);
      free(r_values);
      MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status);
      if(i_values[0]!=size_lsms)
      {
        printf("Size specified for Wang-Landau and in LSMS input file don't match!\n");
        size_lsms=i_values[0];
      }
*/

      evecs=(double **)shmalloc(sizeof(double *)*num_lsms);
      init_steps=(int *)shmalloc(sizeof(int)*num_lsms);
      for(int i=0; i<num_lsms; i++)
      {
        evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms);
        init_steps[i]=initial_steps;
      }
      total_init_steps=num_lsms*initial_steps;
        

      // Initialize the correct evec generator
      switch(evec_generation_mode)
      {
      case Random :  generator = new RandomEvecGenerator(size_lsms);
        break;
      case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms);
        break;
     //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
     //                                                                      evecs, wl_inf, wl_outf, wl_stepf);
     case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms,
                                                                            evecs, wl_inf, wl_outf);
        break;
      //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
      //                                                                     evecs, wl_inf, wl_outf, wl_stepf);
      case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n";
        exit(1);
      }

      for(int i=0; i<num_lsms; i++)
      {
        generator->initializeEvec(i,evecs[i]);
      }
      std::cout<<"This is the master node\n";
      // issue initial commands to all LSMS instances
      running=0;
      bool more_work=true;
      if(total_init_steps>0)
      {
        for(int i=0; i<num_lsms; i++)
        {
          std::cout<<"starting initial calculation in group "<<i<<std::endl;
          //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
          shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
          shmem_int_p(&op, 5, lsms_rank0[i]);
          shmem_fence();


          num_steps--; running++; stepCount++;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
        }
        shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        // first deal with the initial steps:
        while(running>0)
        {
          //if(return_moments_flag)
          //  MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //else
          //  MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          
          shmem_int_wait(&flag,-1);

          running--;
          // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
          // std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
          if(total_init_steps>0)
          {
            //int r_group=(status.MPI_SOURCE-align)/comm_size;
            int r_group=(flag-align)/comm_size;
            std::cout<<"starting additional calculation in group "<<r_group<<std::endl;

            if(init_steps[r_group]>0)
            {
              more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex]));
              init_steps[r_group]--; total_init_steps--;
            }
                
            //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
            shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
            shmem_fence();
                
            num_steps--; running++; stepCount++;
            if(restrict_steps && num_steps<=0) more_work=false;
            if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
            walltime = get_rtc() - walltime_0;
            if(restrict_time && walltime>=max_time) more_work=false;
            if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
          }
              
        }
      }
      more_work=true;
      running=0;
      for(int i=0; i<num_lsms; i++)
      {
        std::cout<<"starting main calculation in group "<<i<<std::endl;
        //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
        shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
        shmem_int_p(&op, 5, lsms_rank0[i]);
        shmem_fence();
        num_steps--; running++; stepCount++;
        if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
      }
      shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        
      generator->startSampling();
      // wait for results and issue new commands or wind down
      while(running>0)
      {
        //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
        shmem_int_wait(&flag,-1);

        running--;
        std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
        std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
        // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE);
        energy_accumulator+=r_values[0]; energies_accumulated++;
        if(more_work)
        {
          int r_group=(status.MPI_SOURCE-align)/comm_size;
          std::cout<<"starting additional calculation in group "<<r_group<<std::endl;
              
          if(generator_needs_moment)
          {
            double m0,m1,m2;
            m0=0.0; m1=0.0; m2=0.0;
            for(int i=0; i<3*size_lsms; i+=3)
            {
              m0+=r_values[R_VALUE_OFFSET+i];
              m1+=r_values[R_VALUE_OFFSET+i+1];
              m2+=r_values[R_VALUE_OFFSET+i+2];
            }
            switch(second_dimension)
            {
            case  MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break;
            case  MagneticMomentX : magnetization=m0; break;
            case  MagneticMomentY : magnetization=m1; break;
            case  MagneticMomentZ : magnetization=m2; break;
            }
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted))
              more_work=false;
          } else {
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false;
          }

          //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
          shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
          shmem_fence();

          num_steps--; running++; stepCount++;
          if(restrict_steps && num_steps<=0) more_work=false;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
          walltime = get_rtc() - walltime_0;
          if(restrict_time && walltime>=max_time) more_work=false;
          if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
        }
        else
        {
          // send an exit message to this instance of LSMS
          int r_group=(status.MPI_SOURCE-align)/comm_size;

          MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD);
        }

        if(step_out_flag && accepted)
        {
          step_out_file<<"# iteration "<<energies_accumulated<<std::endl;
          step_out_file.precision(15);
          step_out_file<<energies_accumulated<<std::endl;
          step_out_file<<r_values[0]<<"  "<<r_values[1]<<std::endl;
          for(int j=0; j<3*size_lsms; j+=3)
          {
            step_out_file<<r_values[j+R_VALUE_OFFSET]<<"  "<<r_values[j+R_VALUE_OFFSET+1]
                         <<"  "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl;
          }
        }
        // write restart file every restartWriteFrequency seconds
        if(walltime>nextWriteTime)
        {
          generator->writeState("WLrestart.jsn");
          nextWriteTime+=restartWriteFrequency;
        }

      }
      generator->writeState("WLrestart.jsn");
/*
  if(evec_generation_mode==WangLandau_1d)
  (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state");
  if(evec_generation_mode==ExhaustiveIsing)
  (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state");
*/
      for(int i=0; i<num_lsms; i++) free(evecs[i]);
      shfree(evecs);
      //shfree(r_values);
    }
  }

  if(world_rank==0)
  {
    if(step_out_flag)
    {
      step_out_file<<"# end\n-1\n"
                   <<energy_accumulator/double(energies_accumulated)<<std::endl;
      step_out_file.close();
    }
    std::cout<<"Finished all scheduled calculations. Freeing resources.\n";
    std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n";
  }


  if(num_lsms>1)
  {
    // make sure averyone arrives here:
    MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD);

    if(world_rank==0)
    {
      MPI_Comm_free(&local_comm);
    }
    else if(my_group>=0)
    {
      MPI_Comm_free(&local_comm);
    }
  }



  if(world_rank==0)
  {
    double walltime = get_rtc() - walltime_0;
    std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n";
    std::cout<<" Monte-Carlo steps / walltime = "
             <<double(stepCount)/walltime<<"/sec\n";
  }

#ifdef USE_PAPI
  PAPI_stop_counters(papi_values,hw_counters);
  papi_values[hw_counters  ] = PAPI_get_real_cyc()-papi_real_cyc_0;
  papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0;
  papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0;
  papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0;
  long long accumulated_counters[NUM_PAPI_EVENTS+4];
/*
  for(int i=0; i<hw_counters; i++)
  {
  printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]),
  std::cout,parameters.myrankWorld);
  }
  printline("PAPI real cycles : "+ttos(papi_values[hw_counters]),
  std::cout,parameters.myrankWorld);
  printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]),
  std::cout,parameters.myrankWorld);
*/
  
  //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4,
  //           MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD);

  shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4,
      comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2);



  if(world_rank==0)
  {
    for(int i=0; i<hw_counters; i++)
    {
      std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n";
    }
    std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n";
    std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n";
    double gflops_papi = ((double)accumulated_counters[1])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_double = ((double)accumulated_counters[2])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_single = ((double)accumulated_counters[3])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]);
    std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n";
    std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n";
    std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n";
    std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n";
  }
#endif


  //MPI_Finalize();
  return 0;
}
int
main (int argc, char **argv)
{
    int i;
    int nextpe;
    int me, npes;
    int success1, success2, success3, success4, success5, success6, success7,
        success8;

    short src1[N];
    int src2[N];
    long src3[N];
    long double src4[N];
    long long src5[N];
    double src6[N];
    float src7[N];
    char *src8;
    short src9;
    int src10;
    long src11;
    double src12;
    float src13;

    short *dest1;
    int *dest2;
    long *dest3;
    long double *dest4;
    long long *dest5;
    double *dest6;
    float *dest7;
    char *dest8;
    short *dest9;
    int *dest10;
    long *dest11;
    double *dest12;
    float *dest13;


    shmem_init ();
    me = shmem_my_pe ();
    npes = shmem_n_pes ();

    if (npes > 1) {

        success1 = 0;
        success2 = 0;
        success3 = 0;
        success4 = 0;
        success5 = 0;
        success6 = 0;
        success7 = 0;
        success8 = 0;
        src8 = (char *) malloc (N * sizeof (char));

        for (i = 0; i < N; i += 1) {
            src1[i] = (short) me;
            src2[i] = me;
            src3[i] = (long) me;
            src4[i] = (long double) me;
            src5[i] = (long long) me;
            src6[i] = (double) me;
            src7[i] = (float) me;
            src8[i] = (char) me;
        }
        src9 = (short) me;
        src10 = me;
        src11 = (long) me;
        src12 = (double) me;
        src13 = (float) me;


        dest1 = (short *) shmem_malloc (N * sizeof (*dest1));
        dest2 = (int *) shmem_malloc (N * sizeof (*dest2));
        dest3 = (long *) shmem_malloc (N * sizeof (*dest3));
        dest4 = (long double *) shmem_malloc (N * sizeof (*dest4));
        dest5 = (long long *) shmem_malloc (N * sizeof (*dest5));
        dest6 = (double *) shmem_malloc (N * sizeof (*dest6));
        dest7 = (float *) shmem_malloc (N * sizeof (*dest7));
        dest8 = (char *) shmem_malloc (4 * sizeof (*dest8));
        dest9 = (short *) shmem_malloc (sizeof (*dest9));
        dest10 = (int *) shmem_malloc (sizeof (*dest10));
        dest11 = (long *) shmem_malloc (sizeof (*dest11));
        dest12 = (double *) shmem_malloc (sizeof (*dest12));
        dest13 = (float *) shmem_malloc (sizeof (*dest13));

        for (i = 0; i < N; i += 1) {
            dest1[i] = -9;
            dest2[i] = -9;
            dest3[i] = -9;
            dest4[i] = -9;
            dest5[i] = -9;
            dest6[i] = -9;
            dest7[i] = -9.0;
            dest8[i] = -9;
        }
        *dest9 = -9;
        *dest10 = -9;
        *dest11 = -9;
        *dest12 = -9;
        *dest13 = -9.0;

        nextpe = (me + 1) % npes;

        /* Testing shmem_short_put, shmem_int_put, shmem_long_put,
           shmem_longdouble_put, shmem_longlong_put, shmem_double_put,
           shmem_float_put, shmem_putmem */
        shmem_barrier_all ();

        shmem_short_put (dest1, src1, N, nextpe);
        shmem_int_put (dest2, src2, N, nextpe);
        shmem_long_put (dest3, src3, N, nextpe);
        shmem_longdouble_put (dest4, src4, N, nextpe);
        shmem_longlong_put (dest5, src5, N, nextpe);
        shmem_double_put (dest6, src6, N, nextpe);
        shmem_float_put (dest7, src7, N, nextpe);
        shmem_putmem (dest8, src8, N * sizeof (char), nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            for (i = 0; i < N; i += 1) {
                if (dest1[i] != (npes - 1)) {
                    success1 = 1;
                }
                if (dest2[i] != (npes - 1)) {
                    success2 = 1;
                }
                if (dest3[i] != (npes - 1)) {
                    success3 = 1;
                }
                if (dest4[i] != (npes - 1)) {
                    success4 = 1;
                }
                if (dest5[i] != (npes - 1)) {
                    success5 = 1;
                }
                if (dest6[i] != (npes - 1)) {
                    success6 = 1;
                }
                if (dest7[i] != (npes - 1)) {
                    success7 = 1;
                }
                if (dest8[i] != (npes - 1)) {
                    success8 = 1;
                }
            }

            if (success1 == 0)
                printf ("Test shmem_short_put: Passed\n");
            else
                printf ("Test shmem_short_put: Failed\n");
            if (success2 == 0)
                printf ("Test shmem_int_put: Passed\n");
            else
                printf ("Test shmem_int_put: Failed\n");
            if (success3 == 0)
                printf ("Test shmem_long_put: Passed\n");
            else
                printf ("Test shmem_long_put: Failed\n");
            if (success4 == 0)
                printf ("Test shmem_longdouble_put: Passed\n");
            else
                printf ("Test shmem_longdouble_put: Failed\n");
            if (success5 == 0)
                printf ("Test shmem_longlong_put: Passed\n");
            else
                printf ("Test shmem_longlong_put: Failed\n");
            if (success6 == 0)
                printf ("Test shmem_double_put: Passed\n");
            else
                printf ("Test shmem_double_put: Failed\n");
            if (success7 == 0)
                printf ("Test shmem_float_put: Passed\n");
            else
                printf ("Test shmem_float_put: Failed\n");
            if (success8 == 0)
                printf ("Test shmem_putmem: Passed\n");
            else
                printf ("Test shmem_putmem: Failed\n");

        }
        shmem_barrier_all ();

        /* Testing shmem_put32, shmem_put64, shmem_put128 */
        if (sizeof (int) == 4) {
            for (i = 0; i < N; i += 1) {
                dest2[i] = -9;
                dest3[i] = -9;
                dest4[i] = -9;
            }
            success2 = 0;
            success3 = 0;
            success4 = 0;

            shmem_barrier_all ();

            shmem_put32 (dest2, src2, N, nextpe);
            shmem_put64 (dest3, src3, N, nextpe);
            shmem_put128 (dest4, src4, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N; i += 1) {
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }
                    if (dest4[i] != (npes - 1)) {
                        success4 = 1;
                    }
                }
                if (success2 == 0)
                    printf ("Test shmem_put32: Passed\n");
                else
                    printf ("Test shmem_put32: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_put64: Passed\n");
                else
                    printf ("Test shmem_put64: Failed\n");

                if (success4 == 0)
                    printf ("Test shmem_put128: Passed\n");
                else
                    printf ("Test shmem_put128: Failed\n");
            }
        }
        else if (sizeof (int) == 8) {
            for (i = 0; i < N; i += 1) {
                dest1[i] = -9;
                dest2[i] = -9;
                dest3[i] = -9;
            }
            success1 = 0;
            success2 = 0;
            success3 = 0;

            shmem_barrier_all ();

            shmem_put32 (dest1, src1, N, nextpe);
            shmem_put64 (dest2, src2, N, nextpe);
            shmem_put128 (dest3, src3, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N; i += 1) {
                    if (dest1[i] != (npes - 1)) {
                        success1 = 1;
                    }
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }

                }
                if (success1 == 0)
                    printf ("Test shmem_put32: Passed\n");
                else
                    printf ("Test shmem_put32: Failed\n");
                if (success2 == 0)
                    printf ("Test shmem_put64: Passed\n");
                else
                    printf ("Test shmem_put64: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_put128: Passed\n");
                else
                    printf ("Test shmem_put128: Failed\n");
            }
        }

        /* Testing shmem_iput32, shmem_iput64, shmem_iput128 */
        shmem_barrier_all ();
        if (sizeof (int) == 4) {
            for (i = 0; i < N; i += 1) {
                dest2[i] = -9;
                dest3[i] = -9;
                dest4[i] = -9;
            }
            success2 = 0;
            success3 = 0;
            success4 = 0;

            shmem_barrier_all ();

            shmem_iput32 (dest2, src2, 1, 2, N, nextpe);
            shmem_iput64 (dest3, src3, 1, 2, N, nextpe);
            shmem_iput128 (dest4, src4, 1, 2, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N / 2; i += 1) {
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }
                    if (dest4[i] != (npes - 1)) {
                        success4 = 1;
                    }
                }
                if (success2 == 0)
                    printf ("Test shmem_iput32: Passed\n");
                else
                    printf ("Test shmem_iput32: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_iput64: Passed\n");
                else
                    printf ("Test shmem_iput64: Failed\n");

                if (success4 == 0)
                    printf ("Test shmem_iput128: Passed\n");
                else
                    printf ("Test shmem_iput128: Failed\n");
            }
        }
        else if (sizeof (int) == 8) {
            for (i = 0; i < N; i += 1) {
                dest1[i] = -9;
                dest2[i] = -9;
                dest3[i] = -9;
            }
            success1 = 0;
            success2 = 0;
            success3 = 0;

            shmem_barrier_all ();

            shmem_iput32 (dest1, src1, 1, 2, N, nextpe);
            shmem_iput64 (dest2, src2, 1, 2, N, nextpe);
            shmem_iput128 (dest3, src3, 1, 2, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N / 2; i += 1) {
                    if (dest1[i] != (npes - 1)) {
                        success1 = 1;
                    }
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }

                }
                if (success1 == 0)
                    printf ("Test shmem_iput32: Passed\n");
                else
                    printf ("Test shmem_iput32: Failed\n");
                if (success2 == 0)
                    printf ("Test shmem_iput64: Passed\n");
                else
                    printf ("Test shmem_iput64: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_iput128: Passed\n");
                else
                    printf ("Test shmem_iput128: Failed\n");
            }
        }

        /* Testing shmem_short_iput, shmem_int_iput, shmem_long_iput,
           shmem_double_iput, shmem_float_iput */
        for (i = 0; i < N; i += 1) {
            dest1[i] = -9;
            dest2[i] = -9;
            dest3[i] = -9;
            dest6[i] = -9;
            dest7[i] = -9;
        }
        success1 = 0;
        success2 = 0;
        success3 = 0;
        success6 = 0;
        success7 = 0;

        shmem_barrier_all ();

        shmem_short_iput (dest1, src1, 1, 2, N, nextpe);
        shmem_int_iput (dest2, src2, 1, 2, N, nextpe);
        shmem_long_iput (dest3, src3, 1, 2, N, nextpe);
        shmem_double_iput (dest6, src6, 1, 2, N, nextpe);
        shmem_float_iput (dest7, src7, 1, 2, N, nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            for (i = 0; i < N / 2; i += 1) {
                if (dest1[i] != (npes - 1)) {
                    success1 = 1;
                }
                if (dest2[i] != (npes - 1)) {
                    success2 = 1;
                }
                if (dest3[i] != (npes - 1)) {
                    success3 = 1;
                }
                if (dest6[i] != (npes - 1)) {
                    success6 = 1;
                }
                if (dest7[i] != (npes - 1)) {
                    success7 = 1;
                }
            }

            if (success1 == 0)
                printf ("Test shmem_short_iput: Passed\n");
            else
                printf ("Test shmem_short_iput: Failed\n");
            if (success2 == 0)
                printf ("Test shmem_int_iput: Passed\n");
            else
                printf ("Test shmem_int_iput: Failed\n");
            if (success3 == 0)
                printf ("Test shmem_long_iput: Passed\n");
            else
                printf ("Test shmem_long_iput: Failed\n");
            if (success6 == 0)
                printf ("Test shmem_double_iput: Passed\n");
            else
                printf ("Test shmem_double_iput: Failed\n");
            if (success7 == 0)
                printf ("Test shmem_float_iput: Passed\n");
            else
                printf ("Test shmem_float_iput: Failed\n");

        }


        /* Testing shmem_double_p, shmem_float_p, shmem_int_p, shmem_long_p,
           shmem_short_p */
        shmem_barrier_all ();

        shmem_short_p (dest9, src9, nextpe);
        shmem_int_p (dest10, src10, nextpe);
        shmem_long_p (dest11, src11, nextpe);
        shmem_double_p (dest12, src12, nextpe);
        shmem_float_p (dest13, src13, nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            if (*dest9 == (npes - 1))
                printf ("Test shmem_short_p: Passed\n");
            else
                printf ("Test shmem_short_p: Failed\n");
            if (*dest10 == (npes - 1))
                printf ("Test shmem_int_p: Passed\n");
            else
                printf ("Test shmem_int_p: Failed\n");
            if (*dest11 == (npes - 1))
                printf ("Test shmem_long_p: Passed\n");
            else
                printf ("Test shmem_long_p: Failed\n");
            if (*dest12 == (npes - 1))
                printf ("Test shmem_double_p: Passed\n");
            else
                printf ("Test shmem_double_p: Failed\n");
            if (*dest13 == (npes - 1))
                printf ("Test shmem_float_p: Passed\n");
            else
                printf ("Test shmem_float_p: Failed\n");


        }

        shmem_barrier_all ();

        shmem_free (dest1);
        shmem_free (dest2);
        shmem_free (dest3);
        shmem_free (dest4);
        shmem_free (dest5);
        shmem_free (dest6);
        shmem_free (dest7);
        shmem_free (dest8);
        shmem_free (dest9);
        shmem_free (dest10);
        shmem_free (dest11);
        shmem_free (dest12);
        shmem_free (dest13);

    }
    else {
        printf ("Number of PEs must be > 1 to test shmem put, test skipped\n");
    }

    shmem_finalize ();

    return 0;
}
Beispiel #5
0
int
main(int argc, char **argv)
{
    int loops=DFLT_LOOPS;
    char *pgm;
    int *Target;
    int *Source;
    int i, me, npes;
    int target_PE;
    long bytes;
    double start_time, *total_time;

    shmem_init();
    me = shmem_my_pe();
    npes = shmem_n_pes();

    if ((pgm=strrchr(argv[0],'/')))
        pgm++;
    else
        pgm = argv[0];

    while ((i = getopt (argc, argv, "hve:l:st")) != EOF) {
        switch (i)
        {
            case 'v':
                Verbose++;
                break;
            case 'e':
                if ((elements = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad elements count %d\n",elements);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 'l':
                if ((loops = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad loop count %d\n",loops);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 's':
                Sync++;
                break;
            case 't':
                Track++;
                break;
            case 'h':
                if (me == 0)
                    usage(pgm);
                return 0;
            default:
                if (me == 0) {
                    fprintf(stderr,"%s: unknown switch '-%c'?\n",pgm,i);
                    usage(pgm);
                }
                shmem_finalize();
                return 1;
        }
    }

    for(i=0; i < SHMEM_REDUCE_SYNC_SIZE; i++)
        pSync[i] = SHMEM_SYNC_VALUE;

    target_PE = (me+1) % npes;

    total_time = (double *) shmem_malloc( npes * sizeof(double) );
    if (!total_time) {
        fprintf(stderr,"ERR: bad total_time shmem_malloc(%ld)\n",
                (elements * sizeof(double)));
        shmem_global_exit(1);
    }
    for(i=0; i < npes; i++)
        total_time[i] = -1.0;

    Source = (int *) shmem_malloc( elements * sizeof(*Source) );
    if (!Source) {
        fprintf(stderr,"ERR: bad Source shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    Target = (int *) shmem_malloc( elements * sizeof(*Target) );
    if (!Target) {
        fprintf(stderr,"ERR: bad Target shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(Source);
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    for (i = 0; i < elements; i++) {
        Target[i] = -90;
        Source[i] = i + 1;
    }

    bytes = loops * sizeof(int) * elements;

    if (Verbose && me==0) {
        fprintf(stderr,
                "%s: INFO - %d loops, put %d (int) elements to PE+1 Max put ??\n",
                pgm, loops, elements);
    }
    shmem_barrier_all();

    for(i=0; i < loops; i++) {

        start_time = shmemx_wtime();

        shmem_int_put(Target, Source, elements, target_PE);

        time_taken += (shmemx_wtime() - start_time);

        if (me==0) {
            if ( Track && i > 0 && ((i % 200) == 0))
                fprintf(stderr,".%d",i);
        }
        if (Sync)
            shmem_barrier_all();
    }

    // collect time per node.
    shmem_double_put( &total_time[me], &time_taken, 1, 0 );
    shmem_double_sum_to_all(&sum_time, &time_taken, 1, 0, 0, npes, pWrk, pSync);

    shmem_barrier_all();

    for (i = 0; i < elements; i++) {
        if (Target[i] != i + 1) {
            printf("%d: Error Target[%d] = %d, expected %d\n",
                   me, i, Target[i], i + 1);
            shmem_global_exit(1);
        }
    }

    if ( Track && me == 0 ) fprintf(stderr,"\n");

    if(Verbose && me == 0) {
        double rate, comp_time;

        if (Verbose > 1)
            fprintf(stdout,"Individule PE times: (seconds)\n");
        for(i=0,comp_time=0.0; i < npes; i++) {
            comp_time += total_time[i];
            if (Verbose > 1)
                fprintf(stdout,"  PE[%d] %8.6f\n",i,total_time[i]);
        }

        sum_time /= (double)npes;
        comp_time /= (double)npes;
        if (sum_time != comp_time)
            printf("%s: computed_time %7.5f != sum_to_all_time %7.5f)\n",
                   pgm, comp_time, sum_time );

        rate = ((double)bytes/(1024.0*1024.0)) / comp_time;
        printf("%s: shmem_int_put() %7.4f MB/sec (bytes %ld secs %7.4f)\n",
               pgm, rate, bytes, sum_time);
    }

    shmem_free(total_time);
    shmem_free(Target);
    shmem_free(Source);

    shmem_finalize();

    return 0;
}
Beispiel #6
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  int Block_size;          /* size of a single block                */
  int Colblock_size;       /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  int order;               /* order of overall matrix               */
  int bufferCount;         /* number of input buffers               */
  int targetBuffer;        /* buffer with which to communicate      */
  int send_to, recv_from;  /* ranks with which to communicate       */
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  long i, j, it, jt, istart;/* dummies                              */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double *A_p;             /* original matrix column block          */
  double *B_p;             /* transposed matrix column block        */
  double **Work_in_p;      /* workspace for the transpose function  */
  double *Work_out_p;      /* workspace for the transpose function  */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double avgtime;          /* timing parameters                     */
  long   *pSync_bcast;     /* work space for collectives            */
  long   *pSync_reduce;    /* work space for collectives            */
  double *pWrk;            /* work space for SHMEM collectives      */
  double *local_trans_time, 
         *trans_time;      /* timing parameters                     */
  double *abserr, 
         *abserr_tot;      /* local and aggregate error             */
  int    *send_flag,
         *recv_flag;       /* synchronization flags                 */
  int    *arguments;       /* command line arguments                */

/*********************************************************************
** Initialize the SHMEM environment
*********************************************************************/

  prk_shmem_init();
  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM matrix transpose: B = A^T\n");
  }

// initialize sync variables for error checks
  pSync_bcast      = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce     = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk             = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
  local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  trans_time       = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  arguments        = (int *)    prk_shmem_align(prk_get_alignment(),4*sizeof(int));
  abserr           = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double));
  abserr_tot       = abserr + 1;
  if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time ||
      !trans_time || !arguments || !abserr) {
    printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID);
    error = 1;
    goto ENDOFTESTS;
  }

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    if (argc != 4 && argc != 5){
      printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n",
                                                               *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    arguments[0]=iterations;
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atoi(*++argv);
    arguments[1]=order;
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    bufferCount = atoi(*++argv);
    arguments[2]=bufferCount;
    if (Num_procs > 1) {
      if ((bufferCount < 1) || (bufferCount >= Num_procs)) {
        printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount);
        error = 1; goto ENDOFTESTS;
      }
    }

    if (argc == 5) Tile_order = atoi(*++argv);
    arguments[3]=Tile_order;

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    printf("Number of buffers    = %d\n", bufferCount);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
  }
  
  shmem_barrier_all();

  /*  Broadcast input data to all ranks */
  shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  order=arguments[1];
  bufferCount=arguments[2];
  Tile_order=arguments[3];

  shmem_barrier_all();
  prk_shmem_free(arguments);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the row block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_in_p   = (double**)prk_malloc(bufferCount*sizeof(double));
    Work_out_p = (double *) prk_malloc(Block_size*sizeof(double));
    recv_flag  = (int*)     prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int));
    if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){
      printf(" Error allocating space for work or flags on node %d\n",my_ID);
      error = 1;
    }

    if (bufferCount < (Num_procs - 1)) {
      send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int));

      if (send_flag == NULL) {
	printf("Error allocating space for flags on node %d\n", my_ID);
	error = 1;
      }
    }

    bail_out(error);

    for(i=0;i<bufferCount;i++) {
      Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double));
      if (Work_in_p[i] == NULL) {
        printf(" Error allocating space for work on node %d\n",my_ID);
        error = 1;
      }
      bail_out(error);
    }

    if (bufferCount < (Num_procs - 1)) {
      for(i=0;i<(Num_procs-1);i++)
        send_flag[i]=0;
    }

    for(i=0;i<bufferCount;i++)
      recv_flag[i]=0;
  }
  
  /* Fill the original column matrices                                              */
  istart = 0;  
  for (j=0;j<Block_order;j++) 
    for (i=0;i<order; i++)  {
      A(i,j) = (double) (order*(j+colstart) + i);
      B(i,j) = 0.0;
  }

  shmem_barrier_all();

  if (bufferCount < (Num_procs - 1)) {
    if (Num_procs > 1) {
      for ( i = 0; i < bufferCount; i++) {
        recv_from = (my_ID + i + 1)%Num_procs;
        shmem_int_inc(&send_flag[i], recv_from);
      }
    }
  }

  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration                                        */
    if (iter == 1) { 
      shmem_barrier_all();
      local_trans_time[0] = wtime();
    }

    /* do the local transpose                                                     */
    istart = colstart; 
    if (!tiling) {
      for (i=0; i<Block_order; i++) 
        for (j=0; j<Block_order; j++) {
          B(j,i) += A(i,j);
          A(i,j) += 1.0;
	}
    }
    else {
      for (i=0; i<Block_order; i+=Tile_order) 
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) += A(it,jt); 
              A(it,jt) += 1.0;
            }
    }

    for (phase=1; phase<Num_procs; phase++){
      recv_from = (my_ID + phase            )%Num_procs;
      send_to   = (my_ID - phase + Num_procs)%Num_procs;

      targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount;

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=0; i<Block_order; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
            A(i,j) += 1.0;
	  }
      }
      else {
        for (i=0; i<Block_order; i+=Tile_order) 
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
                A(it,jt) += 1.0;
	      }
      }

      if (bufferCount < (Num_procs - 1))
        shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1);

      shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to);
      shmem_fence();
      shmem_int_inc(&recv_flag[targetBuffer], send_to);

      i = (iter * (Num_procs - 1) + phase) / bufferCount;

      if ((iter * (Num_procs - 1) + phase) % bufferCount)
	i++;

      shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=0; j<Block_order; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) += Work_in(targetBuffer, i,j);

      if (bufferCount < (Num_procs - 1)) {
        if ((phase + bufferCount) < Num_procs)
	  recv_from = (my_ID + phase + bufferCount) % Num_procs;
        else
	  recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs;

        shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from);
      }
    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time[0] = wtime() - local_trans_time[0];

  shmem_barrier_all();
  shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  abserr[0] = 0.0;
  istart = 0;
  double addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) for (i=0;i<order; i++) {
      abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit));
  }

  shmem_barrier_all();
  shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  if (my_ID == root) {
    if (abserr_tot[0] <= epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time[0]/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr[0]);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon);
      error = 1;
    }
  }

  bail_out(error);

  if (Num_procs>1) 
  {
    if (bufferCount < (Num_procs - 1))
      prk_shmem_free(send_flag);

    prk_shmem_free(recv_flag);
    prk_free(Work_out_p);

    for(i=0;i<bufferCount;i++)
      prk_shmem_free(Work_in_p[i]);

    prk_free(Work_in_p);
  }

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk);

  prk_shmem_finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
Beispiel #7
0
int
main(int argc, char **argv)
{
    int loops=DFLT_LOOPS;
    char *pgm;
    int *Target;
    int *Source;
    int i, me, npes;
    int target_pe;
    long bytes;
    double time_taken=0.0, start_time;

    shmem_init();
    me = shmem_my_pe();
    npes = shmem_n_pes();

    if ((pgm=strrchr(argv[0],'/')))
        pgm++;
    else
        pgm = argv[0];

    while ((i = getopt (argc, argv, "hve:l:st")) != EOF) {
        switch (i)
        {
            case 'v':
                Verbose++;
                break;
            case 'e':
                if ((elements = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad elements count %d\n",elements);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 'l':
                if ((loops = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad loop count %d\n",loops);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 's':
                Sync++;
                break;
            case 't':
                Track++;
                break;
            case 'h':
                if (me == 0)
                    usage(pgm);
                return 0;
            default:
                if (me == 0) {
                    fprintf(stderr,"%s: unknown switch '-%c'?\n",pgm,i);
                    usage(pgm);
                }
                shmem_finalize();
                return 1;
        }
    }

    target_pe = (me+1) % npes;

    total_time = (double *) shmem_malloc( npes * sizeof(double) );
    if (!total_time) {
        fprintf(stderr,"ERR: bad total_time shmem_malloc(%ld)\n",
                (elements * sizeof(double)));
        shmem_global_exit(1);
    }

    Source = (int *) shmem_malloc( elements * sizeof(*Source) );
    if (!Source) {
        fprintf(stderr,"ERR: bad Source shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    Target = (int *) shmem_malloc( elements * sizeof(*Target) );
    if (!Target) {
        fprintf(stderr,"ERR: bad Target shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(Source);
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    for (i = 0; i < elements; i++) {
        Target[i] = -90;
        Source[i] = i + 1;
    }

    bytes = loops * sizeof(int) * elements;

    if (Verbose && me==0)
        fprintf(stderr, "%s: INFO - %d loops, get %d (int) elements from PE+1\n",
                pgm, loops, elements);

    shmem_barrier_all();

    for(i=0; i < loops; i++) {

        start_time = shmemx_wtime();

        shmem_int_get( Target, Source, elements, target_pe );

        time_taken += shmemx_wtime() - start_time;

        if (me==0) {
            if ( Track && i > 0 && ((i % 200) == 0))
                fprintf(stderr,".%d",i);
        }
        if (Sync)
            shmem_barrier_all();
    }

    // collect time per node elapsed time.
    shmem_double_put( &total_time[me], &time_taken, 1, 0 );

    shmem_barrier_all();

    for (i = 0; i < elements; i++) {
        if (Target[i] != i + 1) {
            printf("%d: Error Target[%d] = %d, expected %d\n",
                   me, i, Target[i], i + 1);
            shmem_global_exit(1);
        }
    }

    if ( Track && me == 0 )
        fprintf(stderr,"\n");

    if (Verbose && me == 0) {
        double rate,secs;

        // average time
        for(i=0,secs=0.0; i < npes; i++)
            secs += total_time[i];
        secs /= (double)npes;
        rate = ((double)bytes/(1024.0*1024.0)) / secs;
        printf("%s: ave %5.3f MB/sec (bytes %ld secs %5.3f)\n",
               pgm, rate, bytes, secs);
    }

    shmem_free(total_time);
    shmem_free(Target);
    shmem_free(Source);

    shmem_finalize();

    return 0;
}
Beispiel #8
0
/* exchanges the field  l */
void xchange_field(spinor * const l, const int ieo) {

#  ifdef MPI
  int i,ix, mu, x0, x1, x2, x3, k;

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchangefield)
#endif

  shmem_barrier_all();

  shmem_double_put((double*)(l+T*LX*LY*LZ/2), (double*)l,
                   (LX*LY*LZ*12), g_nb_t_dn);
  shmem_double_put((double*)(l+(T+1)*LX*LY*LZ/2), (double*)(l+(T-1)*LX*LY*LZ/2),
                   (LX*LY*LZ*12), g_nb_t_up);

#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
  k = (T+2)*LX*LY*LZ/2;
  for(x0 = 0; x0 < T; x0++) {
    shmem_double_put((double*)(l + k),
                     (double*)(l + g_lexic2eo[g_ipt[x0][0][0][0]]),
                     12*LZ*LY, g_nb_x_dn);
    k+=LZ*LY;
  }
  k = ((T+2)*LX*LY*LZ + T*LY*LZ)/2;
  for(x0 = 0; x0 < T; x0++) {
    shmem_double_put((double*)(l + k),
                     (double*)(l + g_lexic2eo[g_ipt[x0][LX-1][0][0]]),
                     12*LZ*LY, g_nb_x_up);
    k+=LZ*LY;
  }
#    endif

#    if (defined PARALLELXYT || defined PARALLELXYZT)
  k = ((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2;
  for(x0 = 0; x0 < T; x0++) {
    for(x1 = 0; x1 < LX; x1++) {
      shmem_double_put((double*)(l + k),
                       (double*)(l + g_lexic2eo[g_ipt[x0][x1][0][0]]),
                       12*LZ, g_nb_y_dn);
      k+=LZ;
    }
  }
  k = ((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2;
  for(x0 = 0; x0 < T; x0++) {
    for(x1 = 0; x1 < LX; x1++) {
      shmem_double_put((double*)(l + k),
                       (double*)(l + g_lexic2eo[g_ipt[x0][x1][LY-1][0]]),
                       12*LZ, g_nb_y_up);
      k+=LZ;
    }
  }
#    endif

#    if (defined PARALLELXYZT)
  x0 = (VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ);
  if(ieo == 1) {
    for(k = 0; k < T*LX*LY/2; k++) {
      shmem_double_put((double*)(l + x0),
                       (double*)(l + g_field_z_ipt_even[k]),
                       24, g_nb_z_dn);
      x0++;
    }
  }
  else {
    for(k = 0; k < T*LX*LY/2; k++) {
      shmem_double_put((double*)(l + x0),
                       (double*)(l + g_field_z_ipt_odd[k]),
                       24, g_nb_z_dn);
      x0++;
    }
  }
  x0 = (VOLUME/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2);
  if(ieo == 1) {
    for(k = T*LX*LY/2; k < T*LX*LY; k++) {
      shmem_double_put((double*)(l + x0),
                       (double*)(l + g_field_z_ipt_even[k]),
                       24, g_nb_z_up);
      x0++;
    }
  }
  else {
    for(k = T*LX*LY/2; k < T*LX*LY; k++) {
      shmem_double_put((double*)(l + x0),
                       (double*)(l + g_field_z_ipt_even[k]),
                       24, g_nb_z_up);
      x0++;
    }
  }
#    endif

  shmem_barrier_all();
#  endif // MPI
  return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchangefield)
#endif
}