コード例 #1
0
ファイル: comm_qmp.cpp プロジェクト: ckallidonis/quda
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
{
  if ( QMP_is_initialized() != QMP_TRUE ) {
    errorQuda("QMP has not been initialized");
  }

  int grid_size = 1;
  for (int i = 0; i < ndim; i++) {
    grid_size *= dims[i];
  }
  if (grid_size != QMP_get_number_of_nodes()) {
    errorQuda("Communication grid size declared via initCommsGridQuda() does not match"
              " total number of QMP nodes (%d != %d)", grid_size, QMP_get_number_of_nodes());
  }

  Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data);
  comm_set_default_topology(topo);

  // determine which GPU this process will use (FIXME: adopt the scheme in comm_mpi.cpp)

  int device_count;
  cudaGetDeviceCount(&device_count);
  if (device_count == 0) {
    errorQuda("No CUDA devices found");
  }

  gpuid = (comm_rank() % device_count);
}
コード例 #2
0
ファイル: su3_test.c プロジェクト: adenbley/quda
int main(int argc, char **argv) {

#ifdef QMP_COMMS
  int ndim=4, dims[4];
  QMP_thread_level_t tl;
  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
  dims[0] = dims[1] = dims[2] = 1;
  dims[3] = QMP_get_number_of_nodes();
  QMP_declare_logical_topology(dims, ndim);
#endif  

  SU3Test();

#ifdef QMP_COMMS
  QMP_finalize_msg_passing();
#endif

  return 0;
}
コード例 #3
0
ファイル: QMP_broadcast.c プロジェクト: 6twirl9/qmp
void
stupid_broadcast(void *send_buf, int count)
{
  int node;
  int num_nodes = QMP_get_number_of_nodes();
  QMP_msgmem_t request_msg = QMP_declare_msgmem(send_buf, count);
  QMP_msghandle_t request_mh;

  // Send to each node
  for(node=1; node < num_nodes; ++node)
  {
    if (QMP_get_node_number() == node)
    {
      request_mh = QMP_declare_receive_from(request_msg, 0, 0);

      if (QMP_start(request_mh) != QMP_SUCCESS)
	QMP_abort_string(1, "recvFromWait failed\n");

      QMP_wait(request_mh);
      QMP_free_msghandle(request_mh);
    }

    if (QMP_is_primary_node())
    {
      request_mh = QMP_declare_send_to(request_msg, node, 0);

      if (QMP_start(request_mh) != QMP_SUCCESS)
	QMP_abort_string(1, "sendToWait failed\n");

      QMP_wait(request_mh);
      QMP_free_msghandle(request_mh);
    }
  }

  QMP_free_msgmem(request_msg);
}
コード例 #4
0
ファイル: comm_qmp.cpp プロジェクト: ckallidonis/quda
int comm_size(void)
{
  return QMP_get_number_of_nodes();
}
コード例 #5
0
ファイル: interface_quda.cpp プロジェクト: witzel/quda
void initQuda(int dev)
{
  static int initialized = 0;
  if (initialized) {
    return;
  }
  initialized = 1;

#if (CUDA_VERSION >= 4000) && defined(MULTI_GPU)
  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
  char* cni_str = getenv("CUDA_NIC_INTEROP");
  if(cni_str == NULL){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n");
  }
  int cni_int = atoi(cni_str);
  if (cni_int != 1){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n");    
  }
#endif

  int deviceCount;
  cudaGetDeviceCount(&deviceCount);
  if (deviceCount == 0) {
    errorQuda("No devices supporting CUDA");
  }

  for(int i=0; i<deviceCount; i++) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, i);
    printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name);
  }

#ifdef QMP_COMMS
  int ndim;
  const int *dim;

  if ( QMP_is_initialized() != QMP_TRUE ) {
    errorQuda("QMP is not initialized");
  }
  num_QMP=QMP_get_number_of_nodes();
  rank_QMP=QMP_get_node_number();
  
  dev += rank_QMP % deviceCount;
  ndim = QMP_get_logical_number_of_dimensions();
  dim = QMP_get_logical_dimensions();

#elif defined(MPI_COMMS)

  comm_init();
  dev=comm_gpuid();

#else
  if (dev < 0) dev = deviceCount - 1;
#endif
  
  // Used for applying the gauge field boundary condition
  if( commCoords(3) == 0 ) qudaPt0=true;
  else qudaPt0=false;

  if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true;
  else qudaPtNm1=false;

  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);
  if (deviceProp.major < 1) {
    errorQuda("Device %d does not support CUDA", dev);
  }

  
  printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name);

  cudaSetDevice(dev);
#ifdef HAVE_NUMA
  if(numa_config_set){
    if(gpu_affinity[dev] >=0){
      printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]);
      if(numa_run_on_node(gpu_affinity[dev]) != 0){
        printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]);
      }
    }

  }
#endif

  initCache();
  quda::initBlas();
}
コード例 #6
0
int qio_test(int output_volfmt, int output_serpar, int ildgstyle, 
	     int input_volfmt, int input_serpar, int argc, char *argv[]){

  float array_in[NARRAY], array_out[NARRAY];
  float *field_in[NREAL], *subset_in[NREAL], 
    *field_out[NREAL], *subset_out[NREAL];
  suN_matrix *field_su3_out[NMATRIX], *field_su3_in[NMATRIX];
  QIO_Writer *outfile;
  QIO_Reader *infile;
  float diff_field = 0, diff_array = 0, diff_su3 = 0, diff_subset = 0;
  QMP_thread_level_t provided;
  int status;
  int sites_on_node = 0;
  int i,volume;
  char filename[] = "binary_test";
  int dim = 4;
  int lower[4] = {1, 0, 0, 2};
  int upper[4] = {2, 3, 3, 2};
  char myname[] = "qio_test";
  
  /* Start message passing */
  QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &provided);

  this_node = mynode();
  printf("%s(%d) QMP_init_msg_passing done\n",myname,this_node);

  /* Lattice dimensions */
  lattice_dim = 4;
  lattice_size[0] = 8;
  lattice_size[1] = 4;
  lattice_size[2] = 4;
  lattice_size[3] = 4;

  volume = 1;
  for(i = 0; i < lattice_dim; i++){
    volume *= lattice_size[i];
  }

  /* Set the mapping of coordinates to nodes */
  if(setup_layout(lattice_size, 4, QMP_get_number_of_nodes())!=0)
    return 1;
  printf("%s(%d) layout set for %d nodes\n",myname,this_node,
	 QMP_get_number_of_nodes());
  sites_on_node = num_sites(this_node);

  /* Build the layout structure */
  layout.node_number     = node_number;
  layout.node_index      = node_index;
  layout.get_coords      = get_coords;
  layout.num_sites       = num_sites;
  layout.latsize         = lattice_size;
  layout.latdim          = lattice_dim;
  layout.volume          = volume;
  layout.sites_on_node   = sites_on_node;
  layout.this_node       = this_node;
  layout.number_of_nodes = QMP_get_number_of_nodes();

  /* Open the test output file */
  outfile = open_test_output(filename, output_volfmt, output_serpar, 
			     ildgstyle, myname);
  if(outfile == NULL)return 1;

  /* If this is not the ILDG file test */
  if(ildgstyle == QIO_ILDGNO){
    /* Create the test output field */
    status = vcreate_R(field_out, NREAL);
    if(status)return status;
    
    /* Set some values for the field */
    vset_R(field_out, NREAL);
    
    /* Write the real test field */
    status = write_real_field(outfile, NREAL, field_out, myname);
    if(status)return status;
    
    /* Write a subset of the real test field */
    status = write_real_field_subset(outfile, NREAL, field_out, 
				     lower, upper, dim, myname);
    if(status)return status;
    
    /* Set some values for the global array */
    for(i = 0; i < NARRAY; i++)
      array_out[i] = i;
    
    /* Write the real global array */
    status = write_real_global(outfile, NARRAY, array_out, myname);
    if(status)return status;
  }

  /* Create the test output su3 field */
  status = vcreate_M(field_su3_out, NMATRIX);
  if(status)return status;

  /* Set some values for the su3 field */
  vset_M(field_su3_out, NMATRIX);

  /* Write the su3 test field */
  status = write_su3_field(outfile, NMATRIX, field_su3_out, myname);
  if(status)return status;

  /* Close the file */
  QIO_close_write(outfile);
  printf("%s(%d): Closed file for writing\n",myname,this_node);

  /* Set up a dummy input field */
  status = vcreate_R(field_in, NREAL);
  if(status)return status;
    
  /* Set up a dummy input field for subset */
  status = vcreate_R(subset_in, NREAL);
  if(status)return status;
    
  /* Set up a dummy input SU(N) field */
  status = vcreate_M(field_su3_in, NMATRIX);
  if(status)return status;

  /* Open the test file for reading */
  infile = open_test_input(filename, input_volfmt, input_serpar, myname);
  if(infile == NULL)return 1;

  if(ildgstyle == QIO_ILDGNO){
    /* Peek at the field record */
    status = peek_record_info(infile, myname);
    if(status != QIO_SUCCESS)return status;
    /* Skip the record */

#if(0)
    
    /* Skip the field */
    status = QIO_next_record(infile);
    if(status != QIO_SUCCESS)return status;
    
#else
    
    /* Read the field record */
    printf("%s(%d) reading real field\n",myname,this_node); fflush(stdout);
    status = read_real_field(infile, NREAL, field_in, myname);
    if(status)return status;
    
#endif

    /* Read the subset of the field */
    printf("%s(%d) reading subset of real field\n",
	   myname,this_node); fflush(stdout);
    status = read_real_field_subset(infile, NREAL, subset_in, myname);
    if(status)return status;
    
    /* Read the global array record */
    printf("%s(%d) reading global field\n",myname,this_node); fflush(stdout);
    status = read_real_global(infile, NARRAY, array_in, myname);
    if(status)return status;

  }    

  /* Read the su3 field record */
  printf("%s(%d) reading su3 field\n",myname,this_node); fflush(stdout);
  status = read_su3_field(infile, NMATRIX, field_su3_in, myname);
  if(status)return status;

  /* Close the file */
  QIO_close_read(infile);
  printf("%s(%d): Closed file for reading\n",myname,this_node);

  if(ildgstyle == QIO_ILDGNO){

    /* Compare the input and output fields */
    diff_field = vcompare_R(field_out, field_in, NREAL);
    if(this_node == 0){
      printf("%s(%d): Comparison of in and out real fields |in - out|^2 = %e\n",
	     myname,this_node,diff_field);
    }
    
    /* Create the subset output field */
    status = vcreate_R(subset_out, NREAL);
    if(status)return status;

    /* Copy the subset */
    vsubset_R(subset_out, field_out, lower, upper, NREAL);
    
    /* Compare the input and output subsets */
    diff_subset = vcompare_R(subset_out, subset_in, NREAL);
    if(this_node == 0){
      printf("%s(%d): Comparison of subsets of in and out real fields |in - out|^2 = %e\n",
	     myname,this_node,diff_subset);
    }
    
    /* Compare the input and output global arrays */
    diff_array = vcompare_r(array_out, array_in, NREAL);
    if(this_node == 0){
      printf("%s(%d): Comparison of in and out real global arrays |in - out|^2 = %e\n",
	     myname, this_node, diff_array);
    }
  }

  /* Compare the input and output suN fields */
  diff_su3 = vcompare_M(field_su3_out, field_su3_in, NMATRIX);
  if(this_node == 0){
    printf("%s(%d): Comparison of in and out suN fields |in - out|^2 = %e\n",
	   myname, this_node, diff_field);
  }

  /* Clean up */
  if(ildgstyle == QIO_ILDGNO){
    vdestroy_R(field_out, NREAL);
    vdestroy_R(field_in, NREAL);
    vdestroy_R(subset_in, NREAL);
    vdestroy_R(subset_out, NREAL);
  }
  vdestroy_M(field_su3_in, NMATRIX);
  vdestroy_M(field_su3_out, NMATRIX);

  /* Shut down QMP */
  QMP_finalize_msg_passing();

  /* Report result */
  if(diff_field + diff_subset + diff_su3 + diff_array > 0){
    printf("%s(%d): Test failed\n",myname,this_node);
    return 1;
  }
  printf("%s(%d): Test passed\n",myname,this_node);

  return 0;
}
コード例 #7
0
ファイル: qlayout.c プロジェクト: usqcd-software/qlua
static void
eo_setup(QDP_Lattice *lat, void *args)
{
    mLattice *S = args;
    QDP_allocate_lattice_params(lat, sizeof (params));
    params *p = QDP_get_lattice_params(lat);

    p->S = S;

    if (QMP_get_msg_passing_type() != QMP_SWITCH) {
        int nd2 = QMP_get_allocated_number_of_dimensions();
        const int *nsquares2 = QMP_get_allocated_dimensions();
        int i;

        for (i = 0; i < S->rank; i++) {
            S->net[i] = (i < nd2) ? nsquares2[i] : 1;
        }
    } else { /* not QMP_GRID */
        int squaresize[QLUA_MAX_LATTICE_RANK];
        int extrafactors[QLUA_MAX_LATTICE_RANK];
        int i;
        for (i = 0; i < S->rank; i++) {
            squaresize[i] = S->dim[i];
            extrafactors[i] = 1;
            S->net[i] = 1;
        }

        /* Figure out dimensions of rectangle */
        int n = QMP_get_number_of_nodes();   /* nodes to factor */
        int k = MAXPRIMES-1;
        while (n > 1) {
            /* figure out which prime to divide by starting with largest */
            /* if no factor found, assume n is prime */
            while ((k >= 0) && (n % prime[k] != 0)) --k;
            int pfac = (k>=0) ? prime[k] : n;

            /* figure out which direction to divide */
            /* find largest divisible dimension of h-cubes */
            /* if one direction with largest dimension has already been
               divided, divide it again.  Otherwise divide first direction
               with largest dimension. */
            int j = -1;
            int i;
            for (i = 0; i < S->rank; i++) {
                if (squaresize[i] % pfac == 0) {
                    if ((j<0) ||
                        (extrafactors[j] * squaresize[i] > 
                         extrafactors[i] * squaresize[j])) {
                        j = i;
                    } else if (extrafactors[j] * squaresize[i] == 
                               extrafactors[i] * squaresize[j]) {
                        if ((S->net[j] == 1) || (S->net[i] != 1))
                            j = i;
                    }
                }
            }

            /* This can fail if we run out of prime factors in the dimensions */
            /* then just choose largest dimension */
            if (j < 0) {
                int i;
                for (i = 0; i < S->rank; i++) {
                    if ((j<0) ||
                        (extrafactors[j] * squaresize[i] >
                         extrafactors[i] * squaresize[j]) ) {
                        j = i;
                    } else if (extrafactors[j] * squaresize[i] ==
                               extrafactors[i] * squaresize[j]) {
                        if((S->net[j] == 1) || (S->net[i] != 1))
                            j = i;
                    }
                }
                n /= pfac;
                extrafactors[j] *= pfac;
                S->net[j] *= pfac;
            } else {
                n /= pfac;
                squaresize[j] /= pfac;
                S->net[j] *= pfac;
            }
        }
    } /* not QMP_GRID */

    int mc[QLUA_MAX_LATTICE_RANK];
    int i;

    S->node = QDP_this_node;
    node2coord(mc, QDP_this_node, S);

    for (i = 0; i < S->rank; i++) {
        int x = mc[i];

        mc[i] = x + 1;
        if (mc[i] == S->net[i])
            mc[i] = 0;
        S->neighbor_up[i] = coord2node(mc, S);

        mc[i] = x - 1;
        if (mc[i] < 0)
            mc[i] = S->net[i] - 1;
        S->neighbor_down[i] = coord2node(mc, S);

        mc[i] = x;
    }
}
コード例 #8
0
ファイル: sysfunc.C プロジェクト: DeanHowarth/QUDA-CPS
void init_qmp(int * argc, char ***argv) {

#if 0
  printf("init_qmp(%d %p)\n",*argc,*argv);
  for(int i = 0; i<*argc;i++){
    printf("argv[%d](before)=%s\n",i,(*argv)[i]); 
  }
#endif

#if 0
   spi_init();
#endif
  
    QMP_thread_level_t prv;
#ifndef UNIFORM_SEED_NO_COMMS
    QMP_status_t init_status = QMP_init_msg_passing(argc, argv, QMP_THREAD_SINGLE, &prv);
    if (init_status) printf("QMP_init_msg_passing returned %d\n",init_status);
    peRank = QMP_get_node_number();
    peNum = QMP_get_number_of_nodes();
    if(!peRank)printf("QMP_init_msg_passing returned %d\n",init_status);

    if (init_status != QMP_SUCCESS) {
      QMP_error("%s\n",QMP_error_string(init_status));
    }

    // check QMP thread level
    // Added by Hantao
    if(peRank == 0) {
        switch(prv) {
        case QMP_THREAD_SINGLE:
            printf("QMP thread level = QMP_THREAD_SINGLE\n");
            break;
        case QMP_THREAD_FUNNELED:
            printf("QMP thread level = QMP_THREAD_FUNNELED\n");
            break;
        case QMP_THREAD_SERIALIZED:
            printf("QMP thread level = QMP_THREAD_SERIALIZED\n");
            break;
        case QMP_THREAD_MULTIPLE:
            printf("QMP thread level = QMP_THREAD_MULTIPLE\n");
            break;
        default:
            printf("QMP thread level = no idea what this is, boom!\n");
        }
    }

    //Check to make sure that this machine is a GRID machine
    //Exit if not GRID machine
    QMP_ictype qmp_type = QMP_get_msg_passing_type();

    //Get information about the allocated machine
    peNum = QMP_get_number_of_nodes();
    NDIM = QMP_get_allocated_number_of_dimensions();
    peGrid = QMP_get_allocated_dimensions();
    pePos = QMP_get_allocated_coordinates();

    if(peRank==0){
      for(int i = 0; i<*argc;i++){
        printf("argv[%d])(after)=%s\n",i,(*argv)[i]); 
      }
    }
#else
    QMP_status_t init_status = QMP_SUCCESS;
    peRank=0;
    peNum=1;
    NDIM=4;
#endif

//#if (TARGET == BGL) || (TARGET == BGP)
  if (NDIM>5){
    peNum = 1;
    for(int i = 0;i<5;i++)
	peNum *= peGrid[i];
    peRank = peRank % peNum;
  }
  int if_print=1;
  for(int i = 0;i<NDIM;i++)
  if (pePos[i]>=2) if_print=0;

  if (if_print){
      printf("Rank=%d Num=%d NDIM=%d\n",peRank,peNum,NDIM);
      printf("dim:");
      for(int i = 0;i<NDIM;i++)
        printf(" %d",peGrid[i]);
      printf("\n");
      printf("pos:");
      for(int i = 0;i<NDIM;i++)
        printf(" %d",pePos[i]);
      printf("\n");

#if 0
    int rc;
    BGLPersonality pers;
    rts_get_personality(&pers, sizeof(pers));
    printf("from personality: %d %d %d %d\n",pers.xCoord,pers.yCoord,pers.zCoord,rts_get_processor_id());
#endif
  }


//     printf("from personality:\n");

#if 0
    if ( (qmp_type!= QMP_GRID) && (qmp_type !=QMP_MESH)  ) {
      QMP_error("CPS on QMP only implemented for GRID or MESH, not (%d) machines\n",qmp_type);
    }
#endif

//     printf("QMP_declare_logical_topology(peGrid, NDIM)\n");
#ifndef UNIFORM_SEED_NO_COMMS
    //Declare the logical topology (Redundant for GRID machines)
    if (QMP_declare_logical_topology(peGrid, NDIM) != QMP_SUCCESS) {
      QMP_error("Node %d: Failed to declare logical topology\n",peRank);
      exit(-4);
    }
#endif
    initialized = true;
  printf("Rank=%d init_qmp() done\n",peRank);
    
  }
コード例 #9
0
ファイル: QMP_perf.c プロジェクト: 6twirl9/qmp
int
main (int argc, char** argv)
{
  int             i, nc;
  QMP_status_t      status;
  int       **smem, **rmem;
  QMP_msgmem_t    *recvmem;
  QMP_msghandle_t *recvh;
  QMP_msgmem_t    *sendmem;
  QMP_msghandle_t *sendh;
  struct perf_argv pargv;
  QMP_thread_level_t req, prv;

  /** 
   * Simple point to point topology 
   */
  int dims[4] = {2,2,2,2};
  int ndims = 1;

  //if(QMP_get_node_number()==0)
  //printf("starting init\n"); fflush(stdout);
  req = QMP_THREAD_SINGLE;
  status = QMP_init_msg_passing (&argc, &argv, req, &prv);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "QMP_init failed\n");
    return -1;
  }
  if(QMP_get_node_number()==0)
    printf("finished init\n"); fflush(stdout);

  if (parse_options (argc, argv, &pargv) == -1) {
    if(QMP_get_node_number()==0)
      usage (argv[0]);
    exit (1);
  }

  {
    int maxdims = 4;
    int k=0;
    int nodes = QMP_get_number_of_nodes();
    ndims = 0;
    while( (nodes&1) == 0 ) {
      if(ndims<maxdims) ndims++;
      else {
	dims[k] *= 2;
	k++;
	if(k>=maxdims) k = 0;
      }
      nodes /= 2;
    }
    if(nodes != 1) {
      QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes());
      QMP_error(" must power of 2");
      QMP_abort(1);
    }
    pargv.ndims = ndims;
  }

  status = QMP_declare_logical_topology (dims, ndims);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "Cannot declare logical grid\n");
    return -1;
  }

  /* do a broadcast of parameter */
  if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) {
    QMP_printf ("Broadcast parameter failed\n");
    exit (1);
  }

  {
    int k=1;
    const int *lc = QMP_get_logical_coordinates();
    for(i=0; i<ndims; i++) k += lc[i];
    pargv.sender = k&1;
  }

  QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ",
	     argv[0], pargv.num_channels, pargv.verify, 
	     pargv.option, pargv.size, pargv.loops, pargv.sender,
	     strided_send, strided_recv, strided_array_send);
  fflush(stdout);


  /**
   * Create memory
   */
  nc = pargv.num_channels;
  smem = (int **)malloc(nc*sizeof (int *));
  rmem = (int **)malloc(nc*sizeof (int *));
  sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));
  recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));

  QMP_barrier();
  if(QMP_get_node_number()==0) printf("\n"); fflush(stdout);
  if(pargv.option & TEST_SIMUL) {
    int opts = pargv.option;
    pargv.option = TEST_SIMUL;
    if(QMP_get_node_number()==0)
      QMP_printf("starting simultaneous sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_simultaneous_send (smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished simultaneous sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_PINGPONG) {
    int opts = pargv.option;
    pargv.option = TEST_PINGPONG;
    if(QMP_get_node_number()==0)
      QMP_printf("starting ping pong sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      if(pargv.verify)
	test_pingpong_verify(smem, rmem, sendh, recvh, &pargv);
      else
	test_pingpong(smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished ping pong sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_ONEWAY) {
    int opts = pargv.option;
    pargv.option = TEST_ONEWAY;
    if(QMP_get_node_number()==0)
      QMP_printf("starting one way sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_oneway (smem, rmem, sendh, recvh, &pargv);
      if(!pargv.sender) check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished one way sends"); fflush(stdout);
    pargv.option = opts;
  }


  /**
   * Free memory 
   */
  free (smem);
  free (rmem);

  free (sendh);
  free (recvh);

  free (sendmem);
  free (recvmem);

  QMP_finalize_msg_passing ();

  return 0;
}