// This is not needed as it can be done transitively:
  // ie lookup the QDP index and then lookup the coord with that 
  static void mySiteCoords3D(int gcoords[], int node, int linearsite)
    int mu;
    int subgrid_cb_nrow[4];
    int tmp_coord[4];
    int cb3,cbb3;
    int my_node = QMP_get_node_number();

    for(mu=0; mu < 4; mu++) { 
      subgrid_cb_nrow[mu] = getLattSize()[mu];
    subgrid_cb_nrow[0] /=2;  /* Checkerboarding */

    /* Single processor -- all coords 0 */
    for(mu=0; mu < 4; mu++) { 
      gcoords[mu] = 0;

    crtesn3d(linearsite % total_vol_3d_cb, subgrid_cb_nrow, tmp_coord);

    // Add on position within the node
    // NOTE: the cb for the x-coord is not yet determined
    gcoords[0] += 2*tmp_coord[0];
    for(mu=1; mu < 4; ++mu) {
      gcoords[mu] += tmp_coord[mu];

    cbb3 = cb3;
    for(mu=1; mu < 3; ++mu) {
      cbb3 += gcoords[mu];
    gcoords[0] += (cbb3 & 1);
main (int argc, char** argv)
  QMP_status_t status;
  int this_node;
  QMP_thread_level_t req, prv;

  /* Start QMP */
  status = QMP_init_msg_passing (&argc, &argv, req, &prv);

  if (status != QMP_SUCCESS) {
    QMP_error ("QMP_init failed: %s\n", QMP_error_string(status));

  /* Get my logical node number */
  this_node = QMP_get_node_number();

  /* Print the result */

  /* Quit */
  QMP_finalize_msg_passing ();

  return 0;
Beispiel #3
static void
test_simultaneous_send (int** smem,
			int** rmem,
			QMP_msghandle_t* sendh,
			QMP_msghandle_t* recvh,
			struct perf_argv* pargv)
  double it, ft, dt, bwval;
  int    i, j;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  ndims = pargv->ndims;

  /* do a test for nloops */
  it = get_current_time ();
  for (i = 0; i < nloops; i++) {
    for (j = 0; j < nc; j++) {
      /* receive operation */
      if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));

    for (j = 0; j < nc; j++) {
      /* Send operation */
      if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));

    for (j = 0; j < nc; j++) {
      if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	QMP_printf ("Error in sending %d\n", j );

    for (j = 0; j < nc; j++) {
      if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	QMP_printf ("Error in receiving %d\n", j);

  ft = get_current_time ();

  if(QMP_get_node_number()==0) {
    dt = (ft - it); /* in milli seconds */

    bwval = dsize/(double)1000.0 * 4.0 * nloops * nc*ndims/dt;

    QMP_printf ("Simultaneous send B/W for datasize %d is %g (MB/s)", dsize * 4, bwval);

    QMP_printf ("Time difference is %lf micro seconds", dt*1000.0/nloops);
Beispiel #4
int DML_clear_to_send(char *scratch_buf, size_t size, 
		      int my_io_node, int new_node) 
  if (QMP_get_msg_passing_type() != QMP_GRID)
    int this_node = QMP_get_node_number();

    if(this_node == my_io_node && new_node != my_io_node)
      DML_send_bytes(scratch_buf,size,new_node); /* junk message */

    if(this_node == new_node && new_node != my_io_node)

  return 0;
Beispiel #5
int DML_route_bytes(char *buf, size_t size, int fromnode, int tonode) 
  if (QMP_get_msg_passing_type() == QMP_GRID)
    DML_grid_route(buf, size, fromnode, tonode);
    int this_node = QMP_get_node_number();

    if (this_node == tonode)

    if (this_node == fromnode)

  return 0;
Beispiel #6
unsigned int test_checksum(Float * float_p, int len) {
  static int gcsum_count = 1;
  static int peRank = 0;
  if (gcsum_count==1){
    peRank = QMP_get_node_number();
//    fprintf(stderr,"peRank=%d\n",peRank);
  unsigned int locsum = local_checksum(float_p, len);
  unsigned int bosssum;

//  if(UniqueID() == 0) bosssum = locsum;
  if(CoorX()==0 &&
  CoorY()==0 &&
  CoorZ()==0 &&
  CoorT()==0 ) bosssum = locsum;
  else                bosssum = 0;
  glb_sum_internal2(&bosssum,4,0); // 0 for XOR
  if(bosssum != locsum) {
      fprintf( stderr,
               "GCheckSum %d : Node %d : Oops I did it again: me (%d,%d,%d,%d,%d) %u != boss %u\n",
               locsum,  bosssum );
#if 0
  if(gcsum_count>2 && peRank==0){
    fprintf(stderr,"peRank=%d exiting\n",peRank);

  return bosssum;
// This is not needed as it can be done transitively:
// ie lookup the QDP index and then lookup the coord with that
static void mySiteCoords4D(int gcoords[], int node, int linearsite)
    int mu;
    int subgrid_cb_nrow[4];
    int tmp_coord[4];
    int cb,cbb;
    int* log_coords=QMP_get_logical_coordinates_from(node);
    int my_node = QMP_get_node_number();

    for(mu=0; mu < 4; mu++) {
        subgrid_cb_nrow[mu] = getSubgridSize()[mu];
    subgrid_cb_nrow[0] /=2;  /* Checkerboarding */

    for(mu=0; mu < 4; mu++) {
        gcoords[mu] = log_coords[mu]*getSubgridSize()[mu];



    crtesn4d(linearsite % subgrid_vol_cb, subgrid_cb_nrow, tmp_coord);

    // Add on position within the node
    // NOTE: the cb for the x-coord is not yet determined
    gcoords[0] += 2*tmp_coord[0];
    for(mu=1; mu < 4; ++mu) {
        gcoords[mu] += tmp_coord[mu];

    cbb = cb;
    for(mu=1; mu < 4; ++mu) {
        cbb += gcoords[mu];
    gcoords[0] += (cbb & 1);
Beispiel #8
stupid_broadcast(void *send_buf, int count)
  int node;
  int num_nodes = QMP_get_number_of_nodes();
  QMP_msgmem_t request_msg = QMP_declare_msgmem(send_buf, count);
  QMP_msghandle_t request_mh;

  // Send to each node
  for(node=1; node < num_nodes; ++node)
    if (QMP_get_node_number() == node)
      request_mh = QMP_declare_receive_from(request_msg, 0, 0);

      if (QMP_start(request_mh) != QMP_SUCCESS)
	QMP_abort_string(1, "recvFromWait failed\n");


    if (QMP_is_primary_node())
      request_mh = QMP_declare_send_to(request_msg, node, 0);

      if (QMP_start(request_mh) != QMP_SUCCESS)
	QMP_abort_string(1, "sendToWait failed\n");


Beispiel #9
 * Test oneway blast send
static void
test_oneway (int** smem,
	     int** rmem,
	     QMP_msghandle_t* sendh,
	     QMP_msghandle_t* recvh,
	     struct perf_argv* pargv)
  double it, ft, dt, bwval;
  int    i, j;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;
  QMP_bool_t sender;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  sender = pargv->sender;
  ndims = pargv->ndims;

  if (sender) {
    it = get_current_time ();
    for (i = 0; i < nloops; i++) {
      for (j = 0; j < nc; j++) {
	/* Send operation */
	if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));

      for (j = 0; j < nc; j++) {
	if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in sending %d\n", j );
    ft = get_current_time (); /* In milli seconds */

    dt = (ft - it); /* actual send time milli seconds */

    bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

    if(QMP_get_node_number()==0) {
      QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", 
		  dsize * 4, bwval);
      QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops);
  else { 
    it = get_current_time ();
    for (i = 0; i < nloops; i++) {
      for (j = 0; j < nc; j++) {
	/* receive operation */
	if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));

      for (j = 0; j < nc; j++) {
	if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in receiving %d\n", j);
    ft = get_current_time (); /* In milli seconds */

    dt = (ft - it); /* actual send time milli seconds */

    bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

    if(QMP_get_node_number()==1) {
      QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", 
		  dsize * 4, bwval);
      QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops);
Beispiel #10
main(int argc, char *argv[])
    const char *msg;
    int status = 1;
    int mu, i;
    struct QOP_CLOVER_State *clover_state;
    QDP_Int *I_seed;
    int i_seed;
    QDP_RandomState *state;
    QLA_Real plaq;
    QLA_Real n[NELEMS(F)];
    struct QOP_CLOVER_Gauge *c_g;
    struct QOP_CLOVER_Fermion *c_f[NELEMS(F)];
    double kappa;
    double c_sw;
    double in_eps;
    int in_iter;
    int log_flag;
    double out_eps;
    int out_iter;
    int cg_status;
    double run_time;
    long long flops, sent, received;
    /* start QDP */
    QDP_initialize(&argc, &argv);

    if (argc != 1 + NDIM + 6) {
        printf0("ERROR: usage: %s Lx ... seed kappa c_sw iter eps log?\n",
        goto end;

    for (mu = 0; mu < NDIM; mu++) {
        lattice[mu] = atoi(argv[1 + mu]);
    i_seed = atoi(argv[1 + NDIM]);
    kappa = atof(argv[2 + NDIM]);
    c_sw = atof(argv[3 + NDIM]);
    in_iter = atoi(argv[4 + NDIM]);
    in_eps = atof(argv[5 + NDIM]);
    log_flag = atoi(argv[6 + NDIM]) == 0? 0: QOP_CLOVER_LOG_EVERYTHING;

    /* set lattice size and create layout */
    QDP_set_latsize(NDIM, lattice);

    primary = QMP_is_primary_node();
    self = QMP_get_node_number();
    get_vector(network, 1, QMP_get_logical_number_of_dimensions(),
    get_vector(node, 0, QMP_get_logical_number_of_dimensions(),
    printf0("network: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", network[i]);

    printf0("node: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", node[i]);

    printf0("kappa: %20.15f\n", kappa);
    printf0("c_sw:  %20.15f\n", c_sw);

    printf0("in_iter: %d\n", in_iter);
    printf0("in_eps: %15.2e\n", in_eps);

    /* allocate the gauge field */
    create_Mvector(U, NELEMS(U));
    create_Mvector(C, NELEMS(C));
    create_Dvector(F, NELEMS(F));
    I_seed = QDP_create_I();
    QDP_I_eq_funci(I_seed, icoord, QDP_all);
    state = QDP_create_S();
    QDP_S_eq_seed_i_I(state, i_seed, I_seed, QDP_all);
    for (mu = 0; mu < NELEMS(U); mu++) {
        QDP_M_eq_gaussian_S(U[mu], state, QDP_all);
    for (i = 0; i < NELEMS(F); i++) {
        QDP_D_eq_gaussian_S(F[i], state, QDP_all);

    /* build the clovers */
    clover(C, U);

    /* initialize CLOVER */
    if (QOP_CLOVER_init(&clover_state, lattice, network, node, primary,
                        sublattice, NULL)) {
        printf0("CLOVER_init() failed\n");
        goto end;

    if (QOP_CLOVER_import_fermion(&c_f[0], clover_state, f_reader, F[0])) {
        printf0("CLOVER_import_fermion(0) failed\n");
        goto end;

    if (QOP_CLOVER_allocate_fermion(&c_f[1], clover_state)) {
        printf0("CLOVER_allocate_fermion(1) failed\n");
        goto end;

    if (QOP_CLOVER_allocate_fermion(&c_f[2], clover_state)) {
        printf0("CLOVER_allocate_fermion(2) failed\n");
        goto end;

    if (QOP_CLOVER_allocate_fermion(&c_f[3], clover_state)) {
        printf0("CLOVER_allocate_fermion(3) failed\n");
        goto end;

    if (QOP_CLOVER_import_gauge(&c_g, clover_state, kappa, c_sw,
                                u_reader, c_reader, NULL)) {
        printf("CLOVER_import_gauge() failed\n");
        goto end;

    QOP_CLOVER_D_operator(c_f[2], c_g, c_f[0]);
    cg_status = QOP_CLOVER_D_CG(c_f[3], &out_iter, &out_eps,
                                c_f[2], c_g, c_f[2], in_iter, in_eps,

    msg = QOP_CLOVER_error(clover_state);

    QOP_CLOVER_performance(&run_time, &flops, &sent, &received, clover_state);

    QOP_CLOVER_export_fermion(f_writer, F[3], c_f[3]);

    printf0("CG status: %d\n", cg_status);
    printf0("CG error message: %s\n", msg? msg: "<NONE>");
    printf0("CG iter: %d\n", out_iter);
    printf0("CG eps: %20.10e\n", out_eps);
    printf0("CG performance: runtime %e sec\n", run_time);
    printf0("CG performance: flops  %.3e MFlop/s (%lld)\n",
            flops * 1e-6 / run_time, flops);
    printf0("CG performance: snd    %.3e MB/s (%lld)\n",
            sent * 1e-6 / run_time, sent);
    printf0("CG performance: rcv    %.3e MB (%lld)/s\n",
            received * 1e-6 / run_time, received);

    /* free CLOVER */
    for (i = 0; i < NELEMS(c_f); i++)


    /* Compute plaquette */
    plaq = plaquette(U);

    /* field norms */
    for (i = 0; i < NELEMS(F); i++)
        QDP_r_eq_norm2_D(&n[i], F[i], QDP_all);
    /* Display the values */
    printf0("plaquette = %g\n",
            plaq / (QDP_volume() * QDP_Nc * NDIM * (NDIM - 1) / 2 ));
    for (i = 0; i < NELEMS(F); i++)
        printf0(" |f|^2 [%d] = %20.10e\n", i, (double)(n[i]));

    /* Compute and display <f[1] f[0]> */
    show_dot("1|orig", F[1], F[0]);
    /* Compute and display <f[1] f[3]> */
    show_dot("1|solv", F[1], F[3]);

    destroy_Mvector(U, NELEMS(U));
    destroy_Mvector(C, NELEMS(C));
    destroy_Dvector(F, NELEMS(F));

    status = 0;
    /* shutdown QDP */
    return status;
Beispiel #11
main(int argc, char *argv[])
  struct QOP_MDWF_State *mdwf_state = NULL;
  QMP_thread_level_t qt = QMP_THREAD_SINGLE;
  int status = 1;
  int vx[4];
  int i;

  if (QMP_init_msg_passing(&argc, &argv, qt, &qt) != QMP_SUCCESS) {
    fprintf(stderr, "QMP_init() failed\n");
    return 1;

  for (i = 0; i < NELEM(b5); i++) {
    b5[i] = 0.1 * i * (NELEM(b5) - i);
    c5[i] = 0.1 * i * i * (NELEM(b5) - i);

  self = QMP_get_node_number();
  primary = QMP_is_primary_node();
  for (i = 0; i < argc; i++)
    zprint("arg[%d]=%s", i, argv[i]);
  if (argc != 14) {
    zprint("14 arguments expected, found %d", argc);
    return 1;

  for (i = 0; i < 4; i++) {
    mynetwork[i] = atoi(argv[i+1]);
    mylocal[i] = atoi(argv[i+5]);
    vx[i] = atoi(argv[i+10]);
    mylattice[i] = mylocal[i] * mynetwork[i];
  mylocal[4] = mylattice[4] = atoi(argv[9]);

  zshowv4("network", mynetwork);
  zshowv5("local lattice", mylocal);
  zshowv5("lattice", mylattice);

  getv(mynode, 0, 4, vx);

  xshowv("node", mynode);

  if (QOP_MDWF_init(&mdwf_state,
		    mylattice, mynetwork, mynode, primary,
		    getsub, NULL)) {
    zprint("MDWF_init() failed");
    goto end;

  zprint("MDWF_init() done");



  status = 0;
  return status;
Beispiel #12
void PT::mat(int n, matrix **mout, matrix **min, const int *dir){
  int wire[MAX_DIR];
  int i;
  QMP_msgmem_t msg_mem_p[2*MAX_DIR];
  QMP_msghandle_t msg_handle_p[2*MAX_DIR];
  QMP_msghandle_t multiple;
  static double setup=0.,qmp=0.,localt=0.,nonlocal=0.;
  static int call_num = 0;

//  char *fname="pt_mat()";
//  VRB.Func("",fname);
//  if (call_num%100==1) printf("PT:mat()\n");

  for(i=0;i<n;i++) wire[i] = dir[i]; 
#ifdef PROFILE
  Float dtime2  = - dclock();

  double dtime = -dclock();

  int non_local_dir=0;
  if (!local[wire[i]/2]) {
    //Calculate the address for transfer in a particular direction
    Float * addr = ((Float *)min[i]+GAUGE_LEN*offset[wire[i]]);
    msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));
    msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]]));
    msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0);
    msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0);

  if (call_num==1 && !QMP_get_node_number())

  if(non_local_dir) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir);

  dtime += dclock();
  setup +=dtime;
  dtime = -dclock();
  int if_print = 0;
//  if ( (call_num%10000==1) && (!QMP_get_node_number()) ) if_print=1;

#define USE_TEST2
#ifdef USE_TEST2
//assume nt > n!
    static char *cname="mat()";
#pragma omp parallel default(shared)
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  int w_t = wire[n_t];
  ipoints = (local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Interleaving of local computation of matrix multiplication
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
  //Interleaving of local computation of matrix multiplication
#pragma omp parallel for default(shared)

  dtime += dclock();
  localt +=dtime;
  dtime = -dclock();
//#pragma omp barrier
//#pragma omp master 
  if(non_local_dir) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status));
    for(int i = 0; i < 2*non_local_dir; i++)
//    Free(msg_handle_p);
//    Free(msg_mem_p);
} //#pragma omp master {
  dtime += dclock();
  qmp +=dtime;
  dtime = -dclock();

  //Do non-local computations
#ifdef USE_TEST2
//assume nt > n!
#pragma omp parallel default(shared)
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  int w_t = wire[n_t];
  ipoints = (non_local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (non_local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Non-local computation
  if (ipoints>0)
  partrans_cmm_agg((uc_nl[w_t]+offset*2),(matrix *)rcv_buf[w_t],mout[n_t],ipoints);
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
#pragma omp parallel for
  if (!local[wire[i]/2]) {
#ifdef USE_OMP
    if (call_num%10000==1 && !QMP_get_node_number() ) 
      printf("thread %d of %d i=%d\n",omp_get_thread_num(),omp_get_num_threads(),i);
    partrans_cmm_agg(uc_nl[wire[i]],(matrix *)rcv_buf[wire[i]],mout[i],non_local_chi[wire[i]]/2);

}//#pragma omp parallel

  dtime += dclock();
  nonlocal +=dtime;

  if (call_num%100==0){
    static char *cname="mat()";
    if (!QMP_get_node_number() ) {

#ifdef PROFILE
  dtime2 +=dclock();
//  ParTrans::PTflops +=198*n*vol;
Beispiel #13
main (int argc, char** argv)
  int             i, nc;
  QMP_status_t      status;
  int       **smem, **rmem;
  QMP_msgmem_t    *recvmem;
  QMP_msghandle_t *recvh;
  QMP_msgmem_t    *sendmem;
  QMP_msghandle_t *sendh;
  struct perf_argv pargv;
  QMP_thread_level_t req, prv;

   * Simple point to point topology 
  int dims[4] = {2,2,2,2};
  int ndims = 1;

  //printf("starting init\n"); fflush(stdout);
  status = QMP_init_msg_passing (&argc, &argv, req, &prv);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "QMP_init failed\n");
    return -1;
    printf("finished init\n"); fflush(stdout);

  if (parse_options (argc, argv, &pargv) == -1) {
      usage (argv[0]);
    exit (1);

    int maxdims = 4;
    int k=0;
    int nodes = QMP_get_number_of_nodes();
    ndims = 0;
    while( (nodes&1) == 0 ) {
      if(ndims<maxdims) ndims++;
      else {
	dims[k] *= 2;
	if(k>=maxdims) k = 0;
      nodes /= 2;
    if(nodes != 1) {
      QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes());
      QMP_error(" must power of 2");
    pargv.ndims = ndims;

  status = QMP_declare_logical_topology (dims, ndims);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "Cannot declare logical grid\n");
    return -1;

  /* do a broadcast of parameter */
  if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) {
    QMP_printf ("Broadcast parameter failed\n");
    exit (1);

    int k=1;
    const int *lc = QMP_get_logical_coordinates();
    for(i=0; i<ndims; i++) k += lc[i];
    pargv.sender = k&1;

  QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ",
	     argv[0], pargv.num_channels, pargv.verify, 
	     pargv.option, pargv.size, pargv.loops, pargv.sender,
	     strided_send, strided_recv, strided_array_send);

   * Create memory
  nc = pargv.num_channels;
  smem = (int **)malloc(nc*sizeof (int *));
  rmem = (int **)malloc(nc*sizeof (int *));
  sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));
  recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));

  if(QMP_get_node_number()==0) printf("\n"); fflush(stdout);
  if(pargv.option & TEST_SIMUL) {
    int opts = pargv.option;
    pargv.option = TEST_SIMUL;
      QMP_printf("starting simultaneous sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_simultaneous_send (smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
      QMP_printf("finished simultaneous sends\n"); fflush(stdout);
    pargv.option = opts;

  if(pargv.option & TEST_PINGPONG) {
    int opts = pargv.option;
    pargv.option = TEST_PINGPONG;
      QMP_printf("starting ping pong sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
	test_pingpong_verify(smem, rmem, sendh, recvh, &pargv);
	test_pingpong(smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
      QMP_printf("finished ping pong sends\n"); fflush(stdout);
    pargv.option = opts;

  if(pargv.option & TEST_ONEWAY) {
    int opts = pargv.option;
    pargv.option = TEST_ONEWAY;
      QMP_printf("starting one way sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_oneway (smem, rmem, sendh, recvh, &pargv);
      if(!pargv.sender) check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
      QMP_printf("finished one way sends"); fflush(stdout);
    pargv.option = opts;

   * Free memory 
  free (smem);
  free (rmem);

  free (sendh);
  free (recvh);

  free (sendmem);
  free (recvmem);

  QMP_finalize_msg_passing ();

  return 0;
void make_shift_tables(int bound[2][4][4], halfspinor_array* chi1,
                       halfspinor_array* chi2,

                       halfspinor_array* recv_bufs[2][4],
                       halfspinor_array* send_bufs[2][4],

                       void (*QDP_getSiteCoords)(int coord[], int node, int linearsite),
                       int (*QDP_getLinearSiteIndex)(const int coord[]),

                       int (*QDP_getNodeNumber)(const int coord[]))
    volatile int dir,i;
    const int my_node = QMP_get_node_number();
    int coord[4];
    int gcoord[4];
    int gcoord2[4];

    int linear;
    int **shift_table;
    int x,y,z,t;
    int *subgrid_size = getSubgridSize();
    int mu;
    int offset;

    int cb;
    const int *node_coord  = QMP_get_logical_coordinates();
    int p;
    int site, index;

    InvTab4 *xinvtab;
    InvTab4 *invtab;

    int qdp_index;
    int my_index;
    int num;
    int offsite_found;

    /* Setup the subgrid volume for ever after */
    subgrid_vol = 1;
    for(i=0; i < getNumDim(); ++i) {
        subgrid_vol *= getSubgridSize()[i];

    /* Get the checkerboard size for ever after */
    subgrid_vol_cb = subgrid_vol / 2;

    /* Now I want to build the site table */
    /* I want it cache line aligned? */
    xsite_table = (int *)malloc(sizeof(int)*subgrid_vol+63L);
    if(xsite_table == 0x0 ) {
        QMP_error("Couldnt allocate site table");

    site_table = (int *)((((ptrdiff_t)(xsite_table))+63L)&(-64L));

    xinvtab = (InvTab4 *)malloc(sizeof(InvTab4)*subgrid_vol + 63L);
    if(xinvtab == 0x0 ) {
        QMP_error("Couldnt allocate site table");
    invtab = (InvTab4 *)((((ptrdiff_t)(xinvtab))+63L)&(-64L));

    /* Inversity of functions check:
       Check that myLinearSiteIndex3D is in fact the inverse
       of mySiteCoords3D, and that QDP_getSiteCoords is the
       inverse of QDP_linearSiteIndex()
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* Linear site index */
            my_index = site + subgrid_vol_cb*p;
            QDP_getSiteCoords(gcoord, my_node, my_index);

            if( linear != my_index ) {
                printf("P%d cb=%d site=%d : QDP_getSiteCoords not inverse of QDP_getLinearSiteIndex(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear);

            mySiteCoords4D(gcoord, my_node, my_index);

            if( linear != my_index ) {
                printf("P%d cb=%d site=%d : mySiteCoords3D not inverse of myLinearSiteIndex3D(): my_index=%d linear=%d\n", my_node, p,site, my_index,linear);

    /* Loop through sites - you can choose your path below */
    /* This is a checkerboarded order which is identical hopefully
       to QDP++'s rb2 subset when QDP++ is in a CB2 layout */
    for(p=0; p < 2; p++) {
        for(t=0; t < subgrid_size[3]; t++) {
            for(z=0; z < subgrid_size[2]; z++) {
                for(y=0; y < subgrid_size[1]; y++) {
                    for(x=0; x < subgrid_size[0]/2; x++) {

                        coord[0] = 2*x + p;
                        coord[1] = y;
                        coord[2] = z;
                        coord[3] = t;

                        /* Make global */
                        for(i=0; i < 4; i++) {
                            coord[i] += subgrid_size[i]*node_coord[i];

                        /* Index of coordinate -- NB this is not lexicographic
                           but takes into account checkerboarding in QDP++ */
                        qdp_index = QDP_getLinearSiteIndex(coord);

                        /* Index of coordinate in my layout. -- NB this is not lexicographic
                           but takes into account my 3D checkerbaording */
                        my_index = myLinearSiteIndex4D(coord);
                        site_table[my_index] = qdp_index;

                        linear = my_index%subgrid_vol_cb;


    /* Site table transitivity check:
       for each site, convert to index in cb3d, convert to qdp index
       convert qdp_index to coordinate
       convert coordinate to back index in cb3d
       Check that your cb3d at the end is the same as you
       started with */
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* My local index */
            my_index = site + subgrid_vol_cb*p;

            /* Convert to QDP index */
            qdp_index = site_table[ my_index ];

            /* Switch QDP index to coordinates */
            QDP_getSiteCoords(gcoord, my_node,qdp_index);

            /* Convert back to cb3d index */
            linear = myLinearSiteIndex4D(gcoord);

            /* Check new cb,cbsite index matches the old cb index */
            if (linear != my_index) {
                printf("P%d The Circle is broken. My index=%d qdp_index=%d coords=%d,%d,%d,%d linear(=my_index?)=%d\n", my_node, my_index, qdp_index, gcoord[0],gcoord[1],gcoord[2],gcoord[3],linear);

    /* Consistency check 2: Test mySiteCoords 3D
       for all 3d cb,cb3index convert to
       cb3d linear index (my_index)
       convert to qdp_index (lookup in site table)

       Now convert qdp_index and my_index both to
       coordinates. They should produce the same coordinates
    for(p=0; p < 2; p++) {
        for(site=0; site < subgrid_vol_cb; site++) {

            /* My local index */
            my_index = site + subgrid_vol_cb*p;
            mySiteCoords4D(gcoord, my_node, my_index);

            qdp_index = site_table[ my_index ];
            QDP_getSiteCoords(gcoord2, my_node,qdp_index);

            for(mu=0 ; mu < 4; mu++) {
                if( gcoord2[mu] != gcoord[mu] ) {
                    printf("P%d: my_index=%d qdp_index=%d mySiteCoords=(%d,%d,%d,%d) QDPsiteCoords=(%d,%d,%d,%d)\n", my_node, my_index, qdp_index, gcoord[0], gcoord[1], gcoord[2], gcoord[3], gcoord2[0], gcoord2[1], gcoord2[2], gcoord2[3]);


    /* Allocate the shift table */
    /* The structure is as follows: There are 4 shift tables in order:

      [ Table 1 | Table 2 | Table 3 | Table 4 ]
      Table 1: decomp_scatter_index[mu][site]
      Table 2: decomp_hvv_scatter_index[mu][site]
      Table 3: recons_mvv_gather_index[mu][site]
      Table 4: recons_gather_index[mu][site]


    /* This 4 is for the 4 tables: Table 1-4*/
    if ((shift_table = (int **)malloc(4*sizeof(int*))) == 0 ) {
        QMP_error("init_wnxtsu3dslash: could not initialize shift_table");


    for(i=0; i < 4; i++) {
        /* This 4 is for the 4 comms dierctions: */
        if ((shift_table[i] = (int *)malloc(4*subgrid_vol*sizeof(int))) == 0) {
            QMP_error("init_wnxtsu3dslash: could not initialize shift_table");

    /* Initialize the boundary counters */
    for(cb=0; cb < 2; cb++) {
        for(dir=0; dir < 4; dir++) {
            bound[cb][0][dir] = 0;
            bound[cb][1][dir] = 0;
            bound[cb][2][dir] = 0;
            bound[cb][3][dir] = 0;

    for(cb=0; cb < 2; cb++) {
        for(site=0; site < subgrid_vol_cb; ++site) {

            index = cb*subgrid_vol_cb + site;

            /* Fetch site from site table */
            qdp_index = site_table[index];

            /* Get its coords */
            QDP_getSiteCoords(coord, my_node, qdp_index);

            /* Loop over directions building up shift tables */
            for(dir=0; dir < 4; dir++) {

                int fcoord[4], bcoord[4];
                int fnode, bnode;
                int blinear, flinear;

                /* Backwards displacement*/
                offs(bcoord, coord, dir, -1);
                bnode   = QDP_getNodeNumber(bcoord);
                blinear = QDP_getLinearSiteIndex(bcoord);

                /* Forward displacement */
                offs(fcoord, coord, dir, +1);
                fnode   = QDP_getNodeNumber(fcoord);
                flinear = QDP_getLinearSiteIndex(fcoord);

                /* Scatter:  decomp_{plus,minus} */
                /* Operation: a^F(shift(x,type=0),dir) <- decomp(psi(x),dir) */
                /* Send backwards - also called a receive from forward */
                if (bnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 1, increase boundary count */
                    /* This is the correct code */
                        = subgrid_vol_cb + bound[1-cb][DECOMP_SCATTER][dir];


                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup */
                    shift_table[DECOMP_SCATTER][dir+4*index] =

                /* Scatter:  decomp_hvv_{plus,minus} */
                /* Operation:  a^B(shift(x,type=1),dir) <- U^dag(x,dir)*decomp(psi(x),dir) */
                /* Send forwards - also called a receive from backward */
                if (fnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 1, increase boundary count */
                        = subgrid_vol_cb + bound[1-cb][DECOMP_HVV_SCATTER][dir];


                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup */
                    shift_table[DECOMP_HVV_SCATTER][dir+4*index]           /* Onnode */
                        = invtab[flinear].linearcb ;

                /* Gather:  mvv_recons_{plus,minus} */
                /* Operation:  chi(x) <-  \sum_dir U(x,dir)*a^F(shift(x,type=2),dir) */
                /* Receive from forward */
                if (fnode != my_node) {
                    /* Offnode */
                    /* Append to Tail 2, increase boundary count */

                    shift_table[RECONS_MVV_GATHER][dir+4*index] =
                        2*subgrid_vol_cb + (bound[cb][RECONS_MVV_GATHER][dir]);


                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup. Note this is a recons post shift,
                       so the linear coordinate to invert is mine rather than the neighbours */
                    shift_table[RECONS_MVV_GATHER][dir+4*index] =
                        invtab[qdp_index].linearcb ;

                /* Gather:  recons_{plus,minus} */
                /* Operation:  chi(x) +=  \sum_dir recons(a^B(shift(x,type=3),dir),dir) */
                /* Receive from backward */
                if (bnode != my_node) {

                    shift_table[RECONS_GATHER][dir+4*index] =
                        2*subgrid_vol_cb + bound[cb][RECONS_GATHER][dir];


                else {
                    /* On node. Note the linear part of its (cb3, linear) bit,
                       using a reverse lookup. Note this is a recons post shift,
                       so the linear coordinate to invert is mine rather than the neighbours */

                    shift_table[RECONS_GATHER][dir+4*index] =
                        invtab[qdp_index].linearcb ;

    /* Sanity check - make sure the sending and receiving counters match */
    for(cb=0; cb < 2; cb++) {
        for(dir=0; dir < 4; dir++) {

            /* Sanity 1: Must have same number of boundary sites on each cb for
            a given operation */
            for(i = 0; i < 4; i++) {
                if (bound[1-cb][i][dir] != bound[cb][i][dir]) {

                    QMP_error("SSE Wilson dslash - make_shift_tables: type 0 diff. cb send/recv counts do not match: %d %d",


    /* Now I want to make the offset table into the half spinor temporaries */
    /* The half spinor temporaries will look like this:

       dir=0 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ]
       dir=1 [ Body Half Spinors ][ Tail 1 Half Spinors ][ Tail 2 Half Spinors ]

       And each of these blocks of half spinors will be sized to vol_cb
       sites (ie half volume only).  The shift_table() for a given site and
       direction indexes into one of these lines. So the offset table essentially
       delineates which line one picks, by adding an offset of
       To the shift. The result from offset table, can be used directly as a
       pointer displacement on the temporaries.

       Perhaps the best way to condsider this is to consider a value
       of shift_table[type][dir/site] that lands in the body. The
       shift table merely gives me a site index. But the data needs
       to be different for each direction for that site index. Hence
       we need to replicate the body, for each dir. The 3xsubgrid_vol_cb
       is just there to take care of the buffers.

       Or another way to think of it is that there is a 'body element' index
       specified by the shift table lookup, and that dir is just the slowest
       varying index.


    /* 4 dims, 4 types, rest of the magic is to align the thingie */
    xoffset_table = (halfspinor_array **)malloc(4*4*subgrid_vol*sizeof(halfspinor_array*)+63L);
    if( xoffset_table == 0 ) {
        QMP_error("init_wnxtsu3dslash: could not initialize offset_table[i]");
    /* This is the bit what aligns straight from AMD Manual */
    offset_table = (halfspinor_array**)((((ptrdiff_t)(xoffset_table)) + 63L) & (-64L));

    /* Walk through the shift_table and remap the offsets into actual
       pointers */

    for(dir =0; dir < Nd; dir++) {

        /* Loop through all the sites. Remap the offsets either to local
           arrays or pointers */
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[DECOMP_SCATTER][dir+4*site];
            if( offset >= subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the send back buffer */
                /* send to back index = recv from forward index = 0  */
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] =
                    send_bufs[0][num]+(offset - subgrid_vol_cb);
            else {
                /* Guy is onsite: This is DECOMP_SCATTER so offset to chi1 */
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_SCATTER) ] =

        if( offsite_found > 0 ) {
            /* If we found an offsite guy, next direction has to
            go into the next dir part of the send bufs */

    /* Restart num-s */
    for(dir =0; dir <Nd; dir++) {
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[DECOMP_HVV_SCATTER][dir+4*site];
            if( offset >= subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the send forw buffer */
                /* send to forward / receive from backward index = 1 */

                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] =
                    send_bufs[1][num]+(offset - subgrid_vol_cb);
            else {
                /* Guy is onsite. This is DECOMP_HVV_SCATTER so offset to chi2 */
                offset_table[ dir + 4*(site + subgrid_vol*DECOMP_HVV_SCATTER) ] =
                    chi2+shift_table[DECOMP_HVV_SCATTER][dir+4*site ]+subgrid_vol_cb*dir;
        if( offsite_found > 0 ) {

    for(dir =0; dir <Nd; dir++) {
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[RECONS_MVV_GATHER][dir+4*site];
            if( offset >= 2*subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the recv from front buffer */
                /* recv_from front index = send to back index = 0 */
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] =
                    recv_bufs[0][num]+(offset - 2*subgrid_vol_cb);
            else {
                /* Guy is onsite */
                /* This is RECONS_MVV_GATHER so offset with respect to chi1 */
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_MVV_GATHER) ] =
                    chi1+shift_table[RECONS_MVV_GATHER][dir+4*site ]+subgrid_vol_cb*dir;
        if( offsite_found > 0 ) {

    for(dir =0; dir <Nd; dir++) {
        for(site=0; site < subgrid_vol; site++) {
            offset = shift_table[RECONS_GATHER][dir+4*site];
            if( offset >= 2*subgrid_vol_cb ) {
                /* Found an offsite guy. It's address must be to the recv from back buffer */
                /* receive from back = send to forward index =  1*/
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER) ] =
                    recv_bufs[1][num]+(offset - 2*subgrid_vol_cb);
            else {
                /* Guy is onsite */
                /* This is RECONS_GATHER so offset with respect to chi2 */
                offset_table[ dir + 4*(site + subgrid_vol*RECONS_GATHER ) ] =
                    chi2+shift_table[RECONS_GATHER][dir+4*site ]+subgrid_vol_cb*dir;
        if( offsite_found > 0 ) {

    /* Free shift table - it is no longer needed. We deal solely with offsets */
    for(i=0; i < 4; i++) {
        free( (shift_table)[i] );
    free( shift_table );

    free( xinvtab );

Beispiel #15
void initQuda(int dev)
  static int initialized = 0;
  if (initialized) {
  initialized = 1;

#if (CUDA_VERSION >= 4000) && defined(MULTI_GPU)
  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
  char* cni_str = getenv("CUDA_NIC_INTEROP");
  if(cni_str == NULL){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n");
  int cni_int = atoi(cni_str);
  if (cni_int != 1){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n");    

  int deviceCount;
  if (deviceCount == 0) {
    errorQuda("No devices supporting CUDA");

  for(int i=0; i<deviceCount; i++) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, i);
    printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name);

#ifdef QMP_COMMS
  int ndim;
  const int *dim;

  if ( QMP_is_initialized() != QMP_TRUE ) {
    errorQuda("QMP is not initialized");
  dev += rank_QMP % deviceCount;
  ndim = QMP_get_logical_number_of_dimensions();
  dim = QMP_get_logical_dimensions();

#elif defined(MPI_COMMS)


  if (dev < 0) dev = deviceCount - 1;
  // Used for applying the gauge field boundary condition
  if( commCoords(3) == 0 ) qudaPt0=true;
  else qudaPt0=false;

  if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true;
  else qudaPtNm1=false;

  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);
  if (deviceProp.major < 1) {
    errorQuda("Device %d does not support CUDA", dev);

  printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name);

#ifdef HAVE_NUMA
    if(gpu_affinity[dev] >=0){
      printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]);
      if(numa_run_on_node(gpu_affinity[dev]) != 0){
        printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]);


Beispiel #16
main(int argc, char *argv[])
  struct QOP_MDWF_State *mdwf_state = NULL;
  struct QOP_MDWF_Parameters *mdwf_params = NULL;
  QMP_thread_level_t qt = QMP_THREAD_SINGLE;
  int status = 1;
  int i;

  if (QMP_init_msg_passing(&argc, &argv, qt, &qt) != QMP_SUCCESS) {
    fprintf(stderr, "QMP_init() failed\n");
    return 1;

  for (i = 0; i < NELEM(b5); i++) {
    b5[i] = 0.1 * i * (NELEM(b5) - i);
    c5[i] = 0.1 * i * i * (NELEM(b5) - i);

  self = QMP_get_node_number();
  primary = QMP_is_primary_node();
  if (argc != 7) {
    zprint("7 arguments expected, found %d", argc);
    zprint("usage: localheat Lx Ly Lz Lt Ls time");
    return 1;

  for (i = 0; i < 4; i++) {
    mynetwork[i] = 1;
    mylocal[i] = atoi(argv[i+1]);
    mylattice[i] = mylocal[i] * mynetwork[i];
  mylocal[4] = mylattice[4] = atoi(argv[5]);
  total_sec = atoi(argv[6]);

  zshowv4("network", mynetwork);
  zshowv5("local lattice", mylocal);
  zshowv5("lattice", mylattice);
  zprint("total requested runtime %.0f sec", total_sec);

#if 0
  if (QMP_declare_logical_topology(mynetwork, 4) != QMP_SUCCESS) {
    zprint("declare_logical_top failed");
    goto end;

  getv(mynode, 0, QMP_get_logical_number_of_dimensions(),
  { int i;
    for (i = 0; i < 4; i++)
       mynode[i] = 0;

  if (QOP_MDWF_init(&mdwf_state,
		    mylattice, mynetwork, mynode, primary,
		    getsub, NULL)) {
    zprint("MDWF_init() failed");
    goto end;

  zprint("MDWF_init() done");

  if (QOP_MDWF_set_generic(&mdwf_params, mdwf_state, b5, c5, 0.123, 0.05)) {
    zprint("MDW_set_generic() failed");
    goto end;
  zprint("MDWF_set_generic() done");

  if (do_run(mdwf_state, mdwf_params)) {
    zprint("float test failed");
    goto end;


  zprint("Heater test finished");
  status = 0;
  return status;
Beispiel #17
void init_qmp(int * argc, char ***argv) {

#if 0
  printf("init_qmp(%d %p)\n",*argc,*argv);
  for(int i = 0; i<*argc;i++){

#if 0
    QMP_thread_level_t prv;
    QMP_status_t init_status = QMP_init_msg_passing(argc, argv, QMP_THREAD_SINGLE, &prv);
    if (init_status) printf("QMP_init_msg_passing returned %d\n",init_status);
    peRank = QMP_get_node_number();
    peNum = QMP_get_number_of_nodes();
    if(!peRank)printf("QMP_init_msg_passing returned %d\n",init_status);

    if (init_status != QMP_SUCCESS) {

    // check QMP thread level
    // Added by Hantao
    if(peRank == 0) {
        switch(prv) {
        case QMP_THREAD_SINGLE:
            printf("QMP thread level = QMP_THREAD_SINGLE\n");
            printf("QMP thread level = QMP_THREAD_FUNNELED\n");
            printf("QMP thread level = QMP_THREAD_SERIALIZED\n");
            printf("QMP thread level = QMP_THREAD_MULTIPLE\n");
            printf("QMP thread level = no idea what this is, boom!\n");

    //Check to make sure that this machine is a GRID machine
    //Exit if not GRID machine
    QMP_ictype qmp_type = QMP_get_msg_passing_type();

    //Get information about the allocated machine
    peNum = QMP_get_number_of_nodes();
    NDIM = QMP_get_allocated_number_of_dimensions();
    peGrid = QMP_get_allocated_dimensions();
    pePos = QMP_get_allocated_coordinates();

      for(int i = 0; i<*argc;i++){
    QMP_status_t init_status = QMP_SUCCESS;

//#if (TARGET == BGL) || (TARGET == BGP)
  if (NDIM>5){
    peNum = 1;
    for(int i = 0;i<5;i++)
	peNum *= peGrid[i];
    peRank = peRank % peNum;
  int if_print=1;
  for(int i = 0;i<NDIM;i++)
  if (pePos[i]>=2) if_print=0;

  if (if_print){
      printf("Rank=%d Num=%d NDIM=%d\n",peRank,peNum,NDIM);
      for(int i = 0;i<NDIM;i++)
        printf(" %d",peGrid[i]);
      for(int i = 0;i<NDIM;i++)
        printf(" %d",pePos[i]);

#if 0
    int rc;
    BGLPersonality pers;
    rts_get_personality(&pers, sizeof(pers));
    printf("from personality: %d %d %d %d\n",pers.xCoord,pers.yCoord,pers.zCoord,rts_get_processor_id());

//     printf("from personality:\n");

#if 0
    if ( (qmp_type!= QMP_GRID) && (qmp_type !=QMP_MESH)  ) {
      QMP_error("CPS on QMP only implemented for GRID or MESH, not (%d) machines\n",qmp_type);

//     printf("QMP_declare_logical_topology(peGrid, NDIM)\n");
    //Declare the logical topology (Redundant for GRID machines)
    if (QMP_declare_logical_topology(peGrid, NDIM) != QMP_SUCCESS) {
      QMP_error("Node %d: Failed to declare logical topology\n",peRank);
    initialized = true;
  printf("Rank=%d init_qmp() done\n",peRank);
Beispiel #18
int comm_rank(void)
  return QMP_get_node_number();
Beispiel #19
main(int argc, char *argv[])
    int status = 1;
    int mu, i;
    struct QOP_CLOVER_State *clover_state;
    QDP_Int *I_seed;
    int i_seed;
    QDP_RandomState *state;
    QLA_Real plaq;
    QLA_Real n[NELEMS(F)];
    struct QOP_CLOVER_Gauge *c_g;
    struct QOP_CLOVER_Fermion *c_f[NELEMS(F)];
    double kappa;
    double c_sw;

    /* start QDP */
    QDP_initialize(&argc, &argv);

    if (argc != 1 + NDIM + 3) {
        printf0("ERROR: usage: %s Lx ... seed kappa c_sw\n", argv[0]);
        goto end;

    for (mu = 0; mu < NDIM; mu++) {
        lattice[mu] = atoi(argv[1 + mu]);
    i_seed = atoi(argv[1 + NDIM]);
    kappa = atof(argv[2 + NDIM]);
    c_sw = atof(argv[3 + NDIM]);
    /* set lattice size and create layout */
    QDP_set_latsize(NDIM, lattice);

    primary = QMP_is_primary_node();
    self = QMP_get_node_number();
    get_vector(network, 1, QMP_get_logical_number_of_dimensions(),
    get_vector(node, 0, QMP_get_logical_number_of_dimensions(),
    printf0("network: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", network[i]);

    printf0("node: ");
    for (i = 0; i < NDIM; i++)
        printf0(" %d", node[i]);

    printf0("kappa: %20.15f\n", kappa);
    printf0("c_sw:  %20.15f\n", c_sw);

    /* allocate the gauge field */
    create_Mvector(U, NELEMS(U));
    create_Mvector(C, NELEMS(C));
    create_Dvector(F, NELEMS(F));
    I_seed = QDP_create_I();
    QDP_I_eq_funci(I_seed, icoord, QDP_all);
    state = QDP_create_S();
    QDP_S_eq_seed_i_I(state, i_seed, I_seed, QDP_all);
    for (mu = 0; mu < NELEMS(U); mu++) {
        QDP_M_eq_gaussian_S(U[mu], state, QDP_all);
    for (i = 0; i < NELEMS(F); i++) {
        QDP_D_eq_gaussian_S(F[i], state, QDP_all);

    /* build the clovers */
    clover(C, U);

    /* initialize CLOVER */
    if (QOP_CLOVER_init(&clover_state, lattice, network, node, primary,
                        sublattice, NULL)) {
        printf0("CLOVER_init() failed\n");
        goto end;

    if (QOP_CLOVER_import_fermion(&c_f[0], clover_state, f_reader, F[0])) {
        printf0("CLOVER_import_fermion(0) failed\n");
        goto end;

    if (QOP_CLOVER_import_fermion(&c_f[1], clover_state, f_reader, F[1])) {
        printf0("CLOVER_import_fermion(1) failed\n");
        goto end;

    if (QOP_CLOVER_allocate_fermion(&c_f[2], clover_state)) {
        printf0("CLOVER_allocate_fermion(2) failed\n");
        goto end;

    if (QOP_CLOVER_allocate_fermion(&c_f[3], clover_state)) {
        printf0("CLOVER_allocate_fermion(3) failed\n");
        goto end;

    if (QOP_CLOVER_import_gauge(&c_g, clover_state, kappa, c_sw,
                                u_reader, c_reader, NULL)) {
        printf("CLOVER_import_gauge() failed\n");
        goto end;

    QOP_CLOVER_D_operator(c_f[2], c_g, c_f[0]);
    QOP_CLOVER_export_fermion(f_writer, F[2], c_f[2]);

    QOP_CLOVER_D_operator_conjugated(c_f[3], c_g, c_f[1]);
    QOP_CLOVER_export_fermion(f_writer, F[3], c_f[3]);
    /* free CLOVER */
    for (i = 0; i < NELEMS(c_f); i++)


    /* Compute plaquette */
    plaq = plaquette(U);

    /* field norms */
    for (i = 0; i < NELEMS(F); i++)
        QDP_r_eq_norm2_D(&n[i], F[i], QDP_all);

    /* Display the values */
    printf0("plaquette = %g\n",
            plaq / (QDP_volume() * QDP_Nc * NDIM * (NDIM - 1) / 2 ));
    for (i = 0; i < NELEMS(F); i++)
        printf0(" |f|^2 [%d] = %20.10e\n", i, (double)(n[i]));

    /* Compute and display <f[1] f[2]> */
    show_dot("1|D0", F[1], F[2]);
    /* Compute and display <f[3] f[0]> */
    show_dot("X1|0", F[3], F[0]);

    destroy_Mvector(U, NELEMS(U));
    destroy_Mvector(C, NELEMS(C));
    destroy_Dvector(F, NELEMS(F));

    status = 0;
    /* shutdown QDP */
    return status;