C++ (Cpp) GA_Nodeid 예제들

예제 #1

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

/**
  \brief Clone the RTDB creating a new file

  Take the filename from the specified RTDB and create a new filename with
  the specified suffix. Then copy the current RTDB to the new file.

  \param handle [Input] the RTDB handle
  \param suffix [Input] the suffix for the new RTDB file

  \return the status of the file copy operation
*/
int rtdb_clone(const int handle, const char *suffix)
{
  int status;
#ifdef GAGROUPS
  int me = GA_Nodeid();
#else
  me = GA_Nodeid();
#endif

  if (!verify_parallel_access()) return 0;
  if (handle < 0 || handle >= MAX_RTDB) {
    (void) fprintf(stderr, "rtdb_clone: handle out of range %d\n", handle);
    (void) fflush(stderr);
    return 0;
  }
  if (par_mode[handle] == INACTIVE) {
    (void) fprintf(stderr, "rtdb_clone: handle not active %d\n",handle);
    (void) fflush(stderr);
    return 0;
  }
  if (parallel_mode != par_mode[handle]) {
    (void) fprintf(stderr, "rtdb_clone: mode of open and copy mismatch\n");
    (void) fflush(stderr);
    return 0;
  }

  if (parallel_mode == RTDB_SEQ_MODE || me == 0)
    status = rtdb_seq_copy(handle, suffix);

  if (parallel_mode == RTDB_PAR_MODE) 
    rtdb_broadcast(TYPE_RTDB_STATUS, MT_INT, 1, (void *) &status);

  return status;
}

예제 #2

0

파일 보기

파일: rate.c 프로젝트: jeffhammond/ga

int main(int argc, char **argv)
{
    int status, me;
    int max_arrays = 10;
    double max_sz = 1e8, max_disk = 1e10, max_mem = 1e6;
    int stack = 120000, heap = 3200000;
    int numfiles, numioprocs;
    int total, nproc;

    MP_INIT(argc,argv);
    GA_Initialize();
    me = GA_Nodeid();
    nproc = GA_Nnodes();
    total = pow(SIZE,NDIM)*sizeof(double);
    if (!GA_Uses_ma()) {
        if (GA_Nodeid() == 0) {
            printf("GA is not using MA\n");
        }
        stack = 100000;
        heap  = (int)(2.2*(float)(total));
    }

    if (MA_init(MT_F_DBL, stack, heap) ) {
        if (DRA_Init(max_arrays, max_sz, max_disk, max_mem) != 0)
            GA_Error("DRA_Init failed: ",0);
        if (USER_CONFIG == 0) {
            numfiles = -1;
            numioprocs = -1;
        } else if (USER_CONFIG == 1) {
            numfiles = 1;
            numioprocs = GA_Cluster_nnodes();
        } else if (USER_CONFIG == 2) {
            numfiles = GA_Cluster_nnodes();
            numioprocs = GA_Cluster_nnodes();
        } else {
            numfiles = 1;
            numioprocs = 1;
        }
        if (me==0) {
            printf("Disk resident arrays configured as:\n");
            printf("    Number of files: %d\n",numfiles);
            printf("    Number of I/O processors: %d\n",numioprocs);
        }
        DRA_Set_default_config(numfiles,numioprocs);
        if (me == 0) printf("\n");
        if (me == 0) printf("TESTING PERFORMANCE OF DISK ARRAYS\n");
        if (me == 0) printf("\n");
        test_io_dbl();
        status = DRA_Terminate();
        GA_Terminate();
    } else {
        printf("MA_init failed\n");
    }
    if(me == 0) printf("all done ...\n");
    MP_FINALIZE();
    return 0;
}

예제 #3

0

파일 보기

파일: gada.c 프로젝트: adrielb/DCell

PetscErrorCode testCreate2D()
{
  int ga;
  DA da;
  DALocalInfo info;
  Vec vec;
  PetscErrorCode ierr;
  
  PetscFunctionBegin;
  int d1 = 1453, d2 = 1451;
  ierr = DACreate2d(PETSC_COMM_WORLD,DA_NONPERIODIC,DA_STENCIL_STAR,
              d1,d2,PETSC_DECIDE,PETSC_DECIDE,1,1,0,0, &da); CHKERRQ(ierr);
  ierr = DAGetLocalInfo(da,&info); CHKERRQ(ierr);
  ierr = DACreateGlobalArray( da, &ga, &vec); CHKERRQ(ierr);
  
  PetscReal **v;
  ierr = DAVecGetArray(da,vec,&v); CHKERRQ(ierr);
  int xe = info.xs+info.xm,
      ye = info.ys+info.ym;
  for (int j = info.ys; j < ye; ++j) {
    for (int i = info.xs; i < xe; ++i) {
      v[j][i] = 1.*i + d1 * j;
    }
  }
  ierr = DAVecRestoreArray(da,vec,&v); CHKERRQ(ierr);
  PetscPrintf(PETSC_COMM_WORLD,"Updated local portion with DAVec\n");
  PetscBarrier(0);
  {
    double *da_ptr;
  VecGetArray(vec, &da_ptr);
  double *ptr;
  int low[2],hi[2],ld;
  NGA_Distribution(ga,GA_Nodeid(),low,hi);
  NGA_Access(ga,low,hi,&ptr,&ld);
  printf("[%d] ga:%p\tda:%p\tdiff:%p\n", GA_Nodeid(), ptr, da_ptr, (ptr-da_ptr) );
  NGA_Release_update(ga,low,hi);
  }
  
  int lo[2],ld;
  double val;
  for (int j = 0; j < d2; ++j) {
    for (int i = 0; i < d1; ++i) {
      lo[0] = j;
      lo[1] = i;
      NGA_Get(ga,lo,lo,&val,&ld);
      if( PetscAbs( i + d1*j - val) > .1 )
        printf(".");
//        printf("[%d] (%3.0f,%3.0f)\n", GA_Nodeid(), 1.*i + d1*j, val);
    }
  }
  GA_Print_stats();
  ierr = VecDestroy(vec); CHKERRQ(ierr);
  GA_Destroy(ga);
  PetscFunctionReturn(0);
}

예제 #4

0

파일 보기

파일: nw_inp_from_string.c 프로젝트: dmlb2000/nwchem-cml

int nw_inp_from_string(Integer rtdb, const char *input)
{
    char filename[30];
    FILE *file;
#if defined(USE_FCD) || defined(CRAY_T3E) || defined(WIN32)
    _fcd fstring;
#else
    char fstring[255];
#endif
    int status;
    const char base[] = "temp";
    const char ending[] = ".nw";
    int number ;

// This is bad, not 100% sure to be unique, since could be subgroup
    if (GA_Pgroup_get_world() != GA_Pgroup_get_default()) {
       number = (int) util_sgroup_mygroup_() ;
    } else {
       number = 0 ;
    }
    sprintf(filename, "%s%d%s", base,number,ending);
    if (GA_Nodeid() == 0) {
      if (!(file = fopen(filename,"w"))) {
        GA_Error("nw_inp_from_string: failed to open temp.nw\n",0);
      }
      if (fwrite(input, 1, strlen(input), file) != strlen(input)) {
        GA_Error("nw_inp_from_string: failed to write to temp.nw\n",0);
      }
      if (fwrite("\n", 1, 1, file) != 1) {
        GA_Error("nw_inp_from_string: failed to write to temp.nw\n",0);
      }
      (void) fclose(file);
    }

#if defined(CRAY_T3E)
      fstring = _cptofcd(filename, strlen(filename));
      status = nw_inp_from_file_(&rtdb, fstring);
#elif defined(WIN32)
    fstring.string = filename;
    fstring.len = strlen(filename);
    status = nw_inp_from_file_(&rtdb, fstring);
#elif defined(USE_FCD)
#error Do something about _fcd
#else
    status = nw_inp_from_file_(&rtdb, filename, strlen(filename));
#endif


    if (GA_Nodeid() == 0) (void) unlink(filename);

    return status;
}

예제 #5

0

파일 보기

파일: spiral-mm.c 프로젝트: pghosh2/GA-Matrix-Multiplication-Benchmark

void verify(int g_a, int g_b, int g_c, int *lo, int *hi, int *ld, int N) 
{

	double rchk, alpha=1.0, beta=0.0;
	int g_chk, me=GA_Nodeid();

	g_chk = GA_Duplicate(g_a, "array Check");
	if(!g_chk) GA_Error("duplicate failed",NDIMS);
	GA_Sync();

	GA_Dgemm('n', 'n', N, N, N, 1.0, g_a,
			g_b, 0.0, g_chk);

	GA_Sync();

	alpha=1.0, beta=-1.0;
	GA_Add(&alpha, g_c, &beta, g_chk, g_chk);
	rchk = GA_Ddot(g_chk, g_chk);

	if (me==0) {
		printf("Normed difference in matrices: %12.4e\n", rchk);
		if(rchk < -TOLERANCE || rchk > TOLERANCE)
			GA_Error("Matrix multiply verify failed",0);
		else
			printf("Matrix Mutiply OK\n");
	}

	GA_Destroy(g_chk);
}

예제 #6

0

파일 보기

파일: ga_asymmetr.c 프로젝트: dmlb2000/nwchem-cml

void FATR 
ga_antisymmetrize_(Integer *g_a) {
  
  DoublePrecision alpha = 0.5;
  int i, me = GA_Nodeid();
  extern void * FATR ga_malloc(Integer nelem, int type, char *name);
  extern void FATR ga_free(void *ptr);
  void FATR gai_subtr(int *lo, int *hi, void *a, void *b, DoublePrecision alpha,
                      int type, Integer nelem, int ndim);

  int alo[GA_MAX_DIM], ahi[GA_MAX_DIM], lda[GA_MAX_DIM];
  int blo[GA_MAX_DIM], bhi[GA_MAX_DIM], ldb[GA_MAX_DIM];
  int ndim, dims[GA_MAX_DIM], type;
  Integer nelem=1;
  Logical have_data;
  void *a_ptr, *b_ptr;

  GA_Sync();


  
  NGA_Inquire((int)(*g_a), &type, &ndim, dims);
  
  if (dims[0] != dims[1]) 
    GA_Error("ga_sym: can only sym square matrix", 0L);
  
  /* Find the local distribution */
  NGA_Distribution((int)(*g_a), me, alo, ahi);
 
 
  have_data = ahi[0]>=0;
  for(i=1; i<ndim; i++) have_data = have_data && ahi[i]>=0;
  
  if(have_data) {
    NGA_Access((int)(*g_a), alo, ahi, &a_ptr, lda); 
    
    for(i=0; i<ndim; i++) nelem *= ahi[i]-alo[i] +1;
    b_ptr = (void *) ga_malloc(nelem, MT_C_DBL, "v");
    
    for(i=2; i<ndim; i++) {bhi[i]=ahi[i]; blo[i]=alo[i]; }
    
    /* switch rows and cols */
    blo[1]=alo[0];
    bhi[1]=ahi[0];
    blo[0]=alo[1];
    bhi[0]=ahi[1];

    for (i=0; i < ndim-1; i++) 
      ldb[i] = bhi[i+1] - blo[i+1] + 1; 
    NGA_Get((int)(*g_a), blo, bhi, b_ptr, ldb);
  }
  GA_Sync(); 

  if(have_data) {
    gai_subtr(alo, ahi, a_ptr, b_ptr, alpha, type, nelem, ndim);
    NGA_Release_update((int)(*g_a), alo, ahi);
    ga_free(b_ptr);
  }
  GA_Sync();
}

예제 #7

0

파일 보기

파일: sched_groups.c 프로젝트: 000Justin000/nwchem

/*server signals all clients to terminate*/
void signal_termination(int server_id, int msg_src_mpi) {
  int i, v=TERM_CLIENT;
  const int rank = GA_Nodeid();
  const int size = GA_Nnodes(); 
  const int default_grp = ga_pgroup_get_default_();

/*   assert(server_id == rank);  /\*call with your id, not someone else's*\/ */
  assert(server_id == SVR); /*Only server may invoke this method*/
#ifdef LEADER_BCAST
  {
    int *pid_list = (int *)alloca(size*sizeof(int));
    int root = GA_Pgroup_absolute_id(default_grp, SVR);
/*     int src = GA_Pgroup_absolute_id(default_grp, rank); */
    for(i=0; i<size; i++) {
      pid_list[i] = GA_Pgroup_absolute_id(default_grp,i);
    }
    broadcast(size,pid_list,root,msg_src_mpi,&v,sizeof(int));
  }
#else
  for(i=0; i<size; i++) {
    if(i != SVR) {
      MPI_Send(&v, 1, MPI_INT, GA_Pgroup_absolute_id(default_grp,i), 
	       SIGNAL_TAG, MPI_COMM_WORLD);
    }
  }
#endif
}

예제 #8

0

파일 보기

파일: linux_printaff.c 프로젝트: ryancoleman/lotsofcoresbook1code

int linux_printaff_(){
  mypid=getpid();
#ifdef MPI
  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
#else
  myrank=GA_Nodeid();
#endif
  CPU_ZERO(&mycpuid);
  if (sched_getaffinity(mypid, sizeof(mycpuid), &mycpuid) < 0) {
    perror("sched_getaffinity");
    return -1;
  }
  for (i = 0; i < MXCPUS; i++){
    if (CPU_ISSET(i, &mycpuid)) {
     caff[numaff]=i;  numaff+=1;
    }
  }
  if(numaff>0) {
    printf("rank %d pid %d bind to %d CPUs:", myrank, (int) mypid, numaff);
    for (i = 0; i < numaff; i++){
      printf(" %i ", caff[i]);
    }
    printf(" \n");
    fflush(stdout);
  }
  return 0;
}

예제 #9

0

파일 보기

파일: transpose.c 프로젝트: timasjov/parallel-computing-ga-mpi

int main(int argc, char **argv) {
    int heap=300000, stack=300000;
    int me, nprocs, i;
    double start, end;
	
    /* Initialize Message Passing library */
    MPI_Init(&argc, &argv);

    /* Initialize GA */
    GA_Initialize();
    
    /* Initialize Memory Allocator (MA) */
    if(! MA_init(C_DBL, stack, heap) ) GA_Error("MA_init failed",stack+heap);

    me     = GA_Nodeid();
    nprocs = GA_Nnodes();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
     
	start = MPI_Wtime();
	for(i =0; i< 1000; i++) {
		TRANSPOSE1D();
	}
	end = MPI_Wtime();
	if(me==0) printf("  Time=%2.5e secs\n\n",end-start); 
    
    if(me==0)printf("\nTerminating ..\n");
    GA_Terminate();
    MPI_Finalize();    
}

예제 #10

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

/**
  \brief Delete the data associated with a key from the RTDB

  \param handle [Input] the RTDB handle
  \param name   [Input] the key 

  \return the status of the delete operation
*/
int rtdb_delete(const int handle, const char *name)
{
  int status;
#ifdef GAGROUPS
  int me = GA_Nodeid();
#endif

  if (!verify_parallel_access()) return 0;

  if (handle < 0 || handle >= MAX_RTDB) {
    (void) fprintf(stderr, "rtdb_delete: handle out of range %d\n", handle);
    (void) fflush(stderr);
    return 0;
  }
  if (par_mode[handle] == INACTIVE) {
    (void) fprintf(stderr, "rtdb_delete: handle not active %d\n",handle);
    (void) fflush(stderr);
    return 0;
  }

  if (par_mode[handle] == RTDB_SEQ_MODE && parallel_mode == RTDB_PAR_MODE) {
    (void) fprintf(stderr, "rtdb_delete: seq. open and par. delete\n");
    (void) fflush(stderr);
    return 0;
  }

  if (parallel_mode == RTDB_SEQ_MODE || me == 0)
    status = rtdb_seq_delete(handle, name);

  if (parallel_mode == RTDB_PAR_MODE)
    rtdb_broadcast(TYPE_RTDB_STATUS, MT_INT, 1, (void *) &status);

  return status;
}

예제 #11

0

파일 보기

파일: ga_zero.c 프로젝트: jeffhammond/ga

int main(int argc, char **argv)
{
    TEST_SETUP;

    int shape_idx=0, type_idx=0, dist_idx=0;
    int return_code=0;

    for (shape_idx=0; shape_idx < NUM_SHAPES; ++shape_idx) {
        for (type_idx=0; type_idx < NUM_TYPES; ++type_idx) {
            for (dist_idx=0; dist_idx < NUM_DISTS; ++dist_idx) {
                if (0 == GA_Nodeid()) {
                    printf("%s\t%s\t%s\n",
                            SHAPE_NAMES[shape_idx],
                            TYPE_NAMES[type_idx],
                            DIST_NAMES[dist_idx]
                            );
                }
                GA_Sync();
                return_code = test(shape_idx, type_idx, dist_idx);
                if (0 != return_code) {
                    break;
                }
            }
            if (0 != return_code) {
                break;
            }
        }
        if (0 != return_code) {
            break;
        }
    }

    TEST_TEARDOWN;
    return return_code;
}

예제 #12

0

파일 보기

파일: nga_access.c 프로젝트: niuqingpeng/microbenchmark

int main(int argc, char** argv) 
{ 
    int nprocs,myid,nprocssq; 
    int dims[2],chunk[2]; 
    int i,j,k; 
    int stack = 100000, heap = 100000; 
    MPI_Init(&argc,&argv); 
    GA_Initialize(); 
    MA_init(C_DBL,stack,heap); 
    nprocssq = GA_Nnodes(); 
    nprocs = sqrt(nprocssq); 
    myid = GA_Nodeid(); 
    dims[0] = N; dims[1] = N; 
    chunk[0] = N/nprocs; 
    chunk[1] = N/nprocs; 
    int g_a = NGA_Create(C_DBL,2,dims,"Array A",chunk); 
    int lo[2],hi[2]; 
    NGA_Distribution(g_a,myid,lo,hi); 
    int ld[1] = {N/nprocs}; 
    void *ptr; 
    double *local; 
    printf("Myid = %d, lo = [%d,%d] , hi = [%d,%d] , ld = %d \n",myid,lo[0],lo[1],hi[0],hi[1],ld[0]); 
    NGA_Access(g_a,lo,hi,&ptr,ld); 
    local = (double*) ptr; 
    printf("Myid = %d , local[0][0] = %f\n",*local); 
    GA_Sync(); 
    GA_Destroy(g_a); 
    GA_Terminate(); 
    MPI_Finalize(); 
    return 0; 
}

예제 #13

0

파일 보기

파일: util_gnxtval.c 프로젝트: jnafziger/pdft-nwchem

Integer util_gnxtval_(Integer *val) {

    if(*val > 0) {
       if(!initialized) ga_error("nxtval: not yet initialized", 0L);
       return (Integer) NGA_Read_inc(g_T, &subscript, 1);
    }
    else if(*val==0) {
       int n = 1;
       initialized=1;

       /* create task array */
       g_T = NGA_Create(C_LONG, 1, &n,"Atomic Task", NULL);
       
       /* Initialize the task array */
       if(GA_Nodeid()==0) {
	  int lo=0, hi=0;
	  NGA_Put (g_T, &lo, &hi, &initval, &hi);
	  initval=0;
       }
              GA_Sync();
       return 0;
    }
    else if (*val < 0) { GA_Destroy(g_T); initialized=0; initval=0; return 0;}
    
    ga_error("nxtval: invalid value passed", 0L);
    return -1;
}

예제 #14

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

void rtdb_print_usage()
{
#ifdef USE_HDBM
#ifdef GAGROUPS
  int me = GA_Nodeid();
#endif
  if (me == 0)
    hdbm_print_usage();
#endif
}

예제 #15

0

파일 보기

파일: print.c 프로젝트: jeffhammond/ga

int main(int argc, char **argv)
{
    int size_dst = 15;
    int g_a = 0;
    int I_NEG_ONE = -1;
    long L_NEG_ONE = -1;
    long long LL_NEG_ONE = -1;
    int FIVE = 5;
    int TEN = 10;
    int lo;
    int hi;
    int *ptr;
    int i;

    MP_INIT(argc,argv);

    GA_INIT(argc,argv);

    for (i=0; i<3; ++i) {
        if (0 == i) {
            g_a = NGA_Create(C_INT, 1, &size_dst, "dst", NULL);
            GA_Fill(g_a, &I_NEG_ONE);
        } else if (1 == i) {
            g_a = NGA_Create(C_LONG, 1, &size_dst, "dst", NULL);
            GA_Fill(g_a, &L_NEG_ONE);
        } else if (2 == i) {
            g_a = NGA_Create(C_LONGLONG, 1, &size_dst, "dst", NULL);
            GA_Fill(g_a, &LL_NEG_ONE);
        }
        GA_Sync();
        GA_Print(g_a);
        NGA_Print_patch(g_a, &FIVE, &TEN, 0);
        NGA_Print_patch(g_a, &FIVE, &TEN, 1);
        NGA_Distribution(g_a, GA_Nodeid(), &lo, &hi);
        NGA_Access(g_a, &lo, &hi, &ptr, NULL);
        printf("[%d] (%d)=%d\n", GA_Nodeid(), lo, *ptr);
        NGA_Release(g_a, &lo, &hi);
    }

    GA_Terminate();
    MP_FINALIZE();
    exit(EXIT_SUCCESS);
}

예제 #16

0

파일 보기

파일: gemmtest.c 프로젝트: jeffhammond/ga

void
do_work() {
  int i;
  int me = GA_Nodeid();

  test(C_FLOAT);
  test(C_DBL);
  test(C_DCPL);
  test(C_SCPL);
  /*
  */
  if(me == 0) printf("\n\n");
}

예제 #17

0

파일 보기

파일: util_mic_support.c 프로젝트: 000Justin000/nwchem

int util_mic_get_device_() {
        int count;
	if (!offload_master_()) {
		fprintf(stdout, "%02d: need to be offload master\n", GA_Nodeid());
		GA_Error("util_mic_get_device error", 0L);
	}
        count = util_cgetppn()/util_mic_get_num_devices_();
        if (count > 0) {
          count=util_my_smp_index()/util_nwc_ranks_per_device_();
        }
	return count;


}

예제 #18

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

/**
  \brief Open an RTDB stored on a given file

  The RTDB is stored on a file. Before the RTDB can be accessed in the program
  the file needs to be opened. To access the RTDB a handle is associated with
  with the file and this handle is used in the actual access routines.

  \param filename [Input] the name of the file holding the RTDB
  \param mode     [Input] the initial access mode of the RTDB
  \param handle   [Output] the RTBD handle

  \return The return value of the file open command
*/
int rtdb_open(const char *filename, const char *mode, int *handle)
{
  int status;
#ifdef GAGROUPS
  int me = GA_Nodeid();
#else
  me = GA_Nodeid();
#endif

  if (!verify_parallel_access()) return 0;

  if (parallel_mode == RTDB_SEQ_MODE || me == 0)
    status = rtdb_seq_open(filename, mode, handle);

  if (parallel_mode == RTDB_PAR_MODE) {
    rtdb_broadcast(TYPE_RTDB_HANDLE, MT_INT, 1, (void *) handle);
    rtdb_broadcast(TYPE_RTDB_STATUS, MT_INT, 1, (void *) &status);
  }

  if (status)
    par_mode[*handle] = parallel_mode;

  return status;
}

예제 #19

0

파일 보기

파일: util_mic_support.c 프로젝트: 000Justin000/nwchem

void FATR util_mic_set_affinity_() {
	char affinity[BUFSZ];
	char num_threads[BUFSZ];
	int pos;

	int micdev;
	int nprocs;
	int ranks_per_dev;
	int rank_on_dev;
	int nthreads;
	int ppn;
	int ranks_per_device=util_getenv_nwc_ranks_per_device_();

	if (ranks_per_device == 0) {
	  return ;
	} else if (ranks_per_device < 0){
	  ranks_per_device = RANKS_PER_DEVICE;
	}
	
	pos=snprintf(affinity, BUFSZ, "KMP_PLACE_THREADS=");
	micdev=util_mic_get_device_();
	ppn=util_cgetppn();
#pragma offload target(mic:micdev) out(nprocs)
	{
		/* do one offload to query the coprocessor for the number of cores */
		nprocs = ((int) sysconf(_SC_NPROCESSORS_ONLN) / 4) - 1;
	}

	rank_on_dev = util_my_smp_index() % util_nwc_ranks_per_device_();
	
	nthreads = nprocs / ranks_per_device * DEFAULT_OFFLOAD_THREAD_MULTIPLIER;
	
	pos+=snprintf(affinity+pos, BUFSZ-pos, "%dc,%dt,%do",
	              nprocs / ranks_per_device, DEFAULT_OFFLOAD_THREAD_MULTIPLIER,
				  rank_on_dev * (nprocs / ranks_per_device));
	snprintf(num_threads, BUFSZ, "OMP_NUM_THREADS=%d", nthreads);
	
	printf("%02d: micdev=%d nprocs=%d rank_on_dev=%d ranks_per_device=%d affinity='%s' pos=%d\n", 
	       GA_Nodeid(), micdev, nprocs, rank_on_dev, ranks_per_device, affinity, pos);
	fflush(stdout);
#pragma offload target(mic:micdev) in(affinity) in(num_threads)
	{
		/* set the affinity masks and the number of offloaded OpenMP threads */
		kmp_set_defaults("KMP_AFFINITY=compact");
		kmp_set_defaults(affinity);
		kmp_set_defaults(num_threads);
	}
}

예제 #20

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

static int verify_parallel_access()
/*
  Return true if access mode / processor values are sensible
*/
{
#ifdef GAGROUPS
  int me = GA_Nodeid();
#endif
  if ((parallel_mode == RTDB_SEQ_MODE) && (me != 0)) {
    (void) fflush(stdout);
    (void) fprintf(stderr,"rtdb: sequential access only possible for process 0\n");
    (void) fflush(stderr);
    return 0;
  }
  else
    return 1;
}

예제 #21

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

/**
  \brief Retrieve data from the RTDB

  Retrieve the data associated with the specified key from the RTDB and 
  return it in the provided array.

  \param handle  [Input] the RTDB handle
  \param name    [Input] the key for the data
  \param ma_type [Input] the type of the data specified by one of the MA data types (see mafdecls.fh)
  \param nelem   [Input] the number of elements of the specified type
  \param array   [Output] the actual data retrieved

  \return the status of the read operation
*/
int rtdb_get(const int handle, const char *name, const int ma_type,
		 const int nelem, void *array)
{
  int status;
#ifdef GAGROUPS
  int me = GA_Nodeid();
#endif

  if (!verify_parallel_access()) return 0;

  if (handle < 0 || handle >= MAX_RTDB) {
    (void) fprintf(stderr, "rtdb_get: handle out of range %d\n", handle);
    (void) fflush(stderr);
    return 0;
  }
  if (par_mode[handle] == INACTIVE) {
    (void) fprintf(stderr, "rtdb_get: handle not active %d\n",handle);
    (void) fflush(stderr);
    return 0;
  }

  if (par_mode[handle] == RTDB_SEQ_MODE && parallel_mode == RTDB_PAR_MODE) {
    (void) fprintf(stderr, "rtdb_get: seq. open and par. get\n");
    (void) fflush(stderr);
    return 0;
  }

  if (parallel_mode == RTDB_SEQ_MODE || me == 0)
    status = rtdb_seq_get(handle, name, ma_type, nelem, array);

  /* Implicit assumption that all processes call this with nelem
     greater than or equal to value used on process 0 */

  if (parallel_mode == RTDB_PAR_MODE) {
    rtdb_broadcast(TYPE_RTDB_STATUS, MT_INT, 1, (void *) &status);
    
    if (status) {
      rtdb_broadcast(TYPE_RTDB_NELEM, MT_INT, 1, (void *) &nelem);
      rtdb_broadcast(TYPE_RTDB_ARRAY, ma_type, nelem, (void *) array);
    }
  }

  return status;
}

예제 #22

0

파일 보기

파일: spiral-mm.c 프로젝트: pghosh2/GA-Matrix-Multiplication-Benchmark

int main(int argc, char **argv) 
{
	int proc, nprocs;
	int M, N, K; /* */
	int blockX_len, blockY_len;
	int heap=3000000, stack=3000000;

	if (argc == 6) {
		M = atoi(argv[1]);
		N = atoi(argv[2]);
		K = atoi(argv[3]);
		blockX_len = atoi(argv[4]);
		blockY_len = atoi(argv[5]);
	}
	else {
		printf("Please enter ./a.out <M> <N> <K> <BLOCK-X-LEN> <BLOCK-Y-LEN>");
		exit(-1);
	}

	MPI_Init(&argc, &argv);

	GA_Initialize();

	if(! MA_init(C_DBL, stack, heap) ) GA_Error("MA_init failed",stack+heap);

	proc   = GA_Nodeid();
	nprocs = GA_Nnodes();

	if(proc == 0) {
		printf("Using %d processes\n", nprocs); 
		fflush(stdout);
	}

	matrix_multiply(M, N, K, blockX_len, blockY_len);

	if(proc == 0)
		printf("\nTerminating ..\n");

	GA_Terminate();

	MPI_Finalize();    

	return 0;
}

예제 #23

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

int rtdb_next(const int handle, const int namelen, char *name)
{
  int status;
#ifdef GAGROUPS
  int me = GA_Nodeid();
#endif

  if (!verify_parallel_access()) return 0;

  if (handle < 0 || handle >= MAX_RTDB) {
    (void) fprintf(stderr, "rtdb_next: handle out of range %d\n", handle);
    (void) fflush(stderr);
    return 0;
  }
  if (par_mode[handle] == INACTIVE) {
    (void) fprintf(stderr, "rtdb_next: handle not active %d\n",handle);
    (void) fflush(stderr);
    return 0;
  }

  if (par_mode[handle] == RTDB_SEQ_MODE && parallel_mode == RTDB_PAR_MODE) {
    (void) fprintf(stderr, "rtdb_next: seq. open and par. next\n");
    (void) fflush(stderr);
    return 0;
  }

  if (parallel_mode == RTDB_SEQ_MODE || me == 0)
    status = rtdb_seq_next(handle, namelen, name);
  
  if (parallel_mode == RTDB_PAR_MODE) {
    rtdb_broadcast(TYPE_RTDB_STATUS, MT_INT, 1, (void *) &status);
    
    if (status) {
      int len;
      if (me == 0)len = strlen(name)+1;
      rtdb_broadcast(TYPE_RTDB_LEN, MT_INT, 1, (void *) &len);
      rtdb_broadcast(TYPE_RTDB_NAME, MT_CHAR, len, (void *) name);
    }
    
  }

  return status;
}

예제 #24

0

파일 보기

파일: rtdb.c 프로젝트: wadejong/NWChem-Json

int rtdb_get_info(const int handle,
		  const char *name, int *ma_type, int *nelem, char date[26])
{
  int status;
#ifdef GAGROUPS
  int me = GA_Nodeid();
#endif

  if (!verify_parallel_access()) return 0;

  if (handle < 0 || handle >= MAX_RTDB) {
    (void) fprintf(stderr, "rtdb_get_info: handle out of range %d\n", handle);
    (void) fflush(stderr);
    return 0;
  }
  if (par_mode[handle] == INACTIVE) {
    (void) fprintf(stderr, "rtdb_get_info: handle not active %d\n",handle);
    (void) fflush(stderr);
    return 0;
  }

  if (par_mode[handle] == RTDB_SEQ_MODE && parallel_mode == RTDB_PAR_MODE) {
    (void) fprintf(stderr, "rtdb_get_info: seq. open and par. get\n");
    (void) fflush(stderr);
    return 0;
  }

  if (parallel_mode == RTDB_SEQ_MODE || me == 0)
    status = rtdb_seq_get_info(handle, name, ma_type, nelem, date);

  if (parallel_mode == RTDB_PAR_MODE) {
    rtdb_broadcast(TYPE_RTDB_STATUS, MT_INT, 1, (void *) &status);
    
    if (status) {
      rtdb_broadcast(TYPE_RTDB_NELEM, MT_INT,  1,   (void *) nelem);
      rtdb_broadcast(TYPE_RTDB_TYPE,  MT_INT,  1,   (void *) ma_type);
      rtdb_broadcast(TYPE_RTDB_DATE,  MT_CHAR, 26,  (void *) date);
    }
  }

  return status;
}

예제 #25

0

파일 보기

파일: transp1D.c 프로젝트: MihaiLupoiu/TPP

int main(int argc, char **argv) {
    int heap=300000, stack=300000;
    int me, nprocs;
    
    /* Step1: Initialize Message Passing library */
#ifdef MPI
    MPI_Init(&argc, &argv);   /* initialize MPI */
#else
    PBEGIN_(argc, argv);      /* initialize TCGMSG */
#endif

    /* Step2: Initialize GA */
    /* ### intialize the GA library */
    GA_Initialize();

    
    /* Step3: Initialize Memory Allocator (MA) */
    if(! MA_init(C_DBL, stack, heap) ) GA_Error("MA_init failed",stack+heap);

    /* ### assign the local processor ID to the int variable "me"
     * ### and the total number of processors to the int variable
     * ### "nprocs" */
    me     = GA_Nodeid();
    nprocs = GA_Nnodes();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
    
       
    TRANSPOSE1D();
    
    if(me==0)printf("\nTerminating ..\n");
    /* ### terminate the GA library */
    GA_Terminate();

#ifdef MPI
    MPI_Finalize();    
#else
    PEND_();
#endif
}

예제 #26

0

파일 보기

파일: goss_utils.cpp 프로젝트: GridOPTICS/GridPACK

/**
 * Shut down GOSS communication
 */
void gridpack::goss::GOSSUtils::terminateGOSS()
{
  if (GA_Nodeid()==0) {
    char topic[128];

#ifndef GOSS_DEBUG
    Connection *connection;
    Session *session;
    Destination *destination;
    MessageProducer *producer;


    std::auto_ptr<ActiveMQConnectionFactory>
      connectionFactory(new ActiveMQConnectionFactory(p_URI)) ;
    // Create a Connection
    connection = connectionFactory->createConnection(p_username, p_passwd);
    connection->start();

    // Create a Session
    session = connection->createSession(Session::AUTO_ACKNOWLEDGE);

    // Create the destination (Topic or Queue)
    sprintf(topic,"topic/goss/gridpack/close_goss");
    destination = session->createTopic(topic);

    // Create a MessageProducer from the Session to the Topic
    producer = session->createProducer(destination);
    producer->setDeliveryMode(DeliveryMode::NON_PERSISTENT);

    // Send final message indicating that channel is being close
    std::string buf = "Closing GOSS";
    std::auto_ptr<TextMessage>
      end_message(session->createTextMessage(buf));
    producer->send(end_message.get());
    if (connection) delete connection;
    if (session) delete session;
    if (destination) delete destination;
    if (producer) delete producer;
#endif
  }
}

예제 #27

0

파일 보기

파일: sched_groups.c 프로젝트: 000Justin000/nwchem

void plist_init(proc_list_t *plist) {
  int i, ctr;
  const int me = GA_Nodeid();
  const int nproc = GA_Nnodes();
  const int default_grp = ga_pgroup_get_default_();
  const char *pname = "plist_init";

/*   fprintf(stderr, "%d:: 1 %s\n", me,pname); */

  plist->buf = (proc_t *)malloc((nproc-1)*sizeof(proc_t));
  assert(plist->buf != NULL);
  for(i=0, ctr=0; i<nproc; i++) {
    if(i != SVR) {
      plist->buf[ctr].procid = GA_Pgroup_absolute_id(default_grp,i);
      plist->buf[ctr].next = &plist->buf[ctr+1];
      ctr+=1;
    }
  }
/*   fprintf(stderr, "%d:: 2 %s\n", me,pname); */
  plist->buf[nproc-2].next = NULL;
  plist->idle = &plist->buf[0];
  plist->nidle = nproc-1;
}

예제 #28

0

파일 보기

파일: ga_matrix.cpp 프로젝트: Anastien/GridPACK

// -------------------------------------------------------------
// MPIComm2GApgroup
// -------------------------------------------------------------
static
PetscErrorCode
MPIComm2GApgroup(MPI_Comm comm, int *pGrpHandle)
{
  PetscErrorCode ierr = 0;
  int nproc;
  int me, myGlobalRank;
  int *proclist;
  int p;

  ierr = MPI_Comm_size(comm, &nproc); CHKERRQ(ierr);
  ierr = MPI_Comm_rank(comm, &me); CHKERRQ(ierr);
  myGlobalRank = GA_Nodeid();
  ierr = PetscMalloc(nproc*sizeof(int), &proclist); CHKERRQ(ierr);
  for (p = 0; p < nproc; ++p) {
    proclist[p] = 0;
  }
  proclist[me] = myGlobalRank;
  ierr = MPI_Allreduce(MPI_IN_PLACE, &proclist[0], nproc, MPI_INT, MPI_SUM, comm); CHKERRQ(ierr);
  *pGrpHandle = GA_Pgroup_create(&proclist[0], nproc); 
  ierr = PetscFree(proclist); CHKERRQ(ierr);
  return ierr;
}

예제 #29

0

파일 보기

파일: gemmtest.c 프로젝트: jeffhammond/ga

int 
main(int argc, char **argv) {

Integer heap=9000000, stack=9000000;
int me, nproc;
DoublePrecision time;

    MP_INIT(argc,argv);

    GA_INIT(argc,argv);                           /* initialize GA */

    nproc = GA_Nnodes();
    me = GA_Nodeid();

    if(me==0) printf("Using %d processes\n\n",nproc);

    if (me==0) printf ("Matrix size is %d X %d\n",N,N);

#ifdef USE_REGULAR
    if (me == 0) printf("\nUsing regular data distribution\n\n");
#endif
#ifdef USE_SIMPLE_CYCLIC
    if (me == 0) printf("\nUsing simple block-cyclic data distribution\n\n");
#endif
#ifdef USE_SCALAPACK
    if (me == 0) printf("\nUsing ScaLAPACK data distribution\n\n");
#endif
#ifdef USE_TILED
    if (me == 0) printf("\nUsing tiled data distribution\n\n");
#endif

    if(!MA_init((Integer)MT_F_DBL, stack/nproc, heap/nproc))
       GA_Error("MA_init failed bytes= %d",stack+heap);   

#ifdef PERMUTE
      {
        int i, *list = (int*)malloc(nproc*sizeof(int));
        if(!list)GA_Error("malloc failed",nproc);

        for(i=0; i<nproc;i++)list[i]=nproc-1-i;

        GA_Register_proclist(list, nproc);
        free(list);
      }
#endif

    if(GA_Uses_fapi())GA_Error("Program runs with C API only",1);

    time = MP_TIMER();
    do_work();
    /*    printf("%d: Total Time = %lf\n", me, MP_TIMER()-time);
      printf("%d: GEMM Total Time = %lf\n", me, gTime);
    */

    if(me==0)printf("\nSuccess\n\n");
    GA_Terminate();

    MP_FINALIZE();

    return 0;
}

예제 #30

0

파일 보기

파일: gemmtest.c 프로젝트: jeffhammond/ga

void
test(int data_type) {
  int me=GA_Nodeid();
  int nproc = GA_Nnodes();
  int g_a, g_b, g_c;
  int ndim = 2;
  int dims[2]={N,N};
  int lo[2]={0,0};
  int hi[2]={N-1,N-1};
  int block_size[2]={NB,NB-1};
  int proc_grid[2];
  int i,j,l,k,m,n, ld;

  double alpha_dbl = 1.0, beta_dbl = 0.0;
  double dzero = 0.0;
  double ddiff;

  float alpha_flt = 1.0, beta_flt = 0.0;
  float fzero = 0.0;
  float fdiff;
  float ftmp;
  double dtmp;
  SingleComplex ctmp;
  DoubleComplex ztmp;

  DoubleComplex alpha_dcpl = {1.0, 0.0} , beta_dcpl = {0.0, 0.0}; 
  DoubleComplex zzero = {0.0,0.0};
  DoubleComplex zdiff;

  SingleComplex alpha_scpl = {1.0, 0.0} , beta_scpl = {0.0, 0.0}; 
  SingleComplex czero = {0.0,0.0};
  SingleComplex cdiff;

  void *alpha=NULL, *beta=NULL;
  void *abuf=NULL, *bbuf=NULL, *cbuf=NULL, *c_ptr=NULL;

  switch (data_type) {
  case C_FLOAT:
    alpha  = (void *)&alpha_flt;
    beta   = (void *)&beta_flt;
    abuf = (void*)malloc(N*N*sizeof(float));
    bbuf = (void*)malloc(N*N*sizeof(float));
    cbuf = (void*)malloc(N*N*sizeof(float));
    if(me==0) printf("Single Precision: Testing GA_Sgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;      
  case C_DBL:
    alpha  = (void *)&alpha_dbl;
    beta   = (void *)&beta_dbl;
    abuf = (void*)malloc(N*N*sizeof(double));
    bbuf = (void*)malloc(N*N*sizeof(double));
    cbuf = (void*)malloc(N*N*sizeof(double));
    if(me==0) printf("Double Precision: Testing GA_Dgemm,NGA_Matmul_patch for %d-Dimension", ndim); 
    break;    
  case C_DCPL:
    alpha  = (void *)&alpha_dcpl;
    beta   = (void *)&beta_dcpl;
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    if(me==0) printf("Double Complex:   Testing GA_Zgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  case C_SCPL:
    alpha  = (void *)&alpha_scpl;
    beta   = (void *)&beta_scpl;
    abuf = (void*)malloc(N*N*sizeof(SingleComplex));
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    if(me==0) printf("Single Complex:   Testing GA_Cgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  if (me==0) printf("\nCreate A, B, C\n");
#ifdef USE_REGULAR
  g_a = NGA_Create(data_type, ndim, dims, "array A", NULL);
#endif
#ifdef USE_SIMPLE_CYCLIC
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  NGA_Set_block_cyclic(g_a,block_size);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_SCALAPACK
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_block_cyclic_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_TILED
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_tiled_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
  g_b = GA_Duplicate(g_a, "array B");  
  g_c = GA_Duplicate(g_a, "array C");
  if(!g_a || !g_b || !g_c) GA_Error("Create failed: a, b or c",1);

  ld = N;
  if (me==0) printf("\nInitialize A\n");
  /* Set up matrix A */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[i*N+j] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[i*N+j] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[i*N+j].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[i*N+j].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[i*N+j] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[i*N+j] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[i*N+j].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[i*N+j].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nPerform matrix multiply\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (me==0) printf("\nCheck answer\n");
  /*
  GA_Print(g_a);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_b);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_c); 
  */

  /* Check answer */
  NGA_Get(g_a,lo,hi,abuf,&ld);
  NGA_Get(g_b,lo,hi,bbuf,&ld);
  for (i=0; i<N; i++) {
    for (j=0; j<N; j++) {
      switch (data_type) {
        case C_FLOAT:
          ((float*)cbuf)[i*N+j] = fzero;
          break;
        case C_DBL:
          ((double*)cbuf)[i*N+j] = dzero;
          break;
        case C_DCPL:
          ((DoubleComplex*)cbuf)[i*N+j] = zzero;
          break;
        case C_SCPL:
          ((SingleComplex*)cbuf)[i*N+j] = czero;
          break;
        default:
          GA_Error("wrong data type", data_type);
      }
      for (k=0; k<N; k++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N+j] += ((float*)abuf)[i*N+k]
              *((float*)bbuf)[k*N+j];
            break;
          case C_DBL:
            ((double*)cbuf)[i*N+j] += ((double*)abuf)[i*N+k]
              *((double*)bbuf)[k*N+j];
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N+j].real +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].real
               -(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].imag));
            ((DoubleComplex*)cbuf)[i*N+j].imag +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].imag
               +(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].real));
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N+j].real +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].real
               -(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].imag));
            ((SingleComplex*)cbuf)[i*N+j].imag +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].imag
               +(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].real));
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();
  if (me == 0) {
    NGA_Get(g_c,lo,hi,abuf,&ld);
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            fdiff = ((float*)abuf)[i*N+j]-((float*)cbuf)[i*N+j];
            if (((float*)abuf)[i*N+j] != 0.0) {
              fdiff /= ((float*)abuf)[i*N+j];
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((float*)abuf)[i*N+j],((float*)cbuf)[i*N+j]);
            }
            break;
          case C_DBL:
            ddiff = ((double*)abuf)[i*N+j]-((double*)cbuf)[i*N+j];
            if (((double*)abuf)[i*N+j] != 0.0) {
              ddiff /= ((double*)abuf)[i*N+j];
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((double*)abuf)[i*N+j],((double*)cbuf)[i*N+j]);
            }
            break;
          case C_DCPL:
            zdiff.real = ((DoubleComplex*)abuf)[i*N+j].real
              -((DoubleComplex*)cbuf)[i*N+j].real;
            zdiff.imag = ((DoubleComplex*)abuf)[i*N+j].imag
              -((DoubleComplex*)cbuf)[i*N+j].imag;
            if (((DoubleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((DoubleComplex*)abuf)[i*N+j].imag != 0.0) {
              ztmp = ((DoubleComplex*)abuf)[i*N+j];
              ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
                  /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
            } else {
              ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((DoubleComplex*)abuf)[i*N+j].real,
                  ((DoubleComplex*)abuf)[i*N+j].imag,
                  ((DoubleComplex*)cbuf)[i*N+j].real,
                  ((DoubleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          case C_SCPL:
            cdiff.real = ((SingleComplex*)abuf)[i*N+j].real
              -((SingleComplex*)cbuf)[i*N+j].real;
            cdiff.imag = ((SingleComplex*)abuf)[i*N+j].imag
              -((SingleComplex*)cbuf)[i*N+j].imag;
            if (((SingleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((SingleComplex*)abuf)[i*N+j].imag != 0.0) {
              ctmp = ((SingleComplex*)abuf)[i*N+j];
              fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
                  /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
            } else {
              fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((SingleComplex*)abuf)[i*N+j].real,
                  ((SingleComplex*)abuf)[i*N+j].imag,
                  ((SingleComplex*)cbuf)[i*N+j].real,
                  ((SingleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();

  /* copy cbuf back to g_a */
  if (me == 0) {
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  GA_Sync();

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = GA_Fdot(g_a,g_a);
      break;
    case C_DBL:
      dtmp = GA_Ddot(g_a,g_a);
      break;
    case C_DCPL:
      ztmp = GA_Zdot(g_a,g_a);
      break;
    case C_SCPL:
      ctmp = GA_Cdot(g_a,g_a);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  GA_Zero(g_b);
  GA_Add(alpha,g_a,beta,g_c,g_b);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = GA_Fdot(g_b, g_b);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = GA_Ddot(g_b, g_b);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = GA_Zdot(g_b, g_b);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = GA_Cdot(g_b, g_b);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
#endif

  free(abuf);
  free(bbuf);
  free(cbuf);

  switch (data_type) {
  case C_FLOAT:
    abuf = (void*)malloc(N*N*sizeof(float)/4);
    bbuf = (void*)malloc(N*N*sizeof(float)/4);
    cbuf = (void*)malloc(N*N*sizeof(float)/4);
    break;      
  case C_DBL:
    abuf = (void*)malloc(N*N*sizeof(double)/4);
    bbuf = (void*)malloc(N*N*sizeof(double)/4);
    cbuf = (void*)malloc(N*N*sizeof(double)/4);
    break;    
  case C_DCPL:
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    break;
  case C_SCPL:
    abuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  /* Test multiply on a fraction of matrix. Start by reinitializing
   * A and B */
  GA_Zero(g_a);
  GA_Zero(g_b);
  GA_Zero(g_c);

  if (me==0) printf("\nTest patch multiply\n");

  lo[0] = N/4;
  lo[1] = N/4;
  hi[0] = 3*N/4-1;
  hi[1] = 3*N/4-1;
  ld = N/2;

  /* Set up matrix A */
  if (me==0) printf("\nInitialize A\n");
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[(i-N/4)*N/2+(j-N/4)] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[(i-N/4)*N/2+(j-N/4)] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  beta_flt = 0.0;
  beta_dbl = 0.0;
  beta_scpl.real = 0.0;
  beta_dcpl.real = 0.0;
  if (me==0) printf("\nPerform matrix multiply on sub-blocks\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (0) {
  /*
  if (data_type != C_SCPL && data_type != C_DCPL) {
  */

  if (me==0) printf("\nCheck answer\n");

  /* Multiply buffers by hand */
  if (me == 0) {
    for (i=0; i<N/2; i++) {
      for (j=0; j<N/2; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N/2+j] = fzero;
            break;
          case C_DBL:
            ((double*)cbuf)[i*N/2+j] = dzero;
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N/2+j] = zzero;
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N/2+j] = czero;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
        for (k=0; k<N/2; k++) {
          switch (data_type) {
            case C_FLOAT:
              ((float*)cbuf)[i*N/2+j] += ((float*)abuf)[i*N/2+k]
                *((float*)bbuf)[k*N/2+j];
              break;
            case C_DBL:
              ((double*)cbuf)[i*N/2+j] += ((double*)abuf)[i*N/2+k]
                *((double*)bbuf)[k*N/2+j];
              break;
            case C_DCPL:
              ((DoubleComplex*)cbuf)[i*N/2+j].real +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].real
                 -(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].imag));
              ((DoubleComplex*)cbuf)[i*N/2+j].imag +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].imag
                 +(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].real));
              break;
            case C_SCPL:
              ((SingleComplex*)cbuf)[i*N/2+j].real +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].real
                 -(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].imag));
              ((SingleComplex*)cbuf)[i*N/2+j].imag +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].imag
                 +(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].real));
              break;
            default:
              GA_Error("wrong data type", data_type);
          }
        }
      }
    }
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  if (me == 0) printf("\n\n\n\n");

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = NGA_Fdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DBL:
      dtmp = NGA_Ddot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DCPL:
      ztmp = NGA_Zdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_SCPL:
      ctmp = NGA_Cdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  NGA_Zero_patch(g_b,lo,hi);
  NGA_Add_patch(alpha,g_a,lo,hi,beta,g_c,lo,hi,g_b,lo,hi);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = NGA_Fdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = NGA_Ddot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = NGA_Zdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = NGA_Cdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }

  }
#endif
  free(abuf);
  free(bbuf);
  free(cbuf);

  GA_Destroy(g_a);
  GA_Destroy(g_b);
  GA_Destroy(g_c);
}