Exemplo n.º 1
0
int main(int argc, char **argv)
{
    int size_dst = 15;
    int g_a = 0;
    int I_NEG_ONE = -1;
    long L_NEG_ONE = -1;
    long long LL_NEG_ONE = -1;
    int FIVE = 5;
    int TEN = 10;
    int lo;
    int hi;
    int *ptr;
    int i;

    MP_INIT(argc,argv);

    GA_INIT(argc,argv);

    for (i=0; i<3; ++i) {
        if (0 == i) {
            g_a = NGA_Create(C_INT, 1, &size_dst, "dst", NULL);
            GA_Fill(g_a, &I_NEG_ONE);
        } else if (1 == i) {
            g_a = NGA_Create(C_LONG, 1, &size_dst, "dst", NULL);
            GA_Fill(g_a, &L_NEG_ONE);
        } else if (2 == i) {
            g_a = NGA_Create(C_LONGLONG, 1, &size_dst, "dst", NULL);
            GA_Fill(g_a, &LL_NEG_ONE);
        }
        GA_Sync();
        GA_Print(g_a);
        NGA_Print_patch(g_a, &FIVE, &TEN, 0);
        NGA_Print_patch(g_a, &FIVE, &TEN, 1);
        NGA_Distribution(g_a, GA_Nodeid(), &lo, &hi);
        NGA_Access(g_a, &lo, &hi, &ptr, NULL);
        printf("[%d] (%d)=%d\n", GA_Nodeid(), lo, *ptr);
        NGA_Release(g_a, &lo, &hi);
    }

    GA_Terminate();
    MP_FINALIZE();
    exit(EXIT_SUCCESS);
}
Exemplo n.º 2
0
int 
main(int argc, char **argv) {

Integer heap=9000000, stack=9000000;
int me, nproc;
DoublePrecision time;

    MP_INIT(argc,argv);

    GA_INIT(argc,argv);                           /* initialize GA */

    nproc = GA_Nnodes();
    me = GA_Nodeid();

    if(me==0) printf("Using %d processes\n\n",nproc);

    if (me==0) printf ("Matrix size is %d X %d\n",N,N);

#ifdef USE_REGULAR
    if (me == 0) printf("\nUsing regular data distribution\n\n");
#endif
#ifdef USE_SIMPLE_CYCLIC
    if (me == 0) printf("\nUsing simple block-cyclic data distribution\n\n");
#endif
#ifdef USE_SCALAPACK
    if (me == 0) printf("\nUsing ScaLAPACK data distribution\n\n");
#endif
#ifdef USE_TILED
    if (me == 0) printf("\nUsing tiled data distribution\n\n");
#endif

    if(!MA_init((Integer)MT_F_DBL, stack/nproc, heap/nproc))
       GA_Error("MA_init failed bytes= %d",stack+heap);   

#ifdef PERMUTE
      {
        int i, *list = (int*)malloc(nproc*sizeof(int));
        if(!list)GA_Error("malloc failed",nproc);

        for(i=0; i<nproc;i++)list[i]=nproc-1-i;

        GA_Register_proclist(list, nproc);
        free(list);
      }
#endif

    if(GA_Uses_fapi())GA_Error("Program runs with C API only",1);

    time = MP_TIMER();
    do_work();
    /*    printf("%d: Total Time = %lf\n", me, MP_TIMER()-time);
      printf("%d: GEMM Total Time = %lf\n", me, gTime);
    */

    if(me==0)printf("\nSuccess\n\n");
    GA_Terminate();

    MP_FINALIZE();

    return 0;
}
Exemplo n.º 3
0
int main( int argc, char **argv ) {
  int g_a, g_b, i, j, size, size_me;
  int icnt, idx, jdx, ld;
  int n=N, type=MT_C_INT, one;
  int *values, *ptr;
  int **indices;
  int dims[2]={N,N};
  int lo[2], hi[2];

  int heap=3000000, stack=2000000;
  int me, nproc;

  int datatype, elements;
  double *prealloc_mem;
  MP_INIT(argc,argv);

#if 1
  GA_INIT(argc,argv);                            /* initialize GA */
  me=GA_Nodeid(); 
  nproc=GA_Nnodes();
  if(me==0) {
    if(GA_Uses_fapi())GA_Error("Program runs with C array API only",1);
    printf("\nUsing %ld processes\n",(long)nproc);
    fflush(stdout);
  }

  heap /= nproc;
  stack /= nproc;
  if(! MA_init(MT_F_DBL, stack, heap)) 
    GA_Error("MA_init failed",stack+heap);  /* initialize memory allocator*/ 

  /* Create a regular matrix. */
  if(me==0)printf("\nCreating matrix A of size %d x %d\n",N,N);
  g_a = NGA_Create(type, 2, dims, "A", NULL);
  if(!g_a) GA_Error("create failed: A",n); 

  /* Fill matrix using scatter routines */
  size = N*N;
  if (size%nproc == 0) {
    size_me = size/nproc;
  } else {
    i = size - size%nproc;
    size_me = i/nproc;
    if (me < size%nproc) size_me++;
  }

  /* Check that sizes are all okay */
  i = size_me;
  GA_Igop(&i,1,"+");
  if (i != size) {
    GA_Error("Sizes don't add up correctly: ",i);
  } else if (me==0) {
    printf("\nSizes add up correctly\n");
  }

  /* Allocate index and value arrays */
  indices = (int**)malloc(size_me*sizeof(int*));
  values = (int*)malloc(size_me*sizeof(int));
  icnt = me;
  for (i=0; i<size_me; i++) {
    values[i] = icnt;
    idx = icnt%N; 
    jdx = (icnt-idx)/N;
    if (idx >= N || idx < 0) {
      printf("p[%d] Bogus index i: %d\n",me,idx);
    }
    if (jdx >= N || jdx < 0) {
      printf("p[%d] Bogus index j: %d\n",me,jdx);
    }
    indices[i] = (int*)malloc(2*sizeof(int));
    (indices[i])[0] = idx;
    (indices[i])[1] = jdx;
    icnt += nproc;
  }

  /* Scatter values into g_a */
  NGA_Scatter(g_a, values, indices, size_me);
  GA_Sync();

  /* Check to see if contents of g_a are correct */
  NGA_Distribution( g_a, me, lo, hi );
  NGA_Access(g_a, lo, hi, &ptr, &ld);
  for (i=lo[0]; i<hi[0]; i++) {
    idx = i-lo[0];
    for (j=lo[1]; j<hi[1]; j++) {
      jdx = j-lo[1];
      if (ptr[idx*ld+jdx] != j*N+i) {
        printf("p[%d] (Scatter) expected: %d actual: %d\n",me,j*N+i,ptr[idx*ld+jdx]);
      }
    }
  }
  if (me==0) printf("\nCompleted test of NGA_Scatter\n");

  for (i=0; i<size_me; i++) {
    values[i] = 0;
  }
  GA_Sync();
  NGA_Gather(g_a, values, indices, size_me);
  icnt = me;
  for (i=0; i<size_me; i++) {
    if (icnt != values[i]) {
      printf("p[%d] (Gather) expected: %d actual: %d\n",me,icnt,values[i]);
    }
    icnt += nproc;
  }
  if (me==0) printf("\nCompleted test of NGA_Gather\n");
  GA_Sync();

  /* Scatter-accumulate values back into GA*/
  one = 1;
  NGA_Scatter_acc(g_a, values, indices, size_me, &one);
  GA_Sync();

  /* Check to see if contents of g_a are correct */
  for (i=lo[0]; i<hi[0]; i++) {
    idx = i-lo[0];
    for (j=lo[1]; j<hi[1]; j++) {
      jdx = j-lo[1];
      if (ptr[idx*ld+jdx] != 2*(j*N+i)) {
        printf("p[%d] (Scatter_acc) expected: %d actual: %d\n",me,2*(j*N+i),ptr[idx*ld+jdx]);
      }
    }
  }
  if (me==0) printf("\nCompleted test of NGA_Scatter_acc\n");
  NGA_Release(g_a, lo, hi);

  /* Test fixed buffer size */
  NGA_Alloc_gatscat_buf(size_me);

  /* Scatter-accumulate values back into GA*/
  GA_Sync();
  NGA_Scatter_acc(g_a, values, indices, size_me, &one);
  GA_Sync();

  /* Check to see if contents of g_a are correct */
  for (i=lo[0]; i<hi[0]; i++) {
    idx = i-lo[0];
    for (j=lo[1]; j<hi[1]; j++) {
      jdx = j-lo[1];
      if (ptr[idx*ld+jdx] != 3*(j*N+i)) {
        printf("p[%d] (Scatter_acc) expected: %d actual: %d\n",me,3*(j*N+i),ptr[idx*ld+jdx]);
      }
    }
  }
  if (me==0) printf("\nCompleted test of NGA_Scatter_acc using fixed buffers\n");
  NGA_Release(g_a, lo, hi);
  NGA_Free_gatscat_buf();

  GA_Destroy(g_a);
  if(me==0)printf("\nSuccess\n");
  GA_Terminate();
#endif

  MP_FINALIZE();

 return 0;
}
Exemplo n.º 4
0
int main(int argc, char **argv)
{
    int me;
    int nproc;
    int status;
    int g_a;
    int dims[NDIM];
    int chunk[NDIM];
    int pg_world;
    size_t num = 10;
    double *p1 = NULL;
    double *p2 = NULL;
    size_t i;
    int num_mutex;
    int lo[1];
    int hi[1];
    int ld[1]={1};
    MPI_Comm comm;

    MP_INIT(argc,argv);
    GA_INIT(argc,argv);

    me = GA_Nodeid();
    nproc = GA_Nnodes();
    comm = GA_MPI_Comm_pgroup_default();

    printf("%d: Hello world!\n",me);

    if (me==0) printf("%d: GA_Initialize\n",me);
    /*if (me==0) printf("%d: ARMCI_Init\n",me);*/
    /*ARMCI_Init();*/
    /*if (me==0) printf("%d: MA_Init\n",me);*/
    /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/

    if (me==0) printf("%d: GA_Create_handle\n",me);
    g_a = GA_Create_handle();

    if (me==0) printf("%d: GA_Set_array_name\n",me);
    GA_Set_array_name(g_a,"test array A");

    dims[0] = 30;
    if (me==0) printf("%d: GA_Set_data\n",me);
    GA_Set_data(g_a,NDIM,dims,MT_DBL);

    chunk[0] = -1;
    if (me==0) printf("%d: GA_Set_chunk\n",me);
    GA_Set_chunk(g_a,chunk);

    if (me==0) printf("%d: GA_Pgroup_get_world\n",me);
    pg_world = GA_Pgroup_get_world();
    if (me==0) printf("%d: GA_Set_pgroup\n",me);
    GA_Set_pgroup(g_a,pg_world);

    if (me==0) printf("%d: GA_Allocate\n",me);
    status = GA_Allocate(g_a);
    if(0 == status) MPI_Abort(comm,100);

    if (me==0) printf("%d: GA_Zero\n",me);
    GA_Zero(g_a);

    if (me==0) printf("%d: GA_Sync\n",me);
    GA_Sync();

    num = 10;
    p1 = malloc(num*sizeof(double));
    /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p1==NULL) MPI_Abort(comm,1000);
    p2 = malloc(num*sizeof(double));
    /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p2==NULL) MPI_Abort(comm,2000);

    for ( i=0 ; i<num ; i++ ) p1[i] = 7.0;
    for ( i=0 ; i<num ; i++ ) p2[i] = 3.0;

    num_mutex = 17;
    status = GA_Create_mutexes(num_mutex);
    if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status);

/***************************************************************/
    if (me==0) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Put(g_a,lo,hi,p1,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
    } 
    GA_Print(g_a);
    if (me==1) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Get(g_a,lo,hi,p2,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
        for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n",
                (long unsigned)i,p2[i]);
    }
/***************************************************************/



    status = GA_Destroy_mutexes();
    if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status);

    /*ARMCI_Free(p2);*/
    /*ARMCI_Free(p1);*/
    free(p2);
    free(p1);

    if (me==0) printf("%d: GA_Destroy\n",me);
    GA_Destroy(g_a);

    /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/
    /*ARMCI_Finalize();*/
    if (me==0) printf("%d: GA_Terminate\n",me);
    GA_Terminate();
    if (me==0) printf("%d: MPI_Finalize\n",me);
    MPI_Finalize();

    return(0);
}
Exemplo n.º 5
0
/*
 * test ga_dgemm
 * Note: - change nummax for large arrays
 *       - turn off "dgemm_verify" for large arrays due to memory 
 *         limitations, as dgemm_verify=1 for large arrays produces 
 *         segfault, dumps core,or any crap.
 */
int main(int argc, char **argv)
{
    int num_m;
    int num_n;
    int num_k;
    int i;
    int ii;
    double *h0;
    int g_c;
    int g_b;
    int g_a;
    double a;
    double t1;
    double mf;
    double avg_t[ntrans];
    double avg_mf[ntrans];
    int itime;
    int ntimes;
    int nums_m[/*howmany*/] = {512,1024};
    int nums_n[/*howmany*/] = {512,1024};
    int nums_k[/*howmany*/] = {512,1024};
    char transa[/*ntrans*/] = "ntnt";
    char transb[/*ntrans*/] = "nntt";
    char ta;
    char tb;
    double *tmpa;
    double *tmpb;
    double *tmpc;
    int ndim;
    int dims[2];
#ifdef BLOCK_CYCLIC
    int block_size[2];
#endif

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );
    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
#else
    MP_INIT(argc,argv);
    if (!MA_init(MT_DBL,1,20000000)) {
        GA_Error("failed: ma_init(MT_DBL,1,20000000)",10);
    }
    GA_INIT(argc,argv);
    me = GA_Nodeid();
#endif

    h0 = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpa = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpb = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpc = (double*)malloc(sizeof(double) * nummax*nummax);

    ii = 0;
    for (i=0; i<nummax*nummax; i++) {
        ii = ii + 1;
        if (ii > nummax) {
            ii = 0;
        }
        h0[i] = ii;
    }

    /* Compute times assuming 500 mflops and 5 second target time */
    /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */
    ntimes = 5;

    for (ii=0; ii<howmany; ii++) {
        num_m = nums_m[ii];
        num_n = nums_n[ii];
        num_k = nums_k[ii];
        a = 0.5/(num_m*num_n);
        if (num_m > nummax || num_n > nummax || num_k > nummax) {
            GA_Error("Insufficient memory: check nummax", 1);
        }

#ifndef BLOCK_CYCLIC
        ndim = 2;

	/*
        dims[0] = num_m;
        dims[1] = num_n;
	*/
        dims[1] = num_m;
        dims[0] = num_n;

#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c );
#else
        if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) {
            GA_Error("failed: create g_c",20);
        }
#endif
	/*
        dims[0] = num_k;
        dims[1] = num_n;
	*/
        dims[1] = num_k;
        dims[0] = num_n;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b );
#else
        if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) {
            GA_Error("failed: create g_b",30);
        }
#endif
	/*
        dims[0] = num_m;
        dims[1] = num_k;
	*/
        dims[1] = num_m;
        dims[0] = num_k;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a );
#else
        if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) {
            GA_Error("failed: create g_a",40);
        }
#endif
#else
        ndim = 2;
        block_size[0] = 128;
        block_size[1] = 128;

        dims[0] = num_m;
        dims[1] = num_n;
        g_c = GA_Create_handle();
        GA_Set_data(g_c,ndim,dims,MT_DBL);
        GA_Set_array_name(g_c,"g_c");
        GA_Set_block_cyclic(g_c,block_size);
        if (!GA_Allocate(g_c)) {
            GA_Error("failed: create g_c",40);
        }

        dims[0] = num_k;
        dims[1] = num_n;
        g_b = GA_Create_handle();
        GA_Set_data(g_b,ndim,dims,MT_DBL);
        GA_Set_array_name(g_b,"g_b");
        GA_Set_block_cyclic(g_b,block_size);
        if (!ga_allocate(g_b)) {
            GA_Error("failed: create g_b",40);
        }

        dims[0] = num_m;
        dims[1] = num_k;
        g_a = GA_Create_handle();
        GA_Set_data(g_a,ndim,dims,MT_DBL);
        GA_Set_array_name(g_a,"g_a");
        GA_Set_block_cyclic(g_a,block_size);
        if (!ga_allocate(g_a)) {
            GA_Error('failed: create g_a',40);
        }
#endif         

        /* Initialize matrices A and B */
        if (me == 0) { 
            load_ga(g_a, h0, num_m, num_k);
            load_ga(g_b, h0, num_k, num_n);
        }
#if defined(USE_ELEMENTAL)
        double zero = 0.0;
        ElGlobalArraysFill_d( eldga, g_c, &zero );
	ElGlobalArraysSync_d( eldga );
#else
        GA_Zero(g_c);
        GA_Sync();
#endif
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n",
                    (long)num_m, (long)num_k, (long)num_k, (long)num_n);
            fflush(stdout);
        }

        for (i=0; i<ntrans; i++) {
            avg_t[i]  = 0.0;
            avg_mf[i] = 0.0;
        }

        for (itime=0; itime<ntimes; itime++) {
            for (i=0; i<ntrans; i++) {
#if defined(USE_ELEMENTAL)
	        ElGlobalArraysSync_d( eldga );
#else
                GA_Sync();
#endif
                ta = transa[i];
                tb = transb[i];
                t1 = MP_TIMER();
#if defined(USE_ELEMENTAL)
		ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c );
#else
                GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c);
#endif
                t1 = MP_TIMER() - t1;
#if defined(USE_ELEMENTAL)
                if (me == 0) {
#else
                if (GA_Nodeid() == 0) {
#endif
#if defined(USE_ELEMENTAL)
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc;
#else
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes();
#endif
                    avg_t[i]  = avg_t[i]+t1;
                    avg_mf[i] = avg_mf[i] + mf;
                    printf("%15s%2d: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                            "Run#", itime, t1, mf, ta, tb);
                    fflush(stdout);
                    if (dgemm_verify && itime == 0) {
                        /* recall the C API swaps the matrix order */
                        /* we swap it here for the Fortran-based verify */
                        verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0,
                                g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc);
                    }
                }
            }
        }
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\n");
            for (i=0; i<ntrans; i++) {
                printf("%17s: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                        "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes,
                        transa[i], transb[i]);
            }
            if(dgemm_verify) {
                printf("All GA_Dgemms are verified...O.K.\n");
            }
            fflush(stdout);
        }

        /*
           GA_Print(g_a);
           GA_Print(g_b);
           GA_Print(g_c);
           */
#if defined(USE_ELEMENTAL)
        ElGlobalArraysDestroy_d( eldga, g_a );
        ElGlobalArraysDestroy_d( eldga, g_b );
        ElGlobalArraysDestroy_d( eldga, g_c );
#else
        GA_Destroy(g_c);
        GA_Destroy(g_b);
        GA_Destroy(g_a);
#endif
    }

    /* ???
       format(a15, i2, ': ', e12.4, ' seconds ',f12.1, 
       .     ' mflops/proc ', 3a2)
       */
#if defined(USE_ELEMENTAL)
    if (me == 0) {
#else
    if (GA_Nodeid() == 0) {
#endif
        printf("All tests successful\n");
    }

    free(h0);
    free(tmpa);
    free(tmpb);
    free(tmpc);
#if defined(USE_ELEMENTAL)
    // call el::global arrays destructor
    ElGlobalArraysTerminate_d( eldga );
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    return 0;
}


/*
 * Verify for correctness. Process 0 computes BLAS dgemm 
 * locally. For larger arrays, disbale this test as memory
 * might not be sufficient
 */
void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k,
        double alpha, int g_a, int g_b, double beta, int g_c,
        double *tmpa, double *tmpb, double *tmpc)
{
    int i,j,type,ndim,dims[2],lo[2],hi[2];
    double abs_value;

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            tmpc[j+i*num_m] = -1.0;
            tmpa[j+i*num_m] = -2.0;
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_a, lo, hi, tmpa, &dims[1]);
#endif

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] );
#else
    NGA_Get(g_b, lo, hi, tmpb, &dims[1]);
#endif

    /* compute dgemm sequentially */
#if defined(USE_ELEMENTAL)
    cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), 
	    ( xt2 == 'n'? CblasNoTrans: CblasTrans ), 
	    num_m /* M */, num_n /* N */, num_k /* K */, 
	    alpha, tmpa, num_m, /* lda */ 
	    tmpb, num_k, /* ldb */ beta, 
	    tmpc, num_m /* ldc */);
#else
    xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k,
            &alpha, tmpa, &num_m,
            tmpb, &num_k, &beta,
            tmpc, &num_m);
#endif

    /* after computing c locally, verify it with the values in g_c */

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_c, lo, hi, tmpa, &dims[1]);
#endif

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]);
            if(abs_value > 1.0 || abs_value < -1.0) {
                printf("Values are = %f %f\n",
                        tmpc[j+i*num_m], tmpa[j+i*num_m]);
                printf("Values are = %f %f\n", 
                        fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value);
                fflush(stdout);
                GA_Error("verify ga_dgemm failed", 1);
            }
        }
    }
}

/**
 * called by process '0' (or your master process )
 */
void load_ga(int handle, double *f, int dim1, int dim2)
{
      int lo[2], hi[2];
      
      if (dim1 < 0 || dim2 < 0) {
          return;
      }

      lo[0] = 0;
      lo[1] = 0;
      hi[0] = dim1-1;
      hi[1] = dim2-1;
#if defined(USE_ELEMENTAL)
      ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 );
#else
      NGA_Put(handle, lo, hi, f, &dim1);
#endif
}