Beispiel #1
0
Datei: scf.c Projekt: sg0/gtfock
/// main for SCF
int main (int argc, char **argv)
{
    // init MPI
    int myrank;
    int nprocs;
    int provided;
#if defined (USE_ELEMENTAL)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &myrank );
    ElMPICommSize( MPI_COMM_WORLD, &nprocs );
    MPI_Query_thread(&provided);
#else
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
#endif
    if (myrank == 0)  {
        printf("MPI thread support: %s\n", MPI_THREAD_STRING(provided));
    }
#if 0
    char hostname[1024];
    gethostname (hostname, 1024);
    printf ("Rank %d of %d running on node %s\n", myrank, nprocs, hostname);
#endif

    // create basis set    
    BasisSet_t basis;
    CInt_createBasisSet(&basis);

    // input parameters and load basis set
    int nprow_fock;
    int npcol_fock;
    int nblks_fock;
    int nprow_purif;
    int nshells;
    int natoms;
    int nfunctions;
    int niters;
    if (myrank == 0) {
        if (argc != 8) {
            usage(argv[0]);
            MPI_Finalize();
            exit(0);
        }
        // init parameters
        nprow_fock = atoi(argv[3]);
        npcol_fock = atoi(argv[4]);
        nprow_purif = atoi(argv[5]);
        nblks_fock = atoi(argv[6]);
        niters = atoi(argv[7]);
        assert(nprow_fock * npcol_fock == nprocs);
        assert(nprow_purif * nprow_purif * nprow_purif  <= nprocs);
        assert(niters > 0);       
        CInt_loadBasisSet(basis, argv[1], argv[2]);
        nshells = CInt_getNumShells(basis);
        natoms = CInt_getNumAtoms(basis);
        nfunctions = CInt_getNumFuncs(basis);
        assert(nprow_fock <= nshells && npcol_fock <= nshells);
        assert(nprow_purif <= nfunctions && nprow_purif <= nfunctions);
        printf("Job information:\n");
        char *fname;
        fname = basename(argv[2]);
        printf("  molecule:  %s\n", fname);
        fname = basename(argv[1]);
        printf("  basisset:  %s\n", fname);
        printf("  charge     = %d\n", CInt_getTotalCharge(basis));
        printf("  #atoms     = %d\n", natoms);
        printf("  #shells    = %d\n", nshells);
        printf("  #functions = %d\n", nfunctions);
        printf("  fock build uses   %d (%dx%d) nodes\n",
               nprow_fock * npcol_fock, nprow_fock, npcol_fock);
        printf("  purification uses %d (%dx%dx%d) nodes\n",
               nprow_purif * nprow_purif * nprow_purif,
               nprow_purif, nprow_purif, nprow_purif);
        printf("  #tasks = %d (%dx%d)\n",
               nblks_fock * nblks_fock * nprow_fock * nprow_fock,
               nblks_fock * nprow_fock, nblks_fock * nprow_fock);
        int nthreads = omp_get_max_threads();
        printf("  #nthreads_cpu = %d\n", nthreads);   
    }
    int btmp[8];
    btmp[0] = nprow_fock;
    btmp[1] = npcol_fock;
    btmp[2] = nprow_purif;
    btmp[3] = nblks_fock;
    btmp[4] = niters;
    btmp[5] = natoms;
    btmp[6] = nshells;
    btmp[7] = nfunctions;
    MPI_Bcast(btmp, 8, MPI_INT, 0, MPI_COMM_WORLD);
    nprow_fock = btmp[0];
    npcol_fock = btmp[1];
    nprow_purif = btmp[2];
    nblks_fock = btmp[3];
    niters = btmp[4];
    natoms = btmp[5];
    nshells = btmp[6];
    nfunctions = btmp[7];

    // broadcast basis set
    void *bsbuf;
    int bsbufsize;
    if (myrank == 0) {
        CInt_packBasisSet(basis, &bsbuf, &bsbufsize);
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
    }
    else {
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        bsbuf = (void *)malloc(bsbufsize);
        assert(bsbuf != NULL);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
        CInt_unpackBasisSet(basis, bsbuf);  
        free(bsbuf);
    }

    // init PFock
    if (myrank == 0) {
        printf("Initializing pfock ...\n");
    }
    PFock_t pfock;
    PFock_create(basis, nprow_fock, npcol_fock, nblks_fock, 1e-11,
                 MAX_NUM_D, IS_SYMM, &pfock);
    if (myrank == 0) {
        double mem_cpu;
        PFock_getMemorySize(pfock, &mem_cpu);
        printf("  CPU uses %.3f MB\n", mem_cpu / 1024.0 / 1024.0);
        printf("  Done\n");
    }

    // init purif
    purif_t *purif = create_purif(basis, nprow_purif, nprow_purif, nprow_purif);
    init_oedmat(basis, pfock, purif, nprow_fock, npcol_fock);

    // compute SCF
    if (myrank == 0) {
        printf("Computing SCF ...\n");
    }
    int rowstart = purif->srow_purif;
    int rowend = purif->nrows_purif + rowstart - 1;
    int colstart = purif->scol_purif;
    int colend = purif->ncols_purif + colstart - 1;
    double energy0 = -1.0;
    double totaltime = 0.0;
    double purif_flops = 2.0 * nfunctions * nfunctions * nfunctions;
    double diis_flops;

    // set initial guess
    if (myrank == 0) {
        printf("  initialing D ...\n");
    }
    PFock_setNumDenMat(NUM_D, pfock);
    initial_guess(pfock, basis, purif->runpurif,
                  rowstart, rowend, colstart, colend,
                  purif->D_block, purif->ldx);

    MPI_Barrier(MPI_COMM_WORLD);

    // compute nuc energy
    double ene_nuc = CInt_getNucEnergy(basis);
    if (myrank == 0) {
        printf("  nuc energy = %.10f\n", ene_nuc);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    
    // main loop
    double t1, t2, t3, t4;
    for (int iter = 0; iter < niters; iter++) {
        if (myrank == 0) {
            printf("  iter %d\n", iter);
        }
        t3 = MPI_Wtime();

        // fock matrix construction
        t1 = MPI_Wtime();
        fock_build(pfock, basis, purif->runpurif,
                   rowstart, rowend, colstart, colend,
                   purif->ldx, purif->D_block, purif->F_block);
        if (myrank == 0) {
            printf("After fock build \n");
        }

        // compute energy
        double energy = compute_energy(purif, purif->F_block, purif->D_block);

        t2 = MPI_Wtime();
        if (myrank == 0) {
            printf("    fock build takes %.3f secs\n", t2 - t1);
            if (iter > 0) {
                printf("    energy %.10f (%.10f), %le\n",
                       energy + ene_nuc, energy, fabs (energy - energy0));
            }
            else {
                printf("    energy %.10f (%.10f)\n", energy + ene_nuc,
                       energy);
            }
        }
        if (iter > 0 && fabs (energy - energy0) < 1e-11) {
            niters = iter + 1;
            break;
        }
        energy0 = energy;

        // compute DIIS
        t1 = MPI_Wtime();
        compute_diis(pfock, purif, purif->D_block, purif->F_block, iter);
        t2 = MPI_Wtime();

        if (myrank == 0) {
            if (iter > 1) {
                diis_flops = purif_flops * 6.0;
            } else {
                diis_flops = purif_flops * 2.0;
            }
            printf("    diis takes %.3f secs, %.3lf Gflops\n",
                   t2 - t1, diis_flops / (t2 - t1) / 1e9);
        }
        
    #ifdef __SCF_OUT__
        if (myrank == 0) {
            double outbuf[nfunctions];
            char fname[1024];
            sprintf(fname, "XFX_%d_%d.dat", nfunctions, iter);
            FILE *fp = fopen(fname, "w+");
            assert(fp != NULL);
            for (int i = 0; i < nfunctions; i++) {
                PFock_getMat(pfock, PFOCK_MAT_TYPE_F, USE_D_ID,
                             i, i, USE_D_ID, nfunctions - 1,
                             outbuf, nfunctions);
                for (int j = 0; j < nfunctions; j++) {
                    fprintf(fp, "%.10e\n", outbuf[j]);
                }
            }
            fclose(fp);
        }
    #endif
    
        // purification
        MPI_Barrier(MPI_COMM_WORLD);
        t1 = MPI_Wtime();
        int it = compute_purification(purif, purif->F_block, purif->D_block);
        t2 = MPI_Wtime();
        MPI_Barrier(MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    purification takes %.3f secs,"
                   " %d iterations, %.3f Gflops\n",
                   t2 - t1, it,
                   (it * 2.0 + 4.0) * purif_flops / (t2 - t1) / 1e9);
        }
	/*
#if defined(USE_ELEMENTAL)
    ElGlobalArraysPrint_d( eldga, pfock->ga_D[USE_D_ID] );
#else
    GA_Print (pfock->ga_D[USE_D_ID]);
#endif
*/
        t4 = MPI_Wtime ();
        totaltime += t4 - t3;

#ifdef __SCF_TIMING__
        PFock_getStatistics(pfock);
        double purif_timedgemm;
        double purif_timepdgemm;
        double purif_timepass;
        double purif_timetr;
        MPI_Reduce(&purif->timedgemm, &purif_timedgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepdgemm, &purif_timepdgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepass, &purif_timepass,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timetr, &purif_timetr,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    Purification Statistics:\n");
            printf("      average totaltime  = %.3f\n"
                   "      average timetr     = %.3f\n"
                   "      average timedgemm  = %.3f, %.3f Gflops\n"
                   "      average timepdgemm = %.3f, %.3f Gflops\n",
                   purif_timepass / purif->np_purif,
                   purif_timetr / purif->np_purif,
                   purif_timedgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timedgemm / purif->np_purif) / 1e9,
                   purif_timepdgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timepdgemm / purif->np_purif) / 1e9);
        }
#endif
    } /* for (iter = 0; iter < NITERATIONS; iter++) */

    if (myrank == 0) {
        printf("  totally takes %.3f secs: %.3f secs/iters\n",
               totaltime, totaltime / niters);
        printf("  Done\n");
    }

    destroy_purif(purif);
    PFock_destroy(pfock);
    CInt_destroyBasisSet(basis);

    MPI_Finalize();

    return 0;
}
Beispiel #2
0
// note: Sayan: brings down memory requirement to about 268 MB
int main(int argc, char **argv)
{
    int me, nproc, g_a = -1, i, j;

#if defined(USE_ELEMENTAL)
    int ndim=2, dims[2]= {N1,N2};
#else
    int ndim=2, type=MT_F_DBL, dims[2]= {N1,N2};
#endif

    double *buf;

    int lo[2], hi[2], ld[1];
    double alpha = 1.0;

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );

    ElGlobalArrays_d eldga;

    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
    printf ("INITIALIZED elemental global array...\n");
#else
    MP_INIT(argc,argv);
    GA_Initialize_ltd(-1);

    me=GA_Nodeid();
    nproc=GA_Nnodes();
#endif

    if(me==0) printf("Using %ld processes\n",(long)nproc);
    if(me==0) printf("memory = %ld bytes\n",((long)N1)*((long)N2)*8);

#if defined(USE_ELEMENTAL)
    // create and allocate a global array
    printf ("ndim = %d\n", ndim);
    printf ("dim[0] = %d and dim[1] = %d\n", dims[0], dims[1]);
    ElGlobalArraysCreate_d( eldga, ndim, dims, "A", &g_a);
    printf ("CREATED elemental global array...\n");
    // print distribution
    ElGlobalArraysPrint_d( eldga, g_a );
#else
    g_a = NGA_Create(type, ndim, dims, "A", NULL);

    GA_Zero(g_a);   /* zero the matrix */

    GA_Print_distribution(g_a);
#endif

    if(me == 0) {
//        buf = (double*)(malloc(N1*1024*sizeof(double)));
        buf = (double*)(malloc(N1*128*sizeof(double)));
//        for(j = 0; j < N1*1024; ++j) buf[j] = 1.0;
//        for(i = 0; i < N2/1024; ++i) {
        for(j = 0; j < N1*128; ++j) buf[j] = 1.0;
        for(i = 0; i < N2/128; ++i) {

            lo[0] = 0;
            hi[0] = lo[0] + N1   -1;
            /*
                lo[1] = i*1024;
                hi[1] = lo[1] + 1024 -1;
                ld[0] = 1024;
            */
            lo[1] = i*128;
            hi[1] = lo[1] + 128 -1;
            ld[0] = 128;
            printf("NGA_Acc.%d:  %d:%d %d:%d\n",i,lo[0],hi[0],lo[1],hi[1]);

#if defined(USE_ELEMENTAL)
            ElGlobalArraysAccumulate_d( eldga, g_a, lo, hi, buf, ld, &alpha );
            // there is an explicit flush in NGA_Acc/Put, so when it returns, the buffer
            // can be reused and data has reached the destination
#else
            NGA_Init_fence();
            NGA_Acc(g_a, lo, hi, buf, ld, &alpha);
            NGA_Fence();
#endif
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysSync_d( eldga );
    ElGlobalArraysDestroy_d( eldga, g_a );
    ElGlobalArraysTerminate_d( eldga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Sync();

    GA_Destroy(g_a);

    GA_Terminate();
    MP_FINALIZE();
#endif

    return 0;
}
Beispiel #3
0
int main(int argc, char **argv)
{
  int rank, nprocs;
  int g_A;
  int *local_A=NULL, *local_B=NULL, *output_A=NULL;
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;
  int value=SIZE;

#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &rank );
  ElMPICommSize( MPI_COMM_WORLD, &nprocs );
  // instantiate el::global array
  ElGlobalArraysConstruct_i( &eliga );
  // initialize global arrays
  ElGlobalArraysInitialize_i( eliga );
#else
  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
#endif

  local_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  output_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (output_A, 0, SIZE*SIZE*sizeof(int));
  for(int j=0; j<SIZE; j++)
      for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(i + j);
      //for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(rand()%10);

  local_B=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (local_B, 0, SIZE*SIZE*sizeof(int));

  // nb handle
#if defined(USE_ELEMENTAL)
  typedef ElInt ga_nbhdl_t;
#endif
  ga_nbhdl_t nbnb;

#if defined(USE_ELEMENTAL)
  ElGlobalArraysCreate_i( eliga, DIM, dims, "array_A", NULL, &g_A );
  ElGlobalArraysFill_i( eliga, g_A, &value );
#else
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Fill(g_A, &value);
#endif

  if (rank == 0) printf ("Initial global array:\n");
#if defined(USE_ELEMENTAL)
  ElGlobalArraysPrint_i( eliga, g_A );
#else
  GA_Print(g_A);
#endif

  for (int i = 0; i < NITERS; i++)
  {
      // acc data
#if defined(USE_ELEMENTAL)
      ElGlobalArraysNBAccumulate_i( eliga, g_A, lo, hi, local_A, &ld, &value, &nbnb );
#else
      NGA_NbAcc(g_A, lo, hi, local_A, &ld, &value, &nbnb);
#endif
      
      // updated output
      MPI_Reduce (local_A, output_A, SIZE*SIZE, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

#if defined(USE_ELEMENTAL)
      ElGlobalArraysNBWait_i( eliga, &nbnb );
#else
      NGA_NbWait (&nbnb);
#endif 

      // get
      if (rank == 0) printf ("Get in iter #%d\n", i);
#if defined(USE_ELEMENTAL)
      ElGlobalArraysSync_i( eliga );
      ElGlobalArraysGet_i( eliga, g_A, lo, hi, local_B, &ld );
      ElGlobalArraysPrint_i( eliga, g_A );
#else
      GA_Sync();
      NGA_Get(g_A, lo, hi, local_B, &ld);
      GA_Print(g_A);
#endif
  } // end of iters

  if(rank==0)
    {
      printf(" Alpha (multiplier): %d\n", value);
      printf(" Original local buffer (before accumulation): \n");

      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_A[i*ld+j]);
	  printf("\n");
	}
      printf("\n");
      printf(" Get returns: \n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_B[i*ld + j]);
	  printf("\n");
	}

      printf("\n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    {
	      if(local_B[i*ld+j]!=(value + (NITERS * value * (output_A[i*ld+j]))))
		  GA_Error("ERROR", -99);
	    }
	}
    }
#if defined(USE_ELEMENTAL)
  ElGlobalArraysDestroy_i( eliga, g_A );
#else
  GA_Destroy(g_A);
#endif
  if(rank == 0)
    printf ("OK. Test passed\n");

    free (local_A);
    free (local_B);
    free (output_A);

#if defined(USE_ELEMENTAL)
    ElGlobalArraysTerminate_i( eliga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_i( eliga );
    ElFinalize();
#else
    GA_Terminate();
    MPI_Finalize();
#endif
}
Beispiel #4
0
/*
 * test ga_dgemm
 * Note: - change nummax for large arrays
 *       - turn off "dgemm_verify" for large arrays due to memory 
 *         limitations, as dgemm_verify=1 for large arrays produces 
 *         segfault, dumps core,or any crap.
 */
int main(int argc, char **argv)
{
    int num_m;
    int num_n;
    int num_k;
    int i;
    int ii;
    double *h0;
    int g_c;
    int g_b;
    int g_a;
    double a;
    double t1;
    double mf;
    double avg_t[ntrans];
    double avg_mf[ntrans];
    int itime;
    int ntimes;
    int nums_m[/*howmany*/] = {512,1024};
    int nums_n[/*howmany*/] = {512,1024};
    int nums_k[/*howmany*/] = {512,1024};
    char transa[/*ntrans*/] = "ntnt";
    char transb[/*ntrans*/] = "nntt";
    char ta;
    char tb;
    double *tmpa;
    double *tmpb;
    double *tmpc;
    int ndim;
    int dims[2];
#ifdef BLOCK_CYCLIC
    int block_size[2];
#endif

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );
    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
#else
    MP_INIT(argc,argv);
    if (!MA_init(MT_DBL,1,20000000)) {
        GA_Error("failed: ma_init(MT_DBL,1,20000000)",10);
    }
    GA_INIT(argc,argv);
    me = GA_Nodeid();
#endif

    h0 = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpa = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpb = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpc = (double*)malloc(sizeof(double) * nummax*nummax);

    ii = 0;
    for (i=0; i<nummax*nummax; i++) {
        ii = ii + 1;
        if (ii > nummax) {
            ii = 0;
        }
        h0[i] = ii;
    }

    /* Compute times assuming 500 mflops and 5 second target time */
    /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */
    ntimes = 5;

    for (ii=0; ii<howmany; ii++) {
        num_m = nums_m[ii];
        num_n = nums_n[ii];
        num_k = nums_k[ii];
        a = 0.5/(num_m*num_n);
        if (num_m > nummax || num_n > nummax || num_k > nummax) {
            GA_Error("Insufficient memory: check nummax", 1);
        }

#ifndef BLOCK_CYCLIC
        ndim = 2;

	/*
        dims[0] = num_m;
        dims[1] = num_n;
	*/
        dims[1] = num_m;
        dims[0] = num_n;

#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c );
#else
        if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) {
            GA_Error("failed: create g_c",20);
        }
#endif
	/*
        dims[0] = num_k;
        dims[1] = num_n;
	*/
        dims[1] = num_k;
        dims[0] = num_n;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b );
#else
        if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) {
            GA_Error("failed: create g_b",30);
        }
#endif
	/*
        dims[0] = num_m;
        dims[1] = num_k;
	*/
        dims[1] = num_m;
        dims[0] = num_k;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a );
#else
        if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) {
            GA_Error("failed: create g_a",40);
        }
#endif
#else
        ndim = 2;
        block_size[0] = 128;
        block_size[1] = 128;

        dims[0] = num_m;
        dims[1] = num_n;
        g_c = GA_Create_handle();
        GA_Set_data(g_c,ndim,dims,MT_DBL);
        GA_Set_array_name(g_c,"g_c");
        GA_Set_block_cyclic(g_c,block_size);
        if (!GA_Allocate(g_c)) {
            GA_Error("failed: create g_c",40);
        }

        dims[0] = num_k;
        dims[1] = num_n;
        g_b = GA_Create_handle();
        GA_Set_data(g_b,ndim,dims,MT_DBL);
        GA_Set_array_name(g_b,"g_b");
        GA_Set_block_cyclic(g_b,block_size);
        if (!ga_allocate(g_b)) {
            GA_Error("failed: create g_b",40);
        }

        dims[0] = num_m;
        dims[1] = num_k;
        g_a = GA_Create_handle();
        GA_Set_data(g_a,ndim,dims,MT_DBL);
        GA_Set_array_name(g_a,"g_a");
        GA_Set_block_cyclic(g_a,block_size);
        if (!ga_allocate(g_a)) {
            GA_Error('failed: create g_a',40);
        }
#endif         

        /* Initialize matrices A and B */
        if (me == 0) { 
            load_ga(g_a, h0, num_m, num_k);
            load_ga(g_b, h0, num_k, num_n);
        }
#if defined(USE_ELEMENTAL)
        double zero = 0.0;
        ElGlobalArraysFill_d( eldga, g_c, &zero );
	ElGlobalArraysSync_d( eldga );
#else
        GA_Zero(g_c);
        GA_Sync();
#endif
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n",
                    (long)num_m, (long)num_k, (long)num_k, (long)num_n);
            fflush(stdout);
        }

        for (i=0; i<ntrans; i++) {
            avg_t[i]  = 0.0;
            avg_mf[i] = 0.0;
        }

        for (itime=0; itime<ntimes; itime++) {
            for (i=0; i<ntrans; i++) {
#if defined(USE_ELEMENTAL)
	        ElGlobalArraysSync_d( eldga );
#else
                GA_Sync();
#endif
                ta = transa[i];
                tb = transb[i];
                t1 = MP_TIMER();
#if defined(USE_ELEMENTAL)
		ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c );
#else
                GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c);
#endif
                t1 = MP_TIMER() - t1;
#if defined(USE_ELEMENTAL)
                if (me == 0) {
#else
                if (GA_Nodeid() == 0) {
#endif
#if defined(USE_ELEMENTAL)
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc;
#else
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes();
#endif
                    avg_t[i]  = avg_t[i]+t1;
                    avg_mf[i] = avg_mf[i] + mf;
                    printf("%15s%2d: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                            "Run#", itime, t1, mf, ta, tb);
                    fflush(stdout);
                    if (dgemm_verify && itime == 0) {
                        /* recall the C API swaps the matrix order */
                        /* we swap it here for the Fortran-based verify */
                        verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0,
                                g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc);
                    }
                }
            }
        }
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\n");
            for (i=0; i<ntrans; i++) {
                printf("%17s: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                        "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes,
                        transa[i], transb[i]);
            }
            if(dgemm_verify) {
                printf("All GA_Dgemms are verified...O.K.\n");
            }
            fflush(stdout);
        }

        /*
           GA_Print(g_a);
           GA_Print(g_b);
           GA_Print(g_c);
           */
#if defined(USE_ELEMENTAL)
        ElGlobalArraysDestroy_d( eldga, g_a );
        ElGlobalArraysDestroy_d( eldga, g_b );
        ElGlobalArraysDestroy_d( eldga, g_c );
#else
        GA_Destroy(g_c);
        GA_Destroy(g_b);
        GA_Destroy(g_a);
#endif
    }

    /* ???
       format(a15, i2, ': ', e12.4, ' seconds ',f12.1, 
       .     ' mflops/proc ', 3a2)
       */
#if defined(USE_ELEMENTAL)
    if (me == 0) {
#else
    if (GA_Nodeid() == 0) {
#endif
        printf("All tests successful\n");
    }

    free(h0);
    free(tmpa);
    free(tmpb);
    free(tmpc);
#if defined(USE_ELEMENTAL)
    // call el::global arrays destructor
    ElGlobalArraysTerminate_d( eldga );
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    return 0;
}


/*
 * Verify for correctness. Process 0 computes BLAS dgemm 
 * locally. For larger arrays, disbale this test as memory
 * might not be sufficient
 */
void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k,
        double alpha, int g_a, int g_b, double beta, int g_c,
        double *tmpa, double *tmpb, double *tmpc)
{
    int i,j,type,ndim,dims[2],lo[2],hi[2];
    double abs_value;

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            tmpc[j+i*num_m] = -1.0;
            tmpa[j+i*num_m] = -2.0;
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_a, lo, hi, tmpa, &dims[1]);
#endif

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] );
#else
    NGA_Get(g_b, lo, hi, tmpb, &dims[1]);
#endif

    /* compute dgemm sequentially */
#if defined(USE_ELEMENTAL)
    cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), 
	    ( xt2 == 'n'? CblasNoTrans: CblasTrans ), 
	    num_m /* M */, num_n /* N */, num_k /* K */, 
	    alpha, tmpa, num_m, /* lda */ 
	    tmpb, num_k, /* ldb */ beta, 
	    tmpc, num_m /* ldc */);
#else
    xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k,
            &alpha, tmpa, &num_m,
            tmpb, &num_k, &beta,
            tmpc, &num_m);
#endif

    /* after computing c locally, verify it with the values in g_c */

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_c, lo, hi, tmpa, &dims[1]);
#endif

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]);
            if(abs_value > 1.0 || abs_value < -1.0) {
                printf("Values are = %f %f\n",
                        tmpc[j+i*num_m], tmpa[j+i*num_m]);
                printf("Values are = %f %f\n", 
                        fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value);
                fflush(stdout);
                GA_Error("verify ga_dgemm failed", 1);
            }
        }
    }
}

/**
 * called by process '0' (or your master process )
 */
void load_ga(int handle, double *f, int dim1, int dim2)
{
      int lo[2], hi[2];
      
      if (dim1 < 0 || dim2 < 0) {
          return;
      }

      lo[0] = 0;
      lo[1] = 0;
      hi[0] = dim1-1;
      hi[1] = dim2-1;
#if defined(USE_ELEMENTAL)
      ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 );
#else
      NGA_Put(handle, lo, hi, f, &dim1);
#endif
}
Beispiel #5
0
int main(int argc, char **argv)
{
  int rank, nprocs;
  int g_A;
  int *local_A=NULL, *local_B=NULL, *output_A=NULL;
  int dims[DIM]={SIZE,SIZE}, dims2[DIM], lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;
  int value=SIZE;
  //int value=0;

#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &rank );
  ElMPICommSize( MPI_COMM_WORLD, &nprocs );
  // instantiate el::global array
  ElGlobalArraysConstruct_i( &eliga );
  // initialize global arrays
  ElGlobalArraysInitialize_i( eliga );
#else
  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
#endif

  local_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  output_A=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (output_A, 0, SIZE*SIZE*sizeof(int));
  for(int j=0; j<SIZE; j++)
      for(int i=0; i<SIZE; i++) local_A[i+j*ld]=(i + j);

  local_B=(int*)malloc(SIZE*SIZE*sizeof(int));
  memset (local_B, 0, SIZE*SIZE*sizeof(int));

#if defined(USE_ELEMENTAL)
  ElGlobalArraysCreate_i( eliga, DIM, dims, "array_A", NULL, NULL, &g_A );
  ElGlobalArraysFill_i( eliga, g_A, &value );
  ElGlobalArraysPrint_i( eliga, g_A );
  // acc data
  ElGlobalArraysPut_i( eliga, g_A, lo, hi, local_A, &ld );
  ElGlobalArraysSync_i( eliga );
  // get
  ElGlobalArraysGet_i( eliga, g_A, lo, hi, local_B, &ld );
  ElGlobalArraysSync_i( eliga );
  ElGlobalArraysPrint_i( eliga, g_A );
#else
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
  GA_Fill(g_A, &value);
  GA_Print(g_A);

  NGA_Put(g_A, lo, hi, local_A, &ld);
  
  GA_Sync();
  
  NGA_Get(g_A, lo, hi, local_B, &ld);

  GA_Sync();
  
  GA_Print(g_A);
#endif

  // updated output
  MPI_Reduce (local_A, output_A, SIZE*SIZE, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);

  if(rank==0)
    {
      printf(" Original local buffer to be accumulated: \n");

      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_A[i*ld+j]);
	  printf("\n");
	}
      printf("\n");
      printf(" Get returns: \n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    printf("%d ", local_B[i*ld + j]);
	  printf("\n");
	}

      printf("\n");
      for(int i=0; i<SIZE; i++)
	{
	  for(int j=0; j<SIZE; j++)
	    {
	      if(local_B[i*ld+j]!=output_A[i*ld+j])
		  GA_Error("ERROR", -99);
	    }
	}
    }
#if defined(USE_ELEMENTAL)
  ElGlobalArraysDestroy_i( eliga, g_A );
#else
  GA_Destroy(g_A);
#endif
  if(rank == 0)
    printf ("OK. Test passed\n");

    free (local_A);
    free (local_B);
    free (output_A);

#if defined(USE_ELEMENTAL)
    ElGlobalArraysTerminate_i( eliga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_i( eliga );
    ElFinalize();
#else
    GA_Terminate();
    MPI_Finalize();
#endif
}
Beispiel #6
0
int
main( int argc, char* argv[] )
{
    ElError error = ElInitialize( &argc, &argv );
    if( error != EL_SUCCESS )
        MPI_Abort( MPI_COMM_WORLD, 1 );

    ElInt m, n, mSub, nSub;
    bool print, display;
    error = ElInput_I("--m","matrix height",100,&m);
    EL_ABORT_ON_ERROR( error );
    error = ElInput_I("--n","matrix width",100,&n);
    EL_ABORT_ON_ERROR( error );
    error = ElInput_b("--print","print matrix?",true,&print);
    EL_ABORT_ON_ERROR( error );
    error = ElInput_b("--display","display matrix?",false,&display);
    error = ElProcessInput();
    EL_ABORT_ON_ERROR( error );
    error = ElPrintInputReport();
    EL_ABORT_ON_ERROR( error );

    ElGrid grid;
    error = ElGridCreate( MPI_COMM_WORLD, EL_COLUMN_MAJOR, &grid );
    EL_ABORT_ON_ERROR( error );;

    ElDistMatrix_z A;
    error = ElDistMatrixCreateSpecific_z( EL_MR, EL_MC, grid, &A );
    EL_ABORT_ON_ERROR( error );
    error = ElDistMatrixResize_z( A, m, n );
    EL_ABORT_ON_ERROR( error );

    ElInt i, j;
    for( j=0; j<n; ++j )
    {
        for( i=0; i<m; ++i )
        {
            error = ElDistMatrixSet_z( A, i, j, i+j );
            EL_ABORT_ON_ERROR( error );
        }
    }
    if( print )
    {
        error = ElPrintDist_z( A, "A" );
        EL_ABORT_ON_ERROR( error );
    }
    if( display )
    {
        error = ElDisplayDist_z( A, "A" );
        EL_ABORT_ON_ERROR( error );
    }

    ElEntrywiseMapDist_z( A, &MapFunc );
    if( print )
    {
        error = ElPrintDist_z( A, "ASquared" );
        EL_ABORT_ON_ERROR( error );
    }
    if( display )
    {
        error = ElDisplayDist_z( A, "ASquared" );
        EL_ABORT_ON_ERROR( error );
    }

    error = ElFinalize();
    if( error != EL_SUCCESS )
        MPI_Abort( MPI_COMM_WORLD, 1 );
    return 0;
}
Beispiel #7
0
int main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int g_A, g_B, dims[DIM]={SIZE,SIZE}, val1=5, val2=4;
  int lo[DIM]={SIZE-SIZE,SIZE-SIZE}, hi[DIM]={SIZE-1,SIZE-1}, ld=SIZE;

#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &rank );
  ElMPICommSize( MPI_COMM_WORLD, &nprocs );
  // instantiate el::global array
  ElGlobalArraysConstruct_i( &eliga );
  // initialize global arrays
  ElGlobalArraysInitialize_i( eliga );
#else
  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
#endif

  // create global arrays
#if defined(USE_ELEMENTAL)
  ElGlobalArraysCreate_i( eliga, DIM, dims, "array_A", NULL, &g_A );
#else
  g_A = NGA_Create(C_INT, DIM, dims, "array_A", NULL);
#endif

#if defined(USE_ELEMENTAL)
  ElGlobalArraysDuplicate_i( eliga, g_A, "array_B", &g_B );
#else  
  g_B = GA_Duplicate(g_A, "array_B");
#endif

#if defined(USE_ELEMENTAL)
  ElGlobalArraysFill_i( eliga, g_A, &val1 );
  ElGlobalArraysFill_i( eliga, g_B, &val2 );
#else
  GA_Fill(g_A, &val1);
  GA_Fill(g_B, &val2);
#endif

  int dot_AB = -99;
#if defined(USE_ELEMENTAL)
  ElGlobalArraysDot_i( eliga, g_A, g_B, &dot_AB );
#else
  dot_AB = GA_Idot(g_A, g_B);
#endif

#if defined(USE_ELEMENTAL)
  ElGlobalArraysSync_i( eliga );
#else
  GA_Sync();
#endif

#if defined(USE_ELEMENTAL)
  ElGlobalArraysPrint_i( eliga, g_A );
  ElGlobalArraysPrint_i( eliga, g_B );
#else 
  GA_Print(g_A);
  GA_Print(g_B);
#endif
 
  // Check
#if defined(USE_ELEMENTAL)
  ElGlobalArraysSync_i( eliga );
#else
  GA_Sync();
#endif

  if(rank==0)
      printf ("Integer dot product of g_A and g_B: %d\n", dot_AB);

#if defined(USE_ELEMENTAL)
  ElGlobalArraysSync_i( eliga );
#else
  GA_Sync();
#endif

  if(rank==0)
    printf("Test Completed \n");

#if defined(USE_ELEMENTAL)
  ElGlobalArraysDestroy_i( eliga, g_A );
  ElGlobalArraysDestroy_i( eliga, g_B );
#else
  GA_Destroy(g_A);
  GA_Destroy(g_B);
#endif

#if defined(USE_ELEMENTAL)
  ElGlobalArraysTerminate_i( eliga );
  // call el::global arrays destructor
  ElGlobalArraysDestruct_i( eliga );
  ElFinalize();
#else
  GA_Terminate();
  MPI_Finalize();
#endif

  return 0;
}
Beispiel #8
0
int main(int argc, char **argv)
{
  /* initialize GA */
#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &me );
  ElMPICommSize( MPI_COMM_WORLD, &nproc );
  // instantiate el::global array
  ElGlobalArraysConstruct_d( &eldga );
  // initialize global arrays
  ElGlobalArraysInitialize_d( eldga );
#else
  MP_INIT(argc,argv);
  GA_Initialize_args(&argc, &argv);

  me = GA_Nodeid();
  nproc = GA_Nnodes();
#endif

  if (nproc < 2) {
    if (me == 0) {
      fprintf(stderr, "USAGE: 2 <= processes - got %d\n", nproc);
    }
#if defined(USE_ELEMENTAL)
    ElGlobalArraysTerminate_d( eldga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    exit(0);
  }

  if (!me) {
    printf("\n             Performance of Basic Blocking Communication Operations\n");
  }

#if defined(USE_ELEMENTAL)
  ElGlobalArraysSync_d( eldga );
#else
  GA_Sync();
#endif

  /* test 1 dimension array */
  /*
  if (!me) {
    printf("\n\t\t\tContiguous Data Transfer\n");
  }
  test_1D();
  */

  /* test 2 dimension array */
  if (!me) {
    printf("\n\t\t\tStrided Data Transfer\n");
  }
  test_2D();
#if 0
  if (me == 0) {
    if (warn_accuracy) {
      printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n", warn_accuracy);
    }
    printf("\n\n------------ Now we test the same data transfer for correctness ----------\n");
    fflush(stdout);
  }
#endif
#if defined(USE_ELEMENTAL)
  ElGlobalArraysTerminate_d( eldga );
  // call el::global arrays destructor
  ElGlobalArraysDestruct_d( eldga );
  ElFinalize();
#else
  GA_Terminate();
  MP_FINALIZE();
#endif
  return(0);
}