C++ (Cpp) GA_Nnodes примеры использования

Пример #1

0

Показать файл

Файл: transpose.c Проект: timasjov/parallel-computing-ga-mpi

int main(int argc, char **argv) {
    int heap=300000, stack=300000;
    int me, nprocs, i;
    double start, end;
	
    /* Initialize Message Passing library */
    MPI_Init(&argc, &argv);

    /* Initialize GA */
    GA_Initialize();
    
    /* Initialize Memory Allocator (MA) */
    if(! MA_init(C_DBL, stack, heap) ) GA_Error("MA_init failed",stack+heap);

    me     = GA_Nodeid();
    nprocs = GA_Nnodes();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
     
	start = MPI_Wtime();
	for(i =0; i< 1000; i++) {
		TRANSPOSE1D();
	}
	end = MPI_Wtime();
	if(me==0) printf("  Time=%2.5e secs\n\n",end-start); 
    
    if(me==0)printf("\nTerminating ..\n");
    GA_Terminate();
    MPI_Finalize();    
}

Пример #2

0

Показать файл

Файл: sched_groups.c Проект: 000Justin000/nwchem

/*server signals all clients to terminate*/
void signal_termination(int server_id, int msg_src_mpi) {
  int i, v=TERM_CLIENT;
  const int rank = GA_Nodeid();
  const int size = GA_Nnodes(); 
  const int default_grp = ga_pgroup_get_default_();

/*   assert(server_id == rank);  /\*call with your id, not someone else's*\/ */
  assert(server_id == SVR); /*Only server may invoke this method*/
#ifdef LEADER_BCAST
  {
    int *pid_list = (int *)alloca(size*sizeof(int));
    int root = GA_Pgroup_absolute_id(default_grp, SVR);
/*     int src = GA_Pgroup_absolute_id(default_grp, rank); */
    for(i=0; i<size; i++) {
      pid_list[i] = GA_Pgroup_absolute_id(default_grp,i);
    }
    broadcast(size,pid_list,root,msg_src_mpi,&v,sizeof(int));
  }
#else
  for(i=0; i<size; i++) {
    if(i != SVR) {
      MPI_Send(&v, 1, MPI_INT, GA_Pgroup_absolute_id(default_grp,i), 
	       SIGNAL_TAG, MPI_COMM_WORLD);
    }
  }
#endif
}

Пример #3

0

Показать файл

Файл: nga_access.c Проект: niuqingpeng/microbenchmark

int main(int argc, char** argv) 
{ 
    int nprocs,myid,nprocssq; 
    int dims[2],chunk[2]; 
    int i,j,k; 
    int stack = 100000, heap = 100000; 
    MPI_Init(&argc,&argv); 
    GA_Initialize(); 
    MA_init(C_DBL,stack,heap); 
    nprocssq = GA_Nnodes(); 
    nprocs = sqrt(nprocssq); 
    myid = GA_Nodeid(); 
    dims[0] = N; dims[1] = N; 
    chunk[0] = N/nprocs; 
    chunk[1] = N/nprocs; 
    int g_a = NGA_Create(C_DBL,2,dims,"Array A",chunk); 
    int lo[2],hi[2]; 
    NGA_Distribution(g_a,myid,lo,hi); 
    int ld[1] = {N/nprocs}; 
    void *ptr; 
    double *local; 
    printf("Myid = %d, lo = [%d,%d] , hi = [%d,%d] , ld = %d \n",myid,lo[0],lo[1],hi[0],hi[1],ld[0]); 
    NGA_Access(g_a,lo,hi,&ptr,ld); 
    local = (double*) ptr; 
    printf("Myid = %d , local[0][0] = %f\n",*local); 
    GA_Sync(); 
    GA_Destroy(g_a); 
    GA_Terminate(); 
    MPI_Finalize(); 
    return 0; 
}

Пример #4

0

Показать файл

Файл: rate.c Проект: jeffhammond/ga

int main(int argc, char **argv)
{
    int status, me;
    int max_arrays = 10;
    double max_sz = 1e8, max_disk = 1e10, max_mem = 1e6;
    int stack = 120000, heap = 3200000;
    int numfiles, numioprocs;
    int total, nproc;

    MP_INIT(argc,argv);
    GA_Initialize();
    me = GA_Nodeid();
    nproc = GA_Nnodes();
    total = pow(SIZE,NDIM)*sizeof(double);
    if (!GA_Uses_ma()) {
        if (GA_Nodeid() == 0) {
            printf("GA is not using MA\n");
        }
        stack = 100000;
        heap  = (int)(2.2*(float)(total));
    }

    if (MA_init(MT_F_DBL, stack, heap) ) {
        if (DRA_Init(max_arrays, max_sz, max_disk, max_mem) != 0)
            GA_Error("DRA_Init failed: ",0);
        if (USER_CONFIG == 0) {
            numfiles = -1;
            numioprocs = -1;
        } else if (USER_CONFIG == 1) {
            numfiles = 1;
            numioprocs = GA_Cluster_nnodes();
        } else if (USER_CONFIG == 2) {
            numfiles = GA_Cluster_nnodes();
            numioprocs = GA_Cluster_nnodes();
        } else {
            numfiles = 1;
            numioprocs = 1;
        }
        if (me==0) {
            printf("Disk resident arrays configured as:\n");
            printf("    Number of files: %d\n",numfiles);
            printf("    Number of I/O processors: %d\n",numioprocs);
        }
        DRA_Set_default_config(numfiles,numioprocs);
        if (me == 0) printf("\n");
        if (me == 0) printf("TESTING PERFORMANCE OF DISK ARRAYS\n");
        if (me == 0) printf("\n");
        test_io_dbl();
        status = DRA_Terminate();
        GA_Terminate();
    } else {
        printf("MA_init failed\n");
    }
    if(me == 0) printf("all done ...\n");
    MP_FINALIZE();
    return 0;
}

Пример #5

0

Показать файл

Файл: sched_groups.c Проект: 000Justin000/nwchem

void task_insert(task_list_t *tlist, int tskid, int nproc) {
  task_t *task;
  const int maxproc = GA_Nnodes();

  assert(is_tskid_unique(tlist, tskid));
  assert(nproc > 0 && nproc < maxproc);

  task = (task_t *)malloc(sizeof(task_t));
  assert(task != NULL);
  
  task->tskid = tskid;
  task->proc_list = NULL;
  task->nproc = nproc;
  task->pos = tlist->ntodo;

  task_ins(tlist, task, &tlist->todo);
  tlist->ntodo += 1;
}

Пример #6

0

Показать файл

Файл: spiral-mm.c Проект: pghosh2/GA-Matrix-Multiplication-Benchmark

int main(int argc, char **argv) 
{
	int proc, nprocs;
	int M, N, K; /* */
	int blockX_len, blockY_len;
	int heap=3000000, stack=3000000;

	if (argc == 6) {
		M = atoi(argv[1]);
		N = atoi(argv[2]);
		K = atoi(argv[3]);
		blockX_len = atoi(argv[4]);
		blockY_len = atoi(argv[5]);
	}
	else {
		printf("Please enter ./a.out <M> <N> <K> <BLOCK-X-LEN> <BLOCK-Y-LEN>");
		exit(-1);
	}

	MPI_Init(&argc, &argv);

	GA_Initialize();

	if(! MA_init(C_DBL, stack, heap) ) GA_Error("MA_init failed",stack+heap);

	proc   = GA_Nodeid();
	nprocs = GA_Nnodes();

	if(proc == 0) {
		printf("Using %d processes\n", nprocs); 
		fflush(stdout);
	}

	matrix_multiply(M, N, K, blockX_len, blockY_len);

	if(proc == 0)
		printf("\nTerminating ..\n");

	GA_Terminate();

	MPI_Finalize();    

	return 0;
}

Пример #7

0

Показать файл

Файл: transp1D.c Проект: MihaiLupoiu/TPP

int main(int argc, char **argv) {
    int heap=300000, stack=300000;
    int me, nprocs;
    
    /* Step1: Initialize Message Passing library */
#ifdef MPI
    MPI_Init(&argc, &argv);   /* initialize MPI */
#else
    PBEGIN_(argc, argv);      /* initialize TCGMSG */
#endif

    /* Step2: Initialize GA */
    /* ### intialize the GA library */
    GA_Initialize();

    
    /* Step3: Initialize Memory Allocator (MA) */
    if(! MA_init(C_DBL, stack, heap) ) GA_Error("MA_init failed",stack+heap);

    /* ### assign the local processor ID to the int variable "me"
     * ### and the total number of processors to the int variable
     * ### "nprocs" */
    me     = GA_Nodeid();
    nprocs = GA_Nnodes();
    if(me==0) {
       printf("\nUsing %d processes\n\n", nprocs); fflush(stdout);
    }
    
       
    TRANSPOSE1D();
    
    if(me==0)printf("\nTerminating ..\n");
    /* ### terminate the GA library */
    GA_Terminate();

#ifdef MPI
    MPI_Finalize();    
#else
    PEND_();
#endif
}

Пример #8

0

Показать файл

Файл: sched_groups.c Проект: 000Justin000/nwchem

void plist_init(proc_list_t *plist) {
  int i, ctr;
  const int me = GA_Nodeid();
  const int nproc = GA_Nnodes();
  const int default_grp = ga_pgroup_get_default_();
  const char *pname = "plist_init";

/*   fprintf(stderr, "%d:: 1 %s\n", me,pname); */

  plist->buf = (proc_t *)malloc((nproc-1)*sizeof(proc_t));
  assert(plist->buf != NULL);
  for(i=0, ctr=0; i<nproc; i++) {
    if(i != SVR) {
      plist->buf[ctr].procid = GA_Pgroup_absolute_id(default_grp,i);
      plist->buf[ctr].next = &plist->buf[ctr+1];
      ctr+=1;
    }
  }
/*   fprintf(stderr, "%d:: 2 %s\n", me,pname); */
  plist->buf[nproc-2].next = NULL;
  plist->idle = &plist->buf[0];
  plist->nidle = nproc-1;
}

Пример #9

0

Показать файл

Файл: sched_groups.c Проект: 000Justin000/nwchem

void req_list_init(req_list_t *rlist, task_list_t *tlist) {
  const int nprocs=GA_Nnodes();
  const int nreqs = MIN(nprocs, tlist->ntodo);
  int i;

  rlist->next_ids=rlist->prev_ids=NULL;
  rlist->reqs = NULL;
  rlist->tasks = NULL;
  rlist->idle_id = 0;
  rlist->nrunning = 0;
  rlist->nidle = nreqs;
  rlist->nreqs = nreqs;
  
  if(nreqs) {
    rlist->next_ids = (int *)malloc(nreqs*sizeof(int));
    rlist->prev_ids = (int *)malloc(nreqs*sizeof(int));
    rlist->reqs = (MPI_Request *)malloc(nreqs*sizeof(MPI_Request));
    rlist->tasks = (task_t **)malloc(nreqs*sizeof(task_t *));
    assert(rlist->next_ids!=NULL);
    assert(rlist->prev_ids!=NULL);
    assert(rlist->reqs!=NULL);
    assert(rlist->tasks!=NULL);
    
    rlist->prev_ids[0] = ID_NULL;
    for(i=1; i<nreqs; i++) {
      rlist->next_ids[i-1] = i;
      rlist->prev_ids[i] = i-1;
    }
    rlist->next_ids[nreqs-1] = ID_NULL;

    for(i=0; i<nreqs; i++) {
      rlist->tasks[i] = NULL;
      rlist->reqs[i] = MPI_REQUEST_NULL;
    }
  }
}

Пример #10

0

Показать файл

Файл: transpose.c Проект: timasjov/parallel-computing-ga-mpi

void TRANSPOSE1D() {
    
    int ndim, dims[1], chunk[1], ld[1], lo[1], hi[1];
    int lo1[1], hi1[1], lo2[1], hi2[1];
    int g_a, g_b, a[MAXPROC*TOTALELEMS],b[MAXPROC*TOTALELEMS];
    int nelem, i, q;    
    int me, nprocs;
    
    /* Find local processor ID and number of processors */
    me = GA_Nodeid();
    nprocs = GA_Nnodes();
	
	/* Configure array dimensions. Force an unequal data distribution */
	ndim     = 1; /* 1-d transpose */
	dims[0]  = nprocs*TOTALELEMS + nprocs/2;
	ld[0]    = dims[0];
	chunk[0] = TOTALELEMS; /* minimum data on each process */
 
	/* create a global array g_a and duplicate it to get g_b */
	g_a = NGA_Create(C_INT, 1, dims, "array A", chunk);
	if (!g_a) GA_Error("create failed: A", 0);
	
	g_b = GA_Duplicate(g_a, "array B");
	if (! g_b) GA_Error("duplicate failed", 0);
	
	/* initialize data in g_a */
	if (me==0) {
	   for(i=0; i<dims[0]; i++) a[i] = i;
	   lo[0]  = 0;
	   hi[0] = dims[0]-1;
	   NGA_Put(g_a, lo, hi, a, ld);
	}

	/* Synchronize all processors to guarantee that everyone has data
	   before proceeding to the next step. */
	GA_Sync();

	/* Start initial phase of inversion by inverting the data held locally on
	   each processor. Start by finding out which data each processor owns. */
	NGA_Distribution(g_a, me, lo1, hi1);

	/* Get locally held data and copy it into local buffer a  */
	NGA_Get(g_a, lo1, hi1, a, ld);
	
	/* Invert data locally */
	nelem = hi1[0] - lo1[0] + 1;
	for (i=0; i<nelem; i++) b[i] = a[nelem-1-i];
	
	/* Invert data globally by copying locally inverted blocks into
	 * their inverted positions in the GA */
	lo2[0] = dims[0] - hi1[0] -1;
	hi2[0] = dims[0] - lo1[0] -1;
	NGA_Put(g_b,lo2,hi2,b,ld);

	/* Synchronize all processors to make sure inversion is complete */
	GA_Sync();
	
	
	
	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);

}

Пример #11

0

Показать файл

Файл: rate.c Проект: jeffhammond/ga

void test_io_dbl()
{
    int n, ndim = NDIM;
    double err, tt0, tt1, mbytes;
    int g_a, g_b, d_a;
    int i, itmp, j, req, loop;
    int glo[MAXDIM],ghi[MAXDIM];
    dra_size_t dlo[MAXDIM],dhi[MAXDIM];
    dra_size_t ddims[MAXDIM],reqdims[MAXDIM];
    dra_size_t m;
    int index[MAXDIM], dims[MAXDIM];
    int me, nproc, isize;
    double *ptr;
    double plus, minus;
    int ld[MAXDIM], chunk[MAXDIM];
    char filename[80];
    FILE *fd;

    n = SIZE;
    m = ((dra_size_t)NFACTOR)*((dra_size_t)SIZE);

    loop  = 1;
    for (i=0; i<ndim; i++) loop *= NFACTOR;
    req = -1;
    nproc = GA_Nnodes();
    me    = GA_Nodeid();

    if (me == 0) {
        printf("Creating temporary global arrays %d",n);
        for (i=1; i<ndim; i++) {
            printf(" x %d",n);
        }
        printf("\n");
    }
    if (me == 0) fflush(stdout);
    GA_Sync();
    for (i=0; i<ndim; i++) {
        dims[i] = n;
        chunk[i] = 1;
    }

    g_a = NGA_Create(MT_DBL, ndim, dims, "a", chunk);
    if (!g_a) GA_Error("NGA_Create failed: a", 0);
    g_b = NGA_Create(MT_DBL, ndim, dims, "b", chunk);
    if (!g_b) GA_Error("NGA_Create failed: b", 0);
    if (me == 0) printf("done\n");
    if (me == 0) fflush(stdout);

    /*     initialize g_a, g_b with random values
           ... use ga_access to avoid allocating local buffers for ga_put */

    GA_Sync();
    NGA_Distribution(g_a, me, glo, ghi);
    NGA_Access(g_a, glo, ghi, &ptr, ld);
    isize = 1;
    for (i=0; i<ndim; i++) isize *= (ghi[i]-glo[i]+1);
    fill_random(ptr, isize);
    GA_Sync();
    GA_Zero(g_b);


    /*.......................................................................*/
    if (me == 0) {
        printf("Creating Disk array %ld",m);
        for (i=1; i<ndim; i++) {
            printf(" x %ld",m);
        }
        printf("\n");
    }
    if (me == 0) fflush(stdout);
    for (i=0; i<ndim; i++) {
        ddims[i] = m;
        reqdims[i] = (dra_size_t)n;
    }
    GA_Sync();
    strcpy(filename,FNAME);
    if (! (fd = fopen(filename, "w"))) {
        strcpy(filename,FNAME_ALT);
        if (! (fd = fopen(filename, "w"))) {
            GA_Error("open failed",0);
        }
    }
    fclose(fd);
    if (NDRA_Create(MT_DBL, ndim, ddims, "A", filename, DRA_RW,
                reqdims, &d_a) != 0) {
        GA_Error("NDRA_Create failed(d_a): ",0);
    }
    if (me == 0) printf("testing write\n");
    fflush(stdout);
    tt1 = 0.0;
    for (i=0; i<loop; i++) {
        itmp=i;
        for (j=0; j<ndim; j++) {
            index[j] = itmp%NFACTOR;
            itmp = (itmp - index[j])/NFACTOR;
        }
        for (j=0; j<ndim; j++) {
            glo[j] = 0;
            ghi[j] = SIZE - 1;
            dlo[j] = ((dra_size_t)index[j])*((dra_size_t)SIZE);
            dhi[j] = (((dra_size_t)index[j])+(dra_size_t)1)
                * ((dra_size_t)SIZE) - (dra_size_t)1;
        }
        tt0 = MP_TIMER();
        if (NDRA_Write_section(FALSE, g_a, glo, ghi,
                    d_a, dlo, dhi, &req) != 0) {
            GA_Error("ndra_write_section failed:",0);
        }
        if (DRA_Wait(req) != 0) {
            GA_Error("DRA_Wait failed(d_a): ",req);
        }
        tt1 += (MP_TIMER() - tt0);
    }
    GA_Dgop(&tt1,1,"+");
    tt1 = tt1/((double)nproc);
    mbytes = 1.e-6 * (double)(pow(m,ndim)*sizeof(double));
    if (me == 0) {
        printf("%11.2f MB  time = %11.2f rate = %11.3f MB/s\n",
                mbytes,tt1,mbytes/tt1);
    }

    if (DRA_Close(d_a) != 0) {
        GA_Error("DRA_Close failed(d_a): ",d_a);
    }

    if (me == 0) printf("\n");
    if (me == 0) printf("disk array closed\n");
    if (me == 0) fflush(stdout);

    /*..........................................................*/

    if (me == 0) printf("\n");
    if (me == 0) printf("opening disk array\n");
    if (DRA_Open(filename, DRA_R, &d_a) != 0) {
        GA_Error("DRA_Open failed",0);
    }
    if (me == 0) printf("testing read\n");
    /*  printf("testing read on proc %d\n",me); */
    if (me == 0) fflush(stdout);
    tt1 = 0.0;
    for (i=0; i<loop; i++) {
        itmp=i;
        for (j=0; j<ndim; j++) {
            index[j] = itmp%NFACTOR;
            itmp = (itmp - index[j])/NFACTOR;
        }
        for (j=0; j<ndim; j++) {
            glo[j] = 0;
            ghi[j] = SIZE - 1;
            dlo[j] = ((dra_size_t)index[j])*((dra_size_t)SIZE);
            dhi[j] = (((dra_size_t)index[j])+(dra_size_t)1)
                * ((dra_size_t)SIZE) - (dra_size_t)1;
        }
        tt0 = MP_TIMER();
        if (NDRA_Read_section(FALSE, g_b, glo, ghi,
                    d_a, dlo, dhi, &req) != 0) {
            GA_Error("ndra_read_section failed:",0);
        }
        if (DRA_Wait(req) != 0) {
            GA_Error("DRA_Wait failed(d_a): ",req);
        }
        tt1 += (MP_TIMER() - tt0);
        plus = 1.0;
        minus = -1.0;
        GA_Add(&plus, g_a, &minus, g_b, g_b);
        err = GA_Ddot(g_b, g_b);
        if (err != 0) {
            if (me == 0) {
                printf("BTW, we have error = %f on loop value %d\n",
                        err,i);
            }
            GA_Error(" bye",0);
        }
    }
    GA_Dgop(&tt1,1,"+");
    tt1 = tt1/((double)nproc);
    if (me == 0) {
        printf("%11.2f MB  time = %11.2f rate = %11.3f MB/s\n",
                mbytes,tt1,mbytes/tt1);
    }
    if (DRA_Delete(d_a) != 0) GA_Error("DRA_Delete failed",0);
    /*.......................................................................*/
    GA_Destroy(g_a);
    GA_Destroy(g_b);
}

Пример #12

0

Показать файл

Файл: lennard-jones_ga2.c Проект: fuentesdt/tao-1.10.1-p3

int main (int argc, char **argv)
{
  double startTime;
  int info;			/* used to check for functions returning nonzeros */
  GAVec ga_x;        		/* solution vector */
  TAO_SOLVER tao;		/* TAO_SOLVER solver context */
  TAO_GA_APPLICATION taoapp;	/* TAO application context */
  TaoTerminateReason reason;
  AppCtx user;			/* user-defined application context */

  /*initialize GA and MPI */
  int heap = 400000, stack = 400000;
  MPI_Init (&argc, &argv);	/* initialize MPI */
  GA_Initialize ();		/* initialize GA */
  user.me = GA_Nodeid ();
  user.nproc = GA_Nnodes ();
  startTime = MPI_Wtime();
  
  if (user.me == 0) {
    if (GA_Uses_fapi ())
      GA_Error ("Program runs with C array API only", 0);
    printf ("Using %ld processes\n", (long) user.nproc);
    fflush (stdout);
  }
  heap /= user.nproc;
  stack /= user.nproc;
  if (!MA_init (MT_F_DBL, stack, heap))
    GA_Error ("MA_init failed", stack + heap);	/* initialize memory allocator */
  
  /* Initialize TAO */
  TaoInitialize (&argc, &argv, (char *) 0, help);

  /* Initialize problem parameters */
  user.ndim = NDIM;
  user.natoms = NATOMS;
  user.BlockSize = BLOCKSIZE;


  /* Allocate vectors for the solution and gradient */
  int dims[2];
  dims[0] = user.ndim*user.natoms;
  ga_x = NGA_Create (C_DBL, 1, dims, "GA_X", NULL);
  if (!ga_x) GA_Error ("lennard-jones.main::NGA_Create ga_x", ga_x);

  /* Set up structures for data distribution */
  info = SetupBlocks(&user); CHKERRQ(info);


  /* The TAO code begins here */
  /* Create TAO solver with desired solution method */
  info = TaoCreate (MPI_COMM_WORLD, "tao_lmvm", &tao); CHKERRQ(info);
  info = TaoGAApplicationCreate (MPI_COMM_WORLD, &taoapp); CHKERRQ(info);

  /* Set the initial solution */
  info = InitializeVariables(ga_x, &user); CHKERRQ(info);
  info = TaoGAAppSetInitialSolutionVec(taoapp, ga_x); CHKERRQ(info);

  /* Set routines for function, gradient */
  info = TaoGAAppSetObjectiveAndGradientRoutine (taoapp, FormFunctionGradient, 
					      (void *) &user); CHKERRQ(info);

  /* Check for TAO command line options */
  info = TaoSetFromOptions (tao); CHKERRQ(info);


  /* SOLVE THE APPLICATION */
  info = TaoSolveGAApplication (taoapp, tao); CHKERRQ(info);

  /*  To View TAO solver information use */
  info = TaoView(tao); CHKERRQ(info);

  /* Get termination information */
  info = TaoGetTerminationReason (tao, &reason);
  if(info) GA_Error("lennard-jones.main.TaoGetTerminationReason",info);
  if (user.me == 0) {
    if (reason <= 0)
      printf("Try a different TAO method, adjust some parameters, or check the function evaluation routines\n");
    
    printf("WALL TIME TAKEN = %lf\n", MPI_Wtime()-startTime);
    /*output the solutions */ 

    printf ("The solution is :\n");
  }
  GA_Print (ga_x);




  /* Free TAO data structures */
  info = TaoDestroy (tao); CHKERRQ(info);
  info = TaoGAAppDestroy (taoapp); CHKERRQ(info);

  /* Free GA data structures */
  GA_Destroy (ga_x);
  if (!MA_pop_stack(user.memHandle)) 
    ga_error("Main::MA_pop_stack for memHandle failed",0);

  /* Finalize TAO, GA, and MPI */
  TaoFinalize ();
  GA_Terminate ();
  MPI_Finalize ();

  return 0;
}

Пример #13

0

Показать файл

Файл: ga_lu.c Проект: dmlb2000/nwchem-cml

int main(int argc, char **argv) {
    int i, matrix_size;
    int heap=20000000, stack=200000000;
    double *A=NULL;
    
    if(argc != 2) 
    {
       printf("Usage Error\n\t Usage: <program> <matrix_size>\n");
       exit(0);
    }

    matrix_size = atoi(argv[1]);
    
    if(matrix_size <= 0) 
    {
       printf("Error: matrix size (%d) should be > 0\n",
              matrix_size);
       GA_Error("matrix size should be >0", 1);
    }
    
    /* *****************************************************************
     * Initialize MPI/TCGMSG-MPI, GA and MA
     * *****************************************************************/
#ifdef MPI
#ifdef DCMF
    int desired = MPI_THREAD_MULTIPLE;
    int provided;
    MPI_Init_thread(&argc, &argv, desired, &provided);
    if ( provided != MPI_THREAD_MULTIPLE ) printf("provided != MPI_THREAD_MULTIPLE\n");
#else
  MPI_Init (&argc, &argv);	/* initialize MPI */
#endif
#else
    PBEGIN_(argc, argv);    /* initialize TCGMSG-MPI */
#endif
    
    GA_Initialize();        /* initialize GA */

    me     = GA_Nodeid();
    nprocs = GA_Nnodes();

    heap /= nprocs;
    stack /= nprocs;

    if(! MA_init(MT_F_DBL, stack, heap)) /* initialize MA */
    {
       GA_Error("MA_init failed",stack+heap);
    }

    /* create/initialize the matrix */    
    if((A = (double*)malloc(matrix_size*matrix_size*sizeof(double))) == NULL) 
    {
       GA_Error("malloc failed", matrix_size*matrix_size*sizeof(double));
    }

    for(i=0; i<2; i++) /* 5 runs */
    {
       init_array(A, matrix_size);
#if DEBUG
       if(me==0) print_array(A, matrix_size);
#endif
       /* *****************************************************************
        * Perform LU Factorization
        * *****************************************************************/
       ga_lu(A, matrix_size);
    }
    
    free(A);
    
    /* *****************************************************************
     * Terminate MPI/TCGMSG-MPI, GA and MA
     * *****************************************************************/
    if(me==0)printf("Terminating ..\n");
    GA_Terminate();

#ifdef MPI
    MPI_Finalize();
#else
    PEND_();
#endif
    
    return 0;
}

Пример #14

0

Показать файл

Файл: sched_groups.c Проект: 000Justin000/nwchem

/** Client code. Receives signals from the server to process a task or
    terminate processing and return*/
void client_code() {
  int *buf = NULL, buf_size;
  int flag;
  MPI_Status status;
  Integer p_handle;
  int ntsks=0, src;
  const char *pname = "client_code";
  double e1, e2, e3, e4, e5, f1, f2, f3, f4,f5,f6,f7,f8;
  double t_prepar=0, t_wait_start=0, t_grp=0,t_sync=0,t_compl=0,t_dest=0;
/*   double get_doit_time_(); */
/*   double get_esp_time_(); */
/*   double get_gm_crt_time_(); */
/*   double get_chrg_set_time_(); */
/*   double get_gm_push_time_(); */
  const int server = GA_Pgroup_absolute_id(ga_pgroup_get_default_(),SVR);
  const int default_grp = ga_pgroup_get_default_();; /*default GA group for this dispatcher instance*/
  const int world_me = GA_Nodeid();
  const int nproc = GA_Nnodes();

  t_ptask = 0.0;
/*   fprintf(stderr, "%d: 0 server=%d %s\n", GA_Nodeid(), server,pname); */

  e1 = util_wallsec_();
/*   fprintf(stderr, "%d: 0 %s\n", GA_Nodeid(), pname); */

/*   GA_Pgroup_set_default(GA_Pgroup_get_world()); */

/*   fprintf(stderr, "%d: 1 %s\n", world_me, pname); */

  buf_size = 1+ /*action to perform*/
    1+ /*task id - if TASK_SIGNAL*/
    nproc /*process group info*/
    ;

/*   buf = (int *)malloc(buf_size*sizeof(int)); */
  buf = (int *)alloca(buf_size*sizeof(int));
  assert(buf != NULL);

/*   fprintf(stderr, "%d: 2 %s\n", world_me, pname); */

  e2 = util_wallsec_();
  while(1) {
    int nelem, grp_me;
    Integer tskid;

    f1 = util_wallsec_();
/*     fprintf(stderr, "%d:: Waiting for work\n", world_me); */
    MPI_Recv(buf, buf_size, MPI_INT, MPI_ANY_SOURCE, SIGNAL_TAG, MPI_COMM_WORLD, &status);
    f2 = util_wallsec_();
    t_wait_start += (f2-f1);
/*     fprintf(stderr, "%d:: Client got msg from %d\n", world_me, status.MPI_SOURCE); */

    MPI_Get_elements(&status, MPI_INT, &nelem);
    assert(nelem >= 1);
      
    if(buf[0] == TERM_CLIENT) {
      /*process termination and return*/
/*        fprintf(stderr, "%d:: Recv-ed term signal\n", GA_Nodeid()); */
/*       free(buf); */
/*       fprintf(stderr, "%d:: Terminating client\n", GA_Nodeid()); */
#ifdef LEADER_BCAST
      signal_termination(SVR,status.MPI_SOURCE);
#endif
      break;
    }
/*     fprintf(stderr, "%d:: got a task to process\n", world_me); */
    /*Got a task to process*/
    assert(buf[0] == TASK_START);
    ntsks += 1;

    if(status.MPI_SOURCE == server) {
      qsort(buf+2, nelem-2, sizeof(int), int_compare);
    }
    f3  = util_wallsec_();
    t_prepar += (f3-f2);

#if LEADER_BCAST
    src = (server==status.MPI_SOURCE)?buf[2]:status.MPI_SOURCE;
    broadcast(nelem-2,buf+2,buf[2],src,buf,nelem*sizeof(int));
#endif

    /*The proc ids are in world group. So create sub-group of world group*/
    GA_Pgroup_set_default(GA_Pgroup_get_world());
    p_handle = GA_Pgroup_create(&buf[2], nelem-2);
    GA_Pgroup_set_default(p_handle);
/*     GA_Pgroup_sync(p_handle); */
    f4 = MPI_Wtime();
    t_grp += (f4-f3);

    tskid = buf[1];
/*     fprintf(stderr, "%d(%d):: Invoking process task tskid=%d\n", grp_me, world_me, tskid); */
    process_task_(&tskid, &p_handle);
    f5 = MPI_Wtime();
    t_ptask += (f5-f4);
    
    GA_Pgroup_sync(p_handle);
    grp_me = GA_Nodeid();
    f6 = util_wallsec_();
    t_sync += (f6-f5);

    if(grp_me == 0) {
      int v[2] = {TASK_DONE, tskid};
/*        fprintf(stderr, "%d(%d):: Sending ack for task %d to %d\n", */
/*  	      grp_me, world_me, tskid, SERVER); */
      MPI_Send(v, 2, MPI_INT, server, SIGNAL_TAG, MPI_COMM_WORLD);
    }
    f7 = util_wallsec_();
    t_compl += (f7-f6);
/*     GA_Pgroup_sync(p_handle); */
    GA_Pgroup_destroy(p_handle);
    GA_Pgroup_set_default(default_grp);
    f8 = util_wallsec_();
    t_dest += (f8-f7);
  }
  e3 = util_wallsec_();
/*   fprintf(stderr, "%d:: CLIENT total time=%lf\n", ga_nodeid_(), e3-e1); */
/*   fprintf(stderr, "%d:: CLIENT ntsks=%d\n", ga_nodeid_(), ntsks); */
/*   fprintf(stderr, "%d:: CLIENT loop time=%lf\n", ga_nodeid_(), e3-e2); */
/*   fprintf(stderr, "%d:: CLIENT wait start time=%lf\n", ga_nodeid_(),t_wait_start); */
/*   fprintf(stderr, "%d:: CLIENT prepare time=%lf\n", ga_nodeid_(),t_prepar); */
/*   fprintf(stderr, "%d:: CLIENT grp crt time=%lf\n", ga_nodeid_(), t_grp); */
/*   fprintf(stderr, "%d:: CLIENT ptask time=%lf\n", ga_nodeid_(), t_ptask); */
/*   fprintf(stderr, "%d:: CLIENT sync time=%lf\n", ga_nodeid_(), t_sync); */
/*   fprintf(stderr, "%d:: CLIENT compl time=%lf\n", ga_nodeid_(), t_compl); */
/*   fprintf(stderr, "%d:: CLIENT grp dstry time=%lf\n", ga_nodeid_(), t_dest); */
/*   fflush(stdout); */
/*   fprintf(stderr, "%d:: CLIENT doit time=%lf\n",ga_nodeid_(),get_doit_time_()); */
/*   fprintf(stderr, "%d:: CLIENT esp time=%lf\n",ga_nodeid_(),get_esp_time_()); */
/*   fprintf(stderr, "%d:: CLIENT chrg_set time=%lf\n",ga_nodeid_(),get_chrg_set_time_()); */
/*   fprintf(stderr, "%d:: CLIENT gm_crt time=%lf\n",ga_nodeid_(),get_gm_crt_time_()); */
/*   fprintf(stderr, "%d:: CLIENT gm_push time=%lf\n",ga_nodeid_(),get_gm_push_time_()); */
}

Пример #15

0

Показать файл

Файл: ga-mpi.c Проект: jeffhammond/ga

void do_work()
{
int ZERO=0;   /* useful constants */
int g_a, g_b;
int n=N, ndim=2,type=MT_F_DBL,dims[2]={N,N},coord[2];
int me=GA_Nodeid(), nproc=GA_Nnodes();
int row, i, j;
 int lo[2], hi[2];

/* Note: on all current platforms DoublePrecision = double */
DoublePrecision buf[N], *max_row=NULL;

MPI_Comm WORLD_COMM;
MPI_Comm ROW_COMM;
int ilo,ihi, jlo,jhi, ld, prow, pcol;
int root=0, grp_me=-1;

     WORLD_COMM = GA_MPI_Comm_pgroup_default();

     if(me==0)printf("Creating matrix A\n");
     dims[0]=n; dims[1]=n;
     g_a = NGA_Create(type, ndim, dims, "A", NULL);
     if(!g_a) GA_Error("create failed: A",n); 
     if(me==0)printf("OK\n");
     
     if(me==0)printf("Creating matrix B\n");
     dims[0]=n;
     g_b = NGA_Create(type, 1, dims, "B", NULL);
     if(!g_b) GA_Error("create failed: B",n); 
     if(me==0)printf("OK\n");
     
     GA_Zero(g_a);   /* zero the matrix */
     
     if(me==0)printf("Initializing matrix A\n");
     /* fill in matrix A with values: A(i,j) = (i+j) */ 
     for(row=me; row<n; row+= nproc){
    /**
     * simple load balancing: 
     * each process works on a different row in MIMD style 
     */ 
    for(i=0; i<n; i++) buf[i]=(DoublePrecision)(i+row+1); 
    lo[0]=hi[0]=row;
    lo[1]=ZERO;  hi[1]=n-1; 
    NGA_Put(g_a, lo, hi, buf, &n); 
     }
     
     /* GA_print(&g_a);*/
     NGA_Distribution(g_a, me, lo, hi);
     ilo=lo[0]; ihi=hi[0];
     jlo=lo[1]; jhi=hi[1];
     
     GA_Sync(); 
     if(ihi-ilo+1 >0){
        max_row=(DoublePrecision*)malloc(sizeof(DoublePrecision)*(ihi-ilo+1));
        if (!max_row) GA_Error("malloc 3 failed",(ihi-ilo+1));
        for (i=0; i<(ihi-ilo+1); i++) {
            max_row[i] = 0.0;
        }
     }
     NGA_Proc_topology(g_a, me, coord);  /* block coordinates */
     prow = coord[0];
     pcol = coord[1];

     if(me==0)printf("Splitting comm according to distribution of A\n");
     
     /* GA on SP1 requires synchronization before & after message-passing !!*/
     GA_Sync(); 
     
     if(me==0)printf("Computing max row elements\n");
     /* create communicator for processes that 'own' A[:,jlo:jhi] */
     MPI_Barrier(WORLD_COMM);
     if(pcol < 0 || prow <0)
    MPI_Comm_split(WORLD_COMM,MPI_UNDEFINED,MPI_UNDEFINED, &ROW_COMM);
     else
    MPI_Comm_split(WORLD_COMM, (int)pcol, (int)prow, &ROW_COMM);
     
     if(ROW_COMM != MPI_COMM_NULL){
    double *ptr;
    MPI_Comm_rank(ROW_COMM, &grp_me);
    
    /* each process computes max elements in the block it 'owns' */
    lo[0]=ilo; hi[0]=ihi;
    lo[1]=jlo; hi[1]=jhi;
    NGA_Access(g_a, lo, hi, &ptr, &ld);
    for(i=0; i<ihi-ilo+1; i++){
       for(j=0; j<jhi-jlo+1; j++)
          if(max_row[i] < ptr[i*ld + j]){
         max_row[i] = ptr[i*ld + j];
          }
    }
    MPI_Reduce(max_row, buf, ihi-ilo+1, MPI_DOUBLE, MPI_MAX,
           root, ROW_COMM);
    
     }else fprintf(stderr,"process %d not participating\n",me);
     GA_Sync(); 
     
     /* processes with rank=root in ROW_COMM put results into g_b */
     ld = 1;
     if(grp_me == root) {
    lo[0]=ilo;  hi[0]=ihi;
    NGA_Put(g_b, lo, hi, buf, &ld); 
     }
        
     GA_Sync();

     if(me==0)printf("Checking the result\n");
     if(me==0){
    lo[0]=ZERO; hi[0]=n-1;
        NGA_Get(g_b, lo, hi, buf, &n); 
        for(i=0; i< n; i++)if(buf[i] != (double)n+i){
            fprintf(stderr,"error:%d max=%f should be:%d\n",i,buf[i],n+i);
            GA_Error("terminating...",1);
        }
     }
     
     if(me==0)printf("OK\n");

     GA_Destroy(g_a);
     GA_Destroy(g_b);
}

Пример #16

0

Показать файл

Файл: gatscat.c Проект: jeffhammond/ga

int main( int argc, char **argv ) {
  int g_a, g_b, i, j, size, size_me;
  int icnt, idx, jdx, ld;
  int n=N, type=MT_C_INT, one;
  int *values, *ptr;
  int **indices;
  int dims[2]={N,N};
  int lo[2], hi[2];

  int heap=3000000, stack=2000000;
  int me, nproc;

  int datatype, elements;
  double *prealloc_mem;
  MP_INIT(argc,argv);

#if 1
  GA_INIT(argc,argv);                            /* initialize GA */
  me=GA_Nodeid(); 
  nproc=GA_Nnodes();
  if(me==0) {
    if(GA_Uses_fapi())GA_Error("Program runs with C array API only",1);
    printf("\nUsing %ld processes\n",(long)nproc);
    fflush(stdout);
  }

  heap /= nproc;
  stack /= nproc;
  if(! MA_init(MT_F_DBL, stack, heap)) 
    GA_Error("MA_init failed",stack+heap);  /* initialize memory allocator*/ 

  /* Create a regular matrix. */
  if(me==0)printf("\nCreating matrix A of size %d x %d\n",N,N);
  g_a = NGA_Create(type, 2, dims, "A", NULL);
  if(!g_a) GA_Error("create failed: A",n); 

  /* Fill matrix using scatter routines */
  size = N*N;
  if (size%nproc == 0) {
    size_me = size/nproc;
  } else {
    i = size - size%nproc;
    size_me = i/nproc;
    if (me < size%nproc) size_me++;
  }

  /* Check that sizes are all okay */
  i = size_me;
  GA_Igop(&i,1,"+");
  if (i != size) {
    GA_Error("Sizes don't add up correctly: ",i);
  } else if (me==0) {
    printf("\nSizes add up correctly\n");
  }

  /* Allocate index and value arrays */
  indices = (int**)malloc(size_me*sizeof(int*));
  values = (int*)malloc(size_me*sizeof(int));
  icnt = me;
  for (i=0; i<size_me; i++) {
    values[i] = icnt;
    idx = icnt%N; 
    jdx = (icnt-idx)/N;
    if (idx >= N || idx < 0) {
      printf("p[%d] Bogus index i: %d\n",me,idx);
    }
    if (jdx >= N || jdx < 0) {
      printf("p[%d] Bogus index j: %d\n",me,jdx);
    }
    indices[i] = (int*)malloc(2*sizeof(int));
    (indices[i])[0] = idx;
    (indices[i])[1] = jdx;
    icnt += nproc;
  }

  /* Scatter values into g_a */
  NGA_Scatter(g_a, values, indices, size_me);
  GA_Sync();

  /* Check to see if contents of g_a are correct */
  NGA_Distribution( g_a, me, lo, hi );
  NGA_Access(g_a, lo, hi, &ptr, &ld);
  for (i=lo[0]; i<hi[0]; i++) {
    idx = i-lo[0];
    for (j=lo[1]; j<hi[1]; j++) {
      jdx = j-lo[1];
      if (ptr[idx*ld+jdx] != j*N+i) {
        printf("p[%d] (Scatter) expected: %d actual: %d\n",me,j*N+i,ptr[idx*ld+jdx]);
      }
    }
  }
  if (me==0) printf("\nCompleted test of NGA_Scatter\n");

  for (i=0; i<size_me; i++) {
    values[i] = 0;
  }
  GA_Sync();
  NGA_Gather(g_a, values, indices, size_me);
  icnt = me;
  for (i=0; i<size_me; i++) {
    if (icnt != values[i]) {
      printf("p[%d] (Gather) expected: %d actual: %d\n",me,icnt,values[i]);
    }
    icnt += nproc;
  }
  if (me==0) printf("\nCompleted test of NGA_Gather\n");
  GA_Sync();

  /* Scatter-accumulate values back into GA*/
  one = 1;
  NGA_Scatter_acc(g_a, values, indices, size_me, &one);
  GA_Sync();

  /* Check to see if contents of g_a are correct */
  for (i=lo[0]; i<hi[0]; i++) {
    idx = i-lo[0];
    for (j=lo[1]; j<hi[1]; j++) {
      jdx = j-lo[1];
      if (ptr[idx*ld+jdx] != 2*(j*N+i)) {
        printf("p[%d] (Scatter_acc) expected: %d actual: %d\n",me,2*(j*N+i),ptr[idx*ld+jdx]);
      }
    }
  }
  if (me==0) printf("\nCompleted test of NGA_Scatter_acc\n");
  NGA_Release(g_a, lo, hi);

  /* Test fixed buffer size */
  NGA_Alloc_gatscat_buf(size_me);

  /* Scatter-accumulate values back into GA*/
  GA_Sync();
  NGA_Scatter_acc(g_a, values, indices, size_me, &one);
  GA_Sync();

  /* Check to see if contents of g_a are correct */
  for (i=lo[0]; i<hi[0]; i++) {
    idx = i-lo[0];
    for (j=lo[1]; j<hi[1]; j++) {
      jdx = j-lo[1];
      if (ptr[idx*ld+jdx] != 3*(j*N+i)) {
        printf("p[%d] (Scatter_acc) expected: %d actual: %d\n",me,3*(j*N+i),ptr[idx*ld+jdx]);
      }
    }
  }
  if (me==0) printf("\nCompleted test of NGA_Scatter_acc using fixed buffers\n");
  NGA_Release(g_a, lo, hi);
  NGA_Free_gatscat_buf();

  GA_Destroy(g_a);
  if(me==0)printf("\nSuccess\n");
  GA_Terminate();
#endif

  MP_FINALIZE();

 return 0;
}

Пример #17

0

Показать файл

Файл: gemmtest.c Проект: jeffhammond/ga

void
test(int data_type) {
  int me=GA_Nodeid();
  int nproc = GA_Nnodes();
  int g_a, g_b, g_c;
  int ndim = 2;
  int dims[2]={N,N};
  int lo[2]={0,0};
  int hi[2]={N-1,N-1};
  int block_size[2]={NB,NB-1};
  int proc_grid[2];
  int i,j,l,k,m,n, ld;

  double alpha_dbl = 1.0, beta_dbl = 0.0;
  double dzero = 0.0;
  double ddiff;

  float alpha_flt = 1.0, beta_flt = 0.0;
  float fzero = 0.0;
  float fdiff;
  float ftmp;
  double dtmp;
  SingleComplex ctmp;
  DoubleComplex ztmp;

  DoubleComplex alpha_dcpl = {1.0, 0.0} , beta_dcpl = {0.0, 0.0}; 
  DoubleComplex zzero = {0.0,0.0};
  DoubleComplex zdiff;

  SingleComplex alpha_scpl = {1.0, 0.0} , beta_scpl = {0.0, 0.0}; 
  SingleComplex czero = {0.0,0.0};
  SingleComplex cdiff;

  void *alpha=NULL, *beta=NULL;
  void *abuf=NULL, *bbuf=NULL, *cbuf=NULL, *c_ptr=NULL;

  switch (data_type) {
  case C_FLOAT:
    alpha  = (void *)&alpha_flt;
    beta   = (void *)&beta_flt;
    abuf = (void*)malloc(N*N*sizeof(float));
    bbuf = (void*)malloc(N*N*sizeof(float));
    cbuf = (void*)malloc(N*N*sizeof(float));
    if(me==0) printf("Single Precision: Testing GA_Sgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;      
  case C_DBL:
    alpha  = (void *)&alpha_dbl;
    beta   = (void *)&beta_dbl;
    abuf = (void*)malloc(N*N*sizeof(double));
    bbuf = (void*)malloc(N*N*sizeof(double));
    cbuf = (void*)malloc(N*N*sizeof(double));
    if(me==0) printf("Double Precision: Testing GA_Dgemm,NGA_Matmul_patch for %d-Dimension", ndim); 
    break;    
  case C_DCPL:
    alpha  = (void *)&alpha_dcpl;
    beta   = (void *)&beta_dcpl;
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex));
    if(me==0) printf("Double Complex:   Testing GA_Zgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  case C_SCPL:
    alpha  = (void *)&alpha_scpl;
    beta   = (void *)&beta_scpl;
    abuf = (void*)malloc(N*N*sizeof(SingleComplex));
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex));
    if(me==0) printf("Single Complex:   Testing GA_Cgemm,NGA_Matmul_patch for %d-Dimension", ndim);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  if (me==0) printf("\nCreate A, B, C\n");
#ifdef USE_REGULAR
  g_a = NGA_Create(data_type, ndim, dims, "array A", NULL);
#endif
#ifdef USE_SIMPLE_CYCLIC
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  NGA_Set_block_cyclic(g_a,block_size);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_SCALAPACK
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_block_cyclic_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
#ifdef USE_TILED
  g_a = NGA_Create_handle();
  NGA_Set_data(g_a,ndim,dims,data_type);
  NGA_Set_array_name(g_a,"array A");
  grid_factor(nproc,&i,&j);
  proc_grid[0] = i;
  proc_grid[1] = j;
  NGA_Set_tiled_proc_grid(g_a,block_size,proc_grid);
  if (!GA_Allocate(g_a)) {
    GA_Error("Failed: create: g_a",40);
  }
#endif
  g_b = GA_Duplicate(g_a, "array B");  
  g_c = GA_Duplicate(g_a, "array C");
  if(!g_a || !g_b || !g_c) GA_Error("Create failed: a, b or c",1);

  ld = N;
  if (me==0) printf("\nInitialize A\n");
  /* Set up matrix A */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[i*N+j] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[i*N+j] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[i*N+j].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[i*N+j].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[i*N+j] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[i*N+j] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[i*N+j].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[i*N+j].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[i*N+j].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nPerform matrix multiply\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (me==0) printf("\nCheck answer\n");
  /*
  GA_Print(g_a);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_b);
  if (me == 0) printf("\n\n\n\n");
  GA_Print(g_c); 
  */

  /* Check answer */
  NGA_Get(g_a,lo,hi,abuf,&ld);
  NGA_Get(g_b,lo,hi,bbuf,&ld);
  for (i=0; i<N; i++) {
    for (j=0; j<N; j++) {
      switch (data_type) {
        case C_FLOAT:
          ((float*)cbuf)[i*N+j] = fzero;
          break;
        case C_DBL:
          ((double*)cbuf)[i*N+j] = dzero;
          break;
        case C_DCPL:
          ((DoubleComplex*)cbuf)[i*N+j] = zzero;
          break;
        case C_SCPL:
          ((SingleComplex*)cbuf)[i*N+j] = czero;
          break;
        default:
          GA_Error("wrong data type", data_type);
      }
      for (k=0; k<N; k++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N+j] += ((float*)abuf)[i*N+k]
              *((float*)bbuf)[k*N+j];
            break;
          case C_DBL:
            ((double*)cbuf)[i*N+j] += ((double*)abuf)[i*N+k]
              *((double*)bbuf)[k*N+j];
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N+j].real +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].real
               -(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].imag));
            ((DoubleComplex*)cbuf)[i*N+j].imag +=
              (((DoubleComplex*)abuf)[i*N+k].real
               *((DoubleComplex*)bbuf)[k*N+j].imag
               +(((DoubleComplex*)abuf)[i*N+k].imag
                 *((DoubleComplex*)bbuf)[k*N+j].real));
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N+j].real +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].real
               -(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].imag));
            ((SingleComplex*)cbuf)[i*N+j].imag +=
              (((SingleComplex*)abuf)[i*N+k].real
               *((SingleComplex*)bbuf)[k*N+j].imag
               +(((SingleComplex*)abuf)[i*N+k].imag
                 *((SingleComplex*)bbuf)[k*N+j].real));
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();
  if (me == 0) {
    NGA_Get(g_c,lo,hi,abuf,&ld);
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        switch (data_type) {
          case C_FLOAT:
            fdiff = ((float*)abuf)[i*N+j]-((float*)cbuf)[i*N+j];
            if (((float*)abuf)[i*N+j] != 0.0) {
              fdiff /= ((float*)abuf)[i*N+j];
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((float*)abuf)[i*N+j],((float*)cbuf)[i*N+j]);
            }
            break;
          case C_DBL:
            ddiff = ((double*)abuf)[i*N+j]-((double*)cbuf)[i*N+j];
            if (((double*)abuf)[i*N+j] != 0.0) {
              ddiff /= ((double*)abuf)[i*N+j];
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: %f Expected: %f\n",me,i,j,
                  ((double*)abuf)[i*N+j],((double*)cbuf)[i*N+j]);
            }
            break;
          case C_DCPL:
            zdiff.real = ((DoubleComplex*)abuf)[i*N+j].real
              -((DoubleComplex*)cbuf)[i*N+j].real;
            zdiff.imag = ((DoubleComplex*)abuf)[i*N+j].imag
              -((DoubleComplex*)cbuf)[i*N+j].imag;
            if (((DoubleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((DoubleComplex*)abuf)[i*N+j].imag != 0.0) {
              ztmp = ((DoubleComplex*)abuf)[i*N+j];
              ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
                  /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
            } else {
              ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
            }
            if (fabs(ddiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((DoubleComplex*)abuf)[i*N+j].real,
                  ((DoubleComplex*)abuf)[i*N+j].imag,
                  ((DoubleComplex*)cbuf)[i*N+j].real,
                  ((DoubleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          case C_SCPL:
            cdiff.real = ((SingleComplex*)abuf)[i*N+j].real
              -((SingleComplex*)cbuf)[i*N+j].real;
            cdiff.imag = ((SingleComplex*)abuf)[i*N+j].imag
              -((SingleComplex*)cbuf)[i*N+j].imag;
            if (((SingleComplex*)abuf)[i*N+j].real != 0.0 ||
                ((SingleComplex*)abuf)[i*N+j].imag != 0.0) {
              ctmp = ((SingleComplex*)abuf)[i*N+j];
              fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
                  /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
            } else {
              fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
            }
            if (fabs(fdiff) > TOLERANCE) {
              printf("p[%d] [%d,%d] Actual: (%f,%f) Expected: (%f,%f)\n",me,i,j,
                  ((SingleComplex*)abuf)[i*N+j].real,
                  ((SingleComplex*)abuf)[i*N+j].imag,
                  ((SingleComplex*)cbuf)[i*N+j].real,
                  ((SingleComplex*)cbuf)[i*N+j].imag);
            }
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
  }
  GA_Sync();

  /* copy cbuf back to g_a */
  if (me == 0) {
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  GA_Sync();

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = GA_Fdot(g_a,g_a);
      break;
    case C_DBL:
      dtmp = GA_Ddot(g_a,g_a);
      break;
    case C_DCPL:
      ztmp = GA_Zdot(g_a,g_a);
      break;
    case C_SCPL:
      ctmp = GA_Cdot(g_a,g_a);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  GA_Zero(g_b);
  GA_Add(alpha,g_a,beta,g_c,g_b);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = GA_Fdot(g_b, g_b);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = GA_Ddot(g_b, g_b);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = GA_Zdot(g_b, g_b);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = GA_Cdot(g_b, g_b);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
#endif

  free(abuf);
  free(bbuf);
  free(cbuf);

  switch (data_type) {
  case C_FLOAT:
    abuf = (void*)malloc(N*N*sizeof(float)/4);
    bbuf = (void*)malloc(N*N*sizeof(float)/4);
    cbuf = (void*)malloc(N*N*sizeof(float)/4);
    break;      
  case C_DBL:
    abuf = (void*)malloc(N*N*sizeof(double)/4);
    bbuf = (void*)malloc(N*N*sizeof(double)/4);
    cbuf = (void*)malloc(N*N*sizeof(double)/4);
    break;    
  case C_DCPL:
    abuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(DoubleComplex)/4);
    break;
  case C_SCPL:
    abuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    bbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    cbuf = (void*)malloc(N*N*sizeof(SingleComplex)/4);
    break;
  default:
    GA_Error("wrong data type", data_type);
  }

  /* Test multiply on a fraction of matrix. Start by reinitializing
   * A and B */
  GA_Zero(g_a);
  GA_Zero(g_b);
  GA_Zero(g_c);

  if (me==0) printf("\nTest patch multiply\n");

  lo[0] = N/4;
  lo[1] = N/4;
  hi[0] = 3*N/4-1;
  hi[1] = 3*N/4-1;
  ld = N/2;

  /* Set up matrix A */
  if (me==0) printf("\nInitialize A\n");
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)abuf)[(i-N/4)*N/2+(j-N/4)] = (float)(i*N+j);
            break;
          case C_DBL:
            ((double*)abuf)[(i-N/4)*N/2+(j-N/4)] = (double)(i*N+j);
            break;
          case C_DCPL:
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(i*N+j);
            ((DoubleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(i*N+j);
            ((SingleComplex*)abuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_a,lo,hi,abuf,&ld);
  }
  GA_Sync();

  if (me==0) printf("\nInitialize B\n");
  /* Set up matrix B */
  if (me == 0) {
    for (i=N/4; i<3*N/4; i++) {
      for (j=N/4; j<3*N/4; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (float)(j*N+i);
            break;
          case C_DBL:
            ((double*)bbuf)[(i-N/4)*N/2+(j-N/4)] = (double)(j*N+i);
            break;
          case C_DCPL:
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (double)(j*N+i);
            ((DoubleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          case C_SCPL:
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].real = (float)(j*N+i);
            ((SingleComplex*)bbuf)[(i-N/4)*N/2+(j-N/4)].imag = 1.0;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
      }
    }
    NGA_Put(g_b,lo,hi,bbuf,&ld);
  }
  GA_Sync();

  beta_flt = 0.0;
  beta_dbl = 0.0;
  beta_scpl.real = 0.0;
  beta_dcpl.real = 0.0;
  if (me==0) printf("\nPerform matrix multiply on sub-blocks\n");
  switch (data_type) {
    case C_FLOAT:
      NGA_Matmul_patch('N','N',&alpha_flt,&beta_flt,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DBL:
      NGA_Matmul_patch('N','N',&alpha_dbl,&beta_dbl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_SCPL:
      NGA_Matmul_patch('N','N',&alpha_scpl,&beta_scpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    case C_DCPL:
      NGA_Matmul_patch('N','N',&alpha_dcpl,&beta_dcpl,g_a,lo,hi,
        g_b,lo,hi,g_c,lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  GA_Sync();
#if 0
  if (0) {
  /*
  if (data_type != C_SCPL && data_type != C_DCPL) {
  */

  if (me==0) printf("\nCheck answer\n");

  /* Multiply buffers by hand */
  if (me == 0) {
    for (i=0; i<N/2; i++) {
      for (j=0; j<N/2; j++) {
        switch (data_type) {
          case C_FLOAT:
            ((float*)cbuf)[i*N/2+j] = fzero;
            break;
          case C_DBL:
            ((double*)cbuf)[i*N/2+j] = dzero;
            break;
          case C_DCPL:
            ((DoubleComplex*)cbuf)[i*N/2+j] = zzero;
            break;
          case C_SCPL:
            ((SingleComplex*)cbuf)[i*N/2+j] = czero;
            break;
          default:
            GA_Error("wrong data type", data_type);
        }
        for (k=0; k<N/2; k++) {
          switch (data_type) {
            case C_FLOAT:
              ((float*)cbuf)[i*N/2+j] += ((float*)abuf)[i*N/2+k]
                *((float*)bbuf)[k*N/2+j];
              break;
            case C_DBL:
              ((double*)cbuf)[i*N/2+j] += ((double*)abuf)[i*N/2+k]
                *((double*)bbuf)[k*N/2+j];
              break;
            case C_DCPL:
              ((DoubleComplex*)cbuf)[i*N/2+j].real +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].real
                 -(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].imag));
              ((DoubleComplex*)cbuf)[i*N/2+j].imag +=
                (((DoubleComplex*)abuf)[i*N/2+k].real
                 *((DoubleComplex*)bbuf)[k*N/2+j].imag
                 +(((DoubleComplex*)abuf)[i*N/2+k].imag
                   *((DoubleComplex*)bbuf)[k*N/2+j].real));
              break;
            case C_SCPL:
              ((SingleComplex*)cbuf)[i*N/2+j].real +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].real
                 -(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].imag));
              ((SingleComplex*)cbuf)[i*N/2+j].imag +=
                (((SingleComplex*)abuf)[i*N/2+k].real
                 *((SingleComplex*)bbuf)[k*N/2+j].imag
                 +(((SingleComplex*)abuf)[i*N/2+k].imag
                   *((SingleComplex*)bbuf)[k*N/2+j].real));
              break;
            default:
              GA_Error("wrong data type", data_type);
          }
        }
      }
    }
    NGA_Put(g_a,lo,hi,cbuf,&ld);
  }
  if (me == 0) printf("\n\n\n\n");

  /* Get norm of g_a */
  switch (data_type) {
    case C_FLOAT:
      ftmp = NGA_Fdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DBL:
      dtmp = NGA_Ddot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_DCPL:
      ztmp = NGA_Zdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    case C_SCPL:
      ctmp = NGA_Cdot_patch(g_a,'N',lo,hi,g_a,'N',lo,hi);
      break;
    default:
      GA_Error("wrong data type", data_type);
  }
  /* subtract C from A and put the results in B */
  beta_flt = -1.0;
  beta_dbl = -1.0;
  beta_scpl.real = -1.0;
  beta_dcpl.real = -1.0;
  NGA_Zero_patch(g_b,lo,hi);
  NGA_Add_patch(alpha,g_a,lo,hi,beta,g_c,lo,hi,g_b,lo,hi);
  /* evaluate the norm of the difference between the two matrices */
  switch (data_type) {
    case C_FLOAT:
      fdiff = NGA_Fdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ftmp != 0.0) {
        fdiff /= ftmp;
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(fdiff), TOLERANCE);
        GA_Error("GA_Sgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Sgemm OK\n\n");
      }
      break;
    case C_DBL:
      ddiff = NGA_Ddot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (dtmp != 0.0) {
        ddiff /= dtmp;
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(ddiff), TOLERANCE);
        GA_Error("GA_Dgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Dgemm OK\n\n");
      }
      break;
    case C_DCPL:
      zdiff = NGA_Zdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ztmp.real != 0.0 || ztmp.imag != 0.0) {
        ddiff = sqrt((zdiff.real*zdiff.real+zdiff.imag*zdiff.imag)
            /(ztmp.real*ztmp.real+ztmp.imag*ztmp.imag));
      } else {
        ddiff = sqrt(zdiff.real*zdiff.real+zdiff.imag*zdiff.imag);
      }
      if(fabs(ddiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(zdiff.real), TOLERANCE);
        GA_Error("GA_Zgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Zgemm OK\n\n");
      }
      break;
    case C_SCPL:
      cdiff = NGA_Cdot_patch(g_b,'N',lo,hi,g_b,'N',lo,hi);
      if (ctmp.real != 0.0 || ctmp.imag != 0.0) {
        fdiff = sqrt((cdiff.real*cdiff.real+cdiff.imag*cdiff.imag)
            /(ctmp.real*ctmp.real+ctmp.imag*ctmp.imag));
      } else {
        fdiff = sqrt(cdiff.real*cdiff.real+cdiff.imag*cdiff.imag);
      }
      if(fabs(fdiff) > TOLERANCE) {
        printf("\nabs(result) = %f > %f\n", fabsf(cdiff.real), TOLERANCE);
        GA_Error("GA_Cgemm Failed", 1);
      } else if (me == 0) {
        printf("\nGA_Cgemm OK\n\n");
      }
      break;
    default:
      GA_Error("wrong data type", data_type);
  }

  }
#endif
  free(abuf);
  free(bbuf);
  free(cbuf);

  GA_Destroy(g_a);
  GA_Destroy(g_b);
  GA_Destroy(g_c);
}

Пример #18

0

Показать файл

Файл: testmatmultc.c Проект: sg0/Elemental

/*
 * test ga_dgemm
 * Note: - change nummax for large arrays
 *       - turn off "dgemm_verify" for large arrays due to memory 
 *         limitations, as dgemm_verify=1 for large arrays produces 
 *         segfault, dumps core,or any crap.
 */
int main(int argc, char **argv)
{
    int num_m;
    int num_n;
    int num_k;
    int i;
    int ii;
    double *h0;
    int g_c;
    int g_b;
    int g_a;
    double a;
    double t1;
    double mf;
    double avg_t[ntrans];
    double avg_mf[ntrans];
    int itime;
    int ntimes;
    int nums_m[/*howmany*/] = {512,1024};
    int nums_n[/*howmany*/] = {512,1024};
    int nums_k[/*howmany*/] = {512,1024};
    char transa[/*ntrans*/] = "ntnt";
    char transb[/*ntrans*/] = "nntt";
    char ta;
    char tb;
    double *tmpa;
    double *tmpb;
    double *tmpc;
    int ndim;
    int dims[2];
#ifdef BLOCK_CYCLIC
    int block_size[2];
#endif

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );
    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
#else
    MP_INIT(argc,argv);
    if (!MA_init(MT_DBL,1,20000000)) {
        GA_Error("failed: ma_init(MT_DBL,1,20000000)",10);
    }
    GA_INIT(argc,argv);
    me = GA_Nodeid();
#endif

    h0 = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpa = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpb = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpc = (double*)malloc(sizeof(double) * nummax*nummax);

    ii = 0;
    for (i=0; i<nummax*nummax; i++) {
        ii = ii + 1;
        if (ii > nummax) {
            ii = 0;
        }
        h0[i] = ii;
    }

    /* Compute times assuming 500 mflops and 5 second target time */
    /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */
    ntimes = 5;

    for (ii=0; ii<howmany; ii++) {
        num_m = nums_m[ii];
        num_n = nums_n[ii];
        num_k = nums_k[ii];
        a = 0.5/(num_m*num_n);
        if (num_m > nummax || num_n > nummax || num_k > nummax) {
            GA_Error("Insufficient memory: check nummax", 1);
        }

#ifndef BLOCK_CYCLIC
        ndim = 2;

	/*
        dims[0] = num_m;
        dims[1] = num_n;
	*/
        dims[1] = num_m;
        dims[0] = num_n;

#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c );
#else
        if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) {
            GA_Error("failed: create g_c",20);
        }
#endif
	/*
        dims[0] = num_k;
        dims[1] = num_n;
	*/
        dims[1] = num_k;
        dims[0] = num_n;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b );
#else
        if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) {
            GA_Error("failed: create g_b",30);
        }
#endif
	/*
        dims[0] = num_m;
        dims[1] = num_k;
	*/
        dims[1] = num_m;
        dims[0] = num_k;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a );
#else
        if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) {
            GA_Error("failed: create g_a",40);
        }
#endif
#else
        ndim = 2;
        block_size[0] = 128;
        block_size[1] = 128;

        dims[0] = num_m;
        dims[1] = num_n;
        g_c = GA_Create_handle();
        GA_Set_data(g_c,ndim,dims,MT_DBL);
        GA_Set_array_name(g_c,"g_c");
        GA_Set_block_cyclic(g_c,block_size);
        if (!GA_Allocate(g_c)) {
            GA_Error("failed: create g_c",40);
        }

        dims[0] = num_k;
        dims[1] = num_n;
        g_b = GA_Create_handle();
        GA_Set_data(g_b,ndim,dims,MT_DBL);
        GA_Set_array_name(g_b,"g_b");
        GA_Set_block_cyclic(g_b,block_size);
        if (!ga_allocate(g_b)) {
            GA_Error("failed: create g_b",40);
        }

        dims[0] = num_m;
        dims[1] = num_k;
        g_a = GA_Create_handle();
        GA_Set_data(g_a,ndim,dims,MT_DBL);
        GA_Set_array_name(g_a,"g_a");
        GA_Set_block_cyclic(g_a,block_size);
        if (!ga_allocate(g_a)) {
            GA_Error('failed: create g_a',40);
        }
#endif         

        /* Initialize matrices A and B */
        if (me == 0) { 
            load_ga(g_a, h0, num_m, num_k);
            load_ga(g_b, h0, num_k, num_n);
        }
#if defined(USE_ELEMENTAL)
        double zero = 0.0;
        ElGlobalArraysFill_d( eldga, g_c, &zero );
	ElGlobalArraysSync_d( eldga );
#else
        GA_Zero(g_c);
        GA_Sync();
#endif
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n",
                    (long)num_m, (long)num_k, (long)num_k, (long)num_n);
            fflush(stdout);
        }

        for (i=0; i<ntrans; i++) {
            avg_t[i]  = 0.0;
            avg_mf[i] = 0.0;
        }

        for (itime=0; itime<ntimes; itime++) {
            for (i=0; i<ntrans; i++) {
#if defined(USE_ELEMENTAL)
	        ElGlobalArraysSync_d( eldga );
#else
                GA_Sync();
#endif
                ta = transa[i];
                tb = transb[i];
                t1 = MP_TIMER();
#if defined(USE_ELEMENTAL)
		ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c );
#else
                GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c);
#endif
                t1 = MP_TIMER() - t1;
#if defined(USE_ELEMENTAL)
                if (me == 0) {
#else
                if (GA_Nodeid() == 0) {
#endif
#if defined(USE_ELEMENTAL)
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc;
#else
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes();
#endif
                    avg_t[i]  = avg_t[i]+t1;
                    avg_mf[i] = avg_mf[i] + mf;
                    printf("%15s%2d: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                            "Run#", itime, t1, mf, ta, tb);
                    fflush(stdout);
                    if (dgemm_verify && itime == 0) {
                        /* recall the C API swaps the matrix order */
                        /* we swap it here for the Fortran-based verify */
                        verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0,
                                g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc);
                    }
                }
            }
        }
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\n");
            for (i=0; i<ntrans; i++) {
                printf("%17s: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                        "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes,
                        transa[i], transb[i]);
            }
            if(dgemm_verify) {
                printf("All GA_Dgemms are verified...O.K.\n");
            }
            fflush(stdout);
        }

        /*
           GA_Print(g_a);
           GA_Print(g_b);
           GA_Print(g_c);
           */
#if defined(USE_ELEMENTAL)
        ElGlobalArraysDestroy_d( eldga, g_a );
        ElGlobalArraysDestroy_d( eldga, g_b );
        ElGlobalArraysDestroy_d( eldga, g_c );
#else
        GA_Destroy(g_c);
        GA_Destroy(g_b);
        GA_Destroy(g_a);
#endif
    }

    /* ???
       format(a15, i2, ': ', e12.4, ' seconds ',f12.1, 
       .     ' mflops/proc ', 3a2)
       */
#if defined(USE_ELEMENTAL)
    if (me == 0) {
#else
    if (GA_Nodeid() == 0) {
#endif
        printf("All tests successful\n");
    }

    free(h0);
    free(tmpa);
    free(tmpb);
    free(tmpc);
#if defined(USE_ELEMENTAL)
    // call el::global arrays destructor
    ElGlobalArraysTerminate_d( eldga );
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    return 0;
}


/*
 * Verify for correctness. Process 0 computes BLAS dgemm 
 * locally. For larger arrays, disbale this test as memory
 * might not be sufficient
 */
void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k,
        double alpha, int g_a, int g_b, double beta, int g_c,
        double *tmpa, double *tmpb, double *tmpc)
{
    int i,j,type,ndim,dims[2],lo[2],hi[2];
    double abs_value;

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            tmpc[j+i*num_m] = -1.0;
            tmpa[j+i*num_m] = -2.0;
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_a, lo, hi, tmpa, &dims[1]);
#endif

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] );
#else
    NGA_Get(g_b, lo, hi, tmpb, &dims[1]);
#endif

    /* compute dgemm sequentially */
#if defined(USE_ELEMENTAL)
    cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), 
	    ( xt2 == 'n'? CblasNoTrans: CblasTrans ), 
	    num_m /* M */, num_n /* N */, num_k /* K */, 
	    alpha, tmpa, num_m, /* lda */ 
	    tmpb, num_k, /* ldb */ beta, 
	    tmpc, num_m /* ldc */);
#else
    xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k,
            &alpha, tmpa, &num_m,
            tmpb, &num_k, &beta,
            tmpc, &num_m);
#endif

    /* after computing c locally, verify it with the values in g_c */

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_c, lo, hi, tmpa, &dims[1]);
#endif

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]);
            if(abs_value > 1.0 || abs_value < -1.0) {
                printf("Values are = %f %f\n",
                        tmpc[j+i*num_m], tmpa[j+i*num_m]);
                printf("Values are = %f %f\n", 
                        fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value);
                fflush(stdout);
                GA_Error("verify ga_dgemm failed", 1);
            }
        }
    }
}

/**
 * called by process '0' (or your master process )
 */
void load_ga(int handle, double *f, int dim1, int dim2)
{
      int lo[2], hi[2];
      
      if (dim1 < 0 || dim2 < 0) {
          return;
      }

      lo[0] = 0;
      lo[1] = 0;
      hi[0] = dim1-1;
      hi[1] = dim2-1;
#if defined(USE_ELEMENTAL)
      ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 );
#else
      NGA_Put(handle, lo, hi, f, &dim1);
#endif
}

Пример #19

0

Показать файл

Файл: matrix1.c Проект: MihaiLupoiu/TPP

void matrix_multiply() {
    
    int dims[NDIM], chunk[NDIM], ld[NDIM];
    int lo[NDIM], hi[NDIM], lo1[NDIM], hi1[NDIM];
    int lo2[NDIM], hi2[NDIM], lo3[NDIM], hi3[NDIM];
    int g_a, g_b, g_c, i, j, k, l;
    int me, nprocs;

    /* Find local processor ID and the number of processors */
    /* ### assign processor ID to the int variable "me" and the total number
     * ### of processors to the int variable "nprocs" */
    me     = GA_Nodeid();
    nprocs = GA_Nnodes();
    
    /* Configure array dimensions. Force an unequal data distribution */
    for(i=0; i<NDIM; i++) {
       dims[i]  = TOTALELEMS;
       ld[i]    = dims[i];
       chunk[i] = TOTALELEMS/nprocs-1; /*minimum block size on each process*/
    }
 
    /* create a global array g_a and duplicate it to get g_b and g_c*/
    /* ### create GA of doubles with dimension "NDIM" and size "dims" with
     * ### minimum block size "chunk" and assign the handle to the
     * ### integer variable "g_a". Then create remaining global arrays,
     * ### assigned to the integer handle "g_b" and "g_c" by duplicating
     * ### g_a. Assign the names "Array A", "Array B" and "Array C" to
     * ### "g_a", "g_b", and "g_c". */

    g_a=NGA_Create(C_DBL, NDIM, dims, "array A", chunk);
    

    if (!g_a) GA_Error("create failed: A", NDIM);
    if (me==0) printf("  Created Array A\n");

    g_b=GA_Duplicate(g_a,"array B");
    g_c=GA_Duplicate(g_a,"array C");
    if (!g_b || !g_c) GA_Error("duplicate failed",NDIM);
    if (me==0) printf("  Created Arrays B and C\n");
 
    /* initialize data in matrices a and b */
    if(me==0)printf("  Initializing matrix A and B\n");
    k = 0; l = 7;
    for(i=0; i<dims[0]; i++) {
       for(j=0; j<dims[1]; j++) {
          a[i][j] = (double)(++k%29);
          b[i][j] = (double)(++l%37);
       }
    }

    /*  Copy data to global arrays g_a and g_b */
    lo1[0] = 0;
    lo1[1] = 0;
    hi1[0] = dims[0]-1;
    hi1[1] = dims[1]-1;
    if (me==0) {
      /* ### copy the contents of array "a" into the portion of global array
       * ### "g_a" described by "lo1" and "hi1". Similarly, copy the contents
       * ### of the array "b" into corresponding portion of global array "g_b".
       * ### Use the array of strides "ld" to describe the physical layout of
       * ### arrays "a" and "b". */
      NGA_Put(g_a,lo1,hi1,a,ld);
      NGA_Put(g_b,lo1,hi1,b,ld);

    }
    
    /*  Synchronize all processors to make sure everyone has data */
    /* ### synchronize all processors */

    GA_Sync();


    /* Determine which block of data is locally owned. Note that
       the same block is locally owned for all GAs. */
    /* ### find out which block of data my node ("me") owns for the global
     * ### array "g_c" and store the contents in the integer arrays "lo" and
     * ### "hi". */
    NGA_Distribution(g_c,me,lo,hi);

    
    /* Get the blocks from g_a and g_b needed to compute this block in
       g_c and copy them into the local buffers a and b. */
    lo2[0] = lo[0];
    lo2[1] = 0;
    hi2[0] = hi[0];
    hi2[1] = dims[0]-1;
    /* ### copy the block of data described by the arrays "lo2" and "hi2" from
     * ### the global array "g_a" into the local array "a". Use the array of
     * ### strides "ld" to describe the physical layout of "a". */
    NGA_Get(g_a,lo2,hi2,a,ld);

    lo3[0] = 0;
    lo3[1] = lo[1];
    hi3[0] = dims[1]-1;
    hi3[1] = hi[1];
    /* ### copy the block of data described by the arrays "lo3" and "hi3" from
     * ### the global array "g_b" into the local array "b". Use the array of
     * ### strides "ld" to describe the physical layout of "b". */
     NGA_Get(g_b,lo3,hi3,b,ld);

    /* Do local matrix multiplication and store the result in local
       buffer c. Start by evaluating the transpose of b. */
    for(i=0; i < hi3[0]-lo3[0]+1; i++)
       for(j=0; j < hi3[1]-lo3[1]+1; j++) 
          btrns[j][i] = b[i][j];

    /* Multiply a and b to get c */
    for(i=0; i < hi[0] - lo[0] + 1; i++) {
       for(j=0; j < hi[1] - lo[1] + 1; j++) {
          c[i][j] = 0.0;
          for(k=0; k<dims[0]; k++)
             c[i][j] = c[i][j] + a[i][k]*btrns[j][k];
       }
    }
    
    /* Copy c back to g_c */
    /* ### copy data from the local array "c" into the block of the global
     * ### array "g_c" described by the integer arrays "lo" and "hi". Use
     * ### the array of strides "ld" to describe the physical layout of "c". */
    NGA_Put(g_c,lo,hi,c,ld);

    verify(g_a, g_b, g_c, lo1, hi1, ld);
    
    /* Deallocate arrays */
    /* ### destroy the global arrays "g_a", "g_b", "g_c" */
    GA_Destroy(g_a);
    GA_Destroy(g_b);
    GA_Destroy(g_c);
}

Пример #20

0

Показать файл

Файл: transp1D.c Проект: MihaiLupoiu/TPP

void TRANSPOSE1D() {
    
    int dims[MAXDIM], chunk[MAXDIM], ld[MAXDIM], lo[MAXDIM], hi[MAXDIM];
    int lo1[MAXDIM], hi1[MAXDIM], lo2[MAXDIM], hi2[MAXDIM];
    int g_a, g_b, a[MAXPROC*TOTALELEMS],b[MAXPROC*TOTALELEMS];
    int nelem, i;    
    int me, nprocs;

    /* Find local processor ID and number of processors */
    /* ### assign the local processor ID to the int variable "me"
     * ### and the total number of processors to the int variable
     * ### "nprocs" */
    
    me     = GA_Nodeid();
    nprocs = GA_Nnodes();

    /* Configure array dimensions. Force an unequal data distribution */
    dims[0]  = nprocs*TOTALELEMS + nprocs/2;
    ld[0]    = dims[0];
    chunk[0] = TOTALELEMS; /* minimum data on each process */
 
    /* create a global array g_a and duplicate it to get g_b */
    /* ### create GA of integers with dimension "NDIM" and size "dims" with
     * ### minimum block size "chunk" and assign the handle to the
     * ### integer variable "g_a". Then create a second global array
     * ### assigned to the integer handle "g_b" by duplicating "g_a".
     * ### Assign the names "Array A" and "Array B" to "g_a" and "g_b". */

    g_a=NGA_Create(C_INT, 1, dims, "array A", chunk);
    g_b=GA_Duplicate(g_a,"array B");

    if (!g_a) GA_Error("create failed: A", NDIM);
    if (me==0) printf("  Created Array A\n");
    
    if (! g_b) GA_Error("duplicate failed",NDIM);
    if (me==0) printf("  Created Array B\n");
 
    /* initialize data in g_a */
    if (me==0) {
       printf("  Initializing matrix A\n");
       for(i=0; i<dims[0]; i++) a[i] = i;
       lo[0]  = 0;
       hi[0] = dims[0]-1;
     /* ### copy the contents of array "a" into the portion of global array
      * ### "g_a" described by "lo" and "hi". Use the array of strides
      * ### "ld" to describe the physical layout of array "a". */

      NGA_Put(g_a,lo,hi,a,ld);

    }

    /* Synchronize all processors to guarantee that everyone has data
       before proceeding to the next step. */

    /* ### synchronize all processors */
    GA_Sync();

    /* Start initial phase of inversion by inverting the data held locally on
       each processor. Start by finding out which data each processor owns. */

    /* ### find out which block of data my node ("me") owns for the global
     * ### array "g_a" and store the contents in the integer arrays "lo1" and
     * ### "hi1". */

    NGA_Distribution(g_a,me,lo1,hi1);

    /* Get locally held data and copy it into local buffer a  */

    /* ### use the arrays "lo1" and "hi1" to copy the locally held block of data
     * ### from the global array "g_a" into the local array "a". Use the array
     * ### of strides "ld" to describe the physical layout of "a". */

    NGA_Get(g_a,lo1,hi1,a,ld);

    /* Invert data locally */
    nelem = hi1[0] - lo1[0] + 1;
    for (i=0; i<nelem; i++) b[i] = a[nelem-1-i];
    
    /* Invert data globally by copying locally inverted blocks into
     * their inverted positions in the GA */
    lo2[0] = dims[0] - hi1[0] -1;
    hi2[0] = dims[0] - lo1[0] -1;

    /* ### copy data from the local array "b" into the block of the global
     * ### array "g_b" described by the integer arrays "lo2" and "hi2". Use
     * ### the array of strides "ld" to describe the physical layout of "b". */

    NGA_Put(g_b,lo2,hi2,b,ld);

    /* Synchronize all processors to make sure inversion is complete */
    /* ### synchronize all processors */
    
    GA_Sync();

    /* Check to see if inversion is correct */
    if(me == 0) verify(g_a, g_b);
    
    /* Deallocate arrays */
    /* ### destroy global arrays "g_a" and "g_b" */
    GA_Destroy(g_a);
    GA_Destroy(g_b);
}

Пример #21

0

Показать файл

Файл: testc.c Проект: jeffhammond/ga

void do_work()
{
int ONE=1 ;   /* useful constants */
int g_a, g_b;
int n=N, type=MT_F_DBL;
int me=GA_Nodeid(), nproc=GA_Nnodes();
int i, row;
int dims[2]={N,N};
int lo[2], hi[2], ld;

/* Note: on all current platforms DoublePrecision == double */
double buf[N], err, alpha, beta;

     if(me==0)printf("Creating matrix A\n");
     g_a = NGA_Create(type, 2, dims, "A", NULL);
     if(!g_a) GA_Error("create failed: A",n); 
     if(me==0)printf("OK\n");

     if(me==0)printf("Creating matrix B\n");
     /* create matrix B  so that it has dims and distribution of A*/
     g_b = GA_Duplicate(g_a, "B");
     if(! g_b) GA_Error("duplicate failed",n); 
     if(me==0)printf("OK\n");

     GA_Zero(g_a);   /* zero the matrix */

     if(me==0)printf("Initializing matrix A\n");
     /* fill in matrix A with random values in range 0.. 1 */ 
     lo[1]=0; hi[1]=n-1;
     for(row=me; row<n; row+= nproc){
         /* each process works on a different row in MIMD style */
         lo[0]=hi[0]=row;   
         for(i=0; i<n; i++) buf[i]=sin((double)i + 0.1*(row+1));
         NGA_Put(g_a, lo, hi, buf, &n);
     }


     if(me==0)printf("Symmetrizing matrix A\n");
     GA_Symmetrize(g_a);   /* symmetrize the matrix A = 0.5*(A+A') */
   

     /* check if A is symmetric */ 
     if(me==0)printf("Checking if matrix A is symmetric\n");
     GA_Transpose(g_a, g_b); /* B=A' */
     alpha=1.; beta=-1.;
     GA_Add(&alpha, g_a, &beta, g_b, g_b);  /* B= A - B */
     err= GA_Ddot(g_b, g_b);
     
     if(me==0)printf("Error=%f\n",(double)err);
     
     if(me==0)printf("\nChecking atomic accumulate \n");

     GA_Zero(g_a);   /* zero the matrix */
     for(i=0; i<n; i++) buf[i]=(double)i;

     /* everybody accumulates to the same location/row */
     alpha = 1.0;
     row = n/2;
     lo[0]=hi[0]=row;
     lo[1]=0; hi[1]=n-1;
     ld = hi[1]-lo[1]+1;
     NGA_Acc(g_a, lo, hi, buf, &ld, &alpha );
     GA_Sync();

     if(me==0){ /* node 0 is checking the result */

        NGA_Get(g_a, lo, hi, buf,&ld);
        for(i=0; i<n; i++) if(buf[i] != (double)nproc*i)
           GA_Error("failed: column=",i);
        printf("OK\n\n");

     }
     
     GA_Destroy(g_a);
     GA_Destroy(g_b);
}

Пример #22

0

Показать файл

Файл: ntestc.c Проект: dmlb2000/nwchem-cml

void do_work()
{
int g_a, g_b;
int me=GA_Nodeid(), nproc=GA_Nnodes(), proc, loop;
int dims[NDIM], lo[NDIM], hi[NDIM], block[NDIM], ld[NDIM-1];
int i,d,*proclist, offset;
int adims[NDIM], ndim,type;
typedef struct {
       int lo[NDIM];
       int hi[NDIM];
} patch_t;
patch_t *regions;
int *map;
double *buf;

     /***** create array A with default distribution  *****/
     if(me==0){printf("Creating array A\n"); fflush(stdout);}
     for(i = 0; i<NDIM; i++)dims[i] = N*(i+1);
#ifdef NEW_API
     g_a = GA_Create_handle();
     GA_Set_data(g_a,NDIM,dims,MT_F_DBL);
     GA_Set_array_name(g_a,"array A");
     (void)GA_Allocate(g_a);
#else
     g_a = NGA_Create(MT_F_DBL, NDIM, dims, "array A", NULL);
#endif
     if(!g_a) GA_Error("create failed: A",0); 
     if(me==0)printf("OK\n\n");

     /* print info about array we got */
     NGA_Inquire(g_a, &type, &ndim, adims);
     GA_Print_distribution(g_a);

     GA_Sync();
     /* duplicate array A with ga_create irreg rather than ga_duplicate
      * -- want to show distribution control 
      * -- with ga_duplicate it would be g_b=GA_Duplicate(g_a,name) 
      */
     if(me==0)printf("\nReconstructing distribution description for A\n");

     /* get memory for arrays describing distribution */
     proclist = (int*)malloc(nproc*sizeof(int));
     if(!proclist)GA_Error("malloc failed for proclist",0);
     regions = (patch_t*)malloc(nproc*sizeof(patch_t));
     if(!regions)GA_Error("malloc failed for regions",0);
     map = (int*)malloc((nproc+ndim)*sizeof(int)); /* ubound= nproc+mdim */
     if(!map)GA_Error("malloc failed for map",0);

     /* first find out how array g_a is distributed */
     for(i=0;i<ndim;i++)lo[i]=BASE;
     for(i=0;i<ndim;i++)hi[i]=adims[i] -1 + BASE;
     proc = NGA_Locate_region(g_a, lo, hi, (int*)regions, proclist);
     if(proc<1) GA_Error("error in NGA_Locate_region",proc);

     /* determine blocking for each dimension */
     for(i=0;i<ndim;i++)block[i]=0;
     for(i=0;i<ndim;i++)adims[i]=0;
     
     offset =0;
     for(d=0; d<ndim; d++)
         for(i=0;i<proc;i++)
             if( regions[i].hi[d]>adims[d] ){
                map[offset] = regions[i].lo[d];
                offset++;
                block[d]++;
                adims[d]= regions[i].hi[d];
             }
            
     if(me==0){
        printf("Distribution map contains %d elements\n",offset); 
        print_subscript("number of blocks for each dimension",ndim,block,"\n");
        print_subscript("distribution map",offset,map,"\n\n");
        fflush(stdout);
     }
     
     if(me==0)printf("Creating array B applying distribution of A\n");

#    ifdef USE_DUPLICATE
       g_b = GA_Duplicate(g_a,"array B");
#    else
       g_b = NGA_Create_irreg(MT_F_DBL, NDIM, dims, "array B", block,map);
#    endif
     if(!g_b) GA_Error("create failed: B",0); 
     if(me==0)printf("OK\n\n");
     free(proclist); free(regions); free(map);
     
     GA_Print_distribution(g_b);

     GA_Sync();

     if(me==0){
       printf("\nCompare distributions of A and B\n");
       if(GA_Compare_distr(g_a,g_b))
          printf("Failure: distributions NOT identical\n");
       else 
          printf("Success: distributions identical\n");
       fflush(stdout);
     }
       

     if(me==0){
        printf("\nAccessing local elements of A: set them to the owner process id\n");
        fflush(stdout);
     }
     GA_Sync();

     NGA_Distribution(g_a,me,lo,hi);

     if(hi[0]>=0){/* -1 means no elements stored on this processor */
         double *ptr;
         int locdim[NDIM];
         NGA_Access(g_a, lo,hi, &ptr, ld);
         for(i=0;i<ndim;i++)locdim[i]=hi[i]-lo[i]+1;
         fill_patch(ptr, locdim, ld, ndim,(double)me);
     }

     for(i=0;i<nproc; i++){
       if(me==i && hi[0]>=0){
         char msg[100];
         sprintf(msg,"%d: leading dimensions",me);
         print_subscript(msg,ndim-1,ld,"\n");
         fflush(stdout);
       }
       GA_Sync();
     }
     
     GA_Sync();
     if(me==0)printf("\nRandomly checking the update using ga_get on array sections\n");
     GA_Sync();

     /* show ga_get working and verify array updates 
      * every process does N random gets
      * for simplicity get only a single row at a time
      */
     srand(me); /* different seed for every process */
     hi[ndim-1]=adims[ndim-1] -1 + BASE;
     for(i=1;i<ndim-1; i++)ld[i]=1; ld[ndim-2]=adims[ndim-1] -1 + BASE;

     /* get buffer memory */
     buf = (double*)malloc(adims[ndim-1]*sizeof(double));
     if(!buf)GA_Error("malloc failed for buf",0);

     /* half of the processes check the result */
     if(me<=nproc/2) 
     for(loop = 0; loop< N; loop++){ /* task parallel loop */
         lo[ndim-1]=BASE;
         for (i= 0; i < ndim -1; i ++){
              lo[i] = hi[i] = rand()%adims[i]+BASE; 
         }

         /* print_subscript("getting",ndim,lo,"\n");*/
         NGA_Get(g_a,lo,hi,buf,ld); 
         
         /* check values */
         for(i=0;i<adims[ndim-1]; i++){
             int p = NGA_Locate(g_a, lo);
             if((double)p != buf[i]) {
                char msg[100];
                sprintf(msg,"%d: wrong value: %d != %lf a",me, p, buf[i]);
                print_subscript(msg,ndim,lo,"\n");
                GA_Error("Error - bye",i);  
             }
             lo[ndim-1]++;
          }
     }
             
     free(buf);
     GA_Sync();
           
     if(me==0)printf("OK\n");
     
     GA_Destroy(g_a);
     GA_Destroy(g_b);
}

Пример #23

0

Показать файл

Файл: lock.c Проект: wadejong/NWChem-Json

int main(int argc, char **argv)
{
    int me;
    int nproc;
    int status;
    int g_a;
    int dims[NDIM];
    int chunk[NDIM];
    int pg_world;
    size_t num = 10;
    double *p1 = NULL;
    double *p2 = NULL;
    size_t i;
    int num_mutex;
    int lo[1];
    int hi[1];
    int ld[1]={1};
    MPI_Comm comm;

    MP_INIT(argc,argv);
    GA_INIT(argc,argv);

    me = GA_Nodeid();
    nproc = GA_Nnodes();
    comm = GA_MPI_Comm_pgroup_default();

    printf("%d: Hello world!\n",me);

    if (me==0) printf("%d: GA_Initialize\n",me);
    /*if (me==0) printf("%d: ARMCI_Init\n",me);*/
    /*ARMCI_Init();*/
    /*if (me==0) printf("%d: MA_Init\n",me);*/
    /*MA_init(MT_DBL, 8*1024*1024, 2*1024*1024);*/

    if (me==0) printf("%d: GA_Create_handle\n",me);
    g_a = GA_Create_handle();

    if (me==0) printf("%d: GA_Set_array_name\n",me);
    GA_Set_array_name(g_a,"test array A");

    dims[0] = 30;
    if (me==0) printf("%d: GA_Set_data\n",me);
    GA_Set_data(g_a,NDIM,dims,MT_DBL);

    chunk[0] = -1;
    if (me==0) printf("%d: GA_Set_chunk\n",me);
    GA_Set_chunk(g_a,chunk);

    if (me==0) printf("%d: GA_Pgroup_get_world\n",me);
    pg_world = GA_Pgroup_get_world();
    if (me==0) printf("%d: GA_Set_pgroup\n",me);
    GA_Set_pgroup(g_a,pg_world);

    if (me==0) printf("%d: GA_Allocate\n",me);
    status = GA_Allocate(g_a);
    if(0 == status) MPI_Abort(comm,100);

    if (me==0) printf("%d: GA_Zero\n",me);
    GA_Zero(g_a);

    if (me==0) printf("%d: GA_Sync\n",me);
    GA_Sync();

    num = 10;
    p1 = malloc(num*sizeof(double));
    /*double* p1 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p1==NULL) MPI_Abort(comm,1000);
    p2 = malloc(num*sizeof(double));
    /*double* p2 = ARMCI_Malloc_local(num*sizeof(double));*/
    if (p2==NULL) MPI_Abort(comm,2000);

    for ( i=0 ; i<num ; i++ ) p1[i] = 7.0;
    for ( i=0 ; i<num ; i++ ) p2[i] = 3.0;

    num_mutex = 17;
    status = GA_Create_mutexes(num_mutex);
    if (me==0) printf("%d: GA_Create_mutexes = %d\n",me,status);

/***************************************************************/
    if (me==0) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Put(g_a,lo,hi,p1,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
    } 
    GA_Print(g_a);
    if (me==1) {
        printf("%d: before GA_Lock\n",me);
        GA_Lock(0);
        lo[0] = 0;
        hi[0] = num-1;
        GA_Init_fence();
        NGA_Get(g_a,lo,hi,p2,ld);
        GA_Fence();
        GA_Unlock(0);
        printf("%d: after GA_Unlock\n",me);
        for ( i=0 ; i<num ; i++ ) printf("p2[%2lu] = %20.10f\n",
                (long unsigned)i,p2[i]);
    }
/***************************************************************/



    status = GA_Destroy_mutexes();
    if (me==0) printf("%d: GA_Destroy_mutexes = %d\n",me,status);

    /*ARMCI_Free(p2);*/
    /*ARMCI_Free(p1);*/
    free(p2);
    free(p1);

    if (me==0) printf("%d: GA_Destroy\n",me);
    GA_Destroy(g_a);

    /*if (me==0) printf("%d: ARMCI_Finalize\n",me);*/
    /*ARMCI_Finalize();*/
    if (me==0) printf("%d: GA_Terminate\n",me);
    GA_Terminate();
    if (me==0) printf("%d: MPI_Finalize\n",me);
    MPI_Finalize();

    return(0);
}

Пример #24

0

Показать файл

Файл: big.c Проект: sg0/Elemental

// note: Sayan: brings down memory requirement to about 268 MB
int main(int argc, char **argv)
{
    int me, nproc, g_a = -1, i, j;

#if defined(USE_ELEMENTAL)
    int ndim=2, dims[2]= {N1,N2};
#else
    int ndim=2, type=MT_F_DBL, dims[2]= {N1,N2};
#endif

    double *buf;

    int lo[2], hi[2], ld[1];
    double alpha = 1.0;

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );

    ElGlobalArrays_d eldga;

    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
    printf ("INITIALIZED elemental global array...\n");
#else
    MP_INIT(argc,argv);
    GA_Initialize_ltd(-1);

    me=GA_Nodeid();
    nproc=GA_Nnodes();
#endif

    if(me==0) printf("Using %ld processes\n",(long)nproc);
    if(me==0) printf("memory = %ld bytes\n",((long)N1)*((long)N2)*8);

#if defined(USE_ELEMENTAL)
    // create and allocate a global array
    printf ("ndim = %d\n", ndim);
    printf ("dim[0] = %d and dim[1] = %d\n", dims[0], dims[1]);
    ElGlobalArraysCreate_d( eldga, ndim, dims, "A", &g_a);
    printf ("CREATED elemental global array...\n");
    // print distribution
    ElGlobalArraysPrint_d( eldga, g_a );
#else
    g_a = NGA_Create(type, ndim, dims, "A", NULL);

    GA_Zero(g_a);   /* zero the matrix */

    GA_Print_distribution(g_a);
#endif

    if(me == 0) {
//        buf = (double*)(malloc(N1*1024*sizeof(double)));
        buf = (double*)(malloc(N1*128*sizeof(double)));
//        for(j = 0; j < N1*1024; ++j) buf[j] = 1.0;
//        for(i = 0; i < N2/1024; ++i) {
        for(j = 0; j < N1*128; ++j) buf[j] = 1.0;
        for(i = 0; i < N2/128; ++i) {

            lo[0] = 0;
            hi[0] = lo[0] + N1   -1;
            /*
                lo[1] = i*1024;
                hi[1] = lo[1] + 1024 -1;
                ld[0] = 1024;
            */
            lo[1] = i*128;
            hi[1] = lo[1] + 128 -1;
            ld[0] = 128;
            printf("NGA_Acc.%d:  %d:%d %d:%d\n",i,lo[0],hi[0],lo[1],hi[1]);

#if defined(USE_ELEMENTAL)
            ElGlobalArraysAccumulate_d( eldga, g_a, lo, hi, buf, ld, &alpha );
            // there is an explicit flush in NGA_Acc/Put, so when it returns, the buffer
            // can be reused and data has reached the destination
#else
            NGA_Init_fence();
            NGA_Acc(g_a, lo, hi, buf, ld, &alpha);
            NGA_Fence();
#endif
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysSync_d( eldga );
    ElGlobalArraysDestroy_d( eldga, g_a );
    ElGlobalArraysTerminate_d( eldga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Sync();

    GA_Destroy(g_a);

    GA_Terminate();
    MP_FINALIZE();
#endif

    return 0;
}

Пример #25

0

Показать файл

Файл: ga_pgroup_create2.c Проект: jeffhammond/ga

main(int argc, char **argv)
{
  int rank, nprocs, i, j;
  int p_Geven, p_Godd, p_size, mod, p_size_mod, *list_even=NULL, *list_odd=NULL;
  
  MPI_Init(&argc, &argv);
  
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MA_init(C_INT, 1000, 1000);

  GA_Initialize();
  
  p_size=nprocs/2;
  mod=nprocs%2;
  p_size_mod=p_size+mod;
  list_even = (int*)malloc(p_size*sizeof(int));
  list_odd = (int*)malloc(p_size*sizeof(int));
  
  j=0;
  for(i=0; i<nprocs; i++)
    {
      if(i%2==0)
	{
	  list_even[j]=i;
	  j++;
	}
    }
  j=0;
  for(i=0; i<nprocs; i++)
    {
      if(i%2==1)
	{
	  list_odd[j]=i;
	  j++;
	}
    }
  /*
  if(rank==0)
    {    
      for(i=0; i<p_size; i++)
	printf(" %d--> %d --:: -- %d\n", i, list_even[i], list_odd[i]);
    }
  */
  GA_Sync();
  printf("%d: My ID is %d :: %d --- nodeid & nnodes for GA \n", rank, GA_Nodeid(), GA_Nnodes());
  
  p_Geven=GA_Pgroup_create(list_even, p_size_mod);
  p_Godd=GA_Pgroup_create(list_odd, p_size);
  GA_Sync();
  if(rank%2==0)
    printf("%d: My ID is %d :: %d -- even \n", rank, GA_Pgroup_nodeid(p_Geven), GA_Pgroup_nnodes(p_Geven));
  else
    printf("%d: My ID is %d :: %d --- odd\n", rank, GA_Pgroup_nodeid(p_Godd), GA_Pgroup_nnodes(p_Godd));
  
  GA_Sync();
  if(rank==0)
    GA_PRINT_MSG();
  
   GA_Terminate();
  MPI_Finalize();
}

Пример #26

0

Показать файл

Файл: task_python.c Проект: wadejong/NWChem-Json

int FATR task_python_(Integer *rtdb_ptr)
#endif
{
   FILE *F;
   char buf[20], *pbuf;
   char filename[256];
   int ret;
   
   Py_SetProgramName("NWChem");
   Py_Initialize();		/* set the PYTHONPATH env   */
   initnwchem();
   if (PyRun_SimpleString("from nwchem import *")) {
     fprintf(stderr,"import of NWCHEM failed\n");
     return 0;
   }
   pbuf = buf;			/* pass the rtdb_handle to  */
   sprintf(pbuf, "pass_handle(%d)\n", *rtdb_ptr); /* the python warping mod */
   if (PyRun_SimpleString(pbuf)) {
       fprintf(stderr,"task_python: failed to pass rtdb handle\n");
       return 0;
   }

   ret = 0;
   sprintf(pbuf, "INT     = %d", MT_F_INT);      
   ret += PyRun_SimpleString(pbuf);
   sprintf(pbuf, "DBL     = %d", MT_F_DBL);       
   ret += PyRun_SimpleString(pbuf);
   sprintf(pbuf, "CHAR    = %d", MT_CHAR);      
   ret += PyRun_SimpleString(pbuf);
   sprintf(pbuf, "LOGICAL = %d", MT_BASE + 11); 
   ret += PyRun_SimpleString(pbuf);
   sprintf(pbuf, "taskid = %d", GA_Nodeid());
   ret += PyRun_SimpleString(pbuf);
   sprintf(pbuf, "np = %d", GA_Nnodes());
   ret += PyRun_SimpleString(pbuf);

   if (ret) {
     fprintf(stderr,"setting RTDB types failed\n");
     return 0;
   }

   if (GA_Nodeid())
       sprintf(filename,"nwchem.py-%d",GA_Nodeid());
   else
       strcpy(filename,"nwchem.py");

   util_file_parallel_copy(filename, filename);

   /* PyRun_SimpleFile is unreliable on windows since you're passing
      a file pointer to Python which requires that it is compiled with
      a compatible compiler ... which it most likely is not */
 
#if defined(WIN32)
   ret = PyRun_SimpleString("execfile('nwchem.py')"); 
#else
   if (!(F = fopen(filename, "r"))) {
       fprintf(stderr,"task_python: cannot open file %s\n",filename); 
       return 0;
   }
   ret = PyRun_SimpleFile(F, filename); 
#endif


   /*fclose(F);*/
   /*   unlink(filename); */

   Py_Finalize();

   return !ret;
}

Пример #27

0

Показать файл

Файл: spiral-mm.c Проект: pghosh2/GA-Matrix-Multiplication-Benchmark

/* Square matrix-matrix multiplication */
void matrix_multiply(int M, int N, int K, 
		int blockX_len, int blockY_len) 
{
	/* Local buffers and Global arrays declaration */
	double *a=NULL, *b=NULL, *c=NULL;

	int dims[NDIMS], ld[NDIMS], chunks[NDIMS];
	int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */

	int g_a, g_b, g_c, g_cnt, g_cnt2;
	int offset;
	double alpha = 1.0, beta=0.0;
	int count_p = 0, next_p = 0;
	int count_gac = 0, next_gac = 0;
	double t1,t2,seconds;
        ga_nbhdl_t nbh;
        int count_acc = 0;

	/* Find local processor ID and the number of processes */
	int proc=GA_Nodeid(), nprocs=GA_Nnodes();

	if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 
			|| (K % blockX_len) != 0 || (K % blockY_len) != 0)
		GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101);

	/* Allocate/Set process local buffers */
	a = malloc (blockX_len * blockY_len * sizeof(double)); 
	b = malloc (blockX_len * blockY_len * sizeof(double)); 
	c = malloc (blockX_len * blockY_len * sizeof(double));

	cdims[0] = blockX_len;
	cdims[1] = blockY_len;	

	/* Configure array dimensions */
	for(int i = 0; i < NDIMS; i++) {
		dims[i]  = N;
		chunks[i] = -1;
		ld[i]    = cdims[i]; /* leading dimension/stride of the local buffer */
	}

	/* create a global array g_a and duplicate it to get g_b and g_c*/
	g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks);

	if (!g_a) 
		GA_Error("NGA_Create failed: A", NDIMS);

#if DEBUG>1
	if (proc == 0) 
		printf("  Created Array A\n");
#endif
	/* Ditto for C and B */
	g_b = GA_Duplicate(g_a, "array B");
	g_c = GA_Duplicate(g_a, "array C");

	if (!g_b || !g_c) 
		GA_Error("GA_Duplicate failed",NDIMS);
	if (proc == 0) 
		printf("Created Arrays B and C\n");

	/* Subscript array for read-incr, which is nothing but proc */
	int * rdcnt = malloc (nprocs * sizeof(int));
	memset (rdcnt, 0, nprocs * sizeof(int));
	int * rdcnt2 = malloc (nprocs * sizeof(int));
	memset (rdcnt2, 0, nprocs * sizeof(int));

	/* Create global array of nprocs elements for nxtval */	
	int counter_dim[1];
	counter_dim[0] = nprocs;

	g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL);

	if (!g_cnt) 
		GA_Error("Shared counter failed",1);

	g_cnt2 = GA_Duplicate(g_cnt, "another shared counter");

	if (!g_cnt2) 
		GA_Error("Another shared counter failed",1);

	GA_Zero(g_cnt);
	GA_Zero(g_cnt2);

#if DEBUG>1	
	/* initialize data in matrices a and b */
	if(proc == 0)
		printf("Initializing local buffers - a and b\n");
#endif
	int w = 0; 
	int l = 7;
	for(int i = 0; i < cdims[0]; i++) {
		for(int j = 0; j < cdims[1]; j++) {
			a[i*cdims[1] + j] = (double)(++w%29);
			b[i*cdims[1] + j] = (double)(++l%37);
		}
	}

	/* Copy data to global arrays g_a and g_b from local buffers */
	next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
	for (int i = 0; i < N; i+=cdims[0]) 
	{
		if (next_p == count_p) {
			for (int j = 0; j < N; j+=cdims[1])
			{
				/* Indices of patch */
				lo[0] = i;
				lo[1] = j;
				hi[0] = lo[0] + cdims[0];
				hi[1] = lo[1] + cdims[1];

				hi[0] = hi[0]-1;
				hi[1] = hi[1]-1;
#if DEBUG>1
				printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Put(g_a, lo, hi, a, ld);
				NGA_Put(g_b, lo, hi, b, ld);

			}
			next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
		}		
		count_p++;
	}


#if DEBUG>1
	printf ("After NGA_PUT to global - A and B arrays\n");
#endif
	/* Synchronize all processors to make sure puts from 
	   nprocs has finished before proceeding with dgemm */
	GA_Sync();

	t1 = GA_Wtime();

	next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
	for (int m = 0; m < N; m+=cdims[0])
	{
		for (int k = 0; k < N; k+=cdims[0])
		{
			if (next_gac == count_gac)	
			{
				/* A = m x k */
				lo[0] = m; lo[1] = k;
				hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];

				hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
				printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Get(g_a, lo, hi, a, ld);

				for (int n = 0; n < N; n+=cdims[1])
				{
					memset (c, 0, sizeof(double) * cdims[0] * cdims[1]);
					/* B = k x n */
					lo[0] = k; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_Get(g_b, lo, hi, b, ld);


					//_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0);

					/* TODO I am assuming square matrix blocks, further testing/work 
					   required for rectangular matrices */
					cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */
							cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha,
							a, cdims[0], /* lda */ b, cdims[1], /* ldb */
							beta=1.0, c, cdims[0] /* ldc */);

					NGA_NbWait(&nbh);

					/* C = m x n */
					lo[0] = m; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh);
					count_acc += 1;
				} /* END LOOP N */
				next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
			} /* ENDIF if count == next */
			count_gac++;
		} /* END LOOP K */
	} /* END LOOP M */

	GA_Sync();
	t2 = GA_Wtime();
	seconds = t2 - t1;
	if (proc == 0)
		printf("Time taken for MM (secs):%lf \n", seconds);

        printf("Number of ACC: %d\n", count_acc);

	/* Correctness test - modify data again before this function */
	for (int i = 0; i < NDIMS; i++) {
		lo[i] = 0;
		hi[i] = dims[i]-1;
		ld[i] = dims[i];
	}

	verify(g_a, g_b, g_c, lo, hi, ld, N);

	/* Clear local buffers */
	free(a);
	free(b);
	free(c);
	free(rdcnt);
	free(rdcnt2);

	GA_Sync();

	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);
	GA_Destroy(g_c);
	GA_Destroy(g_cnt);
	GA_Destroy(g_cnt2);
}

Пример #28

0

Показать файл

Файл: sched_groups.c Проект: 000Justin000/nwchem

void plist_destroy(proc_list_t *plist) {
  const int nproc = GA_Nnodes();
  assert(plist->nidle == nproc-1);
  free(plist->buf);
}

Пример #29

0

Показать файл

Файл: gemmtest.c Проект: jeffhammond/ga

int 
main(int argc, char **argv) {

Integer heap=9000000, stack=9000000;
int me, nproc;
DoublePrecision time;

    MP_INIT(argc,argv);

    GA_INIT(argc,argv);                           /* initialize GA */

    nproc = GA_Nnodes();
    me = GA_Nodeid();

    if(me==0) printf("Using %d processes\n\n",nproc);

    if (me==0) printf ("Matrix size is %d X %d\n",N,N);

#ifdef USE_REGULAR
    if (me == 0) printf("\nUsing regular data distribution\n\n");
#endif
#ifdef USE_SIMPLE_CYCLIC
    if (me == 0) printf("\nUsing simple block-cyclic data distribution\n\n");
#endif
#ifdef USE_SCALAPACK
    if (me == 0) printf("\nUsing ScaLAPACK data distribution\n\n");
#endif
#ifdef USE_TILED
    if (me == 0) printf("\nUsing tiled data distribution\n\n");
#endif

    if(!MA_init((Integer)MT_F_DBL, stack/nproc, heap/nproc))
       GA_Error("MA_init failed bytes= %d",stack+heap);   

#ifdef PERMUTE
      {
        int i, *list = (int*)malloc(nproc*sizeof(int));
        if(!list)GA_Error("malloc failed",nproc);

        for(i=0; i<nproc;i++)list[i]=nproc-1-i;

        GA_Register_proclist(list, nproc);
        free(list);
      }
#endif

    if(GA_Uses_fapi())GA_Error("Program runs with C API only",1);

    time = MP_TIMER();
    do_work();
    /*    printf("%d: Total Time = %lf\n", me, MP_TIMER()-time);
      printf("%d: GEMM Total Time = %lf\n", me, gTime);
    */

    if(me==0)printf("\nSuccess\n\n");
    GA_Terminate();

    MP_FINALIZE();

    return 0;
}

Пример #30

0

Показать файл

Файл: perf2.c Проект: sg0/Elemental

int main(int argc, char **argv)
{
  /* initialize GA */
#if defined(USE_ELEMENTAL)
  // initialize Elemental (which will initialize MPI)
  ElInitialize( &argc, &argv );
  ElMPICommRank( MPI_COMM_WORLD, &me );
  ElMPICommSize( MPI_COMM_WORLD, &nproc );
  // instantiate el::global array
  ElGlobalArraysConstruct_d( &eldga );
  // initialize global arrays
  ElGlobalArraysInitialize_d( eldga );
#else
  MP_INIT(argc,argv);
  GA_Initialize_args(&argc, &argv);

  me = GA_Nodeid();
  nproc = GA_Nnodes();
#endif

  if (nproc < 2) {
    if (me == 0) {
      fprintf(stderr, "USAGE: 2 <= processes - got %d\n", nproc);
    }
#if defined(USE_ELEMENTAL)
    ElGlobalArraysTerminate_d( eldga );
    // call el::global arrays destructor
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    exit(0);
  }

  if (!me) {
    printf("\n             Performance of Basic Blocking Communication Operations\n");
  }

#if defined(USE_ELEMENTAL)
  ElGlobalArraysSync_d( eldga );
#else
  GA_Sync();
#endif

  /* test 1 dimension array */
  /*
  if (!me) {
    printf("\n\t\t\tContiguous Data Transfer\n");
  }
  test_1D();
  */

  /* test 2 dimension array */
  if (!me) {
    printf("\n\t\t\tStrided Data Transfer\n");
  }
  test_2D();
#if 0
  if (me == 0) {
    if (warn_accuracy) {
      printf("\nWARNING: Your timer does not have sufficient accuracy for this test (%d)\n", warn_accuracy);
    }
    printf("\n\n------------ Now we test the same data transfer for correctness ----------\n");
    fflush(stdout);
  }
#endif
#if defined(USE_ELEMENTAL)
  ElGlobalArraysTerminate_d( eldga );
  // call el::global arrays destructor
  ElGlobalArraysDestruct_d( eldga );
  ElFinalize();
#else
  GA_Terminate();
  MP_FINALIZE();
#endif
  return(0);
}