int main(int argc, char *argv[]) { // Some constants int minusone = -1; int zero = 0; int one = 1; double dzero = 0.0; // ConText int ConTxt = minusone; // order char order = 'R'; char scope = 'A'; // root process int root = zero; // BLACS/SCALAPACK parameters // the size of the blocks the distributed matrix is split into // (applies to both rows and columns) int mb = 32; int nb = mb; // PDSYEVxxx constraint // the number of rows and columns in the processor grid // only square processor grids due to C vs. Fortran ordering int nprow = 2; int npcol = nprow; // only square processor grids, // starting row and column in grid, do not change int rsrc = zero; int csrc = zero; // dimensions of the matrix to diagonalize int m = 1000; int n = m; // only square matrices int info = zero; // Rest of code will only work for: // nprow = npcol // mb = nb; // m = n; // rsrc = crsc; // Paramteres for Trivial Matrix double alpha = 0.1; // off-diagonal double beta = 75.0; // diagonal // For timing: double tdiag0, tdiag, ttotal0, ttotal; // BLACS Communicator MPI_Comm blacs_comm; int nprocs; int iam; int myrow, mycol; MPI_Init(&argc, &argv); MPI_Barrier(MPI_COMM_WORLD); ttotal0 = MPI_Wtime(); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &iam); if (argc > one) { nprow = strtod(argv[1],NULL); m = strtod(argv[2],NULL); npcol = nprow; n = m; } if (iam == root) { printf("world size %d \n",nprocs); printf("n %d \n", n); printf("nprow %d \n", nprow); printf("npcol %d \n", npcol); } // We can do this on any subcommunicator. #ifdef CartComm int dim[2]; int pbc[2]; dim[0] = nprow; dim[1] = npcol; pbc[0] = 0; pbc[1] = 0; MPI_Cart_create(MPI_COMM_WORLD, 2, dim, pbc, 1, &blacs_comm); #else blacs_comm = MPI_COMM_WORLD; #endif // initialize the grid // The lines below are equivalent to the one call to: if (blacs_comm != MPI_COMM_NULL) { ConTxt = Csys2blacs_handle_(blacs_comm); Cblacs_gridinit_(&ConTxt, &order, nprow, npcol); // get information back about the grid Cblacs_gridinfo_(ConTxt, &nprow, &npcol, &myrow, &mycol); } if (ConTxt != minusone) { int desc[9]; // get the size of the distributed matrix int locM = numroc_(&m, &mb, &myrow, &rsrc, &nprow); int locN = numroc_(&n, &nb, &mycol, &csrc, &npcol); // printf ("locM = %d \n", locM); // printf ("locN = %d \n", locN); int lld = MAX(one,locM); // build the descriptor descinit_(desc, &m, &n, &mb, &nb, &rsrc, &csrc, &ConTxt, &lld, &info); // Allocate arrays // eigenvalues double* eigvals = malloc(n * sizeof(double)); // allocate the distributed matrices double* mata = malloc(locM*locN * sizeof(double)); // allocate the distributed matrix of eigenvectors double* z = malloc(locM*locN * sizeof(double)); // Eigensolver parameters int ibtype = one; char jobz = 'V'; // eigenvectors also char range = 'A'; // all eiganvalues char uplo = 'L'; // work with upper double vl, vu; int il, iu; char cmach = 'U'; double abstol = 2.0 * pdlamch_(&ConTxt, &cmach); int eigvalm, nz; double orfac = -1.0; //double orfac = 0.001; int* ifail; ifail = malloc(m * sizeof(int)); int* iclustr; iclustr = malloc(2*nprow*npcol * sizeof(int)); double* gap; gap = malloc(nprow*npcol * sizeof(double)); double* work; work = malloc(3 * sizeof(double)); int querylwork = minusone; int* iwork; iwork = malloc(1 * sizeof(int)); int queryliwork = minusone; // Build a trivial distributed matrix: Diagonal matrix pdlaset_(&uplo, &m, &n, &alpha, &beta, mata, &one, &one, desc); // First there is a workspace query // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl, // &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one, // &one, desc, work, &querylwork, iwork, &queryliwork, ifail, iclustr, gap, &info); pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals, z, &one, &one, desc, work, &querylwork, iwork, &queryliwork, &info); //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals, // z, &one, &one, desc, work, &querylwork, &info); int lwork = (int)work[0]; //printf("lwork %d\n", lwork); free(work); int liwork = (int)iwork[0]; //printf("liwork %d\n", liwork); free(iwork); work = (double*)malloc(lwork * sizeof(double)); iwork = (int*)malloc(liwork * sizeof(int)); // This is actually diagonalizes the matrix // pdsyevx_(&jobz, &range, &uplo, &n, mata, &one, &one, desc, &vl, // &vu, &il, &iu, &abstol, &eigvalm, &nz, eigvals, &orfac, z, &one, // &one, desc, work, &lwork, iwork, &liwork, ifail, iclustr, gap, &info); Cblacs_barrier(ConTxt, &scope); tdiag0 = MPI_Wtime(); pdsyevd_(&jobz, &uplo, &n, mata, &one, &one, desc, eigvals, z, &one, &one, desc, work, &lwork, iwork, &liwork, &info); //pdsyev_(&jobz, &uplo, &m, mata, &one, &one, desc, eigvals, // z, &one, &one, desc, work, &lwork, &info); Cblacs_barrier(ConTxt, &scope); tdiag = MPI_Wtime() - tdiag0; free(work); free(iwork); free(gap); free(iclustr); free(ifail); free(z); free(mata); // Destroy BLACS grid Cblacs_gridexit_(ConTxt); // Check eigenvalues if (myrow == zero && mycol == zero) { for (int i = 0; i < n; i++) { if (fabs(eigvals[i] - beta) > 0.0001) printf("Problem: eigval %d != %f5.2 but %f\n", i, beta, eigvals[i]); } if (info != zero) { printf("info = %d \n", info); } printf("Time (s) diag: %f\n", tdiag); } free(eigvals); } MPI_Barrier(MPI_COMM_WORLD); ttotal = MPI_Wtime() - ttotal0; if (iam == 0) printf("Time (s) total: %f\n", ttotal); MPI_Finalize(); }
int main(int argc, char **argv) { int ictxt, nside, ngrid, nblock, nthread; int rank, size; int ic, ir, nc, nr; int i, j; char *fname; int info, ZERO=0, ONE=1; struct timeval st, et; double dtnn, dtnt, dttn, dttt; double gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt; /* Initialising MPI stuff */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); printf("Process %i of %i.\n", rank, size); /* Parsing arguments */ if(argc < 6) { exit(-3); } nside = atoi(argv[1]); ngrid = atoi(argv[2]); nblock = atoi(argv[3]); nthread = atoi(argv[4]); fname = argv[5]; if(rank == 0) { printf("Multiplying matrices of size %i x %i\n", nside, nside); printf("Process grid size %i x %i\n", ngrid, ngrid); printf("Block size %i x %i\n", nblock, nblock); printf("Using %i OpenMP threads\n", nthread); } #ifdef _OPENMP if(rank == 0) printf("Setting OMP_NUM_THREADS=%i\n", nthread); omp_set_num_threads(nthread); #endif /* Setting up BLACS */ Cblacs_pinfo( &rank, &size ) ; Cblacs_get(-1, 0, &ictxt ); Cblacs_gridinit(&ictxt, "Row", ngrid, ngrid); Cblacs_gridinfo(ictxt, &nr, &nc, &ir, &ic); int descA[9], descB[9], descC[9]; /* Fetch local array sizes */ int Ar, Ac, Br, Bc, Cr, Cc; Ar = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Ac = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Br = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Bc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Cr = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Cc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); printf("Local array section %i x %i\n", Ar, Ac); /* Set descriptors */ descinit_(descA, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Ar, &info); descinit_(descB, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Br, &info); descinit_(descC, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Cr, &info); /* Initialise and fill arrays */ double *A = (double *)malloc(Ar*Ac*sizeof(double)); double *B = (double *)malloc(Br*Bc*sizeof(double)); double *C = (double *)malloc(Cr*Cc*sizeof(double)); for(i = 0; i < Ar; i++) { for(j = 0; j < Ac; j++) { A[j*Ar + i] = drand48(); B[j*Br + i] = drand48(); C[j*Cr + i] = 0.0; } } double alpha = 1.0, beta = 0.0; //======================== if(rank == 0) printf("Starting multiplication (NN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nn = 2.0*pow(nside, 3) / (dtnn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnn, gfpc_nn); //======================== //======================== if(rank == 0) printf("Starting multiplication (NT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nt = 2.0*pow(nside, 3) / (dtnt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnt, gfpc_nt); //======================== //======================== if(rank == 0) printf("Starting multiplication (TN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tn = 2.0*pow(nside, 3) / (dttn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttn, gfpc_tn); //======================== //======================== if(rank == 0) printf("Starting multiplication (TT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tt = 2.0*pow(nside, 3) / (dttt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttt, gfpc_tt); //======================== if(rank == 0) { FILE * fd; fd = fopen(fname, "w"); fprintf(fd, "%g %g %g %g %i %i %i %i %g %g %g %g\n", gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt, nside, ngrid, nblock, nthread, dtnn, dtnt, dttn, dttt); fclose(fd); } Cblacs_gridexit( 0 ); MPI_Finalize(); }