CIntStatus_t CInt_offload_loadBasisSet (BasisSet_t basis, char *bsfile, char *molfile) { CIntStatus_t status; if ((status = CInt_loadBasisSet (basis, bsfile, molfile)) != CINT_STATUS_SUCCESS) { return status; } // push basis set to mic if ((status = CInt_offload_pushBasisSet (basis)) != CINT_STATUS_SUCCESS) { return status; } return CINT_STATUS_SUCCESS; }
/// main for SCF int main (int argc, char **argv) { // init MPI int myrank; int nprocs; int provided; #if defined (USE_ELEMENTAL) ElInitialize( &argc, &argv ); ElMPICommRank( MPI_COMM_WORLD, &myrank ); ElMPICommSize( MPI_COMM_WORLD, &nprocs ); MPI_Query_thread(&provided); #else MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); #endif if (myrank == 0) { printf("MPI thread support: %s\n", MPI_THREAD_STRING(provided)); } #if 0 char hostname[1024]; gethostname (hostname, 1024); printf ("Rank %d of %d running on node %s\n", myrank, nprocs, hostname); #endif // create basis set BasisSet_t basis; CInt_createBasisSet(&basis); // input parameters and load basis set int nprow_fock; int npcol_fock; int nblks_fock; int nprow_purif; int nshells; int natoms; int nfunctions; int niters; if (myrank == 0) { if (argc != 8) { usage(argv[0]); MPI_Finalize(); exit(0); } // init parameters nprow_fock = atoi(argv[3]); npcol_fock = atoi(argv[4]); nprow_purif = atoi(argv[5]); nblks_fock = atoi(argv[6]); niters = atoi(argv[7]); assert(nprow_fock * npcol_fock == nprocs); assert(nprow_purif * nprow_purif * nprow_purif <= nprocs); assert(niters > 0); CInt_loadBasisSet(basis, argv[1], argv[2]); nshells = CInt_getNumShells(basis); natoms = CInt_getNumAtoms(basis); nfunctions = CInt_getNumFuncs(basis); assert(nprow_fock <= nshells && npcol_fock <= nshells); assert(nprow_purif <= nfunctions && nprow_purif <= nfunctions); printf("Job information:\n"); char *fname; fname = basename(argv[2]); printf(" molecule: %s\n", fname); fname = basename(argv[1]); printf(" basisset: %s\n", fname); printf(" charge = %d\n", CInt_getTotalCharge(basis)); printf(" #atoms = %d\n", natoms); printf(" #shells = %d\n", nshells); printf(" #functions = %d\n", nfunctions); printf(" fock build uses %d (%dx%d) nodes\n", nprow_fock * npcol_fock, nprow_fock, npcol_fock); printf(" purification uses %d (%dx%dx%d) nodes\n", nprow_purif * nprow_purif * nprow_purif, nprow_purif, nprow_purif, nprow_purif); printf(" #tasks = %d (%dx%d)\n", nblks_fock * nblks_fock * nprow_fock * nprow_fock, nblks_fock * nprow_fock, nblks_fock * nprow_fock); int nthreads = omp_get_max_threads(); printf(" #nthreads_cpu = %d\n", nthreads); } int btmp[8]; btmp[0] = nprow_fock; btmp[1] = npcol_fock; btmp[2] = nprow_purif; btmp[3] = nblks_fock; btmp[4] = niters; btmp[5] = natoms; btmp[6] = nshells; btmp[7] = nfunctions; MPI_Bcast(btmp, 8, MPI_INT, 0, MPI_COMM_WORLD); nprow_fock = btmp[0]; npcol_fock = btmp[1]; nprow_purif = btmp[2]; nblks_fock = btmp[3]; niters = btmp[4]; natoms = btmp[5]; nshells = btmp[6]; nfunctions = btmp[7]; // broadcast basis set void *bsbuf; int bsbufsize; if (myrank == 0) { CInt_packBasisSet(basis, &bsbuf, &bsbufsize); MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD); } else { MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD); bsbuf = (void *)malloc(bsbufsize); assert(bsbuf != NULL); MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD); CInt_unpackBasisSet(basis, bsbuf); free(bsbuf); } // init PFock if (myrank == 0) { printf("Initializing pfock ...\n"); } PFock_t pfock; PFock_create(basis, nprow_fock, npcol_fock, nblks_fock, 1e-11, MAX_NUM_D, IS_SYMM, &pfock); if (myrank == 0) { double mem_cpu; PFock_getMemorySize(pfock, &mem_cpu); printf(" CPU uses %.3f MB\n", mem_cpu / 1024.0 / 1024.0); printf(" Done\n"); } // init purif purif_t *purif = create_purif(basis, nprow_purif, nprow_purif, nprow_purif); init_oedmat(basis, pfock, purif, nprow_fock, npcol_fock); // compute SCF if (myrank == 0) { printf("Computing SCF ...\n"); } int rowstart = purif->srow_purif; int rowend = purif->nrows_purif + rowstart - 1; int colstart = purif->scol_purif; int colend = purif->ncols_purif + colstart - 1; double energy0 = -1.0; double totaltime = 0.0; double purif_flops = 2.0 * nfunctions * nfunctions * nfunctions; double diis_flops; // set initial guess if (myrank == 0) { printf(" initialing D ...\n"); } PFock_setNumDenMat(NUM_D, pfock); initial_guess(pfock, basis, purif->runpurif, rowstart, rowend, colstart, colend, purif->D_block, purif->ldx); MPI_Barrier(MPI_COMM_WORLD); // compute nuc energy double ene_nuc = CInt_getNucEnergy(basis); if (myrank == 0) { printf(" nuc energy = %.10f\n", ene_nuc); } MPI_Barrier(MPI_COMM_WORLD); // main loop double t1, t2, t3, t4; for (int iter = 0; iter < niters; iter++) { if (myrank == 0) { printf(" iter %d\n", iter); } t3 = MPI_Wtime(); // fock matrix construction t1 = MPI_Wtime(); fock_build(pfock, basis, purif->runpurif, rowstart, rowend, colstart, colend, purif->ldx, purif->D_block, purif->F_block); if (myrank == 0) { printf("After fock build \n"); } // compute energy double energy = compute_energy(purif, purif->F_block, purif->D_block); t2 = MPI_Wtime(); if (myrank == 0) { printf(" fock build takes %.3f secs\n", t2 - t1); if (iter > 0) { printf(" energy %.10f (%.10f), %le\n", energy + ene_nuc, energy, fabs (energy - energy0)); } else { printf(" energy %.10f (%.10f)\n", energy + ene_nuc, energy); } } if (iter > 0 && fabs (energy - energy0) < 1e-11) { niters = iter + 1; break; } energy0 = energy; // compute DIIS t1 = MPI_Wtime(); compute_diis(pfock, purif, purif->D_block, purif->F_block, iter); t2 = MPI_Wtime(); if (myrank == 0) { if (iter > 1) { diis_flops = purif_flops * 6.0; } else { diis_flops = purif_flops * 2.0; } printf(" diis takes %.3f secs, %.3lf Gflops\n", t2 - t1, diis_flops / (t2 - t1) / 1e9); } #ifdef __SCF_OUT__ if (myrank == 0) { double outbuf[nfunctions]; char fname[1024]; sprintf(fname, "XFX_%d_%d.dat", nfunctions, iter); FILE *fp = fopen(fname, "w+"); assert(fp != NULL); for (int i = 0; i < nfunctions; i++) { PFock_getMat(pfock, PFOCK_MAT_TYPE_F, USE_D_ID, i, i, USE_D_ID, nfunctions - 1, outbuf, nfunctions); for (int j = 0; j < nfunctions; j++) { fprintf(fp, "%.10e\n", outbuf[j]); } } fclose(fp); } #endif // purification MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); int it = compute_purification(purif, purif->F_block, purif->D_block); t2 = MPI_Wtime(); MPI_Barrier(MPI_COMM_WORLD); if (myrank == 0) { printf(" purification takes %.3f secs," " %d iterations, %.3f Gflops\n", t2 - t1, it, (it * 2.0 + 4.0) * purif_flops / (t2 - t1) / 1e9); } /* #if defined(USE_ELEMENTAL) ElGlobalArraysPrint_d( eldga, pfock->ga_D[USE_D_ID] ); #else GA_Print (pfock->ga_D[USE_D_ID]); #endif */ t4 = MPI_Wtime (); totaltime += t4 - t3; #ifdef __SCF_TIMING__ PFock_getStatistics(pfock); double purif_timedgemm; double purif_timepdgemm; double purif_timepass; double purif_timetr; MPI_Reduce(&purif->timedgemm, &purif_timedgemm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&purif->timepdgemm, &purif_timepdgemm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&purif->timepass, &purif_timepass, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&purif->timetr, &purif_timetr, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if (myrank == 0) { printf(" Purification Statistics:\n"); printf(" average totaltime = %.3f\n" " average timetr = %.3f\n" " average timedgemm = %.3f, %.3f Gflops\n" " average timepdgemm = %.3f, %.3f Gflops\n", purif_timepass / purif->np_purif, purif_timetr / purif->np_purif, purif_timedgemm / purif->np_purif, (it * 2.0 + 4.0) * purif_flops / (purif_timedgemm / purif->np_purif) / 1e9, purif_timepdgemm / purif->np_purif, (it * 2.0 + 4.0) * purif_flops / (purif_timepdgemm / purif->np_purif) / 1e9); } #endif } /* for (iter = 0; iter < NITERATIONS; iter++) */ if (myrank == 0) { printf(" totally takes %.3f secs: %.3f secs/iters\n", totaltime, totaltime / niters); printf(" Done\n"); } destroy_purif(purif); PFock_destroy(pfock); CInt_destroyBasisSet(basis); MPI_Finalize(); return 0; }
int main (int argc, char **argv) { if (argc != 4) { printf ("Usage: %s <basisset> <xyz>\n", argv[0]); return -1; } const uint64_t freq = get_cpu_frequency(); const int nthreads = atoi(argv[3]); /* #ifdef _OPENMP omp_set_num_threads(nthreads); #else assert(nthreads == 1); #endif */ // load basis set BasisSet_t basis; CInt_createBasisSet(&basis); CInt_loadBasisSet(basis, argv[1], argv[2]); printf("Molecule info:\n"); printf(" #Atoms\t= %d\n", CInt_getNumAtoms(basis)); printf(" #Shells\t= %d\n", CInt_getNumShells(basis)); printf(" #Funcs\t= %d\n", CInt_getNumFuncs(basis)); printf(" #OccOrb\t= %d\n", CInt_getNumOccOrb(basis)); ERD_t erd; CInt_createERD(basis, &erd, nthreads); printf("Computing Lazy Evaluation Cholesky of ERIs\n"); // reset profiler // erd_reset_profile(); int n = CInt_getNumFuncs(basis); int n2 = n * n; int n3 = n2 * n; int n4 = n3 * n; double* G_ERI; double tol = 1e-6; int max_rank = n2; //int max_rank = (1-floor(log10(tol)))*n; int rank; const uint64_t start_clock = __rdtsc(); cholERI(basis, erd, &G_ERI, tol, max_rank, &rank); const uint64_t end_clock = __rdtsc(); const uint64_t total_ticks = end_clock - start_clock; const double timepass = ((double) total_ticks) / freq; printf("Done\n"); printf("Total GigaTicks: %.3lf, freq = %.3lf GHz\n", (double) (total_ticks) * 1.0e-9, (double)freq/1.0e9); printf("Total time: %.4lf secs\n", timepass); printf("n: %d, rank: %d, 7n: %d, n2: %d\n",n,rank,7*n,n2); double* diag = (double*) malloc(n2*sizeof(double)); computeDiag(basis, erd, diag); for (int i = 0; i < n2; i++) { double aii = 0; for (int j = 0; j < rank; j++) { aii += G_ERI[i+j*n2] * G_ERI[i+j*n2]; } double abserror2 = (diag[i] - aii)*(diag[i] - aii); if (abserror2 > tol) printf("i=%d, truth=%1.2e, approx=%1.2e, error: %1.2e\n", i, diag[i], aii, abserror2); } free(diag); printf("Testing accuracy for each shell quartet\n"); double chol_time_total = 0; double CInt_time_total = 0; int nshell = CInt_getNumShells(basis); int shellIndexM, shellIndexN, shellIndexP, shellIndexQ; int correct = 1; for (shellIndexM = 0; shellIndexM < nshell; shellIndexM++) { for (shellIndexN = 0; shellIndexN < nshell; shellIndexN++) { for (shellIndexP = 0; shellIndexP < nshell; shellIndexP++) { for (shellIndexQ = 0; shellIndexQ < nshell; shellIndexQ++) { int dimM = CInt_getShellDim (basis, shellIndexM); int dimN = CInt_getShellDim (basis, shellIndexN); int dimP = CInt_getShellDim (basis, shellIndexP); int dimQ = CInt_getShellDim (basis, shellIndexQ); // Compute shell with Cholesky double *cholintegrals; int cholnints; const uint64_t chol_start = __rdtsc(); cholComputeShellQuartet(basis, G_ERI, rank, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &cholintegrals, &cholnints); const uint64_t chol_end = __rdtsc(); chol_time_total += ((double) chol_end - chol_start) / freq; // Compute the same shell quartet with CInt double *integrals; int nints; const uint64_t CInt_start = __rdtsc(); CInt_computeShellQuartet(basis, erd, 0, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &integrals, &nints); const uint64_t CInt_end = __rdtsc(); CInt_time_total += ((double) CInt_end - CInt_start) / freq; // Compare each integral individually for (int iM = 0; iM < dimM; iM++) { for (int iN = 0; iN < dimN; iN++) { for (int iP = 0; iP < dimP; iP++) { for (int iQ = 0; iQ < dimQ; iQ++) { int idx = iM + dimM * (iN + dimN * (iP + dimP *(iQ))); double abserror2 = (integrals[idx] - cholintegrals[idx]); abserror2 = abserror2*abserror2; if (abserror2 > tol) { correct = 0; printf("Integral does not satisfy error tolerance: error = %1.2e\n",abserror2); } } } } } free(cholintegrals); } } } } if (correct) { printf("All integrals in all shell quartets satisfy error tolerance\n"); } else { printf("Some integrals did not satisfy error tolerance\n"); } printf("Total time to eval shells from Cholesky factor: %.4lf secs\n", chol_time_total); printf("Total time to eval shells with CInt: %.4lf secs\n", CInt_time_total); printf("Total time to compute Cholesky factor and eval shells from Cholesky factor: %.4lf secs\n", timepass+chol_time_total); printf("Computing Structured Lazy Evaluation Cholesky of ERIs\n"); double* G_structERI; max_rank = (n*(n+1))/2; const uint64_t struct_start_clock = __rdtsc(); structcholERI(basis, erd, &G_structERI, tol, max_rank, &rank); const uint64_t struct_end_clock = __rdtsc(); const uint64_t struct_total_ticks = struct_end_clock - struct_start_clock; const double struct_timepass = ((double) struct_total_ticks) / freq; printf("Done\n"); printf("Total GigaTicks: %.3lf, freq = %.3lf GHz\n", (double) (struct_total_ticks) * 1.0e-9, (double)freq/1.0e9); printf("Total time: %.4lf secs\n", struct_timepass); double* structdiag = (double*) malloc(max_rank*sizeof(double)); computeStructDiag(basis, erd, structdiag); for (int i = 0; i < max_rank; i++) { double aii = 0; for (int j = 0; j < rank; j++) { aii += G_structERI[i+j*max_rank] * G_structERI[i+j*max_rank]; } double abserror2 = (structdiag[i] - aii)*(structdiag[i] - aii); if (abserror2 > tol) printf("i=%d, truth=%1.2e, approx=%1.2e, error: %1.2e\n", i, structdiag[i], aii, abserror2); } free(structdiag); printf("Testing accuracy for each shell quartet\n"); double struct_chol_time_total = 0; CInt_time_total = 0; correct = 1; for (shellIndexM = 0; shellIndexM < nshell; shellIndexM++) { for (shellIndexN = 0; shellIndexN < nshell; shellIndexN++) { for (shellIndexP = 0; shellIndexP < nshell; shellIndexP++) { for (shellIndexQ = 0; shellIndexQ < nshell; shellIndexQ++) { int dimM = CInt_getShellDim (basis, shellIndexM); int dimN = CInt_getShellDim (basis, shellIndexN); int dimP = CInt_getShellDim (basis, shellIndexP); int dimQ = CInt_getShellDim (basis, shellIndexQ); // Compute shell with structured Cholesky double *cholintegrals; int cholnints; const uint64_t struct_chol_start = __rdtsc(); structcholComputeShellQuartet(basis, G_structERI, rank, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &cholintegrals, &cholnints); const uint64_t struct_chol_end = __rdtsc(); struct_chol_time_total += ((double) struct_chol_end - struct_chol_start) / freq; // Compute the same shell quartet with CInt double *integrals; int nints; const uint64_t CInt_start = __rdtsc(); CInt_computeShellQuartet(basis, erd, 0, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &integrals, &nints); const uint64_t CInt_end = __rdtsc(); CInt_time_total += ((double) CInt_end - CInt_start) / freq; // Compare each integral individually for (int iM = 0; iM < dimM; iM++) { for (int iN = 0; iN < dimN; iN++) { for (int iP = 0; iP < dimP; iP++) { for (int iQ = 0; iQ < dimQ; iQ++) { int idx = iM + dimM * (iN + dimN * (iP + dimP *(iQ))); double abserror2 = (integrals[idx] - cholintegrals[idx]); abserror2 = abserror2*abserror2; if (abserror2 > tol) { correct = 0; printf("Integral does not satisfy error tolerance: error = %1.2e\n",abserror2); } } } } } free(cholintegrals); } } } } if (correct) { printf("All integrals in all shell quartets satisfy error tolerance\n"); } else { printf("Some integrals did not satisfy error tolerance\n"); } printf("Total time to eval shells from structured Cholesky factor: %.4lf secs\n", struct_chol_time_total); printf("Total time to eval shells with CInt: %.4lf secs\n", CInt_time_total); printf("Total time to compute Cholesky factor and eval shells from Cholesky factor: %.4lf secs\n", struct_timepass+struct_chol_time_total); free(G_structERI); free(G_ERI); return 0; }