int schwartz_screening(PFock_t pfock, BasisSet_t basis) { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); // create shell pairs values //ERD_t erd; int nthreads = omp_get_max_threads(); //CInt_createERD(basis, &erd, nthreads); int nshells = pfock->nshells; // create global arrays for screening int nprow = pfock->nprow; int npcol = pfock->npcol; int dims[2]; int block[2]; int map[nprow + npcol]; for (int i = 0; i < nprow; i++) { map[i] = pfock->rowptr_sh[i]; } for (int i = 0; i < npcol; i++) { map[i + nprow] = pfock->colptr_sh[i]; } dims[0] = nshells; dims[1] = nshells; block[0] = nprow; block[1] = npcol; pfock->ga_screening = NGA_Create_irreg(C_DBL, 2, dims, "array Screening", block, map); if (0 == pfock->ga_screening) { return -1; } // compute the max shell value double *sq_values = (double *)PFOCK_MALLOC(sizeof(double) * pfock->nshells_row * pfock->nshells_col); if (NULL == sq_values) { return -1; } int startM = pfock->sshell_row; int startN = pfock->sshell_col; int endM = pfock->eshell_row; int endN = pfock->eshell_col; double maxtmp = 0.0; #pragma omp parallel { int tid = omp_get_thread_num(); #pragma omp for reduction(max:maxtmp) for (int M = startM; M <= endM; M++) { int dimM = CInt_getShellDim(basis, M); for (int N = startN; N <= endN; N++) { int dimN = CInt_getShellDim(basis, N); double *integrals; int nints= ComputeShellQuartet(basis,tid,M,N,M,N,&integrals); //CInt_computeShellQuartet(basis, erd, tid, M, N, M, N, // &integrals, &nints); double maxvalue = 0.0; if (nints != 0) { for (int iM = 0; iM < dimM; iM++) { for (int iN = 0; iN < dimN; iN++) { int index = iM * (dimN*dimM*dimN+dimN) + iN * (dimM*dimN+1); if (maxvalue < fabs(integrals[index])) { maxvalue = fabs(integrals[index]); } } } } sq_values[(M - startM) * (endN - startN + 1) + (N - startN)] = maxvalue; if (maxvalue > maxtmp) { maxtmp = maxvalue; } } } } int lo[2]; int hi[2]; lo[0] = startM; hi[0] = endM; lo[1] = startN; hi[1] = endN; int ld = endN - startN + 1; NGA_Put(pfock->ga_screening, lo, hi, sq_values, &ld); // max value MPI_Allreduce(&maxtmp, &(pfock->maxvalue), 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); //CInt_destroyERD(erd); PFOCK_FREE(sq_values); // init shellptr sq_values = (double *)PFOCK_MALLOC(sizeof(double) * nshells); if (NULL == sq_values) { return -1; } int nnz = 0; double eta = pfock->tolscr2 / pfock->maxvalue; pfock->shellptr = (int *)PFOCK_MALLOC(sizeof(int) * (nshells + 1)); pfock->mem_cpu += 1.0 * sizeof(int) * (nshells + 1); if (NULL == pfock->shellptr) { return -1; } memset(pfock->shellptr, 0, sizeof(int) * (nshells + 1)); for (int M = 0; M < nshells; M++) { pfock->shellptr[M] = nnz; lo[0] = M; hi[0] = M; lo[1] = 0; hi[1] = nshells - 1; ld = nshells; NGA_Get(pfock->ga_screening, lo, hi, sq_values, &ld); for (int N = 0; N < nshells; N++) { double maxvalue = sq_values[N]; if (maxvalue > eta) { if (M > N && (M + N) % 2 == 1 || M < N && (M + N) % 2 == 0) { continue; } else { nnz++; } } } pfock->shellptr[M + 1] = nnz; } pfock->nnz = nnz; double maxvalue; pfock->shellvalue = (double *)PFOCK_MALLOC(sizeof(double) * nnz); pfock->shellid = (int *)PFOCK_MALLOC(sizeof(int) * nnz); pfock->shellrid = (int *)PFOCK_MALLOC(sizeof(int) * nnz); pfock->mem_cpu += 1.0 * sizeof(double) * nnz + 2.0 * sizeof(int) * nnz; nshells = pfock->nshells; if (pfock->shellvalue == NULL || pfock->shellid == NULL || pfock->shellrid == NULL) { return -1; } nnz = 0; for (int A = 0; A < nshells; A++) { pfock->shellptr[A] = nnz; lo[0] = A; hi[0] = A; lo[1] = 0; hi[1] = nshells - 1; ld = nshells; NGA_Get(pfock->ga_screening, lo, hi, sq_values, &ld); for (int B = 0; B < nshells; B++) { maxvalue = sq_values[B]; if (maxvalue > eta) { if (A > B && (A + B) % 2 == 1 || A < B && (A + B) % 2 == 0) continue; if (A == B) { pfock->shellvalue[nnz] = maxvalue; } else { pfock->shellvalue[nnz] = -maxvalue; } pfock->shellid[nnz] = B; pfock->shellrid[nnz] = A; nnz++; } } } PFOCK_FREE(sq_values); GA_Destroy(pfock->ga_screening); return 0; }
int main (int argc, char **argv) { if (argc != 4) { printf ("Usage: %s <basisset> <xyz>\n", argv[0]); return -1; } const uint64_t freq = get_cpu_frequency(); const int nthreads = atoi(argv[3]); /* #ifdef _OPENMP omp_set_num_threads(nthreads); #else assert(nthreads == 1); #endif */ // load basis set BasisSet_t basis; CInt_createBasisSet(&basis); CInt_loadBasisSet(basis, argv[1], argv[2]); printf("Molecule info:\n"); printf(" #Atoms\t= %d\n", CInt_getNumAtoms(basis)); printf(" #Shells\t= %d\n", CInt_getNumShells(basis)); printf(" #Funcs\t= %d\n", CInt_getNumFuncs(basis)); printf(" #OccOrb\t= %d\n", CInt_getNumOccOrb(basis)); ERD_t erd; CInt_createERD(basis, &erd, nthreads); printf("Computing Lazy Evaluation Cholesky of ERIs\n"); // reset profiler // erd_reset_profile(); int n = CInt_getNumFuncs(basis); int n2 = n * n; int n3 = n2 * n; int n4 = n3 * n; double* G_ERI; double tol = 1e-6; int max_rank = n2; //int max_rank = (1-floor(log10(tol)))*n; int rank; const uint64_t start_clock = __rdtsc(); cholERI(basis, erd, &G_ERI, tol, max_rank, &rank); const uint64_t end_clock = __rdtsc(); const uint64_t total_ticks = end_clock - start_clock; const double timepass = ((double) total_ticks) / freq; printf("Done\n"); printf("Total GigaTicks: %.3lf, freq = %.3lf GHz\n", (double) (total_ticks) * 1.0e-9, (double)freq/1.0e9); printf("Total time: %.4lf secs\n", timepass); printf("n: %d, rank: %d, 7n: %d, n2: %d\n",n,rank,7*n,n2); double* diag = (double*) malloc(n2*sizeof(double)); computeDiag(basis, erd, diag); for (int i = 0; i < n2; i++) { double aii = 0; for (int j = 0; j < rank; j++) { aii += G_ERI[i+j*n2] * G_ERI[i+j*n2]; } double abserror2 = (diag[i] - aii)*(diag[i] - aii); if (abserror2 > tol) printf("i=%d, truth=%1.2e, approx=%1.2e, error: %1.2e\n", i, diag[i], aii, abserror2); } free(diag); printf("Testing accuracy for each shell quartet\n"); double chol_time_total = 0; double CInt_time_total = 0; int nshell = CInt_getNumShells(basis); int shellIndexM, shellIndexN, shellIndexP, shellIndexQ; int correct = 1; for (shellIndexM = 0; shellIndexM < nshell; shellIndexM++) { for (shellIndexN = 0; shellIndexN < nshell; shellIndexN++) { for (shellIndexP = 0; shellIndexP < nshell; shellIndexP++) { for (shellIndexQ = 0; shellIndexQ < nshell; shellIndexQ++) { int dimM = CInt_getShellDim (basis, shellIndexM); int dimN = CInt_getShellDim (basis, shellIndexN); int dimP = CInt_getShellDim (basis, shellIndexP); int dimQ = CInt_getShellDim (basis, shellIndexQ); // Compute shell with Cholesky double *cholintegrals; int cholnints; const uint64_t chol_start = __rdtsc(); cholComputeShellQuartet(basis, G_ERI, rank, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &cholintegrals, &cholnints); const uint64_t chol_end = __rdtsc(); chol_time_total += ((double) chol_end - chol_start) / freq; // Compute the same shell quartet with CInt double *integrals; int nints; const uint64_t CInt_start = __rdtsc(); CInt_computeShellQuartet(basis, erd, 0, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &integrals, &nints); const uint64_t CInt_end = __rdtsc(); CInt_time_total += ((double) CInt_end - CInt_start) / freq; // Compare each integral individually for (int iM = 0; iM < dimM; iM++) { for (int iN = 0; iN < dimN; iN++) { for (int iP = 0; iP < dimP; iP++) { for (int iQ = 0; iQ < dimQ; iQ++) { int idx = iM + dimM * (iN + dimN * (iP + dimP *(iQ))); double abserror2 = (integrals[idx] - cholintegrals[idx]); abserror2 = abserror2*abserror2; if (abserror2 > tol) { correct = 0; printf("Integral does not satisfy error tolerance: error = %1.2e\n",abserror2); } } } } } free(cholintegrals); } } } } if (correct) { printf("All integrals in all shell quartets satisfy error tolerance\n"); } else { printf("Some integrals did not satisfy error tolerance\n"); } printf("Total time to eval shells from Cholesky factor: %.4lf secs\n", chol_time_total); printf("Total time to eval shells with CInt: %.4lf secs\n", CInt_time_total); printf("Total time to compute Cholesky factor and eval shells from Cholesky factor: %.4lf secs\n", timepass+chol_time_total); printf("Computing Structured Lazy Evaluation Cholesky of ERIs\n"); double* G_structERI; max_rank = (n*(n+1))/2; const uint64_t struct_start_clock = __rdtsc(); structcholERI(basis, erd, &G_structERI, tol, max_rank, &rank); const uint64_t struct_end_clock = __rdtsc(); const uint64_t struct_total_ticks = struct_end_clock - struct_start_clock; const double struct_timepass = ((double) struct_total_ticks) / freq; printf("Done\n"); printf("Total GigaTicks: %.3lf, freq = %.3lf GHz\n", (double) (struct_total_ticks) * 1.0e-9, (double)freq/1.0e9); printf("Total time: %.4lf secs\n", struct_timepass); double* structdiag = (double*) malloc(max_rank*sizeof(double)); computeStructDiag(basis, erd, structdiag); for (int i = 0; i < max_rank; i++) { double aii = 0; for (int j = 0; j < rank; j++) { aii += G_structERI[i+j*max_rank] * G_structERI[i+j*max_rank]; } double abserror2 = (structdiag[i] - aii)*(structdiag[i] - aii); if (abserror2 > tol) printf("i=%d, truth=%1.2e, approx=%1.2e, error: %1.2e\n", i, structdiag[i], aii, abserror2); } free(structdiag); printf("Testing accuracy for each shell quartet\n"); double struct_chol_time_total = 0; CInt_time_total = 0; correct = 1; for (shellIndexM = 0; shellIndexM < nshell; shellIndexM++) { for (shellIndexN = 0; shellIndexN < nshell; shellIndexN++) { for (shellIndexP = 0; shellIndexP < nshell; shellIndexP++) { for (shellIndexQ = 0; shellIndexQ < nshell; shellIndexQ++) { int dimM = CInt_getShellDim (basis, shellIndexM); int dimN = CInt_getShellDim (basis, shellIndexN); int dimP = CInt_getShellDim (basis, shellIndexP); int dimQ = CInt_getShellDim (basis, shellIndexQ); // Compute shell with structured Cholesky double *cholintegrals; int cholnints; const uint64_t struct_chol_start = __rdtsc(); structcholComputeShellQuartet(basis, G_structERI, rank, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &cholintegrals, &cholnints); const uint64_t struct_chol_end = __rdtsc(); struct_chol_time_total += ((double) struct_chol_end - struct_chol_start) / freq; // Compute the same shell quartet with CInt double *integrals; int nints; const uint64_t CInt_start = __rdtsc(); CInt_computeShellQuartet(basis, erd, 0, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &integrals, &nints); const uint64_t CInt_end = __rdtsc(); CInt_time_total += ((double) CInt_end - CInt_start) / freq; // Compare each integral individually for (int iM = 0; iM < dimM; iM++) { for (int iN = 0; iN < dimN; iN++) { for (int iP = 0; iP < dimP; iP++) { for (int iQ = 0; iQ < dimQ; iQ++) { int idx = iM + dimM * (iN + dimN * (iP + dimP *(iQ))); double abserror2 = (integrals[idx] - cholintegrals[idx]); abserror2 = abserror2*abserror2; if (abserror2 > tol) { correct = 0; printf("Integral does not satisfy error tolerance: error = %1.2e\n",abserror2); } } } } } free(cholintegrals); } } } } if (correct) { printf("All integrals in all shell quartets satisfy error tolerance\n"); } else { printf("Some integrals did not satisfy error tolerance\n"); } printf("Total time to eval shells from structured Cholesky factor: %.4lf secs\n", struct_chol_time_total); printf("Total time to eval shells with CInt: %.4lf secs\n", CInt_time_total); printf("Total time to compute Cholesky factor and eval shells from Cholesky factor: %.4lf secs\n", struct_timepass+struct_chol_time_total); free(G_structERI); free(G_ERI); return 0; }