Exemple #1
0
Fichier : scf.c Projet : sg0/gtfock
/// main for SCF
int main (int argc, char **argv)
{
    // init MPI
    int myrank;
    int nprocs;
    int provided;
#if defined (USE_ELEMENTAL)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &myrank );
    ElMPICommSize( MPI_COMM_WORLD, &nprocs );
    MPI_Query_thread(&provided);
#else
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
#endif
    if (myrank == 0)  {
        printf("MPI thread support: %s\n", MPI_THREAD_STRING(provided));
    }
#if 0
    char hostname[1024];
    gethostname (hostname, 1024);
    printf ("Rank %d of %d running on node %s\n", myrank, nprocs, hostname);
#endif

    // create basis set    
    BasisSet_t basis;
    CInt_createBasisSet(&basis);

    // input parameters and load basis set
    int nprow_fock;
    int npcol_fock;
    int nblks_fock;
    int nprow_purif;
    int nshells;
    int natoms;
    int nfunctions;
    int niters;
    if (myrank == 0) {
        if (argc != 8) {
            usage(argv[0]);
            MPI_Finalize();
            exit(0);
        }
        // init parameters
        nprow_fock = atoi(argv[3]);
        npcol_fock = atoi(argv[4]);
        nprow_purif = atoi(argv[5]);
        nblks_fock = atoi(argv[6]);
        niters = atoi(argv[7]);
        assert(nprow_fock * npcol_fock == nprocs);
        assert(nprow_purif * nprow_purif * nprow_purif  <= nprocs);
        assert(niters > 0);       
        CInt_loadBasisSet(basis, argv[1], argv[2]);
        nshells = CInt_getNumShells(basis);
        natoms = CInt_getNumAtoms(basis);
        nfunctions = CInt_getNumFuncs(basis);
        assert(nprow_fock <= nshells && npcol_fock <= nshells);
        assert(nprow_purif <= nfunctions && nprow_purif <= nfunctions);
        printf("Job information:\n");
        char *fname;
        fname = basename(argv[2]);
        printf("  molecule:  %s\n", fname);
        fname = basename(argv[1]);
        printf("  basisset:  %s\n", fname);
        printf("  charge     = %d\n", CInt_getTotalCharge(basis));
        printf("  #atoms     = %d\n", natoms);
        printf("  #shells    = %d\n", nshells);
        printf("  #functions = %d\n", nfunctions);
        printf("  fock build uses   %d (%dx%d) nodes\n",
               nprow_fock * npcol_fock, nprow_fock, npcol_fock);
        printf("  purification uses %d (%dx%dx%d) nodes\n",
               nprow_purif * nprow_purif * nprow_purif,
               nprow_purif, nprow_purif, nprow_purif);
        printf("  #tasks = %d (%dx%d)\n",
               nblks_fock * nblks_fock * nprow_fock * nprow_fock,
               nblks_fock * nprow_fock, nblks_fock * nprow_fock);
        int nthreads = omp_get_max_threads();
        printf("  #nthreads_cpu = %d\n", nthreads);   
    }
    int btmp[8];
    btmp[0] = nprow_fock;
    btmp[1] = npcol_fock;
    btmp[2] = nprow_purif;
    btmp[3] = nblks_fock;
    btmp[4] = niters;
    btmp[5] = natoms;
    btmp[6] = nshells;
    btmp[7] = nfunctions;
    MPI_Bcast(btmp, 8, MPI_INT, 0, MPI_COMM_WORLD);
    nprow_fock = btmp[0];
    npcol_fock = btmp[1];
    nprow_purif = btmp[2];
    nblks_fock = btmp[3];
    niters = btmp[4];
    natoms = btmp[5];
    nshells = btmp[6];
    nfunctions = btmp[7];

    // broadcast basis set
    void *bsbuf;
    int bsbufsize;
    if (myrank == 0) {
        CInt_packBasisSet(basis, &bsbuf, &bsbufsize);
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
    }
    else {
        MPI_Bcast(&bsbufsize, 1, MPI_INT, 0, MPI_COMM_WORLD);
        bsbuf = (void *)malloc(bsbufsize);
        assert(bsbuf != NULL);
        MPI_Bcast(bsbuf, bsbufsize, MPI_CHAR, 0, MPI_COMM_WORLD);
        CInt_unpackBasisSet(basis, bsbuf);  
        free(bsbuf);
    }

    // init PFock
    if (myrank == 0) {
        printf("Initializing pfock ...\n");
    }
    PFock_t pfock;
    PFock_create(basis, nprow_fock, npcol_fock, nblks_fock, 1e-11,
                 MAX_NUM_D, IS_SYMM, &pfock);
    if (myrank == 0) {
        double mem_cpu;
        PFock_getMemorySize(pfock, &mem_cpu);
        printf("  CPU uses %.3f MB\n", mem_cpu / 1024.0 / 1024.0);
        printf("  Done\n");
    }

    // init purif
    purif_t *purif = create_purif(basis, nprow_purif, nprow_purif, nprow_purif);
    init_oedmat(basis, pfock, purif, nprow_fock, npcol_fock);

    // compute SCF
    if (myrank == 0) {
        printf("Computing SCF ...\n");
    }
    int rowstart = purif->srow_purif;
    int rowend = purif->nrows_purif + rowstart - 1;
    int colstart = purif->scol_purif;
    int colend = purif->ncols_purif + colstart - 1;
    double energy0 = -1.0;
    double totaltime = 0.0;
    double purif_flops = 2.0 * nfunctions * nfunctions * nfunctions;
    double diis_flops;

    // set initial guess
    if (myrank == 0) {
        printf("  initialing D ...\n");
    }
    PFock_setNumDenMat(NUM_D, pfock);
    initial_guess(pfock, basis, purif->runpurif,
                  rowstart, rowend, colstart, colend,
                  purif->D_block, purif->ldx);

    MPI_Barrier(MPI_COMM_WORLD);

    // compute nuc energy
    double ene_nuc = CInt_getNucEnergy(basis);
    if (myrank == 0) {
        printf("  nuc energy = %.10f\n", ene_nuc);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    
    // main loop
    double t1, t2, t3, t4;
    for (int iter = 0; iter < niters; iter++) {
        if (myrank == 0) {
            printf("  iter %d\n", iter);
        }
        t3 = MPI_Wtime();

        // fock matrix construction
        t1 = MPI_Wtime();
        fock_build(pfock, basis, purif->runpurif,
                   rowstart, rowend, colstart, colend,
                   purif->ldx, purif->D_block, purif->F_block);
        if (myrank == 0) {
            printf("After fock build \n");
        }

        // compute energy
        double energy = compute_energy(purif, purif->F_block, purif->D_block);

        t2 = MPI_Wtime();
        if (myrank == 0) {
            printf("    fock build takes %.3f secs\n", t2 - t1);
            if (iter > 0) {
                printf("    energy %.10f (%.10f), %le\n",
                       energy + ene_nuc, energy, fabs (energy - energy0));
            }
            else {
                printf("    energy %.10f (%.10f)\n", energy + ene_nuc,
                       energy);
            }
        }
        if (iter > 0 && fabs (energy - energy0) < 1e-11) {
            niters = iter + 1;
            break;
        }
        energy0 = energy;

        // compute DIIS
        t1 = MPI_Wtime();
        compute_diis(pfock, purif, purif->D_block, purif->F_block, iter);
        t2 = MPI_Wtime();

        if (myrank == 0) {
            if (iter > 1) {
                diis_flops = purif_flops * 6.0;
            } else {
                diis_flops = purif_flops * 2.0;
            }
            printf("    diis takes %.3f secs, %.3lf Gflops\n",
                   t2 - t1, diis_flops / (t2 - t1) / 1e9);
        }
        
    #ifdef __SCF_OUT__
        if (myrank == 0) {
            double outbuf[nfunctions];
            char fname[1024];
            sprintf(fname, "XFX_%d_%d.dat", nfunctions, iter);
            FILE *fp = fopen(fname, "w+");
            assert(fp != NULL);
            for (int i = 0; i < nfunctions; i++) {
                PFock_getMat(pfock, PFOCK_MAT_TYPE_F, USE_D_ID,
                             i, i, USE_D_ID, nfunctions - 1,
                             outbuf, nfunctions);
                for (int j = 0; j < nfunctions; j++) {
                    fprintf(fp, "%.10e\n", outbuf[j]);
                }
            }
            fclose(fp);
        }
    #endif
    
        // purification
        MPI_Barrier(MPI_COMM_WORLD);
        t1 = MPI_Wtime();
        int it = compute_purification(purif, purif->F_block, purif->D_block);
        t2 = MPI_Wtime();
        MPI_Barrier(MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    purification takes %.3f secs,"
                   " %d iterations, %.3f Gflops\n",
                   t2 - t1, it,
                   (it * 2.0 + 4.0) * purif_flops / (t2 - t1) / 1e9);
        }
	/*
#if defined(USE_ELEMENTAL)
    ElGlobalArraysPrint_d( eldga, pfock->ga_D[USE_D_ID] );
#else
    GA_Print (pfock->ga_D[USE_D_ID]);
#endif
*/
        t4 = MPI_Wtime ();
        totaltime += t4 - t3;

#ifdef __SCF_TIMING__
        PFock_getStatistics(pfock);
        double purif_timedgemm;
        double purif_timepdgemm;
        double purif_timepass;
        double purif_timetr;
        MPI_Reduce(&purif->timedgemm, &purif_timedgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepdgemm, &purif_timepdgemm,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timepass, &purif_timepass,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&purif->timetr, &purif_timetr,
                   1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        if (myrank == 0) {
            printf("    Purification Statistics:\n");
            printf("      average totaltime  = %.3f\n"
                   "      average timetr     = %.3f\n"
                   "      average timedgemm  = %.3f, %.3f Gflops\n"
                   "      average timepdgemm = %.3f, %.3f Gflops\n",
                   purif_timepass / purif->np_purif,
                   purif_timetr / purif->np_purif,
                   purif_timedgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timedgemm / purif->np_purif) / 1e9,
                   purif_timepdgemm / purif->np_purif,
                   (it * 2.0 + 4.0) *
                   purif_flops / (purif_timepdgemm / purif->np_purif) / 1e9);
        }
#endif
    } /* for (iter = 0; iter < NITERATIONS; iter++) */

    if (myrank == 0) {
        printf("  totally takes %.3f secs: %.3f secs/iters\n",
               totaltime, totaltime / niters);
        printf("  Done\n");
    }

    destroy_purif(purif);
    PFock_destroy(pfock);
    CInt_destroyBasisSet(basis);

    MPI_Finalize();

    return 0;
}
Exemple #2
0
int main (int argc, char **argv)
{
  if (argc != 4) {
    printf ("Usage: %s <basisset> <xyz>\n", argv[0]);
    return -1;
  }
  
  const uint64_t freq = get_cpu_frequency();
  const int nthreads = atoi(argv[3]);
  /*
#ifdef _OPENMP
  omp_set_num_threads(nthreads);
#else
  assert(nthreads == 1);
#endif
  */
  // load basis set
  BasisSet_t basis;
  CInt_createBasisSet(&basis);
  CInt_loadBasisSet(basis, argv[1], argv[2]);
  
  printf("Molecule info:\n");
  printf("  #Atoms\t= %d\n", CInt_getNumAtoms(basis));
  printf("  #Shells\t= %d\n", CInt_getNumShells(basis));
  printf("  #Funcs\t= %d\n", CInt_getNumFuncs(basis));
  printf("  #OccOrb\t= %d\n", CInt_getNumOccOrb(basis));

  ERD_t erd;
  CInt_createERD(basis, &erd, nthreads);

  printf("Computing Lazy Evaluation Cholesky of ERIs\n");
  // reset profiler
  //  erd_reset_profile();

  int n = CInt_getNumFuncs(basis);
  int n2 = n * n;
  int n3 = n2 * n;
  int n4 = n3 * n;
  
  double* G_ERI;
  double tol = 1e-6;
  int max_rank = n2;
  //int max_rank = (1-floor(log10(tol)))*n;
  int rank;
  const uint64_t start_clock = __rdtsc();
  cholERI(basis, erd, &G_ERI, tol, max_rank, &rank);
  const uint64_t end_clock = __rdtsc();
  const uint64_t total_ticks = end_clock - start_clock;
  const double timepass = ((double) total_ticks) / freq;
  printf("Done\n");
  printf("Total GigaTicks: %.3lf, freq = %.3lf GHz\n", (double) (total_ticks) * 1.0e-9, (double)freq/1.0e9);
  printf("Total time: %.4lf secs\n", timepass);

  printf("n: %d, rank: %d, 7n: %d, n2: %d\n",n,rank,7*n,n2);
  double* diag = (double*) malloc(n2*sizeof(double));
  computeDiag(basis, erd, diag);
  for (int i = 0; i < n2; i++) {
    double aii = 0;
    for (int j = 0; j < rank; j++) {
      aii += G_ERI[i+j*n2] * G_ERI[i+j*n2];
    }
    double abserror2 = (diag[i] - aii)*(diag[i] - aii);
    if (abserror2 > tol)
      printf("i=%d, truth=%1.2e, approx=%1.2e, error: %1.2e\n", i, diag[i], aii, abserror2);
  }
  free(diag);

  printf("Testing accuracy for each shell quartet\n");
  double chol_time_total = 0;
  double CInt_time_total = 0;
  int nshell = CInt_getNumShells(basis);
  int shellIndexM, shellIndexN, shellIndexP, shellIndexQ;
  int correct = 1;
  for (shellIndexM = 0; shellIndexM < nshell; shellIndexM++) {
    for (shellIndexN = 0; shellIndexN < nshell; shellIndexN++) {
      for (shellIndexP = 0; shellIndexP < nshell; shellIndexP++) {
	for (shellIndexQ = 0; shellIndexQ < nshell; shellIndexQ++) {
	  int dimM = CInt_getShellDim (basis, shellIndexM);
	  int dimN = CInt_getShellDim (basis, shellIndexN);
	  int dimP = CInt_getShellDim (basis, shellIndexP);
	  int dimQ = CInt_getShellDim (basis, shellIndexQ);

	  // Compute shell with Cholesky
	  double *cholintegrals;
	  int cholnints;
	  const uint64_t chol_start = __rdtsc();
	  cholComputeShellQuartet(basis, G_ERI, rank, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &cholintegrals, &cholnints);
	  const uint64_t chol_end = __rdtsc();
	  chol_time_total += ((double) chol_end - chol_start) / freq;
	  
	  // Compute the same shell quartet with CInt
	  double *integrals;
	  int nints;
	  const uint64_t CInt_start = __rdtsc();
	  CInt_computeShellQuartet(basis, erd, 0, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &integrals, &nints);
	  const uint64_t CInt_end = __rdtsc();
	  CInt_time_total += ((double) CInt_end - CInt_start) / freq;
	  
	  // Compare each integral individually
	  for (int iM = 0; iM < dimM; iM++) {
	    for (int iN = 0; iN < dimN; iN++) {
	      for (int iP = 0; iP < dimP; iP++) {
		for (int iQ = 0; iQ < dimQ; iQ++) {
		  int idx = iM + dimM * (iN + dimN * (iP + dimP *(iQ)));
		  double abserror2 = (integrals[idx] - cholintegrals[idx]);
		  abserror2 = abserror2*abserror2;
		  if (abserror2 > tol) {
		    correct = 0;
		    printf("Integral does not satisfy error tolerance: error = %1.2e\n",abserror2);
		  }
		}
	      }
	    }
	  }
	  free(cholintegrals);
	}
      }
    }
  }
  if (correct) {
    printf("All integrals in all shell quartets satisfy error tolerance\n");
  } else {
    printf("Some integrals did not satisfy error tolerance\n");
  }
  printf("Total time to eval shells from Cholesky factor: %.4lf secs\n", chol_time_total);
  printf("Total time to eval shells with CInt: %.4lf secs\n", CInt_time_total);
  printf("Total time to compute Cholesky factor and eval shells from Cholesky factor: %.4lf secs\n", timepass+chol_time_total);

  printf("Computing Structured Lazy Evaluation Cholesky of ERIs\n");
  double* G_structERI;
  max_rank = (n*(n+1))/2;
  const uint64_t struct_start_clock = __rdtsc();
  structcholERI(basis, erd, &G_structERI, tol, max_rank, &rank);
  const uint64_t struct_end_clock = __rdtsc();
  const uint64_t struct_total_ticks = struct_end_clock - struct_start_clock;
  const double struct_timepass = ((double) struct_total_ticks) / freq;
  printf("Done\n");
  printf("Total GigaTicks: %.3lf, freq = %.3lf GHz\n", (double) (struct_total_ticks) * 1.0e-9, (double)freq/1.0e9);
  printf("Total time: %.4lf secs\n", struct_timepass);
  
  
  double* structdiag = (double*) malloc(max_rank*sizeof(double));
  computeStructDiag(basis, erd, structdiag);
  for (int i = 0; i < max_rank; i++) {
    double aii = 0;
    for (int j = 0; j < rank; j++) {
      aii += G_structERI[i+j*max_rank] * G_structERI[i+j*max_rank];
    }
    double abserror2 = (structdiag[i] - aii)*(structdiag[i] - aii);
    if (abserror2 > tol)
      printf("i=%d, truth=%1.2e, approx=%1.2e, error: %1.2e\n", i, structdiag[i], aii, abserror2);
  }
  free(structdiag);
  printf("Testing accuracy for each shell quartet\n");
  double struct_chol_time_total = 0;
  CInt_time_total = 0;
  correct = 1;
  for (shellIndexM = 0; shellIndexM < nshell; shellIndexM++) {
    for (shellIndexN = 0; shellIndexN < nshell; shellIndexN++) {
      for (shellIndexP = 0; shellIndexP < nshell; shellIndexP++) {
	for (shellIndexQ = 0; shellIndexQ < nshell; shellIndexQ++) {
	  int dimM = CInt_getShellDim (basis, shellIndexM);
	  int dimN = CInt_getShellDim (basis, shellIndexN);
	  int dimP = CInt_getShellDim (basis, shellIndexP);
	  int dimQ = CInt_getShellDim (basis, shellIndexQ);

	  // Compute shell with structured Cholesky
	  double *cholintegrals;
	  int cholnints;
	  const uint64_t struct_chol_start = __rdtsc();
	  structcholComputeShellQuartet(basis, G_structERI, rank, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &cholintegrals, &cholnints);
	  const uint64_t struct_chol_end = __rdtsc();
	  struct_chol_time_total += ((double) struct_chol_end - struct_chol_start) / freq;
	  
	  // Compute the same shell quartet with CInt  
	  double *integrals;
	  int nints;
	  const uint64_t CInt_start = __rdtsc();
	  CInt_computeShellQuartet(basis, erd, 0, shellIndexM, shellIndexN, shellIndexP, shellIndexQ, &integrals, &nints);
	  const uint64_t CInt_end = __rdtsc();
	  CInt_time_total += ((double) CInt_end - CInt_start) / freq;
	  
	  // Compare each integral individually
	  for (int iM = 0; iM < dimM; iM++) {
	    for (int iN = 0; iN < dimN; iN++) {
	      for (int iP = 0; iP < dimP; iP++) {
		for (int iQ = 0; iQ < dimQ; iQ++) {
		  int idx = iM + dimM * (iN + dimN * (iP + dimP *(iQ)));
		  double abserror2 = (integrals[idx] - cholintegrals[idx]);
		  abserror2 = abserror2*abserror2;
		  if (abserror2 > tol) {
		    correct = 0;
		    printf("Integral does not satisfy error tolerance: error = %1.2e\n",abserror2);
		  }
		}
	      }
	    }
	  }
	  free(cholintegrals);
	}
      }
    }
  }
  if (correct) {
    printf("All integrals in all shell quartets satisfy error tolerance\n");
  } else {
    printf("Some integrals did not satisfy error tolerance\n");
  }
  printf("Total time to eval shells from structured Cholesky factor: %.4lf secs\n", struct_chol_time_total);
  printf("Total time to eval shells with CInt: %.4lf secs\n", CInt_time_total);
  printf("Total time to compute Cholesky factor and eval shells from Cholesky factor: %.4lf secs\n", struct_timepass+struct_chol_time_total);

  free(G_structERI);  
  free(G_ERI);
  return 0;
}