int main()
{
    double *A, *B, *C;
    int i,j,r,max_threads,size;
    double alpha, beta;
    double s_initial, s_elapsed;
    
    printf("Intializing data for matrix multiplication C=A*B for matrix\n\n"
            " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N);
    alpha = 1.0;
    beta = 0.0;

    printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n");
    A = ( double *)mkl_malloc(M*P*sizeof( double ),64);
    B = ( double *)mkl_malloc(N*P*sizeof( double ),64);
    C = ( double *)mkl_malloc(M*N*sizeof( double ),64);
    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: can`t allocate memory for matrices.\n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }

    printf("Intializing matrix data\n\n");
    size = M*P;
    for (i = 0; i < size; ++i)
    {
        A[i] = ( double )(i+1);
    }
    size = N*P;
    for (i = 0; i < size; ++i)
    {
        B[i] = ( double )(i-1);
    }

    printf("Finding max number of threads can use for parallel runs \n\n");
    max_threads = mkl_get_max_threads();

    printf("Running from 1 to %i threads \n\n",max_threads);
    for (i = 1; i <= max_threads; ++i)
    {
        size = M*N;
        for (j = 0; j < size; ++j)
        {
            C[j] = 0.0;
        }

	    printf("Requesting to use %i threads \n\n",i); 
	    mkl_set_num_threads(i);

	    printf("Measuring performance of matrix product using dgemm function\n"
		    " via CBLAS interface on %i threads \n\n",i);
	    s_initial = dsecnd();
	    for (r = 0; r < LOOP_COUNT; ++r)
	    {
    		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N);
            // multiply matrices with cblas_dgemm;
	    }
	    s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

	    printf("Matrix multiplication using dgemm completed \n"
		    " at %.5f milliseconds using %d threads \n\n",
		    (s_elapsed * 1000),i);
        printf("Output the result: \n");
        size = M*N;
        for (i = 0; i < size; ++i)
        {
            printf("%i\t",(int)C[i]);
            if (i % N == N - 1)
                printf("\n");
        }
    }

    printf("Dellocating memory\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);

    return 0;
}
DISSECTION_API void DISS_N_FACT(uint64_t &dslv_,
				const double *coefs,
				const int &scaling,
				const double &eps_pivot,
				const int &indefinite_flag)
{
  // scaling = 0 : without scaling
  //           1 : 1/sqrt(a_ii) or 1/sqrt(max|a_ij|)
  //           2 : 1/sqrt(a_ii) or Schur complement corresponding to diagonal
  //               kernel_detection_all (for KKT type)
  // eps_pivot = 1.0e-2 : threshold of pivot, ratio of contiguous diagonal
  //                      entries with absolute value
  // indefinite_flag = 1 : indefinite -> kernel_detection_all = false
  // indefinite_flag = 0 : semi-definite -> kernel_detection_all = true

  bool kernel_detection_all = indefinite_flag == 0 ? true : false;
    
  //  FILE *fout;
  dissection_solver_ptr *dslv = (dissection_solver_ptr *)dslv_;

#ifdef BLAS_MKL
  mkl_set_num_threads(1);
#endif
  #ifdef VECLIB
    setenv("VECLIB_MAXIMUM_THREADS", "1", true);
#endif
  //  dslv->NumericFree(); // for debugging : 20 Nov.2013
    if (dslv->quad_fact) {
    }
    else {
      switch(dslv->real_or_complex) {
	case DISSECTION_REAL_MATRIX:
	  dslv->rptr->NumericFact(dslv->numeric,
				  (double *)coefs, scaling, 
				  eps_pivot, 
				  kernel_detection_all);
	  if (dslv->rptr->getFactorized() == false) {
	    dslv->rptr->SaveMMMatrix(dslv->called, coefs);
	  }
	  break;
      case DISSECTION_COMPLEX_MATRIX:
	dslv->cptr->NumericFact(dslv->numeric,
				(complex<double> *)coefs, scaling, 
				eps_pivot, 
				kernel_detection_all);
	break;
      }
    }
#ifdef BLAS_MKL
  mkl_set_num_threads(dslv->mkl_num_threads);
#endif
  if (dslv->verbose > 0) {
    fprintf(dslv->fp,
	    "%s %d : Dissection::NumericFact done : %d\n",
	    __FILE__, __LINE__, dslv->numeric);
  }
  dslv->numeric++;
}
Example #3
0
MKL_INT PardisoSolver::FactorMatrix(const SparseMatrix * A)
{
  if (directIterative)
    return 0;

  mkl_set_num_threads(numThreads);

  // compute the factorization
  if (verbose >= 1)
    printf("Factoring the %d x %d matrix (%d threads)...\n", n, n, numThreads);

  int upperTriangleOnly = 1;
  int oneIndexed = 1;

  if ((mtype == REAL_SPD) || (mtype == REAL_SYM_INDEFINITE))  // matrix is symmetric
    upperTriangleOnly = 1;    // symmetric matrix can only be represented by its upper triangle elements
  else  // structural symmetric or unsymmetric
    upperTriangleOnly = 0;    // unsymmetric matrix must store all its elements
  A->GenerateCompressedRowMajorFormat(a, NULL, NULL, upperTriangleOnly, oneIndexed);

  // factor 
  phase = 22;
  PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &nrhs, iparm, &msglvl, NULL, NULL,  &error);

  if (error != 0)
    printf("Error: Pardiso Cholesky decomposition returned non-zero exit code %d.\n", error);

  if (verbose >= 1)
    printf("Factorization completed.\n");

  return error;
}
Example #4
0
int setup_threads(int n_threads)
{
    #ifdef _OPENMP
    if (n_threads)
        omp_set_num_threads(n_threads);
    n_threads = omp_get_max_threads();
    if (n_threads > 1)
        cerr << "Using " << n_threads << " threads" << endl;

    Eigen::initParallel();
    Eigen::setNbThreads(n_threads);

    #ifdef MKL_SINGLE
    // Set the threading layer to match the compiler.
    // This lets MKL automatically go single-threaded in parallel regions.
    #ifdef __INTEL_COMPILER
    mkl_set_threading_layer(MKL_THREADING_INTEL);
    #elif defined __GNUC__
    mkl_set_threading_layer(MKL_THREADING_GNU);
    #endif
    mkl_set_num_threads(n_threads);
    #endif
    #endif

    return n_threads;
}
Example #5
0
MKL_INT PardisoSolver::SolveLinearSystemDirectIterative(const SparseMatrix * A, double * x, const double * rhs)
{
  if (directIterative != 1)
  {
    printf("Error: direct-iterative flag was not specified in the constructor.\n");
    return 102;
  }

  mkl_set_num_threads(numThreads); 

  if (verbose >= 2)
    printf("Solving linear system...(%d threads, direct-iterative)\n", numThreads);

  int upperTriangleOnly = 1;
  int oneIndexed = 1;
  A->GenerateCompressedRowMajorFormat(a, NULL, NULL, upperTriangleOnly, oneIndexed);

  phase = 23;
  PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &nrhs, iparm, &msglvl, (double*)rhs, x, &error);

  if (error != 0)
    printf("Error: Pardiso solve returned non-zero exit code %d.\n", error);

  if (verbose >= 2)
    printf("Solve completed.\n"); 

  return error;
}
Example #6
0
PetscErrorCode MatMkl_PardisoSetCntl_MKL_PARDISO(Mat F,PetscInt icntl,PetscInt ival) {
    Mat_MKL_PARDISO *mat_mkl_pardiso =(Mat_MKL_PARDISO*)F->spptr;
    PetscFunctionBegin;
    if(icntl <= 64) {
        mat_mkl_pardiso->iparm[icntl - 1] = ival;
    } else {
        if(icntl == 65)
            mkl_set_num_threads((int)ival);
        else if(icntl == 66)
            mat_mkl_pardiso->maxfct = ival;
        else if(icntl == 67)
            mat_mkl_pardiso->mnum = ival;
        else if(icntl == 68)
            mat_mkl_pardiso->msglvl = ival;
        else if(icntl == 69) {
            int pt[IPARM_SIZE];
            mat_mkl_pardiso->mtype = ival;
            MKL_PARDISO_INIT(&pt, &mat_mkl_pardiso->mtype, mat_mkl_pardiso->iparm);
#if defined(PETSC_USE_REAL_SINGLE)
            mat_mkl_pardiso->iparm[27] = 1;
#else
            mat_mkl_pardiso->iparm[27] = 0;
#endif
            mat_mkl_pardiso->iparm[34] = 1;
        }
    }
    PetscFunctionReturn(0);
}
Example #7
0
MKL_INT PardisoSolver::BackwardSubstitution(double * x, const double * y)
{
  if (directIterative != 0)
  {
    printf("Error: direct-iterative flag was specified in the constructor (must use SolveLinearSystemDirectIterative routine).\n");
    return 101;
  }

  mkl_set_num_threads(numThreads); 

  if (verbose >= 2)
    printf("Performing forward substitution...(%d threads)\n", numThreads);

  int maxIterRefinementSteps = iparm[7];
  iparm[7] = 0;

  phase = 333;
  PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &nrhs, iparm, &msglvl, (double*)y, x, &error);

  iparm[7] = maxIterRefinementSteps;

  if (error != 0)
    printf("Error: Pardiso solve returned non-zero exit code %d.\n", error);

  if (verbose >= 2)
    printf("Solve completed.\n"); 

  return error;
}
Example #8
0
	void computeDenseMatrixMatrixMultiplication(int _m, int _n, int _k, const double *_A, const double *_B, double *_C, const bool _transposeA, const bool _multByScalar, const double _alpha, const bool _addPrevious, const bool _useIntelSmall){

#ifdef USE_INTEL_MKL_BLAS
			CBLAS_TRANSPOSE transposeA;
			int ka, nb;
			if (!_transposeA) {
				transposeA = CblasNoTrans;
				ka = _k;
				//	nb = _n;
			}
			else {
				transposeA = CblasTrans;
				ka = _m;
				//	nb = _n;
			}
			double alpha;
			if (!_multByScalar) {
				alpha = 1.0;
			}
			else {
				alpha = _alpha;
			}
			double beta;
			if (!_addPrevious) {
				beta = 0.0;
			}
			else {
				beta = 1.0;
			}
			mkl_set_num_threads(STACCATO::AuxiliaryParameters::denseVectorMatrixThreads);
			cblas_dgemm(CblasRowMajor, transposeA, CblasNoTrans, _m, _n, _k, alpha, _A, ka, _B, _n, beta, _C, _n);
#endif
#ifndef USE_INTEL_MKL_BLAS
			assert(_A != NULL);
			assert(_B != NULL);
			for (int i = 0; i < _m; i++) {
				for (int j = 0; j < _n; j++) {
					double sum = 0.0;
					for (int l = 0; l < _k; l++) {
						if (!_transposeA) {
							sum += _A[i * _k + l] * _B[l * _n + j];
						}
						if (_transposeA) {
							sum += _A[l * _m + i] * _B[l * _n + j];
						}
					}
					if (_multByScalar) {
						sum = _alpha*sum;
					}
					if (_addPrevious) {
						_C[i*_n + j] += sum;
					}
					if (!_addPrevious) {
						_C[i*_n + j] = sum;
					}
				}
			}
#endif
	}
DisableThreadingInBlock::~DisableThreadingInBlock() {
#if defined(HAVE_MKL_H)
    mkl_set_num_threads(mklNumThreads);
#endif
#ifdef _OPENMP
    omp_set_num_threads(ompNumThreads);
#endif
#ifdef OPENBLAS_DISABLE_THREADS
    openblas_set_num_threads(openblasNumThreads);
#endif
}
Example #10
0
void magma_set_lapack_numthreads(magma_int_t threads)
{
    if ( threads < 1 ) {
        return;
    }

#if defined(MAGMA_WITH_MKL)
    mkl_set_num_threads( threads );
#elif defined(_OPENMP)
    omp_set_num_threads( threads );
#endif
}
Example #11
0
int main (int argc, char *argv[])
{
#ifdef _DIST_
    CnC::dist_cnc_init< cholesky_context > dc_init;
#endif
    int n;
    int b;
    dist_type dt = BLOCKED_ROWS;
    const char *fname = NULL;
    const char *oname = NULL;
    const char *mname = NULL;
    int argi;

    // Command line: cholesky n b filename [out-file]
    if (argc < 3 || argc > 7) {
        fprintf(stderr, "Incorrect number of arguments, epxected N BS [-i infile] [-o outfile] [-w mfile] [-dt disttype]\n");
        return -1;
    }
    argi = 1;
    n = atol(argv[argi++]);
    b = atol(argv[argi++]);
    while( argi < argc ) {
        if( ! strcmp( argv[argi], "-o" ) ) oname = argv[++argi];
        else if( ! strcmp( argv[argi], "-i" ) ) fname = argv[++argi];
        else if( ! strcmp( argv[argi], "-w" ) ) mname = argv[++argi];
        else if( ! strcmp( argv[argi], "-dt" ) ) dt = static_cast< dist_type >( atoi( argv[++argi] ) );
        ++argi;
    }

#ifdef USE_MKL
    if( mname == NULL ) {
        mkl_set_num_threads( 1 );
        omp_set_num_threads( 1 );
    }
#endif

    if(n % b != 0) {
        fprintf(stderr, "The tile size is not compatible with the given matrix\n");
        exit(0);
    }

    double * A = new double[n*n];

    matrix_init( A, n, fname );
    if( mname ) matrix_write( A, n, mname );
    else cholesky(A, n, b, oname, dt);     

    delete [] A;

    return 0;
}
void ReducedMassSpringSystemForceModel::GetTangentStiffnessMatrixHelper(
  double * tangentStiffnessMatrix)
{
  // evaluate stiffness matrix
  //PerformanceCounter counter;
  massSpringSystem->ComputeStiffnessMatrix(u, sparseMatrix);
  //counter.StopCounter();
  //printf("counter: %G\n", counter.GetElapsedTime());

  // project matrix
  #if USE_MKL_SPARSE_BLAS
    mkl_set_num_threads(8);
    //PerformanceCounter counter;

    int upperTriangleOnly=1;
    int oneIndexed=1;
    sparseMatrix->GenerateCompressedRowMajorFormat_four_array(csr_values, csr_columns, csr_pointerB, csr_pointerE, upperTriangleOnly, oneIndexed); 

    char transa = 'N';
    int m = sparseMatrix->GetNumRows();
    int n = r;
    int k = m;
    double alpha = 1.0;
    char matdescra[7] = "SUNFXX";
    double * val = csr_values;
    int * indx = csr_columns;
    int * pntrb = csr_pointerB;
    int * pntre = csr_pointerE;
    double * b = U;
    int ldb = m;
    double beta = 0.0;
    double * c = bufferMatrix;
    int ldc = m;
    mkl_dcsrmm(&transa, &m, &n, &k, &alpha, matdescra, val, indx, pntrb, pntre,
      b, &ldb, &beta, c, &ldc);

    //counter.StopCounter();
    //printf("counter: %G\n", counter.GetElapsedTime());
  #else
    for(int i=0; i<r; i++)
      sparseMatrix->MultiplyVector(&U[ELT(3*n,0,i)], &bufferMatrix[ELT(3*n,0,i)]);
  #endif

  modalMatrix->ProjectMatrix(r, bufferMatrix, tangentStiffnessMatrix);

  int r2 = r*r;
  for(int i=0; i<r2; i++)
    tangentStiffnessMatrix[i] *= -1;
}
DisableThreadingInBlock::DisableThreadingInBlock()
  : mklNumThreads(1)
  , ompNumThreads(1)
  , openblasNumThreads(1)
{
#if defined(HAVE_MKL_H)
    mklNumThreads = mkl_get_max_threads();
    mkl_set_num_threads(1);
#endif
#ifdef _OPENMP
    ompNumThreads = omp_get_max_threads();
    omp_set_num_threads(1);
#endif
#ifdef OPENBLAS_DISABLE_THREADS
    openblasNumThreads = goto_get_num_procs();
    openblas_set_num_threads(1);
#endif
    // Silence compiler warnings about unused private members
    (void) mklNumThreads;
    (void) ompNumThreads;
    (void) openblasNumThreads;
}
Example #14
0
MKL_INT PardisoSolver::SolveLinearSystemMultipleRHS(double * x, const double * rhs, int numRHS)
{
  if (directIterative != 0)
  {
    printf("Error: direct-iterative flag was specified in the constructor (must use SolveLinearSystemDirectIterative routine).\n");
    return 101;
  }

  if (verbose >= 2)
    printf("Solving linear system...(%d threads)\n", numThreads);

  mkl_set_num_threads(numThreads); 

  phase = 33;
  PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &numRHS, iparm, &msglvl, (double*)rhs, x, &error);

  if (error != 0)
    printf("Error: Pardiso solve returned non-zero exit code %d.\n", error);

  if (verbose >= 2)
    printf("Solve completed.\n");

  return error;
}
Example #15
0
extern "C" magma_int_t magma_sbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz,
                                         float *Z, magma_int_t ldz, float *dZ, magma_int_t lddz,
                                         float *V, magma_int_t ldv, float *TAU, float *T, magma_int_t ldt, magma_int_t* info)
{
    magma_int_t mklth = threads;
    
    float timeaplQ2=0.0;
    
#if defined(USEMKL)
        mkl_set_num_threads(1);
#endif
#if defined(USEACML)
        omp_set_num_threads(1);
#endif
    
            float f= 1.;
            magma_int_t n_gpu = ne;
            
            if(threads>40){
                f = 0.5;
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            else if(threads>10){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.68;
#else
                f = 0.72;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            else if(threads>5){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.82;
#else
                f = 0.86;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }            
            else if(threads>1){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.96;
#else
                f = 0.96;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            
            /****************************************************
             *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
             * **************************************************/

            timeaplQ2 = magma_wtime();
            
            /*============================
             *  use GPU+CPU's
             *==========================*/  
            
            if(n_gpu < ne)
            {
                
                // define the size of Q to be done on CPU's and the size on GPU's
                // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)

                printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu); 
                
                magma_sapplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);
                
                magma_sapplyQ_id_data* arg = new magma_sapplyQ_id_data[threads];
                pthread_t* thread_id = new pthread_t[threads];
                
                pthread_attr_t thread_attr;
                
                // ===============================
                // relaunch thread to apply Q
                // ===============================
                // Set one thread per core
                pthread_attr_init(&thread_attr);
                pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
                pthread_setconcurrency(threads);
                
                // Launch threads
                for (magma_int_t thread = 1; thread < threads; thread++)
                {
                    arg[thread] = magma_sapplyQ_id_data(thread, &data_applyQ);
                    pthread_create(&thread_id[thread], &thread_attr, magma_sapplyQ_parallel_section, &arg[thread]);
                }
                arg[0] = magma_sapplyQ_id_data(0, &data_applyQ);
                magma_sapplyQ_parallel_section(&arg[0]);
                
                // Wait for completion
                for (magma_int_t thread = 1; thread < threads; thread++)
                {
                    void *exitcodep;
                    pthread_join(thread_id[thread], &exitcodep);
                }
                
                delete[] thread_id;
                delete[] arg;
                
                magma_ssetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);
                
                /*============================
                 *  use only GPU
                 *==========================*/  
            }else{
                magma_ssetmatrix(n, ne, Z, ldz, dZ, lddz);
                magma_sbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);
                magma_device_sync();
            }

            timeaplQ2 = magma_wtime()-timeaplQ2;
            
#if defined(USEMKL)
        mkl_set_num_threads(mklth);
#endif
#if defined(USEACML)
        omp_set_num_threads(mklth);
#endif
    
    return MAGMA_SUCCESS;
}
Example #16
0
PetscErrorCode PetscSetMKL_PARDISOFromOptions(Mat F, Mat A) {
    Mat_MKL_PARDISO     *mat_mkl_pardiso = (Mat_MKL_PARDISO*)F->spptr;
    PetscErrorCode      ierr;
    PetscInt            icntl;
    PetscBool           flg;
    int                 pt[IPARM_SIZE], threads;

    PetscFunctionBegin;
    ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MKL_PARDISO Options","Mat");
    CHKERRQ(ierr);
    ierr = PetscOptionsInt("-mat_mkl_pardiso_65",
                           "Number of thrads to use",
                           "None",
                           threads,
                           &threads,
                           &flg);
    CHKERRQ(ierr);
    if (flg) mkl_set_num_threads(threads);

    ierr = PetscOptionsInt("-mat_mkl_pardiso_66",
                           "Maximum number of factors with identical sparsity structure that must be kept in memory at the same time",
                           "None",
                           mat_mkl_pardiso->maxfct,
                           &icntl,
                           &flg);
    CHKERRQ(ierr);
    if (flg) mat_mkl_pardiso->maxfct = icntl;

    ierr = PetscOptionsInt("-mat_mkl_pardiso_67",
                           "Indicates the actual matrix for the solution phase",
                           "None",
                           mat_mkl_pardiso->mnum,
                           &icntl,
                           &flg);
    CHKERRQ(ierr);
    if (flg) mat_mkl_pardiso->mnum = icntl;

    ierr = PetscOptionsInt("-mat_mkl_pardiso_68",
                           "Message level information",
                           "None",
                           mat_mkl_pardiso->msglvl,
                           &icntl,
                           &flg);
    CHKERRQ(ierr);
    if (flg) mat_mkl_pardiso->msglvl = icntl;

    ierr = PetscOptionsInt("-mat_mkl_pardiso_69",
                           "Defines the matrix type",
                           "None",
                           mat_mkl_pardiso->mtype,
                           &icntl,
                           &flg);
    CHKERRQ(ierr);
    if(flg) {
        mat_mkl_pardiso->mtype = icntl;
        MKL_PARDISO_INIT(&pt, &mat_mkl_pardiso->mtype, mat_mkl_pardiso->iparm);
#if defined(PETSC_USE_REAL_SINGLE)
        mat_mkl_pardiso->iparm[27] = 1;
#else
        mat_mkl_pardiso->iparm[27] = 0;
#endif
        mat_mkl_pardiso->iparm[34] = 1;
    }
    ierr = PetscOptionsInt("-mat_mkl_pardiso_1",
                           "Use default values",
                           "None",
                           mat_mkl_pardiso->iparm[0],
                           &icntl,
                           &flg);
    CHKERRQ(ierr);

    if(flg && icntl != 0) {
        ierr = PetscOptionsInt("-mat_mkl_pardiso_2",
                               "Fill-in reducing ordering for the input matrix",
                               "None",
                               mat_mkl_pardiso->iparm[1],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[1] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_4",
                               "Preconditioned CGS/CG",
                               "None",
                               mat_mkl_pardiso->iparm[3],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[3] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_5",
                               "User permutation",
                               "None",
                               mat_mkl_pardiso->iparm[4],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[4] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_6",
                               "Write solution on x",
                               "None",
                               mat_mkl_pardiso->iparm[5],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[5] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_8",
                               "Iterative refinement step",
                               "None",
                               mat_mkl_pardiso->iparm[7],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[7] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_10",
                               "Pivoting perturbation",
                               "None",
                               mat_mkl_pardiso->iparm[9],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[9] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_11",
                               "Scaling vectors",
                               "None",
                               mat_mkl_pardiso->iparm[10],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[10] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_12",
                               "Solve with transposed or conjugate transposed matrix A",
                               "None",
                               mat_mkl_pardiso->iparm[11],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[11] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_13",
                               "Improved accuracy using (non-) symmetric weighted matching",
                               "None",
                               mat_mkl_pardiso->iparm[12],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[12] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_18",
                               "Numbers of non-zero elements",
                               "None",
                               mat_mkl_pardiso->iparm[17],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[17] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_19",
                               "Report number of floating point operations",
                               "None",
                               mat_mkl_pardiso->iparm[18],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[18] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_21",
                               "Pivoting for symmetric indefinite matrices",
                               "None",
                               mat_mkl_pardiso->iparm[20],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[20] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_24",
                               "Parallel factorization control",
                               "None",
                               mat_mkl_pardiso->iparm[23],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[23] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_25",
                               "Parallel forward/backward solve control",
                               "None",
                               mat_mkl_pardiso->iparm[24],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[24] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_27",
                               "Matrix checker",
                               "None",
                               mat_mkl_pardiso->iparm[26],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[26] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_31",
                               "Partial solve and computing selected components of the solution vectors",
                               "None",
                               mat_mkl_pardiso->iparm[30],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[30] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_34",
                               "Optimal number of threads for conditional numerical reproducibility (CNR) mode",
                               "None",
                               mat_mkl_pardiso->iparm[33],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[33] = icntl;

        ierr = PetscOptionsInt("-mat_mkl_pardiso_60",
                               "Intel MKL_PARDISO mode",
                               "None",
                               mat_mkl_pardiso->iparm[59],
                               &icntl,
                               &flg);
        CHKERRQ(ierr);
        if (flg) mat_mkl_pardiso->iparm[59] = icntl;
    }

    PetscOptionsEnd();
    PetscFunctionReturn(0);
}
Example #17
0
PardisoSolver::PardisoSolver(const SparseMatrix * A, int numThreads_, matrixType mtype_, reorderingType rtype_, int directIterative_, int verbose_): numThreads(numThreads_), mtype(mtype_), rtype(rtype_), directIterative(directIterative_), verbose(verbose_)
{
  mkl_set_num_threads(numThreads);

  n = A->Getn();

  if (verbose >= 1)
    printf("Converting matrix to Pardiso format...\n");

  int numEntries;
  int upperTriangleOnly;  
  if ((mtype == REAL_SPD) || (mtype == REAL_SYM_INDEFINITE))  // matrix is symmetric
  {
    numEntries = A->GetNumUpperTriangleEntries();
    upperTriangleOnly = 1;
  }
  else  
  {
    // structural symmetric or unsymmetric
    numEntries = A->GetNumEntries();
    upperTriangleOnly = 0;
  }
  a = (double*) malloc (sizeof(double) * numEntries);  
  ia = (int*) malloc (sizeof(int) * (A->GetNumRows() + 1));  
  ja = (int*) malloc (sizeof(int) * numEntries);  
  int oneIndexed = 1;
  A->GenerateCompressedRowMajorFormat(a, ia, ja, upperTriangleOnly, oneIndexed);

  if (verbose >= 2)
    printf("numEntries: %d\n", numEntries);

  // permute & do symbolic factorization

  nrhs = 1; // Number of right hand sides.
  maxfct = 1; // Maximum number of numerical factorizations.
  mnum = 1; // Which factorization to use.
  msglvl = verbose >= 1 ? verbose - 1 : 0; // Print statistical information to the output file
  error = 0; // Initialize error flag

  for (int i = 0; i < 64; i++) 
    iparm[i] = 0;

  iparm[0] = 1; // Do not use the solver default values (use custom values, provided below)
  iparm[1] = rtype; // matrix re-ordering algorithm
  iparm[2] = 0; // unused 

  // use iterative-direct algorithm if requested
  if (directIterative)
  {
    if (mtype == REAL_SPD)  
      iparm[3] = 62; // matrix is symmetric positive-definite; use CGS iteration for symmetric positive-definite matrices
    else
      iparm[3] = 61; // use CGS iteration
  }
  else
    iparm[3] = 0; 

  iparm[4] = 0; // No user fill-in reducing permutation
  iparm[5] = 0; // Write solution into x
  iparm[6] = 0; // Output: number of iterative refinement steps performed
  iparm[7] = 0; // Max numbers of iterative refinement steps (used during the solving stage). Value of 0 (default) means: The solver automatically performs two steps of iterative refinement when perturbed pivots are obtained during the numerical factorization.
  iparm[8] = 0; // Reserved. Must set to 0.

  // Pivoting perturbation; the values below are Pardiso's default values
  // Pivoting only applies to REAL_UNSYM and REAL_SYM_INDEFINITE
  if (mtype == REAL_UNSYM)
    iparm[9] = 13; // For non-symmetric matrices, perturb the pivot elements with 1E-13
  else 
    iparm[9] = 8; // Use 1.0E-8 for symmetric indefinite matrices
  
  // Scaling and matching. The following below are the Pardiso defaults.
  if (mtype == REAL_UNSYM)  // unsymmetric matrices
  {
    iparm[10] = 1; // enable scaling
    iparm[12] = 1; // enable matching
  }
  else
  {
    iparm[10] = 0; // disable scaling
    iparm[12] = 0; // disable matching
  }

  iparm[11] = 0; // Solve with transposed or conjugate transposed matrix A. Not in use here.

  iparm[13] = 0; // Output: Number of perturbed pivots
  iparm[14] = 0; // Output: Peak memory on symbolic factorization (in KB)
  iparm[15] = 0; // Output: Permanent memory on symbolic factorization (in KB)
  iparm[16] = 0; // Output: Size of factors/Peak memory on numerical factorization and solution (in KB)
  
  iparm[17] = -1; // Output: Report the number of non-zero elements in the factors.
  iparm[18] = 0; // Report number of floating point operations (in 10^6 floating point operations) that are necessary to factor the matrix A. Disabled.
  iparm[19] = 0; // Output: Report CG/CGS diagnostics.
  iparm[20] = 1; // Pivoting for symmetric indefinite matrices: Apply 1x1 and 2x2 Bunch-Kaufman pivoting during the factorization process.
  iparm[21] = 0; // Output: Inertia: number of positive eigenvalues.
  iparm[22] = 0; // Output: Inertia: number of negative eigenvalues.

  iparm[23] = 0; // Parallel factorization control. Use default.
  iparm[24] = 0; // Parallel forward/backward solve control. Intel MKL PARDISO uses a parallel algorithm for the solve step.

  // the other iparms (above 24) are left at 0

  /* -------------------------------------------------------------------- *\
     .. Initialize the internal solver memory pointer. This is only 
     necessary for the FIRST call of the PARDISO solver. 
  \* -------------------------------------------------------------------- */
  for (int i=0; i<64; i++) 
    pt[i] = 0;

  if (verbose >= 1)
    printf("Reordering and symbolically factorizing the matrix...\n");

  /* -------------------------------------------------------------------- *\
     .. Reordering and Symbolic Factorization. This step also allocates 
     all memory that is necessary for the factorization.
  \* -------------------------------------------------------------------- */
  phase = 11;
  PARDISO (pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase,
    &n, a, ia, ja, NULL, &nrhs,
    iparm, &msglvl, NULL, NULL, &error);

  if (error != 0)
  {
    printf("Error: Pardiso matrix re-ordering/symbolic factorization returned non-zero exit code %d.\n", error);
    throw error;
  }

  if (verbose >= 2)
  {
    printf("\nReordering and symbolic factorization completed...\n");
    printf("Number of nonzeros in factors = %d\n", iparm[17]);
    printf("Number of factorization MFLOPS = %d\n", iparm[18]);
  }
}
Example #18
0
// X: a MxD matrix, Y: a M vector, W: a M vector
// W0: a M vector
int main(int argc, char ** argv){
    if (argc>1 && argv[1][0]=='h') {
        printf ("Usage: parSymSGD M D T C lamda r\n");
        printf ("  M: number of data points, D: dimensions, T: time iterations, C: cores;\n");
        printf ("  lamda: learning rate, r: panel size in unit of C.\n");
        return 1;
    }u
    // read in the arguments: M, D, I (time iterations), C (cores), r (each panel contains r*C points)
    int M = argc>1?atoi(argv[1]):32;
    int D = argc>2?atoi(argv[2]):4;
    T = argc>3?atoi(argv[3]):10;
    int C = argc>4?atoi(argv[4]):4;
    float lamda = argc>5?atof(argv[5]):0.01;
    int r = argc>6?atoi(argv[6]):1;
    ///printf("M=%d, D=%d, T=%d, C=%d, lamda=%8.6f, r=%d\n",M,D,T,C,lamda,r);

    int max_threads = mkl_get_max_threads(); // get the max number of threads
	
    int rep;
    mkl_set_num_threads(1); // set the number of threads to use by mkl
    panelSz = C*r;
    panels = M/panelSz;

    int i,j,k,p,t;
    float *Y, *Wreal, *W, *X;
    Y = (float *) mkl_malloc(M*sizeof(float),PAGESIZE);
    Wreal = (float *) mkl_malloc(D*sizeof(float),PAGESIZE);
    W = (float *) mkl_malloc(D*sizeof(float),PAGESIZE);
    X = (float *) mkl_malloc(M*D*sizeof(float),PAGESIZE);
    float *Ypred = (float*)mkl_malloc(M*sizeof(float),PAGESIZE);
    float *Ytmp = (float*)mkl_malloc(M*sizeof(float),PAGESIZE);
	float *I = (float*)mkl_malloc(D*D*sizeof(float),PAGESIZE);
    float *Z = (float*)mkl_malloc(M*D*sizeof(float),PAGESIZE);
    float *B = (float*)mkl_malloc(panels*D*sizeof(float),PAGESIZE);

    if (Y==NULL | Wreal==NULL | W==NULL | X==NULL | Ypred==NULL || Ytmp==NULL || Z==NULL || B==NULL || I== NULL){
        printf("Memory allocation error.\n");
        return 2;
    }

    initData(Wreal,W,X,Y, M, D,I);

    ///printf("panelSz=%d, panels=%d\n", panelSz, panels);

    for (nt=1; nt<=max_threads && nt<=panelSz; nt*=2){
        omp_set_num_threads(nt);// set the number of openMP threads

        for (rep=0; rep<REPEATS; rep++){//repeat measurements
            double prepTime, gdTime, sInit;
            // preprocessing
            sInit=dsecnd();
            //preprocessSeq(X, Y, Z, B, panelSz, panels, M, D, lamda);
            preprocessPar(X, Y, Z, B, panelSz, panels, M, D, lamda);
            prepTime = (dsecnd() - sInit);
            ///dump2("Z",Z,M,D);
            ///dump2("B",B,panels,D);

            // GD
            initW(W,D);
            ///dump1("W (initial)", W, D);
            sInit=dsecnd();
            float err;
            float fixpoint = 0.0;
            for (t=0;t<T;t++){
                for (p=0;p<panels;p++){
                    gd(&(X[p*panelSz*D]),&(Z[p*panelSz*D]), &(B[p*D]), panelSz, D, lamda, W, I);
                    ///printf("(t=%d, p=%d) ",t,p);
                    ///dump1("W", W, D);
                    ///err=calErr(X, Ypred, Ytmp, Y, W, M, D);
                  printf("finish  one  panels     ............................  \n");
                }
            }
            gdTime = (dsecnd() - sInit);

            err=calErr(X, Ypred, Ytmp, Y, W, M, D);
            fixpoint = err - prev_err;
            

            // print final err. time is in milliseconds
            printf("nt=%d\t ttlTime=%.5f\t prepTime=%.5f\t gdTime=%.5f\t error=%.5f\n", nt, (gdTime+prepTime)*1000, prepTime*1000, gdTime*1000, err);
        }
    }
    if (B) mkl_free(B);
    if (Z) mkl_free(Z);
    if (Ytmp) mkl_free(Ytmp);
    if (Ypred) mkl_free(Ypred);
    if (Y) mkl_free(Y);
    if (Wreal) mkl_free(Wreal);
    if (W) mkl_free(W);
    if (X) mkl_free(X);
	if (I) mkl_free(I);
    return 0;
}
Example #19
0
void matmul_set_num_threads(size_t count) {
#ifdef WITH_MKL
	mkl_set_num_threads(count);
#endif
//	omp_set_num_threads(count);
}
int main(int argc, char const *argv[]) {
  Eigen::setNbThreads(NumCores);
#ifdef MKL
  mkl_set_num_threads(NumCores);
#endif
  INFO("Eigen3 uses " << Eigen::nbThreads() << " threads.");
  int L;
  RealType J12ratio;
  int OBC;
  int N;
  RealType Uin, phi;
  std::vector<RealType> Vin;
  LoadParameters( "conf.h5", L, J12ratio, OBC, N, Uin, Vin, phi);
  HDF5IO file("BSSH.h5");
  // const int L = 5;
  // const bool OBC = true;
  // const RealType J12ratio = 0.010e0;
  INFO("Build Lattice - ");
  std::vector<ComplexType> J;
  if ( OBC ){
    J = std::vector<ComplexType>(L - 1, ComplexType(1.0, 0.0));
    for (size_t cnt = 0; cnt < L-1; cnt+=2) {
      J.at(cnt) *= J12ratio;
    }
  } else{
    J = std::vector<ComplexType>(L, ComplexType(1.0, 0.0));
    for (size_t cnt = 0; cnt < L; cnt+=2) {
      J.at(cnt) *= J12ratio;
    }
    if ( std::abs(phi) > 1.0e-10 ){
      J.at(L-1) *= exp( ComplexType(0.0e0, 1.0e0) * phi );
      // INFO(exp( ComplexType(0.0e0, 1.0e0) * phi ));
    }
  }
  for ( auto &val : J ){
    INFO_NONEWLINE(val << " ");
  }
  INFO("");
  const std::vector< Node<ComplexType>* > lattice = NN_1D_Chain(L, J, OBC);
  file.saveNumber("1DChain", "L", L);
  file.saveNumber("1DChain", "U", Uin);
  file.saveStdVector("1DChain", "J", J);
  for ( auto &lt : lattice ){
    if ( !(lt->VerifySite()) ) RUNTIME_ERROR("Wrong lattice setup!");
  }
  INFO("DONE!");
  INFO("Build Basis - ");
  // int N1 = (L+1)/2;
  Basis B1(L, N);
  B1.Boson();
  // std::vector< std::vector<int> > st = B1.getBStates();
  // std::vector< RealType > tg = B1.getBTags();
  // for (size_t cnt = 0; cnt < tg.size(); cnt++) {
  //   INFO_NONEWLINE( std::setw(3) << cnt << " - ");
  //   for (auto &j : st.at(cnt)){
  //     INFO_NONEWLINE(j << " ");
  //   }
  //   INFO("- " << tg.at(cnt));
  // }
  file.saveNumber("1DChain", "N", N);
  // file.saveStdVector("Basis", "States", st);
  // file.saveStdVector("Basis", "Tags", tg);
  INFO("DONE!");
  INFO_NONEWLINE("Build Hamiltonian - ");
  std::vector<Basis> Bases;
  Bases.push_back(B1);
  Hamiltonian<ComplexType> ham( Bases );
  std::vector< std::vector<ComplexType> > Vloc;
  std::vector<ComplexType> Vtmp;//(L, 1.0);
  for ( RealType &val : Vin ){
    Vtmp.push_back((ComplexType)val);
  }
  Vloc.push_back(Vtmp);
  std::vector< std::vector<ComplexType> > Uloc;
  // std::vector<ComplexType> Utmp(L, ComplexType(10.0e0, 0.0e0) );
  std::vector<ComplexType> Utmp(L, (ComplexType)Uin);
  Uloc.push_back(Utmp);
  ham.BuildLocalHamiltonian(Vloc, Uloc, Bases);
  ham.BuildHoppingHamiltonian(Bases, lattice);
  ham.BuildTotalHamiltonian();
  INFO("DONE!");
  INFO_NONEWLINE("Diagonalize Hamiltonian - ");
  std::vector<RealType> Val;
  Hamiltonian<ComplexType>::VectorType Vec;
  ham.eigh(Val, Vec);
  INFO("GS energy = " << Val.at(0));
  file.saveVector("GS", "EVec", Vec);
  file.saveStdVector("GS", "EVal", Val);
  INFO("DONE!");
  std::vector<ComplexType> Nbi = Ni( Bases, Vec );
  for (auto &n : Nbi ){
    INFO( n << " " );
  }
  ComplexMatrixType Nij = NiNj( Bases, Vec );
  INFO(Nij);
  INFO(Nij.diagonal());
  file.saveStdVector("Obs", "Nb", Nbi);
  file.saveMatrix("Obs", "Nij", Nij);
  return 0;
}
  int exampleDenseGemmByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type k,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseGemmByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , k = "<< k << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m);
      {
        if (ArgTransA == Trans::NoTranspose) 
          AA_host = DenseMatrixBaseHostType("AA_host", m, k); 
        else 
          AA_host = DenseMatrixBaseHostType("AA_host", k, m);
        
        if (ArgTransB == Trans::NoTranspose) 
          BB_host = DenseMatrixBaseHostType("BB_host", k, m);
        else 
          BB_host = DenseMatrixBaseHostType("BB_host", m, k);
        
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<AA_host.NumRows();++i)
            AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<BB_host.NumCols();++j)
          for (ordinal_type i=0;i<BB_host.NumRows();++i)
            BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i)
            CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
        
        DenseMatrixTools::copy(CB_host, CC_host);
      }

      const double flop = DenseFlopCount<value_type>::Gemm(m, m, k);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif

      os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << "  ";
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host);
        Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke
          (policy, policy.member_single(),
           1.0, A_host, B_host, 1.0, C_host);
        t = timer.seconds();
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        BB_device.mirror(BB_host);
        CC_device.mirror(CC_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device");

        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb);        
        DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb);        

        DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device);
        timer.reset();
        auto future = policy.proc_create_team
          (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device),
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();       
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      } 

      CC_host.mirror(CC_device);
      if (check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<CC_host.NumCols();++j)
          for (ordinal_type i=0;i<CC_host.NumRows();++i) {
            const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j));
            const double val  = CB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }

    return r_val;
  }
int main( int argc, char** argv ) {
	struct timespec start, stop; 
	double time;

#ifndef NDEBUG
	std::cout << "-->WARNING: COMPILED *WITH* ASSERTIONS!<--" << std::endl;
#endif
	
	if( argc<=3 ) {
		std::cout << "Usage: " << argv[0] << " <mtx> <scheme> <x> <REP1> <REP2>" << std::endl << std::endl;
		std::cout << "calculates Ax=y and reports average time taken as well as the mean of y." << std::endl;
		std::cout << "with\t\t <mtx> filename of the matrix A in matrix-market or binary triplet format." << std::endl;
		std::cout << "    \t\t <scheme> number of a sparse scheme to use, see below." << std::endl;
		std::cout << "    \t\t <x> 0 for taking x to be the 1-vector, 1 for taking x to be random (fixed seed)." << std::endl;
		std::cout << "    \t\t <REP1> (optional, default is 1) number of repititions of the entire experiment." << std::endl;
		std::cout << "    \t\t <REP2> (optional, default is 1) number of repititions of the in-place SpMV multiplication, per experiment." << std::endl;
		std::cout << std::endl << "Possible schemes:" << std::endl;
		std::cout << " 0: TS (triplet scheme)" << std::endl;
		std::cout << " 1: CRS (also known as CSR)" << std::endl;
		std::cout << " 2: ICRS (Incremental CRS)" << std::endl;
		std::cout << " 3: ZZ-CRS (Zig-zag CRS)" << std::endl;
		std::cout << " 4: ZZ-ICRS (Zig-zag ICRS)" << std::endl;
		std::cout << " 5: SVM (Sparse vector matrix)" << std::endl;
		std::cout << " 6: HTS (Hilbert-ordered triplet scheme)" << std::endl;
		std::cout << " 7: BICRS (Bi-directional Incremental CRS)" << std::endl;
		std::cout << " 8: Hilbert (Hilbert-ordered triplets backed by BICRS)" << std::endl;
		std::cout << " 9: Block Hilbert (Sparse matrix blocking, backed by Hilbert and HBICRS)" << std::endl;
		std::cout << "10: Bisection Hilbert (Sparse matrix blocking by bisection, backed by Hilbert and HBICRS)" << std::endl;
		std::cout << "11: CBICRS (Compressed Bi-directional Incremental CRS)" << std::endl;
		std::cout << "12: Beta Hilbert (known as Block CO-H+ in the paper by Yzelman & Roose, 2012: parallel compressed blocked Hilbert with BICRS)" << std::endl;
		std::cout << "13: Row-distributed Beta Hilbert (known as Row-distributed block CO-H in the paper by Yzelman & Roose, 2012: same as 12, but simpler distribution)" << std::endl;
#ifdef WITH_CSB
		std::cout << "14: Row-distributed CSB (Uses CSB sequentially within the row-distributed scheme of 13)" << std::endl;
#endif
		std::cout << "15: Row-distributed Hilbert (Parallel row-distributed Hilbert scheme, see also 8)" << std::endl;
		std::cout << "16: Row-distributed parallel CRS (using OpenMP, known as OpenMP CRS in the paper by Yzelman & Roose, 2012)" << std::endl;
		std::cout << "17: Row-distributed SpMV using compressed Hilbert indices." << std::endl;
#ifdef WITH_MKL
		std::cout << "18: Intel MKL SpMV based on the CRS data structure." << std::endl;
#endif
		std::cout << "19: Optimised ICRS." << std::endl;
#ifdef WITH_CUDA
		std::cout << "20: CUDA CuSparse HYB format." << std::endl;
#endif
		std::cout << std::endl << "The in-place Ax=y calculation is preceded by a quasi pre-fetch." << std::endl;
		std::cout << "Add a minus sign before the scheme number to enable use of the CCS wrapper (making each CRS-based structure CCS-based instead)" << std::endl;
		std::cout << "Note: binary triplet format is machine-dependent. ";
		std::cout << "Take care when using the same binary files on different machine architectures." << std::endl;
		return EXIT_FAILURE;
	}

	std::string file = std::string( argv[1] );
	int scheme = atoi( argv[2] );
	int ccs    = scheme < 0 ? 1 : 0;
	if( ccs ) scheme = -scheme;
	int x_mode = atoi( argv[3] );
	unsigned long int rep1 = 1;
	unsigned long int rep2 = 1;
	if( argc >= 5 )
		rep1 = static_cast< unsigned long int >( atoi( argv[4] ) );
	if( argc >= 6 )
		rep2 = static_cast< unsigned long int >( atoi( argv[5] ) );

	if( scheme != 16 && scheme != -16 && //pin master thread to a single core
		scheme != 18 && scheme != -18 ) { //but not when OpenMP is used (otherwise serialised computations)
		cpu_set_t mask;
		CPU_ZERO( &mask );
		CPU_SET ( 0, &mask );
		if( pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ) != 0 ) {
			std::cerr << "Error setting main thread affinity!" << std::endl;
			exit( 1 );
		}
	} else {
		omp_set_num_threads( MachineInfo::getInstance().cores() );
	}

#ifdef WITH_MKL
	if( scheme == 18 ) {
		mkl_set_num_threads( MachineInfo::getInstance().cores() );
	}
#endif
	std::cout << argv[0] << " called with matrix input file " << file << ", scheme number ";
	std::cout << scheme << " and x being " << (x_mode?"random":"the 1-vector") << "." << std::endl;
	std::cout << "Number of repititions of in-place zax is " << rep2 << std::endl;
	std::cout << "Number of repititions of the " << rep2 << " in-place zax(es) is " << rep1 << std::endl;

	Matrix< double >* checkm = new TS< double >( file );
	clock_gettime( CLOCK_ID, &start);
	Matrix< double >* matrix = selectMatrix( scheme, ccs, file );
	clock_gettime( CLOCK_ID, &stop);
	time  = (stop.tv_sec-start.tv_sec)*1000;
	time += (stop.tv_nsec-start.tv_nsec)/1000000.0;
	if( matrix == NULL ) {
		std::cerr << "Error during sparse scheme loading, exiting." << std::endl;
		return EXIT_FAILURE;
	}

	std::cout << "Matrix dimensions: " << matrix->m() << " times " << matrix->n() << "." << std::endl;
	std::cout << "Datastructure loading time: " << time << " ms." << std::endl << std::endl;

	srand( FIXED_SEED );
	double* x = NULL;
#ifdef INTERLEAVE_X
	if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 )
		x = (double*) numa_alloc_interleaved( matrix->n() * sizeof( double ) );
	else
#endif
		x = (double*) _mm_malloc( matrix->n() * sizeof( double ), 64 );

	//initialise input vector
	for( unsigned long int j=0; j<matrix->n(); j++ ) {
		x[ j ] = x_mode?(rand()/(double)RAND_MAX):1.0;
	}

	//do one trial run, also for verification
	double* c = checkm->mv( x );
	clock_gettime( CLOCK_ID, &start );
	double* z = matrix->mv( x );
	clock_gettime( CLOCK_ID, &stop);
	time = (stop.tv_sec-start.tv_sec)*1000;
	time += (stop.tv_nsec-start.tv_nsec)/1000000.0;
	double checkMSE = 0;
	unsigned long int max_e_index = 0;
	double max_e = fabs( z[0] - c[0] );
	for( unsigned long int j=0; j<matrix->m(); j++ ) {
		double curdiff = fabs( z[j] - c[j] );
		if( curdiff > max_e ) {
			max_e = curdiff;
			max_e_index = j;
		}
		curdiff  *= curdiff;
		curdiff  /= (double)(matrix->m());
		checkMSE += curdiff;
	}
#ifdef OUTPUT_Z
	for( unsigned long int j=0; j<matrix->m(); j++ ) {
		std::cout << z[ j ] << std::endl;
	}
#endif
	std::cout << "out-of-place z=Ax: mean= " << checksum( z, matrix->m() ) << ", ";
	std::cout << "MSE = " << checkMSE << ", ";
	std::cout << "max abs error = " << max_e << " while comparing y[ " << max_e_index << " ] = " <<  z[max_e_index] << " and c[ " << max_e_index << " ] = " <<  c[max_e_index] << ", ";
	std::cout << "time= " << time << " ms." << std::endl;
#ifdef RDBH_NO_COLLECT
	if( scheme == 13 ) {
		std::cout << "WARNING: MSE and max abs error are not correct for the Row-distributed Beta Hilbert scheme; please see the RDBHilbert.hpp file, and look for the RDBH_NO_COLLECT flag." << std::endl;
	}
#else
	if( scheme == 13 ) {
		std::cout << "WARNING: timings are pessimistic for the Row-distributed Beta Hilbert scheme; each spmv a (syncing) collect is executed to write local data to the global output vector as required by this library. To get the correct timings, turn this collect off via the RDBH_NO_COLLECT flag in the RDBHilbert.hpp file. Note that this causes the verification process to fail, since all data is kept in private local output subvectors." << std::endl;
	}
#endif
	double *times = new double[ rep1 ];

	//Run rep*rep instances
	for( unsigned long int run = 0; run < rep1; run++ ) {
		sleep( 1 );
		time = 0.0;
		//"prefetch"
		matrix->zax( x, z );
		matrix->zax( x, z, rep2, CLOCK_ID, &time );
		time /= static_cast<double>( rep2 );
		times[ run ] = time;
	}

	//calculate statistics
	double meantime, mintime, vartime;
	meantime = vartime = 0.0;
	mintime = times[ 0 ];
	for( unsigned long int run = 0; run < rep1; run++ ) {
		if( times[ run ] < mintime ) mintime = times[ run ];
		meantime += times[ run ] / static_cast< double >( rep1 );
	}
	for( unsigned long int run = 0; run < rep1; run++ ) {
		vartime += ( times[ run ] - meantime ) * ( times[ run ] - meantime ) / static_cast< double >( rep1 - 1 );
	}
	vartime = sqrt( vartime );

	std::cout << "In-place:" << std::endl;
	std::cout << "Mean  = " << checksum( z, matrix->m() ) << std::endl;
	std::cout << "Time  = " << meantime << " (average), \t" <<  mintime << " (fastest), \t" << vartime << " (stddev) ms. " << std::endl;
	const double avgspeed = static_cast< double >( 2*matrix->nzs() ) / meantime / 1000000.0;
	const double minspeed = static_cast< double >( 2*matrix->nzs() ) / mintime / 1000000.0;
	const double varspeed = fabs( avgspeed - static_cast< double >( 2*matrix->nzs() ) / (meantime - vartime) / 1000000.0 );
	std::cout << "Speed = " << avgspeed << " (average), \t"
					<< minspeed << " (fastest), \t"
					<< varspeed << " (variance) Gflop/s." << std::endl;
	const size_t memuse1 = matrix->bytesUsed() + sizeof( double ) * 2 * matrix->nzs();
	const double avgmem1 = static_cast< double >( 1000*memuse1 ) / meantime / 1073741824.0;
	const double minmem1 = static_cast< double >( 1000*memuse1 ) / mintime / 1073741824.0;
	const double varmem1 = fabs( avgmem1 - static_cast< double >( 1000*memuse1 ) / (meantime-vartime) / 1073741824.0 );
	std::cout << "        " << avgmem1 << " (average), \t"
					<< minmem1 << " (fastest), \t"
					<< varmem1 << " (variance) Gbyte/s (upper bound)." << std::endl;
	const size_t memuse2 = matrix->bytesUsed() + sizeof( double ) * ( matrix->m() + matrix->n() );
	const double avgmem2 = static_cast< double >( 1000*memuse2 ) / meantime / 1073741824.0;
	const double minmem2 = static_cast< double >( 1000*memuse2 ) / mintime / 1073741824.0;
	const double varmem2 = fabs( avgmem2 - static_cast< double >( 1000*memuse2 ) / (meantime-vartime) / 1073741824.0 );
	std::cout << "        " << avgmem2 << " (average), \t"
					<< minmem2 << " (fastest), \t"
					<< varmem2 << " (variance) Gbyte/s (lower bound)." << std::endl;

	delete [] times;
#ifdef INTERLEAVE_X
	if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) {
		numa_free( x, matrix->n() * sizeof( double ) );
	} else
#endif
		_mm_free( x );

	if( scheme == 12 || scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) {
#ifdef _NO_LIBNUMA
		_mm_free( z );
#else
		numa_free( z, matrix->m() * sizeof( double ) );
#endif
	} else {
		_mm_free( z );
	}
	_mm_free( c );
	delete matrix;
	delete checkm;

	return EXIT_SUCCESS;
}
Example #23
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing chetrd_he2hb
*/
int main( int argc, char** argv)
{
    TESTING_INIT_MGPU();

    real_Double_t    gpu_time, gpu_perf, gflops;
    magmaFloatComplex *h_A, *h_R, *h_work, *dT1;
    magmaFloatComplex *tau;
    float *D, *E;

    /* Matrix size */
    magma_int_t N, n2, lda, lwork, ldt, lwork0;

    magma_int_t info;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

#if defined(CHECKEIG)
#if defined(PRECISION_z)  || defined(PRECISION_d)
    magma_int_t WANTZ=0;
    magma_int_t THREADS=1;
#endif
#endif

    magma_int_t NE = 0;
    magma_int_t NB = 0;
    magma_int_t ngpu = 1;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    NB = opts.nb;
    if (NB < 1)
        NB  = 64; //64; //magma_get_chetrd_he2hb_nb(N);

    // what is NE ?
    if (NE < 1)
        NE  = 64; //N;  //magma_get_chetrd_he2hb_nb(N);  // N not yet initialized

    printf("  N    GPU GFlop/s   \n");
    printf("=====================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda = N;
            ldt = N;
            n2  = N*lda;
            gflops = FLOPS_CHETRD( N ) / 1e9;
            
            /* We suppose the magma NB is bigger than lapack NB */
            lwork0 = N*NB;
        
            /* Allocate host memory for the matrix */
            TESTING_MALLOC_CPU( h_A,    magmaFloatComplex, lda*N  );
            TESTING_MALLOC_CPU( tau,    magmaFloatComplex, N-1    );
            
            TESTING_MALLOC_PIN( h_R,    magmaFloatComplex, lda*N  );
            TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork0 );
            TESTING_MALLOC_PIN( D, float, N );
            TESTING_MALLOC_PIN( E, float, N );
            
            //TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (2*min(N,N)+(N+31)/32*32)*NB );
            TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (N*NB) );
        
            // if (WANTZ) gflops = 2.0*gflops;
    
            /* ====================================================================
               Initialize the matrix
               =================================================================== */
            lapackf77_clarnv( &ione, ISEED, &n2, h_A );
            magma_cmake_hermitian( N, h_A, lda );
            
            lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
    
    
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_device_t cdev;
            magma_getdevice( &cdev );
    
            gpu_time = magma_wtime();
            /*
            magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork0, dT1, THREADS, &info);
            tband = magma_wtime - gpu_time();
            printf("  Finish BAND  N %d  NB %d  ngpu %d timing= %f\n", N, NB, ngpu, tband);
            magma_chetrd_bhe2trc_v5(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt);
            */
    
            /*
            magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork, dT1, THREADS, &info);
            tband = magma_wtime - gpu_time();
            printf("  Finish BAND  N %d  NB %d  ngpu %d timing= %f\n", N, NB, ngpu, tband);
            magma_chetrd_bhe2trc(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt);
            */

            magma_range_t range = MagmaRangeAll;
            magma_int_t fraction_ev = 100;
            magma_int_t il, iu, m1;
            float vl=0., vu=0.;
    
            if (fraction_ev == 0) {
                il = N / 10;
                iu = N / 5+il;
            }
            else {
                il = 1;
                iu = (int)(fraction_ev*N);
                if (iu < 1) iu = 1;
            }
            magmaFloatComplex *hh_work;
            magma_int_t *iwork;
            magma_int_t nb, /*lwork,*/ liwork;
            magma_int_t threads = magma_get_parallel_numthreads();
            #if defined(PRECISION_z) || defined(PRECISION_c)
                float *rwork;
                magma_int_t lrwork;
                lwork  = magma_cbulge_get_lq2(N, threads) + 2*N + N*N;
                lrwork = 1 + 5*N +2*N*N;
                TESTING_MALLOC_PIN( rwork, float, lrwork );
            #else
                lwork  = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N;
            #endif
            liwork = 3 + 5*N;
            nb = magma_get_chetrd_nb(N);
            TESTING_MALLOC_PIN( hh_work, magmaFloatComplex, lwork  );
            TESTING_MALLOC_CPU( iwork,   magma_int_t,        liwork );
    
            if (ngpu == 1) {
                printf("calling cheevdx_2stage 1 GPU\n");
                magma_cheevdx_2stage( opts.jobz, range, opts.uplo, N,
                                h_R, lda,
                                vl, vu, il, iu,
                                &m1, D,
                                hh_work, lwork,
                                #if defined(PRECISION_z) || defined(PRECISION_c)
                                rwork, lrwork,
                                #endif
                                iwork, liwork,
                                &info);
    
            } else {
                printf("calling cheevdx_2stage_m %d GPU\n", (int) ngpu);
                magma_cheevdx_2stage_m(ngpu, opts.jobz, range, opts.uplo, N,
                                h_R, lda,
                                vl, vu, il, iu,
                                &m1, D,
                                hh_work, lwork,
                                #if defined(PRECISION_z) || defined(PRECISION_c)
                                rwork, lrwork,
                                #endif
                                iwork, liwork,
                                &info);
            }
    
    
            magma_setdevice( cdev );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
    
            /* =====================================================================
               Check the factorization
               =================================================================== */
            /*
            if ( opts.check ) {
                FILE        *fp ;
    
                printf("Writing input matrix in matlab_i_mat.txt ...\n");
                fp = fopen ("matlab_i_mat.txt", "w") ;
                if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); }
    
                for (j=0; j < N; j++) {
                    for (k=0; k < N; k++) {
                        #if defined(PRECISION_z) || defined(PRECISION_c)
                        fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1,
                                h_A[k+j*lda].x, h_A[k+j*lda].y);
                        #else
                        fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_A[k+j*lda]);
                        #endif
                    }
                }
                fclose( fp ) ;
    
                printf("Writing output matrix in matlab_o_mat.txt ...\n");
                fp = fopen ("matlab_o_mat.txt", "w") ;
                if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); }
    
                for (j=0; j < N; j++) {
                    for (k=0; k < N; k++) {
                        #if defined(PRECISION_z) || defined(PRECISION_c)
                        fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1,
                                h_R[k+j*lda].x, h_R[k+j*lda].y);
                        #else
                        fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_R[k+j*lda]);
                        #endif
                    }
                }
                fclose( fp ) ;
            }
            */
    
    
    
            /* =====================================================================
               Print performance and error.
               =================================================================== */
#if defined(CHECKEIG)
#if defined(PRECISION_z)  || defined(PRECISION_d)
            if ( opts.check ) {
                printf("  Total N %5d  gflops %6.2f  timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time );
                char JOBZ;
                if (WANTZ == 0)
                    JOBZ = 'N';
                else
                    JOBZ = 'V';
                float nrmI=0.0, nrm1=0.0, nrm2=0.0;
                int    lwork2 = 256*N;
                magmaFloatComplex *work2, *AINIT;
                float *rwork2, *D2;
                // TODO free this memory !
                magma_cmalloc_cpu( &work2, lwork2 );
                magma_smalloc_cpu( &rwork2, N );
                magma_smalloc_cpu( &D2, N );
                magma_cmalloc_cpu( &AINIT, N*lda );
                memcpy(AINIT, h_A, N*lda*sizeof(magmaFloatComplex));
                /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */
                cpu_time = magma_wtime();
                i= min(12, THREADS);
    
                #if defined(USEMKL)
                mkl_set_num_threads( i );
                #endif
                #if defined(USEACML)
                omp_set_num_threads(i);
                #endif
    
                lapackf77_cheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2,
                    #if defined(PRECISION_z) || defined (PRECISION_c)
                    rwork2,
                    #endif
                    &info );
                
                ///* call eigensolver for our resulting tridiag [D E] and for Q */
                //dstedc_withZ('V', N, D, E, h_R, lda);
                ////ssterf_( &N, D, E, &info);
                ////
                cpu_time = magma_wtime() - cpu_time;
                printf("  Finish CHECK - EIGEN   timing= %f  threads %d\n", cpu_time, i);
    
                /*
                for (i=0; i < 10; i++)
                    printf(" voici lpk D[%d] %8.2e\n", i, D2[i]);
                */
    
                //magmaFloatComplex mydz=0.0, mydo=1.0;
                //magmaFloatComplex *Z;
                // magma_cmalloc_cpu( &Z, N*lda );
                // dgemm_("N", "N", &N, &N, &N, &mydo, h_R, &lda, h_A, &lda, &mydz, Z, &lda);
    
    
                /* compare result */
                cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2);
    
    
                magmaFloatComplex *WORKAJETER;
                float *RWORKAJETER, *RESU;
                // TODO free this memory !
                magma_cmalloc_cpu( &WORKAJETER, (2* N * N + N)  );
                magma_smalloc_cpu( &RWORKAJETER, N  );
                magma_smalloc_cpu( &RESU, 10 );
                int MATYPE;
                memset(RESU, 0, 10*sizeof(float));
    
     
                MATYPE=3;
                float NOTHING=0.0;
                cpu_time = magma_wtime();
                // check results
                ccheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU );
                cpu_time = magma_wtime() - cpu_time;
                printf("  Finish CHECK - results timing= %f\n", cpu_time);
                #if defined(USEMKL)
                mkl_set_num_threads( 1 );
                #endif
                #if defined(USEACML)
                omp_set_num_threads(1);
                #endif
    
                printf("\n");
                printf(" ================================================================================================================\n");
                printf("   ==> INFO voici  threads=%d    N=%d    NB=%d   WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ);
                printf(" ================================================================================================================\n");
                printf("            DSBTRD                : %15s \n", "STATblgv9withQ    ");
                printf(" ================================================================================================================\n");
                if (WANTZ > 0)
                    printf(" | A - U S U' | / ( |A| n ulp )   : %15.3E   \n", RESU[0]);
                if (WANTZ > 0)
                    printf(" | I - U U' | / ( n ulp )         : %15.3E   \n", RESU[1]);
                printf(" | D1 - EVEIGS | / (|D| ulp)      : %15.3E   \n",  RESU[2]);
                printf(" max | D1 - EVEIGS |              : %15.3E   \n",  RESU[6]);
                printf(" ================================================================================================================\n\n\n");
                
                printf(" ****************************************************************************************************************\n");
                printf(" * Hello here are the norm  Infinite (max)=%8.2e  norm one (sum)=%8.2e   norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2);
                printf(" ****************************************************************************************************************\n\n");
            }
#endif
#endif
            
            printf("  Total N %5d  gflops %6.2f        timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time );
            printf("============================================================================\n\n\n");
            
            /* Memory clean up */
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( tau );
            
            TESTING_FREE_PIN( h_R    );
            TESTING_FREE_PIN( h_work );
            TESTING_FREE_PIN( D      );
            TESTING_FREE_PIN( E      );
            
            TESTING_FREE_DEV( dT1 );
            
            /* TODO - not all memory has been freed inside loop */
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE_MGPU();
    return EXIT_SUCCESS;
}
void StVKReducedStiffnessMatrix::Evaluate(double * q, double * Rq)
{
  // this is same as EvaluateSubset with start=0, end=quadraticSize

  /*
  int i,j,k;
  int output;

  // reset to free terms
  int index = 0;
  int indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      Rq[indexEntry] = freeCoef_[index];
      index++;
      indexEntry++;
    }
    indexEntry += output + 1;
  }

  // add linear terms
  index = 0;
  indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      for(j=0; j<r; j++)
      {
        Rq[indexEntry] += linearCoef_[index] * q[j];
        index++;
      }
      indexEntry++;
    }
    indexEntry += output + 1;
  }

  // add quadratic terms
  index = 0;
  indexEntry = 0;
  for(output=0; output<r; output++)
  {
    for(i=output; i<r; i++)
    {
      for(j=0; j<r; j++)
        for(k=j; k<r; k++)
        {
          Rq[indexEntry] += quadraticCoef_[index] * q[j] * q[k];
          index++;
        }
        indexEntry++;
    }
    indexEntry += output + 1;
  }

  // make symetric
  for(output=0; output<r; output++)
    for(i=0; i<output; i++)
      Rq[ELT(r,i,output)] = Rq[ELT(r,output,i)];
  */

  if (useSingleThread)
  {
    #if defined(WIN32) || defined(linux)
      mkl_max_threads = mkl_get_max_threads();
      mkl_dynamic = mkl_get_dynamic();
      mkl_set_num_threads(1);
      mkl_set_dynamic(0);
    #elif defined(__APPLE__)
      //setenv("VECLIB_MAXIMUM_THREADS", "1", true);
    #endif
  }

  // reset to free terms
  memcpy(buffer1,freeCoef_,sizeof(double)*quadraticSize);

  // add linear terms
  // multiply linearCoef_ and q
  // linearCoef_ is r x quadraticSize array
  cblas_dgemv(CblasColMajor, CblasTrans, 
        r, quadraticSize,
        1.0,
        linearCoef_, r,
        q, 1,
        1.0,
        buffer1, 1);

  // compute qiqj
  int index = 0;
  for(int output=0; output<r; output++)
    for(int i=output; i<r; i++)
    {
      qiqj[index] = q[output] * q[i];
      index++;
    }
 
  // update Rq
  // quadraticCoef_ is quadraticSize x quadraticSize matrix
  // each column gives quadratic coef for one matrix entry
  cblas_dgemv(CblasColMajor, CblasTrans, 
        quadraticSize, quadraticSize,
        1.0,
        quadraticCoef_, quadraticSize,
        qiqj, 1,
        1.0,
        buffer1, 1);

  // unpack into a symmetric matrix
  int i1=0,j1=0;
  for(int i=0; i< quadraticSize; i++)
  {
    Rq[ELT(r,i1,j1)] = buffer1[i];
    Rq[ELT(r,j1,i1)] = buffer1[i];
    j1++;
    if(j1 == r)
    {
      i1++;
      j1 = i1;
    }
  }

  if (useSingleThread)
  {
    #if defined(WIN32) || defined(linux)
      mkl_set_num_threads(mkl_max_threads);
      mkl_set_dynamic(mkl_dynamic);
    #elif defined(__APPLE__)
      //unsetenv("VECLIB_MAXIMUM_THREADS");
    #endif
  }
}
Example #25
0
int main(int argc, char** argv) {


    int maxnumit = 0;
    int maxrec = -1;

    const char *budget_type_str = NULL;
    const char *stage = NULL;
    const char *training = NULL;
    const char *dev = NULL;
    const char *path = NULL;
    const char * etransform_str = NULL;
    const char *kernel_str = NULL;
    const char *rbf_lambda_str = NULL;

#ifdef NDEBUG
    log_info("ai-parse %s (Release)", VERSION);
#else
    log_info("ai-parse %s (Debug)", VERSION);
#endif

    struct argparse_option options[] = {
        OPT_HELP(),
        //OPT_BOOLEAN('f', "force", &force, "force to do", NULL),
        OPT_INTEGER('v', "verbosity", &verbosity, "Verbosity level. Minimum (Default) 0. Increasing values increase parser verbosity.", NULL),
        OPT_STRING('o', "modelname", &modelname, "Model name", NULL),
        OPT_STRING('p', "path", &path, "CoNLL base directory including sections", NULL),
        OPT_STRING('s', "stage", &stage, "[ optimize | train | parse ]", NULL),
        OPT_INTEGER('n', "maxnumit", &maxnumit, "Maximum number of iterations by perceptron. Default is 50", NULL),
        OPT_STRING('t', "training", &training, "Training sections for optimize and train. Apply sections for parse", NULL),
        OPT_STRING('d', "development", &dev, "Development sections for optimize", NULL),
        OPT_STRING('e', "epattern", &epattern, "Embedding Patterns", NULL),
        OPT_INTEGER('l', "edimension", &edimension, "Embedding dimension", NULL),
        OPT_INTEGER('m', "maxrec", &maxrec, "Maximum number of training instance", NULL),
        OPT_STRING('x', "etransform", &etransform_str, "Embedding Transformation", NULL),
        OPT_STRING('k', "kernel", &kernel_str, "Kernel Type", NULL),
        OPT_INTEGER('a', "bias", &bias, "Polynomial kernel additive term. Default is 1", NULL),
        OPT_INTEGER('c', "concurrency", &num_parallel_mkl_slaves, "Parallel MKL Slaves. Default is 90% of all machine cores", NULL),
        OPT_INTEGER('b', "degree", &polynomial_degree, "Degree of polynomial kernel. Default is 4", NULL),
        OPT_STRING('z', "lambda", &rbf_lambda_str, "Lambda multiplier for RBF Kernel.Default value is 0.025"),
        OPT_STRING('u', "budget_type", &budget_type_str, "Budget control methods. NONE|RANDOM", NULL),
        OPT_INTEGER('g', "budget_size", &budget_target, "Budget Target for budget based perceptron algorithms. Default 50K", NULL),
        OPT_END(),
    };
    struct argparse argparse;
    argparse_init(&argparse, options, usage, 0);
    argc = argparse_parse(&argparse, argc, argv);

    int max_threads = mkl_get_max_threads();
    log_info("There are max %d MKL threads", max_threads);

    if (num_parallel_mkl_slaves == -1) {

        num_parallel_mkl_slaves = (int) (max_threads * 0.9);

        if (num_parallel_mkl_slaves == 0)
            num_parallel_mkl_slaves = 1;

    }

    log_info("Number of MKL Slaves is set to be %d", num_parallel_mkl_slaves);
    mkl_set_num_threads(num_parallel_mkl_slaves);

    if (1 == mkl_get_dynamic())
        log_info("Intel MKL may use less than %i threads for a large problem", num_parallel_mkl_slaves);
    else
        log_info("Intel MKL should use %i threads for a large problem", num_parallel_mkl_slaves);

    check(stage != NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0 || strcmp(stage, "parse") == 0),
            "Choose one of -s optimize, train, parse");

    check(path != NULL, "Specify a ConLL base directory using -p");

    check(edimension != 0, "Set embedding dimension using -l");

    check(modelname != NULL, "Provide model name using -o");

    if (budget_type_str != NULL) {
        if (strcmp(budget_type_str, "RANDOM") == 0 || strcmp(budget_type_str, "RANDOMIZED") == 0) {
            budget_method = RANDOMIZED;
        } else if (strcmp(budget_type_str, "NONE") == 0) {
            budget_method = NONE;

        } else {
            log_err("Unknown budget control type %s", budget_type_str);
            goto error;
        }

    } else {
        budget_method = NONE;
    }



    if (training == NULL) {
        log_warn("training section string is set to %s", DEFAULT_TRAINING_SECTION_STR);

        training = strdup(DEFAULT_TRAINING_SECTION_STR);
    }

    if (dev == NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0)) {
        log_info("development section string is set to %s", DEFAULT_DEV_SECTION_STR);

        dev = strdup(DEFAULT_DEV_SECTION_STR);
    }

    check(epattern != NULL, "Embedding pattern is required for -s optimize,train,parse");

    if (etransform_str == NULL) {
        log_info("Embedding transformation is set to be QUADRATIC");

        etransform = DEFAULT_EMBEDDING_TRANFORMATION;
    } else if (strcmp(etransform_str, "LINEAR") == 0) {
        etransform = LINEAR;
    } else if (strcmp(etransform_str, "QUADRATIC") == 0) {
        etransform = QUADRATIC;
    } else if (strcmp(etransform_str, "CUBIC") == 0) {
        etransform = CUBIC;
    } else {
        log_err("Unsupported transformation type for embedding %s", etransform_str);
    }

    if (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0) {

        if (maxnumit <= 0) {
            log_info("maxnumit is set to %d", DEFAULT_MAX_NUMIT);

            maxnumit = DEFAULT_MAX_NUMIT;
        }
    }

    if (kernel_str != NULL) {
        if (strcmp(kernel_str, "POLYNOMIAL") == 0) {

            log_info("Polynomial kernel will be used with bias %f and degree %d", bias, polynomial_degree);

            kernel = KPOLYNOMIAL;
        } else if (strcmp(kernel_str, "GAUSSIAN") == 0 || strcmp(kernel_str, "RBF") == 0) {

            if (rbf_lambda_str != NULL) {
                rbf_lambda = (float) atof(rbf_lambda_str);
            }

            log_info("RBF/GAUSSIAN kernel will be used with lambda %f ", rbf_lambda);

            kernel = KRBF;


        } else {
            log_err("Unsupported kernel type %s. Valid options are LINEAR, POLYNOMIAL, and RBF/GAUSSIAN", kernel_str);
            goto error;
        }
    }

    if (strcmp(stage, "optimize") == 0) {
        void *model = optimize(maxnumit, maxrec, path, training, dev, edimension);

        char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7));
        check_mem(model_filename);

        sprintf(model_filename, "%s.model", modelname);

        FILE *fp = fopen(model_filename, "w");

        if (kernel == KLINEAR) {

            PerceptronModel pmodel = (PerceptronModel) model;

            dump_PerceptronModel(fp, edimension, pmodel->embedding_w_best, pmodel->best_numit);

            PerceptronModel_free(pmodel);
        } else if (kernel == KPOLYNOMIAL || kernel == KRBF) {
            KernelPerceptron kpmodel = (KernelPerceptron) model;

            dump_KernelPerceptronModel(fp, kpmodel);
        }

        log_info("Model is dumped into %s file", model_filename);


        fclose(fp);



    } else if (strcmp(stage, "parse") == 0) {
        char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7));
        check_mem(model_filename);

        sprintf(model_filename, "%s.model", modelname);
        FILE *fp = fopen(model_filename, "r");

        check(fp != NULL, "%s could not be opened", model_filename);

        void *model;
        if (kernel == KLINEAR)
            model = load_PerceptronModel(fp);
        else
            model = load_KernelPerceptronModel(fp);

        fclose(fp);

        check(model != NULL, "Error in loading model file");

        log_info("Model loaded from %s successfully", model_filename);

        parseall(model, path, training, edimension);
    } else {
        log_info("Waiting for implementation");
    }



    return (EXIT_SUCCESS);
error:

    return (EXIT_FAILURE);

}
  int exampleDenseCholByBlocks(const ordinal_type mmin,
                               const ordinal_type mmax,
                               const ordinal_type minc,
                               const ordinal_type mb,
                               const int max_concurrency,
                               const int max_task_dependence,
                               const int team_size,
                               const int mkl_nthreads,
                               const bool check,
                               const bool verbose) {
    typedef typename
      Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ;

    const bool detail = false;
    std::cout << "DeviceSpace::  "; DeviceSpaceType::print_configuration(std::cout, detail);
    std::cout << "HostSpace::    ";   HostSpaceType::print_configuration(std::cout, detail);

    typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType;
    typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType;

    typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType;
    typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType;
    typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType;

    typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType;

    typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType;
    typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType;

    int r_val = 0;

    Kokkos::Impl::Timer timer;
    double t = 0.0;

    std::cout << "DenseCholByBlocks:: test matrices "
              <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc 
              << " , mb = " << mb << std::endl;

    const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); 
    PolicyType policy(max_concurrency,
                      max_task_size,
                      max_task_dependence,
                      team_size);

    std::ostringstream os;
    os.precision(3);
    os << std::scientific;

    for (ordinal_type m=mmin;m<=mmax;m+=minc) {
      os.str("");
      
      // host matrices
      DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host");

      // random T matrix
      {
        TT_host.createConfTo(AA_host);
        for (ordinal_type j=0;j<TT_host.NumCols();++j) {
          for (ordinal_type i=0;i<TT_host.NumRows();++i)
            TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0;
          TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j));
        }
      }
      // create SPD matrix
      {
        Teuchos::BLAS<ordinal_type,value_type> blas;

        blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, 
                  Teuchos::CONJ_TRANS,
                  m, m,
                  1.0,
                  TT_host.ValuePtr(), TT_host.ColStride(),
                  0.0,
                  AA_host.ValuePtr(), AA_host.ColStride());

        // preserve a copy of A
        AB_host.createConfTo(AA_host);        
        DenseMatrixTools::copy(AB_host, AA_host);
      }

      const double flop = DenseFlopCount<value_type>::Chol(m);

#ifdef HAVE_SHYLUTACHO_MKL
      mkl_set_num_threads(mkl_nthreads);
#endif
      os << "DenseCholByBlocks:: m = " << m << "  ";
      int ierr = 0;
      if (check) {
        timer.reset();
        DenseMatrixViewHostType A_host(AB_host); 
        ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke
          (policy, policy.member_single(),
           A_host);
        t = timer.seconds();
        TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)");
        os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      DenseMatrixBaseDeviceType AA_device("AA_device");
      {
        timer.reset();
        AA_device.mirror(AA_host);
        t = timer.seconds();
        os << ":: Mirror = " << t << " [sec]  ";
      }

      {
        DenseHierMatrixBaseDeviceType HA_device("HA_device");
        
        DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb);

        DenseHierTaskViewDeviceType TA_device(HA_device);

        timer.reset();
        auto future = policy.proc_create_team
          (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant>
           ::createTaskFunctor(policy, TA_device), 
           0);
        policy.spawn(future);
        Kokkos::Experimental::wait(policy);
        t = timer.seconds();
        os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs]  ";
      }

      AA_host.mirror(AA_device);
      if (!ierr && check) {
        double err = 0.0, norm = 0.0;
        for (ordinal_type j=0;j<AA_host.NumCols();++j)
          for (ordinal_type i=0;i<=j;++i) {
            const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j));
            const double val  = AB_host.Value(i,j);
            err  += diff*diff;
            norm += val*val;
          }
        os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err);
      }
      std::cout << os.str() << std::endl;
    }
    
    return r_val;
  }
Example #27
0
int dynamixMain (int argc, char * argv[]) {



  //// DECLARING VARIABLES


  // Struct of parameters
  PARAMETERS p;
  // CVode variables
  void * cvode_mem = NULL;			// pointer to block of CVode memory
  N_Vector y, yout;			// arrays of populations

  // arrays for energetic parameters
  realtype ** V = NULL;				// pointer to k-c coupling constants
  realtype * Vbridge = NULL;			// pointer to array of bridge coupling constants.
  // first element [0] is Vkb1, last [Nb] is VcbN
  realtype * Vnobridge = NULL;			// coupling constant when there is no bridge

  //// Setting defaults for parameters to be read from input
  //// done setting defaults

  int flag;
  realtype * k_pops = NULL;				// pointers to arrays of populations
  realtype * l_pops = NULL;
  realtype * c_pops = NULL;
  realtype * b_pops = NULL;
  realtype * ydata = NULL;				// pointer to ydata (contains all populations)
  realtype * wavefunction = NULL;			// (initial) wavefunction
  realtype * dm = NULL;					// density matrix
  realtype * dmt = NULL;				// density matrix in time
  realtype * wfnt = NULL;				// density matrix in time
  realtype * k_energies = NULL;				// pointers to arrays of energies
  realtype * c_energies = NULL;
  realtype * b_energies = NULL;
  realtype * l_energies = NULL;
  realtype t0 = 0.0;				// initial time
  realtype t = 0;
  realtype tret = 0;					// time returned by the solver
  time_t startRun;				// time at start of log
  time_t endRun;					// time at end of log
  struct tm * currentTime = NULL;			// time structure for localtime
#ifdef DEBUG
  FILE * realImaginary;				// file containing real and imaginary parts of the wavefunction
#endif
  FILE * log;					// log file with run times
  realtype * tkprob = NULL; 				// total probability in k, l, c, b states at each timestep
  realtype * tlprob = NULL;
  realtype * tcprob = NULL;
  realtype * tbprob = NULL;
  double ** allprob = NULL;				// populations in all states at all times
  realtype * times = NULL;
  realtype * qd_est = NULL;
  realtype * qd_est_diag = NULL;
  std::string inputFile = "ins/parameters.in";			// name of input file
  std::string cEnergiesInput = "ins/c_energies.in";
  std::string cPopsInput = "ins/c_pops.in";
  std::string bEnergiesInput = "ins/b_energies.in";
  std::string VNoBridgeInput = "ins/Vnobridge.in";
  std::string VBridgeInput = "ins/Vbridge.in";
  std::map<const std::string, bool> outs;	// map of output file names to bool

  // default output directory
  p.outputDir = "outs/";

  double summ = 0;			// sum variable

  // ---- process command line flags ---- //
  opterr = 0;
  int c;
  std::string insDir;
  /* process command line options */
  while ((c = getopt(argc, argv, "i:o:")) != -1) {
    switch (c) {
      case 'i':
	// check that it ends in a slash
	std::cerr << "[dynamix]: assigning input directory" << std::endl;
	insDir = optarg;
	if (strcmp(&(insDir.at(insDir.length() - 1)), "/")) {
	  std::cerr << "ERROR: option -i requires argument ("
	            << insDir << ") to have a trailing slash (/)." << std::endl;
	  return 1;
	}
	else {
	  // ---- assign input files ---- //
	  inputFile = insDir + "parameters.in";
	  cEnergiesInput = insDir + "c_energies.in";
	  cPopsInput = insDir + "c_pops.in";
	  bEnergiesInput = insDir + "b_energies.in";
	  VNoBridgeInput = insDir + "Vnobridge.in";
	  VBridgeInput = insDir + "Vbridge.in";
	}
	break;
      case 'o':
	std::cerr << "[dynamix]: assigning output directory" << std::endl;
	p.outputDir = optarg;
	break;
      case '?':
	if (optopt == 'i') {
	  fprintf(stderr, "Option -%c requires a directory argument.\n", optopt);
	}
	else if (isprint(optopt)) {
	  fprintf(stderr, "Unknown option -%c.\n", optopt);
	}
	else {
	  fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
	}
	return 1;
      default:
	continue;
    }
  }
  optind = 1;	// reset global variable counter for the next time this is run

  std::cerr << "[dynamix]: ARGUMENTS" << std::endl;
  for (int ii = 0; ii < argc; ii++) {
    std::cerr << "[dynamix]: " << argv[ii] << std::endl;
  }

  //// ASSIGN PARAMETERS FROM INPUT FILE


  // ---- TODO create output directory if it does not exist ---- //
  flag = mkdir(p.outputDir.c_str(), 0755);

  std::cerr << "Looking for inputs in all the " << inputFile << " places" << std::endl;
  assignParams(inputFile.c_str(), &p);

  // Decide which output files to make
#ifdef DEBUG
  std::cout << "Assigning outputs as specified in " << inputFile << "\n";
#endif
  assignOutputs(inputFile.c_str(), outs, &p);

#ifdef DEBUG
  // print out which outputs will be made
  for (std::map<const std::string, bool>::iterator it = outs.begin(); it != outs.end(); it++) {
    std::cout << "Output file: " << it->first << " will be created.\n";
  }
#endif

  // OPEN LOG FILE; PUT IN START TIME //
  if (isOutput(outs, "log.out")) {
    log = fopen("log.out", "w");			// note that this file is closed at the end of the program
  }
  time(&startRun);
  currentTime = localtime(&startRun);
  if (isOutput(outs, "log.out")) {
    fprintf(log, "Run started at %s\n", asctime(currentTime));
  }

  if (isOutput(outs, "log.out")) {
    // make a note about the laser intensity.
    fprintf(log,"The laser intensity is %.5e W/cm^2.\n\n",pow(p.pumpAmpl,2)*3.5094452e16);
  }


  //// READ DATA FROM INPUTS


  p.Nc = numberOfValuesInFile(cEnergiesInput.c_str());
  p.Nb = numberOfValuesInFile(bEnergiesInput.c_str());
  k_pops = new realtype [p.Nk];
  c_pops = new realtype [p.Nc];
  b_pops = new realtype [p.Nb];
  l_pops = new realtype [p.Nl];
  k_energies = new realtype [p.Nk];
  c_energies = new realtype [p.Nc];
  b_energies = new realtype [p.Nb];
  l_energies = new realtype [p.Nl];
  if (numberOfValuesInFile(cPopsInput.c_str()) != p.Nc) {
    fprintf(stderr, "ERROR [Inputs]: c_pops and c_energies not the same length.\n");
    return -1;
  }
  readArrayFromFile(c_energies, cEnergiesInput.c_str(), p.Nc);
  if (p.bridge_on) {
    if (p.bridge_on && (p.Nb < 1)) {
      std::cerr << "\nERROR: bridge_on but no bridge states.  The file b_energies.in is probably empty.\n";
      return -1;
    }
    p.Vbridge.resize(p.Nb+1);
    readArrayFromFile(b_energies, bEnergiesInput.c_str(), p.Nb);
    readVectorFromFile(p.Vbridge, VBridgeInput.c_str(), p.Nb + 1);
#ifdef DEBUG
    std::cout << "COUPLINGS:";
    for (int ii = 0; ii < p.Nb+1; ii++) {
      std::cout << " " << p.Vbridge[ii];
    }
    std::cout << std::endl;
#endif
  }
  else {
    p.Nb = 0;
    p.Vnobridge.resize(1);
    readVectorFromFile(p.Vnobridge, VNoBridgeInput.c_str(), 1);
  }

#ifdef DEBUG
  std::cout << "\nDone reading things from inputs.\n";
#endif


  //// PREPROCESS DATA FROM INPUTS


  // check torsion parameters, set up torsion spline
  if (p.torsion) {
#ifdef DEBUG
    std::cout << "Torsion is on." << std::endl;
#endif

    // error checking
    if (p.torsionSite > p.Nb) {
      std::cerr << "ERROR: torsion site (" << p.torsionSite
	<< ") is larger than number of bridge sites (" << p.Nb << ")." << std::endl;
      exit(-1);
    }
    else if (p.torsionSite < 0) {
      std::cerr << "ERROR: torsion site is less than zero." << std::endl;
      exit(-1);
    }

    if (!fileExists(p.torsionFile)) {
      std::cerr << "ERROR: torsion file " << p.torsionFile << " does not exist." << std::endl;
    }

    // create spline
    p.torsionV = new Spline(p.torsionFile.c_str());
    if (p.torsionV->getFirstX() != 0.0) {
      std::cerr << "ERROR: time in " << p.torsionFile << " should start at 0.0." << std::endl;
      exit(-1);
    }
    if (p.torsionV->getLastX() < p.tout) {
      std::cerr << "ERROR: time in " << p.torsionFile << " should be >= tout." << std::endl;
      exit(-1);
    }
  }


  // set number of processors for OpenMP
  //omp_set_num_threads(p.nproc);
  mkl_set_num_threads(p.nproc);

  p.NEQ = p.Nk+p.Nc+p.Nb+p.Nl;				// total number of equations set
  p.NEQ2 = p.NEQ*p.NEQ;				// number of elements in DM
#ifdef DEBUG
  std::cout << "\nTotal number of states: " << p.NEQ << std::endl;
  std::cout << p.Nk << " bulk, " << p.Nc << " QD, " << p.Nb << " bridge, " << p.Nl << " bulk VB.\n";
#endif
  tkprob = new realtype [p.numOutputSteps+1];	// total population on k, b, c at each timestep
  tcprob = new realtype [p.numOutputSteps+1];
  tbprob = new realtype [p.numOutputSteps+1];
  tlprob = new realtype [p.numOutputSteps+1];
  allprob = new double * [p.numOutputSteps+1];
  for (int ii = 0; ii <= p.numOutputSteps; ii++) {
    allprob[ii] = new double [p.NEQ];
  }
  // assign times.
  p.times.resize(p.numOutputSteps+1);
  for (int ii = 0; ii <= p.numOutputSteps; ii++) {
    p.times[ii] = float(ii)/p.numOutputSteps*p.tout;
  }
  qd_est = new realtype [p.numOutputSteps+1];
  qd_est_diag = new realtype [p.numOutputSteps+1];
  p.Ik = 0;					// set index start positions for each type of state
  p.Ic = p.Nk;
  p.Ib = p.Ic+p.Nc;
  p.Il = p.Ib+p.Nb;

  // assign bulk conduction and valence band energies
  // for RTA, bulk and valence bands have parabolic energies
  if (p.rta) {
    buildParabolicBand(k_energies, p.Nk, p.kBandEdge, CONDUCTION, &p);
    buildParabolicBand(l_energies, p.Nl, p.lBandTop, VALENCE, &p);
  }
  else {
    buildContinuum(k_energies, p.Nk, p.kBandEdge, p.kBandTop);
    buildContinuum(l_energies, p.Nl, p.kBandEdge - p.valenceBand - p.bulk_gap, p.kBandEdge - p.bulk_gap);
  }
  // calculate band width
  p.kBandWidth = k_energies[p.Nk - 1] - k_energies[0];


  //// BUILD INITIAL WAVEFUNCTION


  // bridge states (empty to start)
  initializeArray(b_pops, p.Nb, 0.0);

  // coefficients in bulk and other states depend on input conditions in bulk
  if (!p.rta) {
#ifdef DEBUG
    std::cout << "\ninitializing k_pops\n";
#endif
    if (p.bulk_constant) {
      initializeArray(k_pops, p.Nk, 0.0);
#ifdef DEBUG
      std::cout << "\ninitializing k_pops with constant probability in range of states\n";
#endif
      initializeArray(k_pops+p.Nk_first-1, p.Nk_final-p.Nk_first+1, 1.0);
      initializeArray(l_pops, p.Nl, 0.0);		// populate l states (all 0 to start off)
      initializeArray(c_pops, p.Nc, 0.0);		// QD states empty to start
    }
    else if (p.bulk_Gauss) {
      buildKPopsGaussian(k_pops, k_energies, p.kBandEdge,
	  p.bulkGaussSigma, p.bulkGaussMu, p.Nk);   // populate k states with FDD
      initializeArray(l_pops, p.Nl, 0.0);		// populate l states (all 0 to start off)
      initializeArray(c_pops, p.Nc, 0.0);		// QD states empty to start
    }
    else if (p.qd_pops) {
      readArrayFromFile(c_pops, cPopsInput.c_str(), p.Nc);	// QD populations from file
      initializeArray(l_pops, p.Nl, 0.0);		// populate l states (all 0 to start off)
      initializeArray(k_pops, p.Nk, 0.0);             // populate k states (all zero to start off)
    }
    else {
      initializeArray(k_pops, p.Nk, 0.0);             // populate k states (all zero to start off)
      initializeArray(l_pops, p.Nl, 1.0);		// populate l states (all populated to start off)
      initializeArray(c_pops, p.Nc, 0.0);		// QD states empty to start
    }
#ifdef DEBUG
    std::cout << "\nThis is k_pops:\n";
    for (int ii = 0; ii < p.Nk; ii++) {
      std::cout << k_pops[ii] << std::endl;
    }
    std::cout << "\n";
#endif
  }
  // with RTA, use different set of switches
  else {
    // bulk valence band
    if (p.VBPopFlag == POP_EMPTY) {
#ifdef DEBUG
      std::cout << "Initializing empty valence band" << std::endl;
#endif
      initializeArray(l_pops, p.Nl, 0.0);
    }
    else if (p.VBPopFlag == POP_FULL) {
#ifdef DEBUG
      std::cout << "Initializing full valence band" << std::endl;
#endif
      initializeArray(l_pops, p.Nl, 1.0);
    }
    else {
      std::cerr << "ERROR: unrecognized VBPopFlag " << p.VBPopFlag << std::endl;
    }

    // bulk conduction band
    if (p.CBPopFlag == POP_EMPTY) {
#ifdef DEBUG
      std::cout << "Initializing empty conduction band" << std::endl;
#endif
      initializeArray(k_pops, p.Nk, 0.0);
    }
    else if (p.CBPopFlag == POP_FULL) {
#ifdef DEBUG
      std::cout << "Initializing full conduction band" << std::endl;
#endif
      initializeArray(k_pops, p.Nk, 1.0);
    }
    else if (p.CBPopFlag == POP_CONSTANT) {
#ifdef DEBUG
      std::cout << "Initializing constant distribution in conduction band" << std::endl;
#endif
      initializeArray(k_pops, p.Nk, 0.0);
      initializeArray(k_pops, p.Nk, 1e-1); // FIXME
      initializeArray(k_pops+p.Nk_first-1, p.Nk_final-p.Nk_first+1, 1.0);
    }
    else if (p.CBPopFlag == POP_GAUSSIAN) {
#ifdef DEBUG
      std::cout << "Initializing Gaussian in conduction band" << std::endl;
#endif
      buildKPopsGaussian(k_pops, k_energies, p.kBandEdge,
	  p.bulkGaussSigma, p.bulkGaussMu, p.Nk);
    }
    else {
      std::cerr << "ERROR: unrecognized CBPopFlag " << p.CBPopFlag << std::endl;
    }

    //// QD
    if (p.QDPopFlag == POP_EMPTY) {
      initializeArray(c_pops, p.Nc, 0.0);
    }
    else if (p.QDPopFlag == POP_FULL) {
      initializeArray(c_pops, p.Nc, 1.0);
    }
    else {
      std::cerr << "ERROR: unrecognized QDPopFlag " << p.QDPopFlag << std::endl;
    }
  }

  // create empty wavefunction
  wavefunction = new realtype [2*p.NEQ];
  initializeArray(wavefunction, 2*p.NEQ, 0.0);

  // assign real parts of wavefunction coefficients (imaginary are zero)
  for (int ii = 0; ii < p.Nk; ii++) {
    wavefunction[p.Ik + ii] = k_pops[ii];
  }
  for (int ii = 0; ii < p.Nc; ii++) {
    wavefunction[p.Ic + ii] = c_pops[ii];
  }
  for (int ii = 0; ii < p.Nb; ii++) {
    wavefunction[p.Ib + ii] = b_pops[ii];
  }
  for (int ii = 0; ii < p.Nl; ii++) {
    wavefunction[p.Il + ii] = l_pops[ii];
  }

  if (isOutput(outs, "psi_start.out")) {
    outputWavefunction(wavefunction, p.NEQ);
  }

  // Give all coefficients a random phase
  if (p.random_phase) {
    float phi;
    // set the seed
    if (p.random_seed == -1) { srand(time(NULL)); }
    else { srand(p.random_seed); }
    for (int ii = 0; ii < p.NEQ; ii++) {
      phi = 2*3.1415926535*(float)rand()/(float)RAND_MAX;
      wavefunction[ii] = wavefunction[ii]*cos(phi);
      wavefunction[ii + p.NEQ] = wavefunction[ii + p.NEQ]*sin(phi);
    }
  }

#ifdef DEBUG
  // print out details of wavefunction coefficients
  std::cout << std::endl;
  for (int ii = 0; ii < p.Nk; ii++) {
    std::cout << "starting wavefunction: Re[k(" << ii << ")] = " << wavefunction[p.Ik + ii] << std::endl;
  }
  for (int ii = 0; ii < p.Nc; ii++) {
    std::cout << "starting wavefunction: Re[c(" << ii << ")] = " << wavefunction[p.Ic + ii] << std::endl;
  }
  for (int ii = 0; ii < p.Nb; ii++) {
    std::cout << "starting wavefunction: Re[b(" << ii << ")] = " << wavefunction[p.Ib + ii] << std::endl;
  }
  for (int ii = 0; ii < p.Nl; ii++) {
    std::cout << "starting wavefunction: Re[l(" << ii << ")] = " << wavefunction[p.Il + ii] << std::endl;
  }
  for (int ii = 0; ii < p.Nk; ii++) {
    std::cout << "starting wavefunction: Im[k(" << ii << ")] = " << wavefunction[p.Ik + ii + p.NEQ] << std::endl;
  }
  for (int ii = 0; ii < p.Nc; ii++) {
    std::cout << "starting wavefunction: Im[c(" << ii << ")] = " << wavefunction[p.Ic + ii + p.NEQ] << std::endl;
  }
  for (int ii = 0; ii < p.Nb; ii++) {
    std::cout << "starting wavefunction: Im[b(" << ii << ")] = " << wavefunction[p.Ib + ii + p.NEQ] << std::endl;
  }
  for (int ii = 0; ii < p.Nl; ii++) {
    std::cout << "starting wavefunction: Im[l(" << ii << ")] = " << wavefunction[p.Il + ii + p.NEQ] << std::endl;
  }
  std::cout << std::endl;
  summ = 0;
  for (int ii = 0; ii < 2*p.NEQ; ii++) {
    summ += pow(wavefunction[ii],2);
  }
  std::cout << "\nTotal population is " << summ << "\n\n";
#endif


  //// ASSEMBLE ARRAY OF ENERGIES


  // TODO TODO
  p.energies.resize(p.NEQ);
  for (int ii = 0; ii < p.Nk; ii++) {
    p.energies[p.Ik + ii] = k_energies[ii];
  }
  for (int ii = 0; ii < p.Nc; ii++) {
    p.energies[p.Ic + ii] = c_energies[ii];
  }
  for (int ii = 0; ii < p.Nb; ii++) {
    p.energies[p.Ib + ii] = b_energies[ii];
  }
  for (int ii = 0; ii < p.Nl; ii++) {
    p.energies[p.Il + ii] = l_energies[ii];
  }

#ifdef DEBUG
  for (int ii = 0; ii < p.NEQ; ii++) {
    std::cout << "p.energies[" << ii << "] is " << p.energies[ii] << "\n";
  }
#endif


  //// ASSIGN COUPLING CONSTANTS


  V = new realtype * [p.NEQ];
  for (int ii = 0; ii < p.NEQ; ii++) {
    V[ii] = new realtype [p.NEQ];
  }
  buildCoupling(V, &p, outs);

  if (isOutput(outs, "log.out")) {
    // make a note in the log about system timescales
    double tau = 0;		// fundamental system timescale
    if (p.Nk == 1) {
      fprintf(log, "\nThe timescale (tau) is undefined (Nk == 1).\n");
    }
    else {
      if (p.bridge_on) {
	if (p.scale_bubr) {
	  tau = 1.0/(2*p.Vbridge[0]*M_PI);
	}
	else {
	  tau = ((p.kBandTop - p.kBandEdge)/(p.Nk - 1))/(2*pow(p.Vbridge[0],2)*M_PI);
	}
      }
      else {
	if (p.scale_buqd) {
	  tau = 1.0/(2*p.Vnobridge[0]*M_PI);
	}
	else {
	  tau = ((p.kBandTop - p.kBandEdge)/(p.Nk - 1))/(2*pow(p.Vnobridge[0],2)*M_PI);
	}
      }
      fprintf(log, "\nThe timescale (tau) is %.9e a.u.\n", tau);
    }
  }

  //// CREATE DENSITY MATRIX
  if (! p.wavefunction) {
    // Create the initial density matrix
    dm = new realtype [2*p.NEQ2];
    initializeArray(dm, 2*p.NEQ2, 0.0);
#pragma omp parallel for
    for (int ii = 0; ii < p.NEQ; ii++) {
      // diagonal part
      dm[p.NEQ*ii + ii] = pow(wavefunction[ii],2) + pow(wavefunction[ii + p.NEQ],2);
      if (p.coherent) {
	// off-diagonal part
	for (int jj = 0; jj < ii; jj++) {
	  // real part of \rho_{ii,jj}
	  dm[p.NEQ*ii + jj] = wavefunction[ii]*wavefunction[jj] + wavefunction[ii+p.NEQ]*wavefunction[jj+p.NEQ];
	  // imaginary part of \rho_{ii,jj}
	  dm[p.NEQ*ii + jj + p.NEQ2] = wavefunction[ii]*wavefunction[jj+p.NEQ] - wavefunction[jj]*wavefunction[ii+p.NEQ];
	  // real part of \rho_{jj,ii}
	  dm[p.NEQ*jj + ii] = dm[p.NEQ*ii + jj];
	  // imaginary part of \rho_{jj,ii}
	  dm[p.NEQ*jj + ii + p.NEQ2] = -1*dm[p.NEQ*ii + jj + p.NEQ*p.NEQ];
	}
      }
    }

    // Create the array to store the density matrix in time
    dmt = new realtype [2*p.NEQ2*(p.numOutputSteps+1)];
    initializeArray(dmt, 2*p.NEQ2*(p.numOutputSteps+1), 0.0);

#ifdef DEBUG2
    // print out density matrix
    std::cout << "\nDensity matrix without normalization:\n\n";
    for (int ii = 0; ii < p.NEQ; ii++) {
      for (int jj = 0; jj < p.NEQ; jj++) {
	fprintf(stdout, "(%+.1e,%+.1e) ", dm[p.NEQ*ii + jj], dm[p.NEQ*ii + jj + p.NEQ2]);
      }
      fprintf(stdout, "\n");
    }
#endif

    // Normalize the DM so that populations add up to 1.
    // No normalization if RTA is on.
    if (!p.rta) {
      summ = 0.0;
      for (int ii = 0; ii < p.NEQ; ii++) {
	// assume here that diagonal elements are all real
	summ += dm[p.NEQ*ii + ii];
      }
      if ( summ == 0.0 ) {
	std::cerr << "\nFATAL ERROR [populations]: total population is 0!\n";
	return -1;
      }
      if (summ != 1.0) {
	// the variable 'summ' is now a multiplicative normalization factor
	summ = 1.0/summ;
	for (int ii = 0; ii < 2*p.NEQ2; ii++) {
	  dm[ii] *= summ;
	}
      }
#ifdef DEBUG
      std::cout << "\nThe normalization factor for the density matrix is " << summ << "\n\n";
#endif
    }

    // Error checking for total population; recount population first
    summ = 0.0;
    for (int ii = 0; ii < p.NEQ; ii++) {
      summ += dm[p.NEQ*ii + ii];
    }
    if ( fabs(summ-1.0) > 1e-12  && (!p.rta)) {
      std::cerr << "\nWARNING [populations]: After normalization, total population is not 1, it is " << summ << "!\n";
    }
#ifdef DEBUG
    std::cout << "\nAfter normalization, the sum of the populations in the density matrix is " << summ << "\n\n";
#endif
    // Add initial DM to parameters.
    p.startDM.resize(2*p.NEQ2);
    memcpy(&(p.startDM[0]), &(dm[0]), 2*p.NEQ2*sizeof(double));
  }
  // wavefunction
  else {

    // Create the array to store the wavefunction in time
    wfnt = new realtype [2*p.NEQ*(p.numOutputSteps+1)];
    initializeArray(wfnt, 2*p.NEQ*(p.numOutputSteps+1), 0.0);

    // normalize
    summ = 0.0;
    for (int ii = 0; ii < p.NEQ; ii++) {
      summ += pow(wavefunction[ii],2) + pow(wavefunction[ii+p.NEQ],2);
    }
#ifdef DEBUG
    std::cout << "Before normalization, the total population is " << summ << std::endl;
#endif
    summ = 1.0/sqrt(summ);
    for (int ii = 0; ii < 2*p.NEQ; ii++) {
      wavefunction[ii] *= summ;
    }

    // check total population
    summ = 0.0;
    for (int ii = 0; ii < p.NEQ; ii++) {
      summ += pow(wavefunction[ii],2) + pow(wavefunction[ii+p.NEQ],2);
    }
#ifdef DEBUG
    std::cout << "After normalization, the total population is " << summ << std::endl;
#endif
    if (fabs(summ - 1.0) > 1e-12) {
      std::cerr << "WARNING: wavefunction not normalized!  Total density is " << summ << std::endl;
    }

    // Add initial wavefunction to parameters.
    p.startWfn.resize(2*p.NEQ);
    memcpy(&(p.startWfn[0]), &(wavefunction[0]), 2*p.NEQ*sizeof(double));
  }


  //// BUILD HAMILTONIAN


  // //TODO TODO
#ifdef DEBUG
  fprintf(stderr, "Building Hamiltonian.\n");
#endif
  realtype * H = NULL;
  H = new realtype [p.NEQ2];
  for (int ii = 0; ii < p.NEQ2; ii++) {
    H[ii] = 0.0;
  }
  buildHamiltonian(H, p.energies, V, &p);
  // add Hamiltonian to p
  p.H.resize(p.NEQ2);
  for (int ii = 0; ii < p.NEQ2; ii++) {
    p.H[ii] = H[ii];
  }
  // create sparse version of H
  p.H_sp.resize(p.NEQ2);
  p.H_cols.resize(p.NEQ2);
  p.H_rowind.resize(p.NEQ2 + 1);
  int job [6] = {0, 0, 0, 2, p.NEQ2, 1};
  int info = 0;

  mkl_ddnscsr(&job[0], &(p.NEQ), &(p.NEQ), &(p.H)[0], &(p.NEQ), &(p.H_sp)[0],
      &(p.H_cols)[0], &(p.H_rowind)[0], &info);


  //// SET UP CVODE VARIABLES


#ifdef DEBUG
  std::cout << "\nCreating N_Vectors.\n";
  if (p.wavefunction) {
    std::cout << "\nProblem size is " << 2*p.NEQ << " elements.\n";
  }
  else {
    std::cout << "\nProblem size is " << 2*p.NEQ2 << " elements.\n";
  }
#endif
  // Creates N_Vector y with initial populations which will be used by CVode//
  if (p.wavefunction) {
    y = N_VMake_Serial(2*p.NEQ, wavefunction);
  }
  else {
    y = N_VMake_Serial(2*p.NEQ2, dm);
  }
  // put in t = 0 information
  if (! p.wavefunction) {
    updateDM(y, dmt, 0, &p);
  }
  else {
    updateWfn(y, wfnt, 0, &p);
  }
  // the vector yout has the same dimensions as y
  yout = N_VClone(y);

#ifdef DEBUG
  realImaginary = fopen("real_imaginary.out", "w");
#endif

  // Make plot files
  makePlots(outs, &p);

  // only do propagation if not just making plots
  if (! p.justPlots) {
    // Make outputs independent of time propagation
    computeGeneralOutputs(outs, &p);

    // create CVode object
    // this is a stiff problem, I guess?
#ifdef DEBUG
    std::cout << "\nCreating cvode_mem object.\n";
#endif
    cvode_mem = CVodeCreate(CV_BDF, CV_NEWTON);
    flag = CVodeSetUserData(cvode_mem, (void *) &p);

#ifdef DEBUG
    std::cout << "\nInitializing CVode solver.\n";
#endif
    // initialize CVode solver //

    if (p.wavefunction) {
      //flag = CVodeInit(cvode_mem, &RHS_WFN, t0, y);
      flag = CVodeInit(cvode_mem, &RHS_WFN_SPARSE, t0, y);
    }
    else {
      if (p.kinetic) {
	flag = CVodeInit(cvode_mem, &RHS_DM_RELAX, t0, y);
      }
      else if (p.rta) {
	flag = CVodeInit(cvode_mem, &RHS_DM_RTA, t0, y);
	//flag = CVodeInit(cvode_mem, &RHS_DM_RTA_BLAS, t0, y);
      }
      else if (p.dephasing) {
	flag = CVodeInit(cvode_mem, &RHS_DM_dephasing, t0, y);
      }
      else {
	//flag = CVodeInit(cvode_mem, &RHS_DM, t0, y);
	flag = CVodeInit(cvode_mem, &RHS_DM_BLAS, t0, y);
      }
    }

#ifdef DEBUG
    std::cout << "\nSpecifying integration tolerances.\n";
#endif
    // specify integration tolerances //
    flag = CVodeSStolerances(cvode_mem, p.reltol, p.abstol);

#ifdef DEBUG
    std::cout << "\nAttaching linear solver module.\n";
#endif
    // attach linear solver module //
    if (p.wavefunction) {
      flag = CVDense(cvode_mem, 2*p.NEQ);
    }
    else {
      // Diagonal approximation to the Jacobian saves memory for large systems
      flag = CVDiag(cvode_mem);
    }

    //// CVODE TIME PROPAGATION


#ifdef DEBUG
    std::cout << "\nAdvancing the solution in time.\n";
#endif
    for (int ii = 1; ii <= p.numsteps; ii++) {
      t = (p.tout*((double) ii)/((double) p.numsteps));
      flag = CVode(cvode_mem, t, yout, &tret, 1);
#ifdef DEBUGf
      std::cout << std::endl << "CVode flag at step " << ii << ": " << flag << std::endl;
#endif
      if ((ii % (p.numsteps/p.numOutputSteps) == 0) || (ii == p.numsteps)) {
	// show progress in stdout
	if (p.progressStdout) {
	  fprintf(stdout, "\r%-.2lf percent done", ((double)ii/((double)p.numsteps))*100);
	  fflush(stdout);
	}
	// show progress in a file
	if (p.progressFile) {
	  std::ofstream progressFile("progress.tmp");
	  progressFile << ((double)ii/((double)p.numsteps))*100 << " percent done." << std::endl;
	  progressFile.close();
	}
	if (p.wavefunction) {
	  updateWfn(yout, wfnt, ii*p.numOutputSteps/p.numsteps, &p);
	}
	else {
	  updateDM(yout, dmt, ii*p.numOutputSteps/p.numsteps, &p);
	}
      }
    }

#ifdef DEBUG
    fclose(realImaginary);
#endif


    //// MAKE FINAL OUTPUTS


    // finalize log file //
    time(&endRun);
    currentTime = localtime(&endRun);
    if (isOutput(outs, "log.out")) {
      fprintf(log, "Final status of 'flag' variable: %d\n\n", flag);
      fprintf(log, "Run ended at %s\n", asctime(currentTime));
      fprintf(log, "Run took %.3g seconds.\n", difftime(endRun, startRun));
      fclose(log);					// note that the log file is opened after variable declaration
    }
    if (p.progressStdout) {
      printf("\nRun took %.3g seconds.\n", difftime(endRun, startRun));
    }

    // Compute density outputs.
#ifdef DEBUG
    std::cout << "Computing outputs..." << std::endl;
#endif
    if (p.wavefunction) {
      computeWfnOutput(wfnt, outs, &p);
    }
    else {
      computeDMOutput(dmt, outs, &p);
    }
#ifdef DEBUG
    std::cout << "done computing outputs" << std::endl;
#endif

    // do analytical propagation
    if (p.analytical && (! p.bridge_on)) {
      computeAnalyticOutputs(outs, &p);
    }
  }


  //// CLEAN UP


#ifdef DEBUG
  fprintf(stdout, "Deallocating N_Vectors.\n");
#endif
  // deallocate memory for N_Vectors //
  N_VDestroy_Serial(y);
  N_VDestroy_Serial(yout);

#ifdef DEBUG
  fprintf(stdout, "Freeing CVode memory.\n");
#endif
  // free solver memory //
  CVodeFree(&cvode_mem);

#ifdef DEBUG
  fprintf(stdout, "Freeing memory in main.\n");
#endif
  // delete all these guys
  delete [] tkprob;
  delete [] tlprob;
  delete [] tcprob;
  delete [] tbprob;
  for (int ii = 0; ii <= p.numOutputSteps; ii++) {
    delete [] allprob[ii];
  }
  delete [] allprob;
  delete [] k_pops;
  delete [] c_pops;
  delete [] b_pops;
  delete [] l_pops;
  if (p.bridge_on) {
    delete [] Vbridge;
  }
  else {
    delete [] Vnobridge;
  }
  delete [] k_energies;
  delete [] c_energies;
  delete [] b_energies;
  delete [] l_energies;
  delete [] wavefunction;
  delete [] H;
  for (int ii = 0; ii < p.NEQ; ii++) {
    delete [] V[ii];
  }
  delete [] V;
  if (p.wavefunction) {
    delete [] wfnt;
  }
  else {
    delete [] dm;
    delete [] dmt;
  }
  delete [] times;
  delete [] qd_est;
  delete [] qd_est_diag;

  std::cout << "whoo" << std::endl;

  return 0;
}
int main(int argc, char const *argv[]) {
  Eigen::setNbThreads(NumCores);
#ifdef MKL
  mkl_set_num_threads(NumCores);
#endif
  INFO("Eigen3 uses " << Eigen::nbThreads() << " threads.");
  int L;
  RealType J12ratio;
  int OBC;
  int N1, N2;
  int dynamics, Tsteps;
  RealType Uin, phi, dt;
  std::vector<RealType> Vin;
  LoadParameters( "conf.h5", L, J12ratio, OBC, N1, N2, Uin, Vin, phi, dynamics, Tsteps, dt);
  HDF5IO *file = new HDF5IO("FSSH.h5");
  // const int L = 5;
  // const bool OBC = true;
  // const RealType J12ratio = 0.010e0;
  INFO("Build Lattice - ");
  std::vector<DT> J;
  if ( OBC ){
    // J = std::vector<DT>(L - 1, 0.0);// NOTE: Atomic limit testing
    J = std::vector<DT>(L - 1, 1.0);
    if ( J12ratio > 1.0e0 ) {
      for (size_t cnt = 1; cnt < L-1; cnt+=2) {
        J.at(cnt) /= J12ratio;
      }
    } else{
      for (size_t cnt = 0; cnt < L-1; cnt+=2) {
        J.at(cnt) *= J12ratio;
      }
    }
  } else{
    J = std::vector<DT>(L, 1.0);
    if ( J12ratio > 1.0e0 ) {
      for (size_t cnt = 1; cnt < L; cnt+=2) {
        J.at(cnt) /= J12ratio;
      }
    } else{
      for (size_t cnt = 0; cnt < L; cnt+=2) {
        J.at(cnt) *= J12ratio;
      }
    }
#ifndef DTYPE
    if ( std::abs(phi) > 1.0e-10 ){
      J.at(L-1) *= exp( DT(0.0, 1.0) * phi );
    }
#endif
  }
  for ( auto &val : J ){
    INFO_NONEWLINE(val << " ");
  }
  INFO("");
  const std::vector< Node<DT>* > lattice = NN_1D_Chain(L, J, OBC);
  file->saveNumber("1DChain", "L", L);
  file->saveStdVector("1DChain", "J", J);
  for ( auto &lt : lattice ){
    if ( !(lt->VerifySite()) ) RUNTIME_ERROR("Wrong lattice setup!");
  }
  INFO("DONE!");
  INFO("Build Basis - ");
  // int N1 = (L+1)/2;
  Basis F1(L, N1, true);
  F1.Fermion();
  std::vector<int> st1 = F1.getFStates();
  std::vector<size_t> tg1 = F1.getFTags();
  // for (size_t cnt = 0; cnt < st1.size(); cnt++) {
  //   INFO_NONEWLINE( std::setw(3) << st1.at(cnt) << " - ");
  //   F1.printFermionBasis(st1.at(cnt));
  //   INFO("- " << tg1.at(st1.at(cnt)));
  // }
  // int N2 = (L-1)/2;
  Basis F2(L, N2, true);
  F2.Fermion();
  std::vector<int> st2 = F2.getFStates();
  std::vector<size_t> tg2 = F2.getFTags();
  // for (size_t cnt = 0; cnt < st2.size(); cnt++) {
  //   INFO_NONEWLINE( std::setw(3) << st2.at(cnt) << " - ");
  //   F2.printFermionBasis(st2.at(cnt));
  //   INFO("- " << tg2.at(st2.at(cnt)));
  // }
  file->saveNumber("Basis", "N1", N1);
  file->saveStdVector("Basis", "F1States", st1);
  file->saveStdVector("Basis", "F1Tags", tg1);
  file->saveNumber("Basis", "N2", N2);
  file->saveStdVector("Basis", "F2States", st2);
  file->saveStdVector("Basis", "F2Tags", tg2);
  INFO("DONE!");
  INFO_NONEWLINE("Build Hamiltonian - ");
  std::vector<Basis> Bases;
  Bases.push_back(F1);
  Bases.push_back(F2);
  Hamiltonian<DT> ham( Bases );
  std::vector< std::vector<DT> > Vloc;
  std::vector<DT> Vtmp;//(L, 1.0);
  for ( RealType &val : Vin ){
    Vtmp.push_back((DT)val);
  }
  Vloc.push_back(Vtmp);
  Vloc.push_back(Vtmp);
  std::vector< std::vector<DT> > Uloc;
  // std::vector<DT> Utmp(L, DT(10.0e0, 0.0e0) );
  std::vector<DT> Utmp(L, (DT)Uin);
  Uloc.push_back(Utmp);
  Uloc.push_back(Utmp);
  ham.BuildLocalHamiltonian(Vloc, Uloc, Bases);
  INFO(" - BuildLocalHamiltonian DONE!");
  ham.BuildHoppingHamiltonian(Bases, lattice);
  INFO(" - BuildHoppingHamiltonian DONE!");
  ham.BuildTotalHamiltonian();
  INFO("DONE!");
  INFO_NONEWLINE("Diagonalize Hamiltonian - ");
  std::vector<RealType> Val;
  Hamiltonian<DT>::VectorType Vec;
  ham.eigh(Val, Vec);
  INFO("GS energy = " << Val.at(0));
  file->saveVector("GS", "EVec", Vec);
  file->saveStdVector("GS", "EVal", Val);
  INFO("DONE!");
  std::vector< DTV > Nfi = Ni( Bases, Vec, ham );
  INFO(" Up Spin - ");
  INFO(Nfi.at(0));
  INFO(" Down Spin - ");
  INFO(Nfi.at(1));
  INFO(" N_i - ");
  DTV Niall = Nfi.at(0) + Nfi.at(1);
  INFO(Niall);
  DTM Nud = NupNdn( Bases, Vec, ham );
  INFO(" Correlation NupNdn");
  INFO(Nud);
  DTM Nuu = NupNup( Bases, Vec, ham );
  INFO(" Correlation NupNup");
  INFO(Nuu);
  DTM Ndd = NdnNdn( Bases, Vec, ham );
  INFO(" Correlation NdnNdn");
  INFO(Ndd);
  INFO(" N_i^2 - ");
  DTM Ni2 = Nuu.diagonal() + Ndd.diagonal() + 2.0e0 * Nud.diagonal();
  INFO(Ni2);
  file->saveVector("Obs", "Nup", Nfi.at(0));
  file->saveVector("Obs", "Ndn", Nfi.at(1));
  file->saveMatrix("Obs", "NupNdn", Nud);
  file->saveMatrix("Obs", "NupNup", Nuu);
  file->saveMatrix("Obs", "NdnNdn", Ndd);
  delete file;
  if ( dynamics ){
    ComplexType Prefactor = ComplexType(0.0, -1.0e0*dt);/* NOTE: hbar = 1 */
    std::cout << "Begin dynamics......" << std::endl;
    std::cout << "Cut the boundary." << std::endl;
    J.pop_back();
    std::vector< Node<DT>* > lattice2 = NN_1D_Chain(L, J, true);// cut to open
    ham.BuildHoppingHamiltonian(Bases, lattice2);
    INFO(" - Update Hopping Hamiltonian DONE!");
    ham.BuildTotalHamiltonian();
    INFO(" - Update Total Hamiltonian DONE!");
    for (size_t cntT = 1; cntT <= Tsteps; cntT++) {
      ham.expH(Prefactor, Vec);
      if ( cntT % 2 == 0 ){
        HDF5IO file2("DYN.h5");
        std::string gname = "Obs-";
        gname.append(std::to_string((unsigned long long)cntT));
        gname.append("/");
        Nfi = Ni( Bases, Vec, ham );
        Nud = NupNdn( Bases, Vec, ham );
        Nuu = NupNup( Bases, Vec, ham );
        Ndd = NdnNdn( Bases, Vec, ham );
        file2.saveVector(gname, "Nup", Nfi.at(0));
        file2.saveVector(gname, "Ndn", Nfi.at(1));
        file2.saveMatrix(gname, "NupNdn", Nud);
        file2.saveMatrix(gname, "NupNup", Nuu);
        file2.saveMatrix(gname, "NdnNdn", Ndd);
      }
    }
  }
  return 0;
}
extern "C" void plasma_set_num_threads(int num_threads)
{
    //plasma_setlapack_numthreads(num_threads);
    mkl_set_num_threads(num_threads);
}
Example #30
0
extern "C" __declspec(dllexport) void set_max_threads(const MKL_INT num_threads)
{
	mkl_set_num_threads(num_threads);
}