int main() { double *A, *B, *C; int i,j,r,max_threads,size; double alpha, beta; double s_initial, s_elapsed; printf("Intializing data for matrix multiplication C=A*B for matrix\n\n" " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N); alpha = 1.0; beta = 0.0; printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n"); A = ( double *)mkl_malloc(M*P*sizeof( double ),64); B = ( double *)mkl_malloc(N*P*sizeof( double ),64); C = ( double *)mkl_malloc(M*N*sizeof( double ),64); if (A == NULL || B == NULL || C == NULL) { printf("Error: can`t allocate memory for matrices.\n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf("Intializing matrix data\n\n"); size = M*P; for (i = 0; i < size; ++i) { A[i] = ( double )(i+1); } size = N*P; for (i = 0; i < size; ++i) { B[i] = ( double )(i-1); } printf("Finding max number of threads can use for parallel runs \n\n"); max_threads = mkl_get_max_threads(); printf("Running from 1 to %i threads \n\n",max_threads); for (i = 1; i <= max_threads; ++i) { size = M*N; for (j = 0; j < size; ++j) { C[j] = 0.0; } printf("Requesting to use %i threads \n\n",i); mkl_set_num_threads(i); printf("Measuring performance of matrix product using dgemm function\n" " via CBLAS interface on %i threads \n\n",i); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; ++r) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N); // multiply matrices with cblas_dgemm; } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf("Matrix multiplication using dgemm completed \n" " at %.5f milliseconds using %d threads \n\n", (s_elapsed * 1000),i); printf("Output the result: \n"); size = M*N; for (i = 0; i < size; ++i) { printf("%i\t",(int)C[i]); if (i % N == N - 1) printf("\n"); } } printf("Dellocating memory\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 0; }
DISSECTION_API void DISS_N_FACT(uint64_t &dslv_, const double *coefs, const int &scaling, const double &eps_pivot, const int &indefinite_flag) { // scaling = 0 : without scaling // 1 : 1/sqrt(a_ii) or 1/sqrt(max|a_ij|) // 2 : 1/sqrt(a_ii) or Schur complement corresponding to diagonal // kernel_detection_all (for KKT type) // eps_pivot = 1.0e-2 : threshold of pivot, ratio of contiguous diagonal // entries with absolute value // indefinite_flag = 1 : indefinite -> kernel_detection_all = false // indefinite_flag = 0 : semi-definite -> kernel_detection_all = true bool kernel_detection_all = indefinite_flag == 0 ? true : false; // FILE *fout; dissection_solver_ptr *dslv = (dissection_solver_ptr *)dslv_; #ifdef BLAS_MKL mkl_set_num_threads(1); #endif #ifdef VECLIB setenv("VECLIB_MAXIMUM_THREADS", "1", true); #endif // dslv->NumericFree(); // for debugging : 20 Nov.2013 if (dslv->quad_fact) { } else { switch(dslv->real_or_complex) { case DISSECTION_REAL_MATRIX: dslv->rptr->NumericFact(dslv->numeric, (double *)coefs, scaling, eps_pivot, kernel_detection_all); if (dslv->rptr->getFactorized() == false) { dslv->rptr->SaveMMMatrix(dslv->called, coefs); } break; case DISSECTION_COMPLEX_MATRIX: dslv->cptr->NumericFact(dslv->numeric, (complex<double> *)coefs, scaling, eps_pivot, kernel_detection_all); break; } } #ifdef BLAS_MKL mkl_set_num_threads(dslv->mkl_num_threads); #endif if (dslv->verbose > 0) { fprintf(dslv->fp, "%s %d : Dissection::NumericFact done : %d\n", __FILE__, __LINE__, dslv->numeric); } dslv->numeric++; }
MKL_INT PardisoSolver::FactorMatrix(const SparseMatrix * A) { if (directIterative) return 0; mkl_set_num_threads(numThreads); // compute the factorization if (verbose >= 1) printf("Factoring the %d x %d matrix (%d threads)...\n", n, n, numThreads); int upperTriangleOnly = 1; int oneIndexed = 1; if ((mtype == REAL_SPD) || (mtype == REAL_SYM_INDEFINITE)) // matrix is symmetric upperTriangleOnly = 1; // symmetric matrix can only be represented by its upper triangle elements else // structural symmetric or unsymmetric upperTriangleOnly = 0; // unsymmetric matrix must store all its elements A->GenerateCompressedRowMajorFormat(a, NULL, NULL, upperTriangleOnly, oneIndexed); // factor phase = 22; PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &nrhs, iparm, &msglvl, NULL, NULL, &error); if (error != 0) printf("Error: Pardiso Cholesky decomposition returned non-zero exit code %d.\n", error); if (verbose >= 1) printf("Factorization completed.\n"); return error; }
int setup_threads(int n_threads) { #ifdef _OPENMP if (n_threads) omp_set_num_threads(n_threads); n_threads = omp_get_max_threads(); if (n_threads > 1) cerr << "Using " << n_threads << " threads" << endl; Eigen::initParallel(); Eigen::setNbThreads(n_threads); #ifdef MKL_SINGLE // Set the threading layer to match the compiler. // This lets MKL automatically go single-threaded in parallel regions. #ifdef __INTEL_COMPILER mkl_set_threading_layer(MKL_THREADING_INTEL); #elif defined __GNUC__ mkl_set_threading_layer(MKL_THREADING_GNU); #endif mkl_set_num_threads(n_threads); #endif #endif return n_threads; }
MKL_INT PardisoSolver::SolveLinearSystemDirectIterative(const SparseMatrix * A, double * x, const double * rhs) { if (directIterative != 1) { printf("Error: direct-iterative flag was not specified in the constructor.\n"); return 102; } mkl_set_num_threads(numThreads); if (verbose >= 2) printf("Solving linear system...(%d threads, direct-iterative)\n", numThreads); int upperTriangleOnly = 1; int oneIndexed = 1; A->GenerateCompressedRowMajorFormat(a, NULL, NULL, upperTriangleOnly, oneIndexed); phase = 23; PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &nrhs, iparm, &msglvl, (double*)rhs, x, &error); if (error != 0) printf("Error: Pardiso solve returned non-zero exit code %d.\n", error); if (verbose >= 2) printf("Solve completed.\n"); return error; }
PetscErrorCode MatMkl_PardisoSetCntl_MKL_PARDISO(Mat F,PetscInt icntl,PetscInt ival) { Mat_MKL_PARDISO *mat_mkl_pardiso =(Mat_MKL_PARDISO*)F->spptr; PetscFunctionBegin; if(icntl <= 64) { mat_mkl_pardiso->iparm[icntl - 1] = ival; } else { if(icntl == 65) mkl_set_num_threads((int)ival); else if(icntl == 66) mat_mkl_pardiso->maxfct = ival; else if(icntl == 67) mat_mkl_pardiso->mnum = ival; else if(icntl == 68) mat_mkl_pardiso->msglvl = ival; else if(icntl == 69) { int pt[IPARM_SIZE]; mat_mkl_pardiso->mtype = ival; MKL_PARDISO_INIT(&pt, &mat_mkl_pardiso->mtype, mat_mkl_pardiso->iparm); #if defined(PETSC_USE_REAL_SINGLE) mat_mkl_pardiso->iparm[27] = 1; #else mat_mkl_pardiso->iparm[27] = 0; #endif mat_mkl_pardiso->iparm[34] = 1; } } PetscFunctionReturn(0); }
MKL_INT PardisoSolver::BackwardSubstitution(double * x, const double * y) { if (directIterative != 0) { printf("Error: direct-iterative flag was specified in the constructor (must use SolveLinearSystemDirectIterative routine).\n"); return 101; } mkl_set_num_threads(numThreads); if (verbose >= 2) printf("Performing forward substitution...(%d threads)\n", numThreads); int maxIterRefinementSteps = iparm[7]; iparm[7] = 0; phase = 333; PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &nrhs, iparm, &msglvl, (double*)y, x, &error); iparm[7] = maxIterRefinementSteps; if (error != 0) printf("Error: Pardiso solve returned non-zero exit code %d.\n", error); if (verbose >= 2) printf("Solve completed.\n"); return error; }
void computeDenseMatrixMatrixMultiplication(int _m, int _n, int _k, const double *_A, const double *_B, double *_C, const bool _transposeA, const bool _multByScalar, const double _alpha, const bool _addPrevious, const bool _useIntelSmall){ #ifdef USE_INTEL_MKL_BLAS CBLAS_TRANSPOSE transposeA; int ka, nb; if (!_transposeA) { transposeA = CblasNoTrans; ka = _k; // nb = _n; } else { transposeA = CblasTrans; ka = _m; // nb = _n; } double alpha; if (!_multByScalar) { alpha = 1.0; } else { alpha = _alpha; } double beta; if (!_addPrevious) { beta = 0.0; } else { beta = 1.0; } mkl_set_num_threads(STACCATO::AuxiliaryParameters::denseVectorMatrixThreads); cblas_dgemm(CblasRowMajor, transposeA, CblasNoTrans, _m, _n, _k, alpha, _A, ka, _B, _n, beta, _C, _n); #endif #ifndef USE_INTEL_MKL_BLAS assert(_A != NULL); assert(_B != NULL); for (int i = 0; i < _m; i++) { for (int j = 0; j < _n; j++) { double sum = 0.0; for (int l = 0; l < _k; l++) { if (!_transposeA) { sum += _A[i * _k + l] * _B[l * _n + j]; } if (_transposeA) { sum += _A[l * _m + i] * _B[l * _n + j]; } } if (_multByScalar) { sum = _alpha*sum; } if (_addPrevious) { _C[i*_n + j] += sum; } if (!_addPrevious) { _C[i*_n + j] = sum; } } } #endif }
DisableThreadingInBlock::~DisableThreadingInBlock() { #if defined(HAVE_MKL_H) mkl_set_num_threads(mklNumThreads); #endif #ifdef _OPENMP omp_set_num_threads(ompNumThreads); #endif #ifdef OPENBLAS_DISABLE_THREADS openblas_set_num_threads(openblasNumThreads); #endif }
void magma_set_lapack_numthreads(magma_int_t threads) { if ( threads < 1 ) { return; } #if defined(MAGMA_WITH_MKL) mkl_set_num_threads( threads ); #elif defined(_OPENMP) omp_set_num_threads( threads ); #endif }
int main (int argc, char *argv[]) { #ifdef _DIST_ CnC::dist_cnc_init< cholesky_context > dc_init; #endif int n; int b; dist_type dt = BLOCKED_ROWS; const char *fname = NULL; const char *oname = NULL; const char *mname = NULL; int argi; // Command line: cholesky n b filename [out-file] if (argc < 3 || argc > 7) { fprintf(stderr, "Incorrect number of arguments, epxected N BS [-i infile] [-o outfile] [-w mfile] [-dt disttype]\n"); return -1; } argi = 1; n = atol(argv[argi++]); b = atol(argv[argi++]); while( argi < argc ) { if( ! strcmp( argv[argi], "-o" ) ) oname = argv[++argi]; else if( ! strcmp( argv[argi], "-i" ) ) fname = argv[++argi]; else if( ! strcmp( argv[argi], "-w" ) ) mname = argv[++argi]; else if( ! strcmp( argv[argi], "-dt" ) ) dt = static_cast< dist_type >( atoi( argv[++argi] ) ); ++argi; } #ifdef USE_MKL if( mname == NULL ) { mkl_set_num_threads( 1 ); omp_set_num_threads( 1 ); } #endif if(n % b != 0) { fprintf(stderr, "The tile size is not compatible with the given matrix\n"); exit(0); } double * A = new double[n*n]; matrix_init( A, n, fname ); if( mname ) matrix_write( A, n, mname ); else cholesky(A, n, b, oname, dt); delete [] A; return 0; }
void ReducedMassSpringSystemForceModel::GetTangentStiffnessMatrixHelper( double * tangentStiffnessMatrix) { // evaluate stiffness matrix //PerformanceCounter counter; massSpringSystem->ComputeStiffnessMatrix(u, sparseMatrix); //counter.StopCounter(); //printf("counter: %G\n", counter.GetElapsedTime()); // project matrix #if USE_MKL_SPARSE_BLAS mkl_set_num_threads(8); //PerformanceCounter counter; int upperTriangleOnly=1; int oneIndexed=1; sparseMatrix->GenerateCompressedRowMajorFormat_four_array(csr_values, csr_columns, csr_pointerB, csr_pointerE, upperTriangleOnly, oneIndexed); char transa = 'N'; int m = sparseMatrix->GetNumRows(); int n = r; int k = m; double alpha = 1.0; char matdescra[7] = "SUNFXX"; double * val = csr_values; int * indx = csr_columns; int * pntrb = csr_pointerB; int * pntre = csr_pointerE; double * b = U; int ldb = m; double beta = 0.0; double * c = bufferMatrix; int ldc = m; mkl_dcsrmm(&transa, &m, &n, &k, &alpha, matdescra, val, indx, pntrb, pntre, b, &ldb, &beta, c, &ldc); //counter.StopCounter(); //printf("counter: %G\n", counter.GetElapsedTime()); #else for(int i=0; i<r; i++) sparseMatrix->MultiplyVector(&U[ELT(3*n,0,i)], &bufferMatrix[ELT(3*n,0,i)]); #endif modalMatrix->ProjectMatrix(r, bufferMatrix, tangentStiffnessMatrix); int r2 = r*r; for(int i=0; i<r2; i++) tangentStiffnessMatrix[i] *= -1; }
DisableThreadingInBlock::DisableThreadingInBlock() : mklNumThreads(1) , ompNumThreads(1) , openblasNumThreads(1) { #if defined(HAVE_MKL_H) mklNumThreads = mkl_get_max_threads(); mkl_set_num_threads(1); #endif #ifdef _OPENMP ompNumThreads = omp_get_max_threads(); omp_set_num_threads(1); #endif #ifdef OPENBLAS_DISABLE_THREADS openblasNumThreads = goto_get_num_procs(); openblas_set_num_threads(1); #endif // Silence compiler warnings about unused private members (void) mklNumThreads; (void) ompNumThreads; (void) openblasNumThreads; }
MKL_INT PardisoSolver::SolveLinearSystemMultipleRHS(double * x, const double * rhs, int numRHS) { if (directIterative != 0) { printf("Error: direct-iterative flag was specified in the constructor (must use SolveLinearSystemDirectIterative routine).\n"); return 101; } if (verbose >= 2) printf("Solving linear system...(%d threads)\n", numThreads); mkl_set_num_threads(numThreads); phase = 33; PARDISO(pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &numRHS, iparm, &msglvl, (double*)rhs, x, &error); if (error != 0) printf("Error: Pardiso solve returned non-zero exit code %d.\n", error); if (verbose >= 2) printf("Solve completed.\n"); return error; }
extern "C" magma_int_t magma_sbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, float *Z, magma_int_t ldz, float *dZ, magma_int_t lddz, float *V, magma_int_t ldv, float *TAU, float *T, magma_int_t ldt, magma_int_t* info) { magma_int_t mklth = threads; float timeaplQ2=0.0; #if defined(USEMKL) mkl_set_num_threads(1); #endif #if defined(USEACML) omp_set_num_threads(1); #endif float f= 1.; magma_int_t n_gpu = ne; if(threads>40){ f = 0.5; n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>10){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.68; #else f = 0.72; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>5){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.82; #else f = 0.86; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>1){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.96; #else f = 0.96; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if(n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); magma_sapplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_sapplyQ_id_data* arg = new magma_sapplyQ_id_data[threads]; pthread_t* thread_id = new pthread_t[threads]; pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_sapplyQ_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_sapplyQ_parallel_section, &arg[thread]); } arg[0] = magma_sapplyQ_id_data(0, &data_applyQ); magma_sapplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } delete[] thread_id; delete[] arg; magma_ssetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ }else{ magma_ssetmatrix(n, ne, Z, ldz, dZ, lddz); magma_sbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; #if defined(USEMKL) mkl_set_num_threads(mklth); #endif #if defined(USEACML) omp_set_num_threads(mklth); #endif return MAGMA_SUCCESS; }
PetscErrorCode PetscSetMKL_PARDISOFromOptions(Mat F, Mat A) { Mat_MKL_PARDISO *mat_mkl_pardiso = (Mat_MKL_PARDISO*)F->spptr; PetscErrorCode ierr; PetscInt icntl; PetscBool flg; int pt[IPARM_SIZE], threads; PetscFunctionBegin; ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MKL_PARDISO Options","Mat"); CHKERRQ(ierr); ierr = PetscOptionsInt("-mat_mkl_pardiso_65", "Number of thrads to use", "None", threads, &threads, &flg); CHKERRQ(ierr); if (flg) mkl_set_num_threads(threads); ierr = PetscOptionsInt("-mat_mkl_pardiso_66", "Maximum number of factors with identical sparsity structure that must be kept in memory at the same time", "None", mat_mkl_pardiso->maxfct, &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->maxfct = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_67", "Indicates the actual matrix for the solution phase", "None", mat_mkl_pardiso->mnum, &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->mnum = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_68", "Message level information", "None", mat_mkl_pardiso->msglvl, &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->msglvl = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_69", "Defines the matrix type", "None", mat_mkl_pardiso->mtype, &icntl, &flg); CHKERRQ(ierr); if(flg) { mat_mkl_pardiso->mtype = icntl; MKL_PARDISO_INIT(&pt, &mat_mkl_pardiso->mtype, mat_mkl_pardiso->iparm); #if defined(PETSC_USE_REAL_SINGLE) mat_mkl_pardiso->iparm[27] = 1; #else mat_mkl_pardiso->iparm[27] = 0; #endif mat_mkl_pardiso->iparm[34] = 1; } ierr = PetscOptionsInt("-mat_mkl_pardiso_1", "Use default values", "None", mat_mkl_pardiso->iparm[0], &icntl, &flg); CHKERRQ(ierr); if(flg && icntl != 0) { ierr = PetscOptionsInt("-mat_mkl_pardiso_2", "Fill-in reducing ordering for the input matrix", "None", mat_mkl_pardiso->iparm[1], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[1] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_4", "Preconditioned CGS/CG", "None", mat_mkl_pardiso->iparm[3], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[3] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_5", "User permutation", "None", mat_mkl_pardiso->iparm[4], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[4] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_6", "Write solution on x", "None", mat_mkl_pardiso->iparm[5], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[5] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_8", "Iterative refinement step", "None", mat_mkl_pardiso->iparm[7], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[7] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_10", "Pivoting perturbation", "None", mat_mkl_pardiso->iparm[9], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[9] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_11", "Scaling vectors", "None", mat_mkl_pardiso->iparm[10], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[10] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_12", "Solve with transposed or conjugate transposed matrix A", "None", mat_mkl_pardiso->iparm[11], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[11] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_13", "Improved accuracy using (non-) symmetric weighted matching", "None", mat_mkl_pardiso->iparm[12], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[12] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_18", "Numbers of non-zero elements", "None", mat_mkl_pardiso->iparm[17], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[17] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_19", "Report number of floating point operations", "None", mat_mkl_pardiso->iparm[18], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[18] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_21", "Pivoting for symmetric indefinite matrices", "None", mat_mkl_pardiso->iparm[20], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[20] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_24", "Parallel factorization control", "None", mat_mkl_pardiso->iparm[23], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[23] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_25", "Parallel forward/backward solve control", "None", mat_mkl_pardiso->iparm[24], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[24] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_27", "Matrix checker", "None", mat_mkl_pardiso->iparm[26], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[26] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_31", "Partial solve and computing selected components of the solution vectors", "None", mat_mkl_pardiso->iparm[30], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[30] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_34", "Optimal number of threads for conditional numerical reproducibility (CNR) mode", "None", mat_mkl_pardiso->iparm[33], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[33] = icntl; ierr = PetscOptionsInt("-mat_mkl_pardiso_60", "Intel MKL_PARDISO mode", "None", mat_mkl_pardiso->iparm[59], &icntl, &flg); CHKERRQ(ierr); if (flg) mat_mkl_pardiso->iparm[59] = icntl; } PetscOptionsEnd(); PetscFunctionReturn(0); }
PardisoSolver::PardisoSolver(const SparseMatrix * A, int numThreads_, matrixType mtype_, reorderingType rtype_, int directIterative_, int verbose_): numThreads(numThreads_), mtype(mtype_), rtype(rtype_), directIterative(directIterative_), verbose(verbose_) { mkl_set_num_threads(numThreads); n = A->Getn(); if (verbose >= 1) printf("Converting matrix to Pardiso format...\n"); int numEntries; int upperTriangleOnly; if ((mtype == REAL_SPD) || (mtype == REAL_SYM_INDEFINITE)) // matrix is symmetric { numEntries = A->GetNumUpperTriangleEntries(); upperTriangleOnly = 1; } else { // structural symmetric or unsymmetric numEntries = A->GetNumEntries(); upperTriangleOnly = 0; } a = (double*) malloc (sizeof(double) * numEntries); ia = (int*) malloc (sizeof(int) * (A->GetNumRows() + 1)); ja = (int*) malloc (sizeof(int) * numEntries); int oneIndexed = 1; A->GenerateCompressedRowMajorFormat(a, ia, ja, upperTriangleOnly, oneIndexed); if (verbose >= 2) printf("numEntries: %d\n", numEntries); // permute & do symbolic factorization nrhs = 1; // Number of right hand sides. maxfct = 1; // Maximum number of numerical factorizations. mnum = 1; // Which factorization to use. msglvl = verbose >= 1 ? verbose - 1 : 0; // Print statistical information to the output file error = 0; // Initialize error flag for (int i = 0; i < 64; i++) iparm[i] = 0; iparm[0] = 1; // Do not use the solver default values (use custom values, provided below) iparm[1] = rtype; // matrix re-ordering algorithm iparm[2] = 0; // unused // use iterative-direct algorithm if requested if (directIterative) { if (mtype == REAL_SPD) iparm[3] = 62; // matrix is symmetric positive-definite; use CGS iteration for symmetric positive-definite matrices else iparm[3] = 61; // use CGS iteration } else iparm[3] = 0; iparm[4] = 0; // No user fill-in reducing permutation iparm[5] = 0; // Write solution into x iparm[6] = 0; // Output: number of iterative refinement steps performed iparm[7] = 0; // Max numbers of iterative refinement steps (used during the solving stage). Value of 0 (default) means: The solver automatically performs two steps of iterative refinement when perturbed pivots are obtained during the numerical factorization. iparm[8] = 0; // Reserved. Must set to 0. // Pivoting perturbation; the values below are Pardiso's default values // Pivoting only applies to REAL_UNSYM and REAL_SYM_INDEFINITE if (mtype == REAL_UNSYM) iparm[9] = 13; // For non-symmetric matrices, perturb the pivot elements with 1E-13 else iparm[9] = 8; // Use 1.0E-8 for symmetric indefinite matrices // Scaling and matching. The following below are the Pardiso defaults. if (mtype == REAL_UNSYM) // unsymmetric matrices { iparm[10] = 1; // enable scaling iparm[12] = 1; // enable matching } else { iparm[10] = 0; // disable scaling iparm[12] = 0; // disable matching } iparm[11] = 0; // Solve with transposed or conjugate transposed matrix A. Not in use here. iparm[13] = 0; // Output: Number of perturbed pivots iparm[14] = 0; // Output: Peak memory on symbolic factorization (in KB) iparm[15] = 0; // Output: Permanent memory on symbolic factorization (in KB) iparm[16] = 0; // Output: Size of factors/Peak memory on numerical factorization and solution (in KB) iparm[17] = -1; // Output: Report the number of non-zero elements in the factors. iparm[18] = 0; // Report number of floating point operations (in 10^6 floating point operations) that are necessary to factor the matrix A. Disabled. iparm[19] = 0; // Output: Report CG/CGS diagnostics. iparm[20] = 1; // Pivoting for symmetric indefinite matrices: Apply 1x1 and 2x2 Bunch-Kaufman pivoting during the factorization process. iparm[21] = 0; // Output: Inertia: number of positive eigenvalues. iparm[22] = 0; // Output: Inertia: number of negative eigenvalues. iparm[23] = 0; // Parallel factorization control. Use default. iparm[24] = 0; // Parallel forward/backward solve control. Intel MKL PARDISO uses a parallel algorithm for the solve step. // the other iparms (above 24) are left at 0 /* -------------------------------------------------------------------- *\ .. Initialize the internal solver memory pointer. This is only necessary for the FIRST call of the PARDISO solver. \* -------------------------------------------------------------------- */ for (int i=0; i<64; i++) pt[i] = 0; if (verbose >= 1) printf("Reordering and symbolically factorizing the matrix...\n"); /* -------------------------------------------------------------------- *\ .. Reordering and Symbolic Factorization. This step also allocates all memory that is necessary for the factorization. \* -------------------------------------------------------------------- */ phase = 11; PARDISO (pt, &maxfct, &mnum, (MKL_INT*)&mtype, &phase, &n, a, ia, ja, NULL, &nrhs, iparm, &msglvl, NULL, NULL, &error); if (error != 0) { printf("Error: Pardiso matrix re-ordering/symbolic factorization returned non-zero exit code %d.\n", error); throw error; } if (verbose >= 2) { printf("\nReordering and symbolic factorization completed...\n"); printf("Number of nonzeros in factors = %d\n", iparm[17]); printf("Number of factorization MFLOPS = %d\n", iparm[18]); } }
// X: a MxD matrix, Y: a M vector, W: a M vector // W0: a M vector int main(int argc, char ** argv){ if (argc>1 && argv[1][0]=='h') { printf ("Usage: parSymSGD M D T C lamda r\n"); printf (" M: number of data points, D: dimensions, T: time iterations, C: cores;\n"); printf (" lamda: learning rate, r: panel size in unit of C.\n"); return 1; }u // read in the arguments: M, D, I (time iterations), C (cores), r (each panel contains r*C points) int M = argc>1?atoi(argv[1]):32; int D = argc>2?atoi(argv[2]):4; T = argc>3?atoi(argv[3]):10; int C = argc>4?atoi(argv[4]):4; float lamda = argc>5?atof(argv[5]):0.01; int r = argc>6?atoi(argv[6]):1; ///printf("M=%d, D=%d, T=%d, C=%d, lamda=%8.6f, r=%d\n",M,D,T,C,lamda,r); int max_threads = mkl_get_max_threads(); // get the max number of threads int rep; mkl_set_num_threads(1); // set the number of threads to use by mkl panelSz = C*r; panels = M/panelSz; int i,j,k,p,t; float *Y, *Wreal, *W, *X; Y = (float *) mkl_malloc(M*sizeof(float),PAGESIZE); Wreal = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); W = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); X = (float *) mkl_malloc(M*D*sizeof(float),PAGESIZE); float *Ypred = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *Ytmp = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *I = (float*)mkl_malloc(D*D*sizeof(float),PAGESIZE); float *Z = (float*)mkl_malloc(M*D*sizeof(float),PAGESIZE); float *B = (float*)mkl_malloc(panels*D*sizeof(float),PAGESIZE); if (Y==NULL | Wreal==NULL | W==NULL | X==NULL | Ypred==NULL || Ytmp==NULL || Z==NULL || B==NULL || I== NULL){ printf("Memory allocation error.\n"); return 2; } initData(Wreal,W,X,Y, M, D,I); ///printf("panelSz=%d, panels=%d\n", panelSz, panels); for (nt=1; nt<=max_threads && nt<=panelSz; nt*=2){ omp_set_num_threads(nt);// set the number of openMP threads for (rep=0; rep<REPEATS; rep++){//repeat measurements double prepTime, gdTime, sInit; // preprocessing sInit=dsecnd(); //preprocessSeq(X, Y, Z, B, panelSz, panels, M, D, lamda); preprocessPar(X, Y, Z, B, panelSz, panels, M, D, lamda); prepTime = (dsecnd() - sInit); ///dump2("Z",Z,M,D); ///dump2("B",B,panels,D); // GD initW(W,D); ///dump1("W (initial)", W, D); sInit=dsecnd(); float err; float fixpoint = 0.0; for (t=0;t<T;t++){ for (p=0;p<panels;p++){ gd(&(X[p*panelSz*D]),&(Z[p*panelSz*D]), &(B[p*D]), panelSz, D, lamda, W, I); ///printf("(t=%d, p=%d) ",t,p); ///dump1("W", W, D); ///err=calErr(X, Ypred, Ytmp, Y, W, M, D); printf("finish one panels ............................ \n"); } } gdTime = (dsecnd() - sInit); err=calErr(X, Ypred, Ytmp, Y, W, M, D); fixpoint = err - prev_err; // print final err. time is in milliseconds printf("nt=%d\t ttlTime=%.5f\t prepTime=%.5f\t gdTime=%.5f\t error=%.5f\n", nt, (gdTime+prepTime)*1000, prepTime*1000, gdTime*1000, err); } } if (B) mkl_free(B); if (Z) mkl_free(Z); if (Ytmp) mkl_free(Ytmp); if (Ypred) mkl_free(Ypred); if (Y) mkl_free(Y); if (Wreal) mkl_free(Wreal); if (W) mkl_free(W); if (X) mkl_free(X); if (I) mkl_free(I); return 0; }
void matmul_set_num_threads(size_t count) { #ifdef WITH_MKL mkl_set_num_threads(count); #endif // omp_set_num_threads(count); }
int main(int argc, char const *argv[]) { Eigen::setNbThreads(NumCores); #ifdef MKL mkl_set_num_threads(NumCores); #endif INFO("Eigen3 uses " << Eigen::nbThreads() << " threads."); int L; RealType J12ratio; int OBC; int N; RealType Uin, phi; std::vector<RealType> Vin; LoadParameters( "conf.h5", L, J12ratio, OBC, N, Uin, Vin, phi); HDF5IO file("BSSH.h5"); // const int L = 5; // const bool OBC = true; // const RealType J12ratio = 0.010e0; INFO("Build Lattice - "); std::vector<ComplexType> J; if ( OBC ){ J = std::vector<ComplexType>(L - 1, ComplexType(1.0, 0.0)); for (size_t cnt = 0; cnt < L-1; cnt+=2) { J.at(cnt) *= J12ratio; } } else{ J = std::vector<ComplexType>(L, ComplexType(1.0, 0.0)); for (size_t cnt = 0; cnt < L; cnt+=2) { J.at(cnt) *= J12ratio; } if ( std::abs(phi) > 1.0e-10 ){ J.at(L-1) *= exp( ComplexType(0.0e0, 1.0e0) * phi ); // INFO(exp( ComplexType(0.0e0, 1.0e0) * phi )); } } for ( auto &val : J ){ INFO_NONEWLINE(val << " "); } INFO(""); const std::vector< Node<ComplexType>* > lattice = NN_1D_Chain(L, J, OBC); file.saveNumber("1DChain", "L", L); file.saveNumber("1DChain", "U", Uin); file.saveStdVector("1DChain", "J", J); for ( auto < : lattice ){ if ( !(lt->VerifySite()) ) RUNTIME_ERROR("Wrong lattice setup!"); } INFO("DONE!"); INFO("Build Basis - "); // int N1 = (L+1)/2; Basis B1(L, N); B1.Boson(); // std::vector< std::vector<int> > st = B1.getBStates(); // std::vector< RealType > tg = B1.getBTags(); // for (size_t cnt = 0; cnt < tg.size(); cnt++) { // INFO_NONEWLINE( std::setw(3) << cnt << " - "); // for (auto &j : st.at(cnt)){ // INFO_NONEWLINE(j << " "); // } // INFO("- " << tg.at(cnt)); // } file.saveNumber("1DChain", "N", N); // file.saveStdVector("Basis", "States", st); // file.saveStdVector("Basis", "Tags", tg); INFO("DONE!"); INFO_NONEWLINE("Build Hamiltonian - "); std::vector<Basis> Bases; Bases.push_back(B1); Hamiltonian<ComplexType> ham( Bases ); std::vector< std::vector<ComplexType> > Vloc; std::vector<ComplexType> Vtmp;//(L, 1.0); for ( RealType &val : Vin ){ Vtmp.push_back((ComplexType)val); } Vloc.push_back(Vtmp); std::vector< std::vector<ComplexType> > Uloc; // std::vector<ComplexType> Utmp(L, ComplexType(10.0e0, 0.0e0) ); std::vector<ComplexType> Utmp(L, (ComplexType)Uin); Uloc.push_back(Utmp); ham.BuildLocalHamiltonian(Vloc, Uloc, Bases); ham.BuildHoppingHamiltonian(Bases, lattice); ham.BuildTotalHamiltonian(); INFO("DONE!"); INFO_NONEWLINE("Diagonalize Hamiltonian - "); std::vector<RealType> Val; Hamiltonian<ComplexType>::VectorType Vec; ham.eigh(Val, Vec); INFO("GS energy = " << Val.at(0)); file.saveVector("GS", "EVec", Vec); file.saveStdVector("GS", "EVal", Val); INFO("DONE!"); std::vector<ComplexType> Nbi = Ni( Bases, Vec ); for (auto &n : Nbi ){ INFO( n << " " ); } ComplexMatrixType Nij = NiNj( Bases, Vec ); INFO(Nij); INFO(Nij.diagonal()); file.saveStdVector("Obs", "Nb", Nbi); file.saveMatrix("Obs", "Nij", Nij); return 0; }
int exampleDenseGemmByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type k, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseGemmByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , k = "<< k << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host, BB_host, CC_host("CC_host", m, m), CB_host("CB_host", m, m); { if (ArgTransA == Trans::NoTranspose) AA_host = DenseMatrixBaseHostType("AA_host", m, k); else AA_host = DenseMatrixBaseHostType("AA_host", k, m); if (ArgTransB == Trans::NoTranspose) BB_host = DenseMatrixBaseHostType("BB_host", k, m); else BB_host = DenseMatrixBaseHostType("BB_host", m, k); for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<AA_host.NumRows();++i) AA_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<BB_host.NumCols();++j) for (ordinal_type i=0;i<BB_host.NumRows();++i) BB_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) CC_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; DenseMatrixTools::copy(CB_host, CC_host); } const double flop = DenseFlopCount<value_type>::Gemm(m, m, k); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseGemmByBlocks:: m = " << m << " n = " << m << " k = " << k << " "; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AA_host), B_host(BB_host), C_host(CB_host); Gemm<ArgTransA,ArgTransB,AlgoGemm::ExternalBlas,Variant::One>::invoke (policy, policy.member_single(), 1.0, A_host, B_host, 1.0, C_host); t = timer.seconds(); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"), BB_device("BB_device"), CC_device("CC_device"); { timer.reset(); AA_device.mirror(AA_host); BB_device.mirror(BB_host); CC_device.mirror(CC_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"), HB_device("HB_device"), HC_device("HC_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseMatrixTools::createHierMatrix(HB_device, BB_device, mb, mb); DenseMatrixTools::createHierMatrix(HC_device, CC_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device), TB_device(HB_device), TC_device(HC_device); timer.reset(); auto future = policy.proc_create_team (Gemm<ArgTransA,ArgTransB,AlgoGemm::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, 1.0, TA_device, TB_device, 1.0, TC_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } CC_host.mirror(CC_device); if (check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<CC_host.NumCols();++j) for (ordinal_type i=0;i<CC_host.NumRows();++i) { const double diff = abs(CC_host.Value(i,j) - CB_host.Value(i,j)); const double val = CB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
int main( int argc, char** argv ) { struct timespec start, stop; double time; #ifndef NDEBUG std::cout << "-->WARNING: COMPILED *WITH* ASSERTIONS!<--" << std::endl; #endif if( argc<=3 ) { std::cout << "Usage: " << argv[0] << " <mtx> <scheme> <x> <REP1> <REP2>" << std::endl << std::endl; std::cout << "calculates Ax=y and reports average time taken as well as the mean of y." << std::endl; std::cout << "with\t\t <mtx> filename of the matrix A in matrix-market or binary triplet format." << std::endl; std::cout << " \t\t <scheme> number of a sparse scheme to use, see below." << std::endl; std::cout << " \t\t <x> 0 for taking x to be the 1-vector, 1 for taking x to be random (fixed seed)." << std::endl; std::cout << " \t\t <REP1> (optional, default is 1) number of repititions of the entire experiment." << std::endl; std::cout << " \t\t <REP2> (optional, default is 1) number of repititions of the in-place SpMV multiplication, per experiment." << std::endl; std::cout << std::endl << "Possible schemes:" << std::endl; std::cout << " 0: TS (triplet scheme)" << std::endl; std::cout << " 1: CRS (also known as CSR)" << std::endl; std::cout << " 2: ICRS (Incremental CRS)" << std::endl; std::cout << " 3: ZZ-CRS (Zig-zag CRS)" << std::endl; std::cout << " 4: ZZ-ICRS (Zig-zag ICRS)" << std::endl; std::cout << " 5: SVM (Sparse vector matrix)" << std::endl; std::cout << " 6: HTS (Hilbert-ordered triplet scheme)" << std::endl; std::cout << " 7: BICRS (Bi-directional Incremental CRS)" << std::endl; std::cout << " 8: Hilbert (Hilbert-ordered triplets backed by BICRS)" << std::endl; std::cout << " 9: Block Hilbert (Sparse matrix blocking, backed by Hilbert and HBICRS)" << std::endl; std::cout << "10: Bisection Hilbert (Sparse matrix blocking by bisection, backed by Hilbert and HBICRS)" << std::endl; std::cout << "11: CBICRS (Compressed Bi-directional Incremental CRS)" << std::endl; std::cout << "12: Beta Hilbert (known as Block CO-H+ in the paper by Yzelman & Roose, 2012: parallel compressed blocked Hilbert with BICRS)" << std::endl; std::cout << "13: Row-distributed Beta Hilbert (known as Row-distributed block CO-H in the paper by Yzelman & Roose, 2012: same as 12, but simpler distribution)" << std::endl; #ifdef WITH_CSB std::cout << "14: Row-distributed CSB (Uses CSB sequentially within the row-distributed scheme of 13)" << std::endl; #endif std::cout << "15: Row-distributed Hilbert (Parallel row-distributed Hilbert scheme, see also 8)" << std::endl; std::cout << "16: Row-distributed parallel CRS (using OpenMP, known as OpenMP CRS in the paper by Yzelman & Roose, 2012)" << std::endl; std::cout << "17: Row-distributed SpMV using compressed Hilbert indices." << std::endl; #ifdef WITH_MKL std::cout << "18: Intel MKL SpMV based on the CRS data structure." << std::endl; #endif std::cout << "19: Optimised ICRS." << std::endl; #ifdef WITH_CUDA std::cout << "20: CUDA CuSparse HYB format." << std::endl; #endif std::cout << std::endl << "The in-place Ax=y calculation is preceded by a quasi pre-fetch." << std::endl; std::cout << "Add a minus sign before the scheme number to enable use of the CCS wrapper (making each CRS-based structure CCS-based instead)" << std::endl; std::cout << "Note: binary triplet format is machine-dependent. "; std::cout << "Take care when using the same binary files on different machine architectures." << std::endl; return EXIT_FAILURE; } std::string file = std::string( argv[1] ); int scheme = atoi( argv[2] ); int ccs = scheme < 0 ? 1 : 0; if( ccs ) scheme = -scheme; int x_mode = atoi( argv[3] ); unsigned long int rep1 = 1; unsigned long int rep2 = 1; if( argc >= 5 ) rep1 = static_cast< unsigned long int >( atoi( argv[4] ) ); if( argc >= 6 ) rep2 = static_cast< unsigned long int >( atoi( argv[5] ) ); if( scheme != 16 && scheme != -16 && //pin master thread to a single core scheme != 18 && scheme != -18 ) { //but not when OpenMP is used (otherwise serialised computations) cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET ( 0, &mask ); if( pthread_setaffinity_np( pthread_self(), sizeof( mask ), &mask ) != 0 ) { std::cerr << "Error setting main thread affinity!" << std::endl; exit( 1 ); } } else { omp_set_num_threads( MachineInfo::getInstance().cores() ); } #ifdef WITH_MKL if( scheme == 18 ) { mkl_set_num_threads( MachineInfo::getInstance().cores() ); } #endif std::cout << argv[0] << " called with matrix input file " << file << ", scheme number "; std::cout << scheme << " and x being " << (x_mode?"random":"the 1-vector") << "." << std::endl; std::cout << "Number of repititions of in-place zax is " << rep2 << std::endl; std::cout << "Number of repititions of the " << rep2 << " in-place zax(es) is " << rep1 << std::endl; Matrix< double >* checkm = new TS< double >( file ); clock_gettime( CLOCK_ID, &start); Matrix< double >* matrix = selectMatrix( scheme, ccs, file ); clock_gettime( CLOCK_ID, &stop); time = (stop.tv_sec-start.tv_sec)*1000; time += (stop.tv_nsec-start.tv_nsec)/1000000.0; if( matrix == NULL ) { std::cerr << "Error during sparse scheme loading, exiting." << std::endl; return EXIT_FAILURE; } std::cout << "Matrix dimensions: " << matrix->m() << " times " << matrix->n() << "." << std::endl; std::cout << "Datastructure loading time: " << time << " ms." << std::endl << std::endl; srand( FIXED_SEED ); double* x = NULL; #ifdef INTERLEAVE_X if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) x = (double*) numa_alloc_interleaved( matrix->n() * sizeof( double ) ); else #endif x = (double*) _mm_malloc( matrix->n() * sizeof( double ), 64 ); //initialise input vector for( unsigned long int j=0; j<matrix->n(); j++ ) { x[ j ] = x_mode?(rand()/(double)RAND_MAX):1.0; } //do one trial run, also for verification double* c = checkm->mv( x ); clock_gettime( CLOCK_ID, &start ); double* z = matrix->mv( x ); clock_gettime( CLOCK_ID, &stop); time = (stop.tv_sec-start.tv_sec)*1000; time += (stop.tv_nsec-start.tv_nsec)/1000000.0; double checkMSE = 0; unsigned long int max_e_index = 0; double max_e = fabs( z[0] - c[0] ); for( unsigned long int j=0; j<matrix->m(); j++ ) { double curdiff = fabs( z[j] - c[j] ); if( curdiff > max_e ) { max_e = curdiff; max_e_index = j; } curdiff *= curdiff; curdiff /= (double)(matrix->m()); checkMSE += curdiff; } #ifdef OUTPUT_Z for( unsigned long int j=0; j<matrix->m(); j++ ) { std::cout << z[ j ] << std::endl; } #endif std::cout << "out-of-place z=Ax: mean= " << checksum( z, matrix->m() ) << ", "; std::cout << "MSE = " << checkMSE << ", "; std::cout << "max abs error = " << max_e << " while comparing y[ " << max_e_index << " ] = " << z[max_e_index] << " and c[ " << max_e_index << " ] = " << c[max_e_index] << ", "; std::cout << "time= " << time << " ms." << std::endl; #ifdef RDBH_NO_COLLECT if( scheme == 13 ) { std::cout << "WARNING: MSE and max abs error are not correct for the Row-distributed Beta Hilbert scheme; please see the RDBHilbert.hpp file, and look for the RDBH_NO_COLLECT flag." << std::endl; } #else if( scheme == 13 ) { std::cout << "WARNING: timings are pessimistic for the Row-distributed Beta Hilbert scheme; each spmv a (syncing) collect is executed to write local data to the global output vector as required by this library. To get the correct timings, turn this collect off via the RDBH_NO_COLLECT flag in the RDBHilbert.hpp file. Note that this causes the verification process to fail, since all data is kept in private local output subvectors." << std::endl; } #endif double *times = new double[ rep1 ]; //Run rep*rep instances for( unsigned long int run = 0; run < rep1; run++ ) { sleep( 1 ); time = 0.0; //"prefetch" matrix->zax( x, z ); matrix->zax( x, z, rep2, CLOCK_ID, &time ); time /= static_cast<double>( rep2 ); times[ run ] = time; } //calculate statistics double meantime, mintime, vartime; meantime = vartime = 0.0; mintime = times[ 0 ]; for( unsigned long int run = 0; run < rep1; run++ ) { if( times[ run ] < mintime ) mintime = times[ run ]; meantime += times[ run ] / static_cast< double >( rep1 ); } for( unsigned long int run = 0; run < rep1; run++ ) { vartime += ( times[ run ] - meantime ) * ( times[ run ] - meantime ) / static_cast< double >( rep1 - 1 ); } vartime = sqrt( vartime ); std::cout << "In-place:" << std::endl; std::cout << "Mean = " << checksum( z, matrix->m() ) << std::endl; std::cout << "Time = " << meantime << " (average), \t" << mintime << " (fastest), \t" << vartime << " (stddev) ms. " << std::endl; const double avgspeed = static_cast< double >( 2*matrix->nzs() ) / meantime / 1000000.0; const double minspeed = static_cast< double >( 2*matrix->nzs() ) / mintime / 1000000.0; const double varspeed = fabs( avgspeed - static_cast< double >( 2*matrix->nzs() ) / (meantime - vartime) / 1000000.0 ); std::cout << "Speed = " << avgspeed << " (average), \t" << minspeed << " (fastest), \t" << varspeed << " (variance) Gflop/s." << std::endl; const size_t memuse1 = matrix->bytesUsed() + sizeof( double ) * 2 * matrix->nzs(); const double avgmem1 = static_cast< double >( 1000*memuse1 ) / meantime / 1073741824.0; const double minmem1 = static_cast< double >( 1000*memuse1 ) / mintime / 1073741824.0; const double varmem1 = fabs( avgmem1 - static_cast< double >( 1000*memuse1 ) / (meantime-vartime) / 1073741824.0 ); std::cout << " " << avgmem1 << " (average), \t" << minmem1 << " (fastest), \t" << varmem1 << " (variance) Gbyte/s (upper bound)." << std::endl; const size_t memuse2 = matrix->bytesUsed() + sizeof( double ) * ( matrix->m() + matrix->n() ); const double avgmem2 = static_cast< double >( 1000*memuse2 ) / meantime / 1073741824.0; const double minmem2 = static_cast< double >( 1000*memuse2 ) / mintime / 1073741824.0; const double varmem2 = fabs( avgmem2 - static_cast< double >( 1000*memuse2 ) / (meantime-vartime) / 1073741824.0 ); std::cout << " " << avgmem2 << " (average), \t" << minmem2 << " (fastest), \t" << varmem2 << " (variance) Gbyte/s (lower bound)." << std::endl; delete [] times; #ifdef INTERLEAVE_X if( scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) { numa_free( x, matrix->n() * sizeof( double ) ); } else #endif _mm_free( x ); if( scheme == 12 || scheme == 13 || scheme == 14 || scheme == 15 || scheme == 16 || scheme == 17 || scheme == 18 ) { #ifdef _NO_LIBNUMA _mm_free( z ); #else numa_free( z, matrix->m() * sizeof( double ) ); #endif } else { _mm_free( z ); } _mm_free( c ); delete matrix; delete checkm; return EXIT_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing chetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); real_Double_t gpu_time, gpu_perf, gflops; magmaFloatComplex *h_A, *h_R, *h_work, *dT1; magmaFloatComplex *tau; float *D, *E; /* Matrix size */ magma_int_t N, n2, lda, lwork, ldt, lwork0; magma_int_t info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) magma_int_t WANTZ=0; magma_int_t THREADS=1; #endif #endif magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t ngpu = 1; magma_opts opts; parse_opts( argc, argv, &opts ); NB = opts.nb; if (NB < 1) NB = 64; //64; //magma_get_chetrd_he2hb_nb(N); // what is NE ? if (NE < 1) NE = 64; //N; //magma_get_chetrd_he2hb_nb(N); // N not yet initialized printf(" N GPU GFlop/s \n"); printf("=====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; n2 = N*lda; gflops = FLOPS_CHETRD( N ) / 1e9; /* We suppose the magma NB is bigger than lapack NB */ lwork0 = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, N-1 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork0 ); TESTING_MALLOC_PIN( D, float, N ); TESTING_MALLOC_PIN( E, float, N ); //TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (2*min(N,N)+(N+31)/32*32)*NB ); TESTING_MALLOC_DEV( dT1, magmaFloatComplex, (N*NB) ); // if (WANTZ) gflops = 2.0*gflops; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); magma_cmake_hermitian( N, h_A, lda ); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_device_t cdev; magma_getdevice( &cdev ); gpu_time = magma_wtime(); /* magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork0, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_chetrd_bhe2trc_v5(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ /* magma_chetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_chetrd_bhe2trc(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ magma_range_t range = MagmaRangeAll; magma_int_t fraction_ev = 100; magma_int_t il, iu, m1; float vl=0., vu=0.; if (fraction_ev == 0) { il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int)(fraction_ev*N); if (iu < 1) iu = 1; } magmaFloatComplex *hh_work; magma_int_t *iwork; magma_int_t nb, /*lwork,*/ liwork; magma_int_t threads = magma_get_parallel_numthreads(); #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; lwork = magma_cbulge_get_lq2(N, threads) + 2*N + N*N; lrwork = 1 + 5*N +2*N*N; TESTING_MALLOC_PIN( rwork, float, lrwork ); #else lwork = magma_cbulge_get_lq2(N, threads) + 1 + 6*N + 2*N*N; #endif liwork = 3 + 5*N; nb = magma_get_chetrd_nb(N); TESTING_MALLOC_PIN( hh_work, magmaFloatComplex, lwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); if (ngpu == 1) { printf("calling cheevdx_2stage 1 GPU\n"); magma_cheevdx_2stage( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } else { printf("calling cheevdx_2stage_m %d GPU\n", (int) ngpu); magma_cheevdx_2stage_m(ngpu, opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); } magma_setdevice( cdev ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.check ) { FILE *fp ; printf("Writing input matrix in matlab_i_mat.txt ...\n"); fp = fopen ("matlab_i_mat.txt", "w") ; if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #if defined(PRECISION_z) || defined(PRECISION_c) fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_A[k+j*lda].x, h_A[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_A[k+j*lda]); #endif } } fclose( fp ) ; printf("Writing output matrix in matlab_o_mat.txt ...\n"); fp = fopen ("matlab_o_mat.txt", "w") ; if ( fp == NULL ) { printf("Couldn't open output file\n"); exit(1); } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #if defined(PRECISION_z) || defined(PRECISION_c) fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_R[k+j*lda].x, h_R[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_R[k+j*lda]); #endif } } fclose( fp ) ; } */ /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if (WANTZ == 0) JOBZ = 'N'; else JOBZ = 'V'; float nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaFloatComplex *work2, *AINIT; float *rwork2, *D2; // TODO free this memory ! magma_cmalloc_cpu( &work2, lwork2 ); magma_smalloc_cpu( &rwork2, N ); magma_smalloc_cpu( &D2, N ); magma_cmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaFloatComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ cpu_time = magma_wtime(); i= min(12, THREADS); #if defined(USEMKL) mkl_set_num_threads( i ); #endif #if defined(USEACML) omp_set_num_threads(i); #endif lapackf77_cheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #if defined(PRECISION_z) || defined (PRECISION_c) rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////ssterf_( &N, D, E, &info); //// cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, i); /* for (i=0; i < 10; i++) printf(" voici lpk D[%d] %8.2e\n", i, D2[i]); */ //magmaFloatComplex mydz=0.0, mydo=1.0; //magmaFloatComplex *Z; // magma_cmalloc_cpu( &Z, N*lda ); // dgemm_("N", "N", &N, &N, &N, &mydo, h_R, &lda, h_A, &lda, &mydz, Z, &lda); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaFloatComplex *WORKAJETER; float *RWORKAJETER, *RESU; // TODO free this memory ! magma_cmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_smalloc_cpu( &RWORKAJETER, N ); magma_smalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(float)); MATYPE=3; float NOTHING=0.0; cpu_time = magma_wtime(); // check results ccheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); #if defined(USEMKL) mkl_set_num_threads( 1 ); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); printf("============================================================================\n\n\n"); /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); TESTING_FREE_DEV( dT1 ); /* TODO - not all memory has been freed inside loop */ fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE_MGPU(); return EXIT_SUCCESS; }
void StVKReducedStiffnessMatrix::Evaluate(double * q, double * Rq) { // this is same as EvaluateSubset with start=0, end=quadraticSize /* int i,j,k; int output; // reset to free terms int index = 0; int indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { Rq[indexEntry] = freeCoef_[index]; index++; indexEntry++; } indexEntry += output + 1; } // add linear terms index = 0; indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { for(j=0; j<r; j++) { Rq[indexEntry] += linearCoef_[index] * q[j]; index++; } indexEntry++; } indexEntry += output + 1; } // add quadratic terms index = 0; indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { for(j=0; j<r; j++) for(k=j; k<r; k++) { Rq[indexEntry] += quadraticCoef_[index] * q[j] * q[k]; index++; } indexEntry++; } indexEntry += output + 1; } // make symetric for(output=0; output<r; output++) for(i=0; i<output; i++) Rq[ELT(r,i,output)] = Rq[ELT(r,output,i)]; */ if (useSingleThread) { #if defined(WIN32) || defined(linux) mkl_max_threads = mkl_get_max_threads(); mkl_dynamic = mkl_get_dynamic(); mkl_set_num_threads(1); mkl_set_dynamic(0); #elif defined(__APPLE__) //setenv("VECLIB_MAXIMUM_THREADS", "1", true); #endif } // reset to free terms memcpy(buffer1,freeCoef_,sizeof(double)*quadraticSize); // add linear terms // multiply linearCoef_ and q // linearCoef_ is r x quadraticSize array cblas_dgemv(CblasColMajor, CblasTrans, r, quadraticSize, 1.0, linearCoef_, r, q, 1, 1.0, buffer1, 1); // compute qiqj int index = 0; for(int output=0; output<r; output++) for(int i=output; i<r; i++) { qiqj[index] = q[output] * q[i]; index++; } // update Rq // quadraticCoef_ is quadraticSize x quadraticSize matrix // each column gives quadratic coef for one matrix entry cblas_dgemv(CblasColMajor, CblasTrans, quadraticSize, quadraticSize, 1.0, quadraticCoef_, quadraticSize, qiqj, 1, 1.0, buffer1, 1); // unpack into a symmetric matrix int i1=0,j1=0; for(int i=0; i< quadraticSize; i++) { Rq[ELT(r,i1,j1)] = buffer1[i]; Rq[ELT(r,j1,i1)] = buffer1[i]; j1++; if(j1 == r) { i1++; j1 = i1; } } if (useSingleThread) { #if defined(WIN32) || defined(linux) mkl_set_num_threads(mkl_max_threads); mkl_set_dynamic(mkl_dynamic); #elif defined(__APPLE__) //unsetenv("VECLIB_MAXIMUM_THREADS"); #endif } }
int main(int argc, char** argv) { int maxnumit = 0; int maxrec = -1; const char *budget_type_str = NULL; const char *stage = NULL; const char *training = NULL; const char *dev = NULL; const char *path = NULL; const char * etransform_str = NULL; const char *kernel_str = NULL; const char *rbf_lambda_str = NULL; #ifdef NDEBUG log_info("ai-parse %s (Release)", VERSION); #else log_info("ai-parse %s (Debug)", VERSION); #endif struct argparse_option options[] = { OPT_HELP(), //OPT_BOOLEAN('f', "force", &force, "force to do", NULL), OPT_INTEGER('v', "verbosity", &verbosity, "Verbosity level. Minimum (Default) 0. Increasing values increase parser verbosity.", NULL), OPT_STRING('o', "modelname", &modelname, "Model name", NULL), OPT_STRING('p', "path", &path, "CoNLL base directory including sections", NULL), OPT_STRING('s', "stage", &stage, "[ optimize | train | parse ]", NULL), OPT_INTEGER('n', "maxnumit", &maxnumit, "Maximum number of iterations by perceptron. Default is 50", NULL), OPT_STRING('t', "training", &training, "Training sections for optimize and train. Apply sections for parse", NULL), OPT_STRING('d', "development", &dev, "Development sections for optimize", NULL), OPT_STRING('e', "epattern", &epattern, "Embedding Patterns", NULL), OPT_INTEGER('l', "edimension", &edimension, "Embedding dimension", NULL), OPT_INTEGER('m', "maxrec", &maxrec, "Maximum number of training instance", NULL), OPT_STRING('x', "etransform", &etransform_str, "Embedding Transformation", NULL), OPT_STRING('k', "kernel", &kernel_str, "Kernel Type", NULL), OPT_INTEGER('a', "bias", &bias, "Polynomial kernel additive term. Default is 1", NULL), OPT_INTEGER('c', "concurrency", &num_parallel_mkl_slaves, "Parallel MKL Slaves. Default is 90% of all machine cores", NULL), OPT_INTEGER('b', "degree", &polynomial_degree, "Degree of polynomial kernel. Default is 4", NULL), OPT_STRING('z', "lambda", &rbf_lambda_str, "Lambda multiplier for RBF Kernel.Default value is 0.025"), OPT_STRING('u', "budget_type", &budget_type_str, "Budget control methods. NONE|RANDOM", NULL), OPT_INTEGER('g', "budget_size", &budget_target, "Budget Target for budget based perceptron algorithms. Default 50K", NULL), OPT_END(), }; struct argparse argparse; argparse_init(&argparse, options, usage, 0); argc = argparse_parse(&argparse, argc, argv); int max_threads = mkl_get_max_threads(); log_info("There are max %d MKL threads", max_threads); if (num_parallel_mkl_slaves == -1) { num_parallel_mkl_slaves = (int) (max_threads * 0.9); if (num_parallel_mkl_slaves == 0) num_parallel_mkl_slaves = 1; } log_info("Number of MKL Slaves is set to be %d", num_parallel_mkl_slaves); mkl_set_num_threads(num_parallel_mkl_slaves); if (1 == mkl_get_dynamic()) log_info("Intel MKL may use less than %i threads for a large problem", num_parallel_mkl_slaves); else log_info("Intel MKL should use %i threads for a large problem", num_parallel_mkl_slaves); check(stage != NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0 || strcmp(stage, "parse") == 0), "Choose one of -s optimize, train, parse"); check(path != NULL, "Specify a ConLL base directory using -p"); check(edimension != 0, "Set embedding dimension using -l"); check(modelname != NULL, "Provide model name using -o"); if (budget_type_str != NULL) { if (strcmp(budget_type_str, "RANDOM") == 0 || strcmp(budget_type_str, "RANDOMIZED") == 0) { budget_method = RANDOMIZED; } else if (strcmp(budget_type_str, "NONE") == 0) { budget_method = NONE; } else { log_err("Unknown budget control type %s", budget_type_str); goto error; } } else { budget_method = NONE; } if (training == NULL) { log_warn("training section string is set to %s", DEFAULT_TRAINING_SECTION_STR); training = strdup(DEFAULT_TRAINING_SECTION_STR); } if (dev == NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0)) { log_info("development section string is set to %s", DEFAULT_DEV_SECTION_STR); dev = strdup(DEFAULT_DEV_SECTION_STR); } check(epattern != NULL, "Embedding pattern is required for -s optimize,train,parse"); if (etransform_str == NULL) { log_info("Embedding transformation is set to be QUADRATIC"); etransform = DEFAULT_EMBEDDING_TRANFORMATION; } else if (strcmp(etransform_str, "LINEAR") == 0) { etransform = LINEAR; } else if (strcmp(etransform_str, "QUADRATIC") == 0) { etransform = QUADRATIC; } else if (strcmp(etransform_str, "CUBIC") == 0) { etransform = CUBIC; } else { log_err("Unsupported transformation type for embedding %s", etransform_str); } if (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0) { if (maxnumit <= 0) { log_info("maxnumit is set to %d", DEFAULT_MAX_NUMIT); maxnumit = DEFAULT_MAX_NUMIT; } } if (kernel_str != NULL) { if (strcmp(kernel_str, "POLYNOMIAL") == 0) { log_info("Polynomial kernel will be used with bias %f and degree %d", bias, polynomial_degree); kernel = KPOLYNOMIAL; } else if (strcmp(kernel_str, "GAUSSIAN") == 0 || strcmp(kernel_str, "RBF") == 0) { if (rbf_lambda_str != NULL) { rbf_lambda = (float) atof(rbf_lambda_str); } log_info("RBF/GAUSSIAN kernel will be used with lambda %f ", rbf_lambda); kernel = KRBF; } else { log_err("Unsupported kernel type %s. Valid options are LINEAR, POLYNOMIAL, and RBF/GAUSSIAN", kernel_str); goto error; } } if (strcmp(stage, "optimize") == 0) { void *model = optimize(maxnumit, maxrec, path, training, dev, edimension); char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7)); check_mem(model_filename); sprintf(model_filename, "%s.model", modelname); FILE *fp = fopen(model_filename, "w"); if (kernel == KLINEAR) { PerceptronModel pmodel = (PerceptronModel) model; dump_PerceptronModel(fp, edimension, pmodel->embedding_w_best, pmodel->best_numit); PerceptronModel_free(pmodel); } else if (kernel == KPOLYNOMIAL || kernel == KRBF) { KernelPerceptron kpmodel = (KernelPerceptron) model; dump_KernelPerceptronModel(fp, kpmodel); } log_info("Model is dumped into %s file", model_filename); fclose(fp); } else if (strcmp(stage, "parse") == 0) { char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7)); check_mem(model_filename); sprintf(model_filename, "%s.model", modelname); FILE *fp = fopen(model_filename, "r"); check(fp != NULL, "%s could not be opened", model_filename); void *model; if (kernel == KLINEAR) model = load_PerceptronModel(fp); else model = load_KernelPerceptronModel(fp); fclose(fp); check(model != NULL, "Error in loading model file"); log_info("Model loaded from %s successfully", model_filename); parseall(model, path, training, edimension); } else { log_info("Waiting for implementation"); } return (EXIT_SUCCESS); error: return (EXIT_FAILURE); }
int exampleDenseCholByBlocks(const ordinal_type mmin, const ordinal_type mmax, const ordinal_type minc, const ordinal_type mb, const int max_concurrency, const int max_task_dependence, const int team_size, const int mkl_nthreads, const bool check, const bool verbose) { typedef typename Kokkos::Impl::is_space<DeviceSpaceType>::host_mirror_space::execution_space HostSpaceType ; const bool detail = false; std::cout << "DeviceSpace:: "; DeviceSpaceType::print_configuration(std::cout, detail); std::cout << "HostSpace:: "; HostSpaceType::print_configuration(std::cout, detail); typedef Kokkos::Experimental::TaskPolicy<DeviceSpaceType> PolicyType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,HostSpaceType> DenseMatrixBaseHostType; typedef DenseMatrixView<DenseMatrixBaseHostType> DenseMatrixViewHostType; typedef DenseMatrixBase<value_type,ordinal_type,size_type,DeviceSpaceType> DenseMatrixBaseDeviceType; typedef DenseMatrixView<DenseMatrixBaseDeviceType> DenseMatrixViewDeviceType; typedef TaskView<DenseMatrixViewDeviceType> DenseTaskViewDeviceType; typedef DenseMatrixBase<DenseTaskViewDeviceType,ordinal_type,size_type,DeviceSpaceType> DenseHierMatrixBaseDeviceType; typedef DenseMatrixView<DenseHierMatrixBaseDeviceType> DenseHierMatrixViewDeviceType; typedef TaskView<DenseHierMatrixViewDeviceType> DenseHierTaskViewDeviceType; int r_val = 0; Kokkos::Impl::Timer timer; double t = 0.0; std::cout << "DenseCholByBlocks:: test matrices " <<":: mmin = " << mmin << " , mmax = " << mmax << " , minc = " << minc << " , mb = " << mb << std::endl; const size_t max_task_size = (3*sizeof(DenseTaskViewDeviceType)+sizeof(PolicyType)+128); PolicyType policy(max_concurrency, max_task_size, max_task_dependence, team_size); std::ostringstream os; os.precision(3); os << std::scientific; for (ordinal_type m=mmin;m<=mmax;m+=minc) { os.str(""); // host matrices DenseMatrixBaseHostType AA_host("AA_host", m, m), AB_host("AB_host"), TT_host("TT_host"); // random T matrix { TT_host.createConfTo(AA_host); for (ordinal_type j=0;j<TT_host.NumCols();++j) { for (ordinal_type i=0;i<TT_host.NumRows();++i) TT_host.Value(i,j) = 2.0*((value_type)rand()/(RAND_MAX)) - 1.0; TT_host.Value(j,j) = std::fabs(TT_host.Value(j,j)); } } // create SPD matrix { Teuchos::BLAS<ordinal_type,value_type> blas; blas.HERK(ArgUplo == Uplo::Upper ? Teuchos::UPPER_TRI : Teuchos::LOWER_TRI, Teuchos::CONJ_TRANS, m, m, 1.0, TT_host.ValuePtr(), TT_host.ColStride(), 0.0, AA_host.ValuePtr(), AA_host.ColStride()); // preserve a copy of A AB_host.createConfTo(AA_host); DenseMatrixTools::copy(AB_host, AA_host); } const double flop = DenseFlopCount<value_type>::Chol(m); #ifdef HAVE_SHYLUTACHO_MKL mkl_set_num_threads(mkl_nthreads); #endif os << "DenseCholByBlocks:: m = " << m << " "; int ierr = 0; if (check) { timer.reset(); DenseMatrixViewHostType A_host(AB_host); ierr = Chol<ArgUplo,AlgoChol::ExternalLapack,Variant::One>::invoke (policy, policy.member_single(), A_host); t = timer.seconds(); TACHO_TEST_FOR_ABORT( ierr, "Fail to perform Cholesky (serial)"); os << ":: Serial Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } DenseMatrixBaseDeviceType AA_device("AA_device"); { timer.reset(); AA_device.mirror(AA_host); t = timer.seconds(); os << ":: Mirror = " << t << " [sec] "; } { DenseHierMatrixBaseDeviceType HA_device("HA_device"); DenseMatrixTools::createHierMatrix(HA_device, AA_device, mb, mb); DenseHierTaskViewDeviceType TA_device(HA_device); timer.reset(); auto future = policy.proc_create_team (Chol<ArgUplo,AlgoChol::DenseByBlocks,ArgVariant> ::createTaskFunctor(policy, TA_device), 0); policy.spawn(future); Kokkos::Experimental::wait(policy); t = timer.seconds(); os << ":: Parallel Performance = " << (flop/t/1.0e9) << " [GFLOPs] "; } AA_host.mirror(AA_device); if (!ierr && check) { double err = 0.0, norm = 0.0; for (ordinal_type j=0;j<AA_host.NumCols();++j) for (ordinal_type i=0;i<=j;++i) { const double diff = abs(AA_host.Value(i,j) - AB_host.Value(i,j)); const double val = AB_host.Value(i,j); err += diff*diff; norm += val*val; } os << ":: Check result ::norm = " << sqrt(norm) << ", error = " << sqrt(err); } std::cout << os.str() << std::endl; } return r_val; }
int dynamixMain (int argc, char * argv[]) { //// DECLARING VARIABLES // Struct of parameters PARAMETERS p; // CVode variables void * cvode_mem = NULL; // pointer to block of CVode memory N_Vector y, yout; // arrays of populations // arrays for energetic parameters realtype ** V = NULL; // pointer to k-c coupling constants realtype * Vbridge = NULL; // pointer to array of bridge coupling constants. // first element [0] is Vkb1, last [Nb] is VcbN realtype * Vnobridge = NULL; // coupling constant when there is no bridge //// Setting defaults for parameters to be read from input //// done setting defaults int flag; realtype * k_pops = NULL; // pointers to arrays of populations realtype * l_pops = NULL; realtype * c_pops = NULL; realtype * b_pops = NULL; realtype * ydata = NULL; // pointer to ydata (contains all populations) realtype * wavefunction = NULL; // (initial) wavefunction realtype * dm = NULL; // density matrix realtype * dmt = NULL; // density matrix in time realtype * wfnt = NULL; // density matrix in time realtype * k_energies = NULL; // pointers to arrays of energies realtype * c_energies = NULL; realtype * b_energies = NULL; realtype * l_energies = NULL; realtype t0 = 0.0; // initial time realtype t = 0; realtype tret = 0; // time returned by the solver time_t startRun; // time at start of log time_t endRun; // time at end of log struct tm * currentTime = NULL; // time structure for localtime #ifdef DEBUG FILE * realImaginary; // file containing real and imaginary parts of the wavefunction #endif FILE * log; // log file with run times realtype * tkprob = NULL; // total probability in k, l, c, b states at each timestep realtype * tlprob = NULL; realtype * tcprob = NULL; realtype * tbprob = NULL; double ** allprob = NULL; // populations in all states at all times realtype * times = NULL; realtype * qd_est = NULL; realtype * qd_est_diag = NULL; std::string inputFile = "ins/parameters.in"; // name of input file std::string cEnergiesInput = "ins/c_energies.in"; std::string cPopsInput = "ins/c_pops.in"; std::string bEnergiesInput = "ins/b_energies.in"; std::string VNoBridgeInput = "ins/Vnobridge.in"; std::string VBridgeInput = "ins/Vbridge.in"; std::map<const std::string, bool> outs; // map of output file names to bool // default output directory p.outputDir = "outs/"; double summ = 0; // sum variable // ---- process command line flags ---- // opterr = 0; int c; std::string insDir; /* process command line options */ while ((c = getopt(argc, argv, "i:o:")) != -1) { switch (c) { case 'i': // check that it ends in a slash std::cerr << "[dynamix]: assigning input directory" << std::endl; insDir = optarg; if (strcmp(&(insDir.at(insDir.length() - 1)), "/")) { std::cerr << "ERROR: option -i requires argument (" << insDir << ") to have a trailing slash (/)." << std::endl; return 1; } else { // ---- assign input files ---- // inputFile = insDir + "parameters.in"; cEnergiesInput = insDir + "c_energies.in"; cPopsInput = insDir + "c_pops.in"; bEnergiesInput = insDir + "b_energies.in"; VNoBridgeInput = insDir + "Vnobridge.in"; VBridgeInput = insDir + "Vbridge.in"; } break; case 'o': std::cerr << "[dynamix]: assigning output directory" << std::endl; p.outputDir = optarg; break; case '?': if (optopt == 'i') { fprintf(stderr, "Option -%c requires a directory argument.\n", optopt); } else if (isprint(optopt)) { fprintf(stderr, "Unknown option -%c.\n", optopt); } else { fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt); } return 1; default: continue; } } optind = 1; // reset global variable counter for the next time this is run std::cerr << "[dynamix]: ARGUMENTS" << std::endl; for (int ii = 0; ii < argc; ii++) { std::cerr << "[dynamix]: " << argv[ii] << std::endl; } //// ASSIGN PARAMETERS FROM INPUT FILE // ---- TODO create output directory if it does not exist ---- // flag = mkdir(p.outputDir.c_str(), 0755); std::cerr << "Looking for inputs in all the " << inputFile << " places" << std::endl; assignParams(inputFile.c_str(), &p); // Decide which output files to make #ifdef DEBUG std::cout << "Assigning outputs as specified in " << inputFile << "\n"; #endif assignOutputs(inputFile.c_str(), outs, &p); #ifdef DEBUG // print out which outputs will be made for (std::map<const std::string, bool>::iterator it = outs.begin(); it != outs.end(); it++) { std::cout << "Output file: " << it->first << " will be created.\n"; } #endif // OPEN LOG FILE; PUT IN START TIME // if (isOutput(outs, "log.out")) { log = fopen("log.out", "w"); // note that this file is closed at the end of the program } time(&startRun); currentTime = localtime(&startRun); if (isOutput(outs, "log.out")) { fprintf(log, "Run started at %s\n", asctime(currentTime)); } if (isOutput(outs, "log.out")) { // make a note about the laser intensity. fprintf(log,"The laser intensity is %.5e W/cm^2.\n\n",pow(p.pumpAmpl,2)*3.5094452e16); } //// READ DATA FROM INPUTS p.Nc = numberOfValuesInFile(cEnergiesInput.c_str()); p.Nb = numberOfValuesInFile(bEnergiesInput.c_str()); k_pops = new realtype [p.Nk]; c_pops = new realtype [p.Nc]; b_pops = new realtype [p.Nb]; l_pops = new realtype [p.Nl]; k_energies = new realtype [p.Nk]; c_energies = new realtype [p.Nc]; b_energies = new realtype [p.Nb]; l_energies = new realtype [p.Nl]; if (numberOfValuesInFile(cPopsInput.c_str()) != p.Nc) { fprintf(stderr, "ERROR [Inputs]: c_pops and c_energies not the same length.\n"); return -1; } readArrayFromFile(c_energies, cEnergiesInput.c_str(), p.Nc); if (p.bridge_on) { if (p.bridge_on && (p.Nb < 1)) { std::cerr << "\nERROR: bridge_on but no bridge states. The file b_energies.in is probably empty.\n"; return -1; } p.Vbridge.resize(p.Nb+1); readArrayFromFile(b_energies, bEnergiesInput.c_str(), p.Nb); readVectorFromFile(p.Vbridge, VBridgeInput.c_str(), p.Nb + 1); #ifdef DEBUG std::cout << "COUPLINGS:"; for (int ii = 0; ii < p.Nb+1; ii++) { std::cout << " " << p.Vbridge[ii]; } std::cout << std::endl; #endif } else { p.Nb = 0; p.Vnobridge.resize(1); readVectorFromFile(p.Vnobridge, VNoBridgeInput.c_str(), 1); } #ifdef DEBUG std::cout << "\nDone reading things from inputs.\n"; #endif //// PREPROCESS DATA FROM INPUTS // check torsion parameters, set up torsion spline if (p.torsion) { #ifdef DEBUG std::cout << "Torsion is on." << std::endl; #endif // error checking if (p.torsionSite > p.Nb) { std::cerr << "ERROR: torsion site (" << p.torsionSite << ") is larger than number of bridge sites (" << p.Nb << ")." << std::endl; exit(-1); } else if (p.torsionSite < 0) { std::cerr << "ERROR: torsion site is less than zero." << std::endl; exit(-1); } if (!fileExists(p.torsionFile)) { std::cerr << "ERROR: torsion file " << p.torsionFile << " does not exist." << std::endl; } // create spline p.torsionV = new Spline(p.torsionFile.c_str()); if (p.torsionV->getFirstX() != 0.0) { std::cerr << "ERROR: time in " << p.torsionFile << " should start at 0.0." << std::endl; exit(-1); } if (p.torsionV->getLastX() < p.tout) { std::cerr << "ERROR: time in " << p.torsionFile << " should be >= tout." << std::endl; exit(-1); } } // set number of processors for OpenMP //omp_set_num_threads(p.nproc); mkl_set_num_threads(p.nproc); p.NEQ = p.Nk+p.Nc+p.Nb+p.Nl; // total number of equations set p.NEQ2 = p.NEQ*p.NEQ; // number of elements in DM #ifdef DEBUG std::cout << "\nTotal number of states: " << p.NEQ << std::endl; std::cout << p.Nk << " bulk, " << p.Nc << " QD, " << p.Nb << " bridge, " << p.Nl << " bulk VB.\n"; #endif tkprob = new realtype [p.numOutputSteps+1]; // total population on k, b, c at each timestep tcprob = new realtype [p.numOutputSteps+1]; tbprob = new realtype [p.numOutputSteps+1]; tlprob = new realtype [p.numOutputSteps+1]; allprob = new double * [p.numOutputSteps+1]; for (int ii = 0; ii <= p.numOutputSteps; ii++) { allprob[ii] = new double [p.NEQ]; } // assign times. p.times.resize(p.numOutputSteps+1); for (int ii = 0; ii <= p.numOutputSteps; ii++) { p.times[ii] = float(ii)/p.numOutputSteps*p.tout; } qd_est = new realtype [p.numOutputSteps+1]; qd_est_diag = new realtype [p.numOutputSteps+1]; p.Ik = 0; // set index start positions for each type of state p.Ic = p.Nk; p.Ib = p.Ic+p.Nc; p.Il = p.Ib+p.Nb; // assign bulk conduction and valence band energies // for RTA, bulk and valence bands have parabolic energies if (p.rta) { buildParabolicBand(k_energies, p.Nk, p.kBandEdge, CONDUCTION, &p); buildParabolicBand(l_energies, p.Nl, p.lBandTop, VALENCE, &p); } else { buildContinuum(k_energies, p.Nk, p.kBandEdge, p.kBandTop); buildContinuum(l_energies, p.Nl, p.kBandEdge - p.valenceBand - p.bulk_gap, p.kBandEdge - p.bulk_gap); } // calculate band width p.kBandWidth = k_energies[p.Nk - 1] - k_energies[0]; //// BUILD INITIAL WAVEFUNCTION // bridge states (empty to start) initializeArray(b_pops, p.Nb, 0.0); // coefficients in bulk and other states depend on input conditions in bulk if (!p.rta) { #ifdef DEBUG std::cout << "\ninitializing k_pops\n"; #endif if (p.bulk_constant) { initializeArray(k_pops, p.Nk, 0.0); #ifdef DEBUG std::cout << "\ninitializing k_pops with constant probability in range of states\n"; #endif initializeArray(k_pops+p.Nk_first-1, p.Nk_final-p.Nk_first+1, 1.0); initializeArray(l_pops, p.Nl, 0.0); // populate l states (all 0 to start off) initializeArray(c_pops, p.Nc, 0.0); // QD states empty to start } else if (p.bulk_Gauss) { buildKPopsGaussian(k_pops, k_energies, p.kBandEdge, p.bulkGaussSigma, p.bulkGaussMu, p.Nk); // populate k states with FDD initializeArray(l_pops, p.Nl, 0.0); // populate l states (all 0 to start off) initializeArray(c_pops, p.Nc, 0.0); // QD states empty to start } else if (p.qd_pops) { readArrayFromFile(c_pops, cPopsInput.c_str(), p.Nc); // QD populations from file initializeArray(l_pops, p.Nl, 0.0); // populate l states (all 0 to start off) initializeArray(k_pops, p.Nk, 0.0); // populate k states (all zero to start off) } else { initializeArray(k_pops, p.Nk, 0.0); // populate k states (all zero to start off) initializeArray(l_pops, p.Nl, 1.0); // populate l states (all populated to start off) initializeArray(c_pops, p.Nc, 0.0); // QD states empty to start } #ifdef DEBUG std::cout << "\nThis is k_pops:\n"; for (int ii = 0; ii < p.Nk; ii++) { std::cout << k_pops[ii] << std::endl; } std::cout << "\n"; #endif } // with RTA, use different set of switches else { // bulk valence band if (p.VBPopFlag == POP_EMPTY) { #ifdef DEBUG std::cout << "Initializing empty valence band" << std::endl; #endif initializeArray(l_pops, p.Nl, 0.0); } else if (p.VBPopFlag == POP_FULL) { #ifdef DEBUG std::cout << "Initializing full valence band" << std::endl; #endif initializeArray(l_pops, p.Nl, 1.0); } else { std::cerr << "ERROR: unrecognized VBPopFlag " << p.VBPopFlag << std::endl; } // bulk conduction band if (p.CBPopFlag == POP_EMPTY) { #ifdef DEBUG std::cout << "Initializing empty conduction band" << std::endl; #endif initializeArray(k_pops, p.Nk, 0.0); } else if (p.CBPopFlag == POP_FULL) { #ifdef DEBUG std::cout << "Initializing full conduction band" << std::endl; #endif initializeArray(k_pops, p.Nk, 1.0); } else if (p.CBPopFlag == POP_CONSTANT) { #ifdef DEBUG std::cout << "Initializing constant distribution in conduction band" << std::endl; #endif initializeArray(k_pops, p.Nk, 0.0); initializeArray(k_pops, p.Nk, 1e-1); // FIXME initializeArray(k_pops+p.Nk_first-1, p.Nk_final-p.Nk_first+1, 1.0); } else if (p.CBPopFlag == POP_GAUSSIAN) { #ifdef DEBUG std::cout << "Initializing Gaussian in conduction band" << std::endl; #endif buildKPopsGaussian(k_pops, k_energies, p.kBandEdge, p.bulkGaussSigma, p.bulkGaussMu, p.Nk); } else { std::cerr << "ERROR: unrecognized CBPopFlag " << p.CBPopFlag << std::endl; } //// QD if (p.QDPopFlag == POP_EMPTY) { initializeArray(c_pops, p.Nc, 0.0); } else if (p.QDPopFlag == POP_FULL) { initializeArray(c_pops, p.Nc, 1.0); } else { std::cerr << "ERROR: unrecognized QDPopFlag " << p.QDPopFlag << std::endl; } } // create empty wavefunction wavefunction = new realtype [2*p.NEQ]; initializeArray(wavefunction, 2*p.NEQ, 0.0); // assign real parts of wavefunction coefficients (imaginary are zero) for (int ii = 0; ii < p.Nk; ii++) { wavefunction[p.Ik + ii] = k_pops[ii]; } for (int ii = 0; ii < p.Nc; ii++) { wavefunction[p.Ic + ii] = c_pops[ii]; } for (int ii = 0; ii < p.Nb; ii++) { wavefunction[p.Ib + ii] = b_pops[ii]; } for (int ii = 0; ii < p.Nl; ii++) { wavefunction[p.Il + ii] = l_pops[ii]; } if (isOutput(outs, "psi_start.out")) { outputWavefunction(wavefunction, p.NEQ); } // Give all coefficients a random phase if (p.random_phase) { float phi; // set the seed if (p.random_seed == -1) { srand(time(NULL)); } else { srand(p.random_seed); } for (int ii = 0; ii < p.NEQ; ii++) { phi = 2*3.1415926535*(float)rand()/(float)RAND_MAX; wavefunction[ii] = wavefunction[ii]*cos(phi); wavefunction[ii + p.NEQ] = wavefunction[ii + p.NEQ]*sin(phi); } } #ifdef DEBUG // print out details of wavefunction coefficients std::cout << std::endl; for (int ii = 0; ii < p.Nk; ii++) { std::cout << "starting wavefunction: Re[k(" << ii << ")] = " << wavefunction[p.Ik + ii] << std::endl; } for (int ii = 0; ii < p.Nc; ii++) { std::cout << "starting wavefunction: Re[c(" << ii << ")] = " << wavefunction[p.Ic + ii] << std::endl; } for (int ii = 0; ii < p.Nb; ii++) { std::cout << "starting wavefunction: Re[b(" << ii << ")] = " << wavefunction[p.Ib + ii] << std::endl; } for (int ii = 0; ii < p.Nl; ii++) { std::cout << "starting wavefunction: Re[l(" << ii << ")] = " << wavefunction[p.Il + ii] << std::endl; } for (int ii = 0; ii < p.Nk; ii++) { std::cout << "starting wavefunction: Im[k(" << ii << ")] = " << wavefunction[p.Ik + ii + p.NEQ] << std::endl; } for (int ii = 0; ii < p.Nc; ii++) { std::cout << "starting wavefunction: Im[c(" << ii << ")] = " << wavefunction[p.Ic + ii + p.NEQ] << std::endl; } for (int ii = 0; ii < p.Nb; ii++) { std::cout << "starting wavefunction: Im[b(" << ii << ")] = " << wavefunction[p.Ib + ii + p.NEQ] << std::endl; } for (int ii = 0; ii < p.Nl; ii++) { std::cout << "starting wavefunction: Im[l(" << ii << ")] = " << wavefunction[p.Il + ii + p.NEQ] << std::endl; } std::cout << std::endl; summ = 0; for (int ii = 0; ii < 2*p.NEQ; ii++) { summ += pow(wavefunction[ii],2); } std::cout << "\nTotal population is " << summ << "\n\n"; #endif //// ASSEMBLE ARRAY OF ENERGIES // TODO TODO p.energies.resize(p.NEQ); for (int ii = 0; ii < p.Nk; ii++) { p.energies[p.Ik + ii] = k_energies[ii]; } for (int ii = 0; ii < p.Nc; ii++) { p.energies[p.Ic + ii] = c_energies[ii]; } for (int ii = 0; ii < p.Nb; ii++) { p.energies[p.Ib + ii] = b_energies[ii]; } for (int ii = 0; ii < p.Nl; ii++) { p.energies[p.Il + ii] = l_energies[ii]; } #ifdef DEBUG for (int ii = 0; ii < p.NEQ; ii++) { std::cout << "p.energies[" << ii << "] is " << p.energies[ii] << "\n"; } #endif //// ASSIGN COUPLING CONSTANTS V = new realtype * [p.NEQ]; for (int ii = 0; ii < p.NEQ; ii++) { V[ii] = new realtype [p.NEQ]; } buildCoupling(V, &p, outs); if (isOutput(outs, "log.out")) { // make a note in the log about system timescales double tau = 0; // fundamental system timescale if (p.Nk == 1) { fprintf(log, "\nThe timescale (tau) is undefined (Nk == 1).\n"); } else { if (p.bridge_on) { if (p.scale_bubr) { tau = 1.0/(2*p.Vbridge[0]*M_PI); } else { tau = ((p.kBandTop - p.kBandEdge)/(p.Nk - 1))/(2*pow(p.Vbridge[0],2)*M_PI); } } else { if (p.scale_buqd) { tau = 1.0/(2*p.Vnobridge[0]*M_PI); } else { tau = ((p.kBandTop - p.kBandEdge)/(p.Nk - 1))/(2*pow(p.Vnobridge[0],2)*M_PI); } } fprintf(log, "\nThe timescale (tau) is %.9e a.u.\n", tau); } } //// CREATE DENSITY MATRIX if (! p.wavefunction) { // Create the initial density matrix dm = new realtype [2*p.NEQ2]; initializeArray(dm, 2*p.NEQ2, 0.0); #pragma omp parallel for for (int ii = 0; ii < p.NEQ; ii++) { // diagonal part dm[p.NEQ*ii + ii] = pow(wavefunction[ii],2) + pow(wavefunction[ii + p.NEQ],2); if (p.coherent) { // off-diagonal part for (int jj = 0; jj < ii; jj++) { // real part of \rho_{ii,jj} dm[p.NEQ*ii + jj] = wavefunction[ii]*wavefunction[jj] + wavefunction[ii+p.NEQ]*wavefunction[jj+p.NEQ]; // imaginary part of \rho_{ii,jj} dm[p.NEQ*ii + jj + p.NEQ2] = wavefunction[ii]*wavefunction[jj+p.NEQ] - wavefunction[jj]*wavefunction[ii+p.NEQ]; // real part of \rho_{jj,ii} dm[p.NEQ*jj + ii] = dm[p.NEQ*ii + jj]; // imaginary part of \rho_{jj,ii} dm[p.NEQ*jj + ii + p.NEQ2] = -1*dm[p.NEQ*ii + jj + p.NEQ*p.NEQ]; } } } // Create the array to store the density matrix in time dmt = new realtype [2*p.NEQ2*(p.numOutputSteps+1)]; initializeArray(dmt, 2*p.NEQ2*(p.numOutputSteps+1), 0.0); #ifdef DEBUG2 // print out density matrix std::cout << "\nDensity matrix without normalization:\n\n"; for (int ii = 0; ii < p.NEQ; ii++) { for (int jj = 0; jj < p.NEQ; jj++) { fprintf(stdout, "(%+.1e,%+.1e) ", dm[p.NEQ*ii + jj], dm[p.NEQ*ii + jj + p.NEQ2]); } fprintf(stdout, "\n"); } #endif // Normalize the DM so that populations add up to 1. // No normalization if RTA is on. if (!p.rta) { summ = 0.0; for (int ii = 0; ii < p.NEQ; ii++) { // assume here that diagonal elements are all real summ += dm[p.NEQ*ii + ii]; } if ( summ == 0.0 ) { std::cerr << "\nFATAL ERROR [populations]: total population is 0!\n"; return -1; } if (summ != 1.0) { // the variable 'summ' is now a multiplicative normalization factor summ = 1.0/summ; for (int ii = 0; ii < 2*p.NEQ2; ii++) { dm[ii] *= summ; } } #ifdef DEBUG std::cout << "\nThe normalization factor for the density matrix is " << summ << "\n\n"; #endif } // Error checking for total population; recount population first summ = 0.0; for (int ii = 0; ii < p.NEQ; ii++) { summ += dm[p.NEQ*ii + ii]; } if ( fabs(summ-1.0) > 1e-12 && (!p.rta)) { std::cerr << "\nWARNING [populations]: After normalization, total population is not 1, it is " << summ << "!\n"; } #ifdef DEBUG std::cout << "\nAfter normalization, the sum of the populations in the density matrix is " << summ << "\n\n"; #endif // Add initial DM to parameters. p.startDM.resize(2*p.NEQ2); memcpy(&(p.startDM[0]), &(dm[0]), 2*p.NEQ2*sizeof(double)); } // wavefunction else { // Create the array to store the wavefunction in time wfnt = new realtype [2*p.NEQ*(p.numOutputSteps+1)]; initializeArray(wfnt, 2*p.NEQ*(p.numOutputSteps+1), 0.0); // normalize summ = 0.0; for (int ii = 0; ii < p.NEQ; ii++) { summ += pow(wavefunction[ii],2) + pow(wavefunction[ii+p.NEQ],2); } #ifdef DEBUG std::cout << "Before normalization, the total population is " << summ << std::endl; #endif summ = 1.0/sqrt(summ); for (int ii = 0; ii < 2*p.NEQ; ii++) { wavefunction[ii] *= summ; } // check total population summ = 0.0; for (int ii = 0; ii < p.NEQ; ii++) { summ += pow(wavefunction[ii],2) + pow(wavefunction[ii+p.NEQ],2); } #ifdef DEBUG std::cout << "After normalization, the total population is " << summ << std::endl; #endif if (fabs(summ - 1.0) > 1e-12) { std::cerr << "WARNING: wavefunction not normalized! Total density is " << summ << std::endl; } // Add initial wavefunction to parameters. p.startWfn.resize(2*p.NEQ); memcpy(&(p.startWfn[0]), &(wavefunction[0]), 2*p.NEQ*sizeof(double)); } //// BUILD HAMILTONIAN // //TODO TODO #ifdef DEBUG fprintf(stderr, "Building Hamiltonian.\n"); #endif realtype * H = NULL; H = new realtype [p.NEQ2]; for (int ii = 0; ii < p.NEQ2; ii++) { H[ii] = 0.0; } buildHamiltonian(H, p.energies, V, &p); // add Hamiltonian to p p.H.resize(p.NEQ2); for (int ii = 0; ii < p.NEQ2; ii++) { p.H[ii] = H[ii]; } // create sparse version of H p.H_sp.resize(p.NEQ2); p.H_cols.resize(p.NEQ2); p.H_rowind.resize(p.NEQ2 + 1); int job [6] = {0, 0, 0, 2, p.NEQ2, 1}; int info = 0; mkl_ddnscsr(&job[0], &(p.NEQ), &(p.NEQ), &(p.H)[0], &(p.NEQ), &(p.H_sp)[0], &(p.H_cols)[0], &(p.H_rowind)[0], &info); //// SET UP CVODE VARIABLES #ifdef DEBUG std::cout << "\nCreating N_Vectors.\n"; if (p.wavefunction) { std::cout << "\nProblem size is " << 2*p.NEQ << " elements.\n"; } else { std::cout << "\nProblem size is " << 2*p.NEQ2 << " elements.\n"; } #endif // Creates N_Vector y with initial populations which will be used by CVode// if (p.wavefunction) { y = N_VMake_Serial(2*p.NEQ, wavefunction); } else { y = N_VMake_Serial(2*p.NEQ2, dm); } // put in t = 0 information if (! p.wavefunction) { updateDM(y, dmt, 0, &p); } else { updateWfn(y, wfnt, 0, &p); } // the vector yout has the same dimensions as y yout = N_VClone(y); #ifdef DEBUG realImaginary = fopen("real_imaginary.out", "w"); #endif // Make plot files makePlots(outs, &p); // only do propagation if not just making plots if (! p.justPlots) { // Make outputs independent of time propagation computeGeneralOutputs(outs, &p); // create CVode object // this is a stiff problem, I guess? #ifdef DEBUG std::cout << "\nCreating cvode_mem object.\n"; #endif cvode_mem = CVodeCreate(CV_BDF, CV_NEWTON); flag = CVodeSetUserData(cvode_mem, (void *) &p); #ifdef DEBUG std::cout << "\nInitializing CVode solver.\n"; #endif // initialize CVode solver // if (p.wavefunction) { //flag = CVodeInit(cvode_mem, &RHS_WFN, t0, y); flag = CVodeInit(cvode_mem, &RHS_WFN_SPARSE, t0, y); } else { if (p.kinetic) { flag = CVodeInit(cvode_mem, &RHS_DM_RELAX, t0, y); } else if (p.rta) { flag = CVodeInit(cvode_mem, &RHS_DM_RTA, t0, y); //flag = CVodeInit(cvode_mem, &RHS_DM_RTA_BLAS, t0, y); } else if (p.dephasing) { flag = CVodeInit(cvode_mem, &RHS_DM_dephasing, t0, y); } else { //flag = CVodeInit(cvode_mem, &RHS_DM, t0, y); flag = CVodeInit(cvode_mem, &RHS_DM_BLAS, t0, y); } } #ifdef DEBUG std::cout << "\nSpecifying integration tolerances.\n"; #endif // specify integration tolerances // flag = CVodeSStolerances(cvode_mem, p.reltol, p.abstol); #ifdef DEBUG std::cout << "\nAttaching linear solver module.\n"; #endif // attach linear solver module // if (p.wavefunction) { flag = CVDense(cvode_mem, 2*p.NEQ); } else { // Diagonal approximation to the Jacobian saves memory for large systems flag = CVDiag(cvode_mem); } //// CVODE TIME PROPAGATION #ifdef DEBUG std::cout << "\nAdvancing the solution in time.\n"; #endif for (int ii = 1; ii <= p.numsteps; ii++) { t = (p.tout*((double) ii)/((double) p.numsteps)); flag = CVode(cvode_mem, t, yout, &tret, 1); #ifdef DEBUGf std::cout << std::endl << "CVode flag at step " << ii << ": " << flag << std::endl; #endif if ((ii % (p.numsteps/p.numOutputSteps) == 0) || (ii == p.numsteps)) { // show progress in stdout if (p.progressStdout) { fprintf(stdout, "\r%-.2lf percent done", ((double)ii/((double)p.numsteps))*100); fflush(stdout); } // show progress in a file if (p.progressFile) { std::ofstream progressFile("progress.tmp"); progressFile << ((double)ii/((double)p.numsteps))*100 << " percent done." << std::endl; progressFile.close(); } if (p.wavefunction) { updateWfn(yout, wfnt, ii*p.numOutputSteps/p.numsteps, &p); } else { updateDM(yout, dmt, ii*p.numOutputSteps/p.numsteps, &p); } } } #ifdef DEBUG fclose(realImaginary); #endif //// MAKE FINAL OUTPUTS // finalize log file // time(&endRun); currentTime = localtime(&endRun); if (isOutput(outs, "log.out")) { fprintf(log, "Final status of 'flag' variable: %d\n\n", flag); fprintf(log, "Run ended at %s\n", asctime(currentTime)); fprintf(log, "Run took %.3g seconds.\n", difftime(endRun, startRun)); fclose(log); // note that the log file is opened after variable declaration } if (p.progressStdout) { printf("\nRun took %.3g seconds.\n", difftime(endRun, startRun)); } // Compute density outputs. #ifdef DEBUG std::cout << "Computing outputs..." << std::endl; #endif if (p.wavefunction) { computeWfnOutput(wfnt, outs, &p); } else { computeDMOutput(dmt, outs, &p); } #ifdef DEBUG std::cout << "done computing outputs" << std::endl; #endif // do analytical propagation if (p.analytical && (! p.bridge_on)) { computeAnalyticOutputs(outs, &p); } } //// CLEAN UP #ifdef DEBUG fprintf(stdout, "Deallocating N_Vectors.\n"); #endif // deallocate memory for N_Vectors // N_VDestroy_Serial(y); N_VDestroy_Serial(yout); #ifdef DEBUG fprintf(stdout, "Freeing CVode memory.\n"); #endif // free solver memory // CVodeFree(&cvode_mem); #ifdef DEBUG fprintf(stdout, "Freeing memory in main.\n"); #endif // delete all these guys delete [] tkprob; delete [] tlprob; delete [] tcprob; delete [] tbprob; for (int ii = 0; ii <= p.numOutputSteps; ii++) { delete [] allprob[ii]; } delete [] allprob; delete [] k_pops; delete [] c_pops; delete [] b_pops; delete [] l_pops; if (p.bridge_on) { delete [] Vbridge; } else { delete [] Vnobridge; } delete [] k_energies; delete [] c_energies; delete [] b_energies; delete [] l_energies; delete [] wavefunction; delete [] H; for (int ii = 0; ii < p.NEQ; ii++) { delete [] V[ii]; } delete [] V; if (p.wavefunction) { delete [] wfnt; } else { delete [] dm; delete [] dmt; } delete [] times; delete [] qd_est; delete [] qd_est_diag; std::cout << "whoo" << std::endl; return 0; }
int main(int argc, char const *argv[]) { Eigen::setNbThreads(NumCores); #ifdef MKL mkl_set_num_threads(NumCores); #endif INFO("Eigen3 uses " << Eigen::nbThreads() << " threads."); int L; RealType J12ratio; int OBC; int N1, N2; int dynamics, Tsteps; RealType Uin, phi, dt; std::vector<RealType> Vin; LoadParameters( "conf.h5", L, J12ratio, OBC, N1, N2, Uin, Vin, phi, dynamics, Tsteps, dt); HDF5IO *file = new HDF5IO("FSSH.h5"); // const int L = 5; // const bool OBC = true; // const RealType J12ratio = 0.010e0; INFO("Build Lattice - "); std::vector<DT> J; if ( OBC ){ // J = std::vector<DT>(L - 1, 0.0);// NOTE: Atomic limit testing J = std::vector<DT>(L - 1, 1.0); if ( J12ratio > 1.0e0 ) { for (size_t cnt = 1; cnt < L-1; cnt+=2) { J.at(cnt) /= J12ratio; } } else{ for (size_t cnt = 0; cnt < L-1; cnt+=2) { J.at(cnt) *= J12ratio; } } } else{ J = std::vector<DT>(L, 1.0); if ( J12ratio > 1.0e0 ) { for (size_t cnt = 1; cnt < L; cnt+=2) { J.at(cnt) /= J12ratio; } } else{ for (size_t cnt = 0; cnt < L; cnt+=2) { J.at(cnt) *= J12ratio; } } #ifndef DTYPE if ( std::abs(phi) > 1.0e-10 ){ J.at(L-1) *= exp( DT(0.0, 1.0) * phi ); } #endif } for ( auto &val : J ){ INFO_NONEWLINE(val << " "); } INFO(""); const std::vector< Node<DT>* > lattice = NN_1D_Chain(L, J, OBC); file->saveNumber("1DChain", "L", L); file->saveStdVector("1DChain", "J", J); for ( auto < : lattice ){ if ( !(lt->VerifySite()) ) RUNTIME_ERROR("Wrong lattice setup!"); } INFO("DONE!"); INFO("Build Basis - "); // int N1 = (L+1)/2; Basis F1(L, N1, true); F1.Fermion(); std::vector<int> st1 = F1.getFStates(); std::vector<size_t> tg1 = F1.getFTags(); // for (size_t cnt = 0; cnt < st1.size(); cnt++) { // INFO_NONEWLINE( std::setw(3) << st1.at(cnt) << " - "); // F1.printFermionBasis(st1.at(cnt)); // INFO("- " << tg1.at(st1.at(cnt))); // } // int N2 = (L-1)/2; Basis F2(L, N2, true); F2.Fermion(); std::vector<int> st2 = F2.getFStates(); std::vector<size_t> tg2 = F2.getFTags(); // for (size_t cnt = 0; cnt < st2.size(); cnt++) { // INFO_NONEWLINE( std::setw(3) << st2.at(cnt) << " - "); // F2.printFermionBasis(st2.at(cnt)); // INFO("- " << tg2.at(st2.at(cnt))); // } file->saveNumber("Basis", "N1", N1); file->saveStdVector("Basis", "F1States", st1); file->saveStdVector("Basis", "F1Tags", tg1); file->saveNumber("Basis", "N2", N2); file->saveStdVector("Basis", "F2States", st2); file->saveStdVector("Basis", "F2Tags", tg2); INFO("DONE!"); INFO_NONEWLINE("Build Hamiltonian - "); std::vector<Basis> Bases; Bases.push_back(F1); Bases.push_back(F2); Hamiltonian<DT> ham( Bases ); std::vector< std::vector<DT> > Vloc; std::vector<DT> Vtmp;//(L, 1.0); for ( RealType &val : Vin ){ Vtmp.push_back((DT)val); } Vloc.push_back(Vtmp); Vloc.push_back(Vtmp); std::vector< std::vector<DT> > Uloc; // std::vector<DT> Utmp(L, DT(10.0e0, 0.0e0) ); std::vector<DT> Utmp(L, (DT)Uin); Uloc.push_back(Utmp); Uloc.push_back(Utmp); ham.BuildLocalHamiltonian(Vloc, Uloc, Bases); INFO(" - BuildLocalHamiltonian DONE!"); ham.BuildHoppingHamiltonian(Bases, lattice); INFO(" - BuildHoppingHamiltonian DONE!"); ham.BuildTotalHamiltonian(); INFO("DONE!"); INFO_NONEWLINE("Diagonalize Hamiltonian - "); std::vector<RealType> Val; Hamiltonian<DT>::VectorType Vec; ham.eigh(Val, Vec); INFO("GS energy = " << Val.at(0)); file->saveVector("GS", "EVec", Vec); file->saveStdVector("GS", "EVal", Val); INFO("DONE!"); std::vector< DTV > Nfi = Ni( Bases, Vec, ham ); INFO(" Up Spin - "); INFO(Nfi.at(0)); INFO(" Down Spin - "); INFO(Nfi.at(1)); INFO(" N_i - "); DTV Niall = Nfi.at(0) + Nfi.at(1); INFO(Niall); DTM Nud = NupNdn( Bases, Vec, ham ); INFO(" Correlation NupNdn"); INFO(Nud); DTM Nuu = NupNup( Bases, Vec, ham ); INFO(" Correlation NupNup"); INFO(Nuu); DTM Ndd = NdnNdn( Bases, Vec, ham ); INFO(" Correlation NdnNdn"); INFO(Ndd); INFO(" N_i^2 - "); DTM Ni2 = Nuu.diagonal() + Ndd.diagonal() + 2.0e0 * Nud.diagonal(); INFO(Ni2); file->saveVector("Obs", "Nup", Nfi.at(0)); file->saveVector("Obs", "Ndn", Nfi.at(1)); file->saveMatrix("Obs", "NupNdn", Nud); file->saveMatrix("Obs", "NupNup", Nuu); file->saveMatrix("Obs", "NdnNdn", Ndd); delete file; if ( dynamics ){ ComplexType Prefactor = ComplexType(0.0, -1.0e0*dt);/* NOTE: hbar = 1 */ std::cout << "Begin dynamics......" << std::endl; std::cout << "Cut the boundary." << std::endl; J.pop_back(); std::vector< Node<DT>* > lattice2 = NN_1D_Chain(L, J, true);// cut to open ham.BuildHoppingHamiltonian(Bases, lattice2); INFO(" - Update Hopping Hamiltonian DONE!"); ham.BuildTotalHamiltonian(); INFO(" - Update Total Hamiltonian DONE!"); for (size_t cntT = 1; cntT <= Tsteps; cntT++) { ham.expH(Prefactor, Vec); if ( cntT % 2 == 0 ){ HDF5IO file2("DYN.h5"); std::string gname = "Obs-"; gname.append(std::to_string((unsigned long long)cntT)); gname.append("/"); Nfi = Ni( Bases, Vec, ham ); Nud = NupNdn( Bases, Vec, ham ); Nuu = NupNup( Bases, Vec, ham ); Ndd = NdnNdn( Bases, Vec, ham ); file2.saveVector(gname, "Nup", Nfi.at(0)); file2.saveVector(gname, "Ndn", Nfi.at(1)); file2.saveMatrix(gname, "NupNdn", Nud); file2.saveMatrix(gname, "NupNup", Nuu); file2.saveMatrix(gname, "NdnNdn", Ndd); } } } return 0; }
extern "C" void plasma_set_num_threads(int num_threads) { //plasma_setlapack_numthreads(num_threads); mkl_set_num_threads(num_threads); }
extern "C" __declspec(dllexport) void set_max_threads(const MKL_INT num_threads) { mkl_set_num_threads(num_threads); }