int main() { double *A, *B, *C; int i,j,r,max_threads,size; double alpha, beta; double s_initial, s_elapsed; printf("Intializing data for matrix multiplication C=A*B for matrix\n\n" " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N); alpha = 1.0; beta = 0.0; printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n"); A = ( double *)mkl_malloc(M*P*sizeof( double ),64); B = ( double *)mkl_malloc(N*P*sizeof( double ),64); C = ( double *)mkl_malloc(M*N*sizeof( double ),64); if (A == NULL || B == NULL || C == NULL) { printf("Error: can`t allocate memory for matrices.\n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf("Intializing matrix data\n\n"); size = M*P; for (i = 0; i < size; ++i) { A[i] = ( double )(i+1); } size = N*P; for (i = 0; i < size; ++i) { B[i] = ( double )(i-1); } printf("Finding max number of threads can use for parallel runs \n\n"); max_threads = mkl_get_max_threads(); printf("Running from 1 to %i threads \n\n",max_threads); for (i = 1; i <= max_threads; ++i) { size = M*N; for (j = 0; j < size; ++j) { C[j] = 0.0; } printf("Requesting to use %i threads \n\n",i); mkl_set_num_threads(i); printf("Measuring performance of matrix product using dgemm function\n" " via CBLAS interface on %i threads \n\n",i); s_initial = dsecnd(); for (r = 0; r < LOOP_COUNT; ++r) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N); // multiply matrices with cblas_dgemm; } s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT; printf("Matrix multiplication using dgemm completed \n" " at %.5f milliseconds using %d threads \n\n", (s_elapsed * 1000),i); printf("Output the result: \n"); size = M*N; for (i = 0; i < size; ++i) { printf("%i\t",(int)C[i]); if (i % N == N - 1) printf("\n"); } } printf("Dellocating memory\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 0; }
TH_API void THInferNumThreads(void) { #if defined(_OPENMP) && defined(TH_BLAS_MKL) // If we are using MKL an OpenMP make sure the number of threads match. // Otherwise, MKL and our OpenMP-enabled functions will keep changing the // size of the OpenMP thread pool, resulting in worse performance (and memory // leaks in GCC 5.4) omp_set_num_threads(mkl_get_max_threads()); #endif }
// -------------------- // Print the available GPU devices. Used in testing. extern "C" void magma_print_environment() { magma_int_t major, minor, micro; magma_version( &major, &minor, µ ); printf( "%% clMAGMA %d.%d.%d %s\n", (int) major, (int) minor, (int) micro, MAGMA_VERSION_STAGE ); // CUDA, OpenCL, OpenMP, MKL, ACML versions all printed on same line char device_name[1024], driver[1024]; clGetPlatformInfo( g_runtime.get_platform(), CL_PLATFORM_VERSION, sizeof(device_name), device_name, NULL ); printf( "%% OpenCL platform %s.", device_name ); #if defined(_OPENMP) int omp_threads = 0; #pragma omp parallel { omp_threads = omp_get_num_threads(); } printf( " OpenMP threads %d.", omp_threads ); #else printf( " MAGMA not compiled with OpenMP." ); #endif #if defined(MAGMA_WITH_MKL) MKLVersion mkl_version; mkl_get_version( &mkl_version ); printf( " MKL %d.%d.%d, MKL threads %d.", mkl_version.MajorVersion, mkl_version.MinorVersion, mkl_version.UpdateVersion, mkl_get_max_threads() ); #endif #if defined(MAGMA_WITH_ACML) int acml_major, acml_minor, acml_patch; acmlversion( &acml_major, &acml_minor, &acml_patch ); printf( " ACML %d.%d.%d.", acml_major, acml_minor, acml_patch ); #endif printf( "\n" ); // print devices int ndevices = g_runtime.get_num_devices(); cl_device_id* devices = g_runtime.get_devices(); cl_ulong mem_size, alloc_size; for( int dev=0; dev < ndevices; ++dev ) { clGetDeviceInfo( devices[dev], CL_DEVICE_NAME, sizeof(device_name), device_name, NULL ); clGetDeviceInfo( devices[dev], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL ); clGetDeviceInfo( devices[dev], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_size), &alloc_size, NULL ); clGetDeviceInfo( devices[dev], CL_DRIVER_VERSION, sizeof(driver), driver, NULL ); printf( "%% Device: %s, %.1f MiB memory, max allocation %.1f MiB, driver %s\n", device_name, mem_size/(1024.*1024.), alloc_size/(1024.*1024.), driver ); } }
magma_int_t magma_get_lapack_numthreads() { magma_int_t threads = 1; #if defined(MAGMA_WITH_MKL) threads = mkl_get_max_threads(); #elif defined(_OPENMP) #pragma omp parallel { threads = omp_get_num_threads(); } #endif return threads; }
DisableThreadingInBlock::DisableThreadingInBlock() : mklNumThreads(1) , ompNumThreads(1) , openblasNumThreads(1) { #if defined(HAVE_MKL_H) mklNumThreads = mkl_get_max_threads(); mkl_set_num_threads(1); #endif #ifdef _OPENMP ompNumThreads = omp_get_max_threads(); omp_set_num_threads(1); #endif #ifdef OPENBLAS_DISABLE_THREADS openblasNumThreads = goto_get_num_procs(); openblas_set_num_threads(1); #endif // Silence compiler warnings about unused private members (void) mklNumThreads; (void) ompNumThreads; (void) openblasNumThreads; }
int run_nmf(matrix X, matrix W, matrix H, int threads, int max_iter, int verbose) { if (threads == 0 || threads > omp_get_max_threads()) { omp_threads = omp_get_max_threads(); mkl_threads = mkl_get_max_threads(); } else { omp_threads = threads; mkl_threads = threads; } eps_threads = omp_threads; vecdiv_threads = omp_threads; vecmult_threads = omp_threads; sumrows_threads = omp_threads; sumcols_threads = omp_threads; coldiv_threads = omp_threads; rowdiv_threads = omp_threads; check_threads = omp_threads; double timers[TIMERS]; int i; for(i=0;i<TIMERS;i++) timers[i]=0; update_div(W,H,X,CONVERGE_THRESH,max_iter,timers,verbose); return 0; }
DISSECTION_API void DISS_INIT(uint64_t &dslv_, const int &called, const int &real_or_complex, const int &nthreads, const int &verbose) { int num_threads; dissection_solver_ptr *dslv; dslv_ = (uint64_t)new dissection_solver_ptr; dslv = (dissection_solver_ptr *)dslv_; dslv->real_or_complex = real_or_complex; dslv->quad_fact = false; dslv->called = called; dslv->symbolic = 0; dslv->numeric = 0; { int pid = (int)getpid(); char fname[256]; if (verbose > 0) { dslv->verbose = true; } else { dslv->verbose = false; } #if 1 if (dslv->verbose > 0) { fprintf(stderr, "pid = %d\n", pid); sprintf(fname, "dissection.%04d.%04d.log", pid, called); // sprintf(fname, "dissection.%04d.log", pid); dslv->fp = fopen(fname, "a"); } else { dslv->fp = stderr; } #else dslv->fp = stderr; #endif } if (dslv->verbose > 0) { fprintf(dslv->fp, "%s %d : diss_init : called = %d\n", __FILE__, __LINE__, dslv->called); } // _called++; // counter for dumping matrix data to debug #ifdef BLAS_MKL if (getenv("MKL_NUM_THREADS")) { sscanf(getenv("MKL_NUM_THREADS"), "%d", &dslv->mkl_num_threads); if (dslv->verbose > 0) { fprintf(dslv->fp, "environmental variable MKL_NUM_THREADS = %d\n", dslv->mkl_num_threads); } } else { dslv->mkl_num_threads = mkl_get_max_threads(); } if (dslv->verbose > 0) { fprintf(dslv->fp, "MKL_NUM_THREADS = %d\n", dslv->mkl_num_threads); } #endif if (nthreads == (-1)) { if (getenv("NTHREADS")) { sscanf(getenv("NTHREADS"), "%d", &num_threads); } else { num_threads = 1; } } if (nthreads > 0) { num_threads = nthreads; } { switch(real_or_complex) { case DISSECTION_REAL_MATRIX: dslv->rptr = new DissectionSolver<double>(num_threads, (verbose != 0 ? true : false), dslv->called, dslv->fp); break; case DISSECTION_COMPLEX_MATRIX: dslv->cptr = new DissectionSolver<complex<double>, double>(num_threads, (verbose != 0 ? true : false), dslv->called, dslv->fp); break; default: if (dslv->verbose > 0) { fprintf(dslv->fp, "%s %d : unknown matrix data type : %d\n", __FILE__, __LINE__, dslv->real_or_complex); } } } }
// X: a MxD matrix, Y: a M vector, W: a M vector // W0: a M vector int main(int argc, char ** argv){ if (argc>1 && argv[1][0]=='h') { printf ("Usage: parSymSGD M D T C lamda r\n"); printf (" M: number of data points, D: dimensions, T: time iterations, C: cores;\n"); printf (" lamda: learning rate, r: panel size in unit of C.\n"); return 1; }u // read in the arguments: M, D, I (time iterations), C (cores), r (each panel contains r*C points) int M = argc>1?atoi(argv[1]):32; int D = argc>2?atoi(argv[2]):4; T = argc>3?atoi(argv[3]):10; int C = argc>4?atoi(argv[4]):4; float lamda = argc>5?atof(argv[5]):0.01; int r = argc>6?atoi(argv[6]):1; ///printf("M=%d, D=%d, T=%d, C=%d, lamda=%8.6f, r=%d\n",M,D,T,C,lamda,r); int max_threads = mkl_get_max_threads(); // get the max number of threads int rep; mkl_set_num_threads(1); // set the number of threads to use by mkl panelSz = C*r; panels = M/panelSz; int i,j,k,p,t; float *Y, *Wreal, *W, *X; Y = (float *) mkl_malloc(M*sizeof(float),PAGESIZE); Wreal = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); W = (float *) mkl_malloc(D*sizeof(float),PAGESIZE); X = (float *) mkl_malloc(M*D*sizeof(float),PAGESIZE); float *Ypred = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *Ytmp = (float*)mkl_malloc(M*sizeof(float),PAGESIZE); float *I = (float*)mkl_malloc(D*D*sizeof(float),PAGESIZE); float *Z = (float*)mkl_malloc(M*D*sizeof(float),PAGESIZE); float *B = (float*)mkl_malloc(panels*D*sizeof(float),PAGESIZE); if (Y==NULL | Wreal==NULL | W==NULL | X==NULL | Ypred==NULL || Ytmp==NULL || Z==NULL || B==NULL || I== NULL){ printf("Memory allocation error.\n"); return 2; } initData(Wreal,W,X,Y, M, D,I); ///printf("panelSz=%d, panels=%d\n", panelSz, panels); for (nt=1; nt<=max_threads && nt<=panelSz; nt*=2){ omp_set_num_threads(nt);// set the number of openMP threads for (rep=0; rep<REPEATS; rep++){//repeat measurements double prepTime, gdTime, sInit; // preprocessing sInit=dsecnd(); //preprocessSeq(X, Y, Z, B, panelSz, panels, M, D, lamda); preprocessPar(X, Y, Z, B, panelSz, panels, M, D, lamda); prepTime = (dsecnd() - sInit); ///dump2("Z",Z,M,D); ///dump2("B",B,panels,D); // GD initW(W,D); ///dump1("W (initial)", W, D); sInit=dsecnd(); float err; float fixpoint = 0.0; for (t=0;t<T;t++){ for (p=0;p<panels;p++){ gd(&(X[p*panelSz*D]),&(Z[p*panelSz*D]), &(B[p*D]), panelSz, D, lamda, W, I); ///printf("(t=%d, p=%d) ",t,p); ///dump1("W", W, D); ///err=calErr(X, Ypred, Ytmp, Y, W, M, D); printf("finish one panels ............................ \n"); } } gdTime = (dsecnd() - sInit); err=calErr(X, Ypred, Ytmp, Y, W, M, D); fixpoint = err - prev_err; // print final err. time is in milliseconds printf("nt=%d\t ttlTime=%.5f\t prepTime=%.5f\t gdTime=%.5f\t error=%.5f\n", nt, (gdTime+prepTime)*1000, prepTime*1000, gdTime*1000, err); } } if (B) mkl_free(B); if (Z) mkl_free(Z); if (Ytmp) mkl_free(Ytmp); if (Ypred) mkl_free(Ypred); if (Y) mkl_free(Y); if (Wreal) mkl_free(Wreal); if (W) mkl_free(W); if (X) mkl_free(X); if (I) mkl_free(I); return 0; }
void StVKReducedStiffnessMatrix::Evaluate(double * q, double * Rq) { // this is same as EvaluateSubset with start=0, end=quadraticSize /* int i,j,k; int output; // reset to free terms int index = 0; int indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { Rq[indexEntry] = freeCoef_[index]; index++; indexEntry++; } indexEntry += output + 1; } // add linear terms index = 0; indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { for(j=0; j<r; j++) { Rq[indexEntry] += linearCoef_[index] * q[j]; index++; } indexEntry++; } indexEntry += output + 1; } // add quadratic terms index = 0; indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { for(j=0; j<r; j++) for(k=j; k<r; k++) { Rq[indexEntry] += quadraticCoef_[index] * q[j] * q[k]; index++; } indexEntry++; } indexEntry += output + 1; } // make symetric for(output=0; output<r; output++) for(i=0; i<output; i++) Rq[ELT(r,i,output)] = Rq[ELT(r,output,i)]; */ if (useSingleThread) { #if defined(WIN32) || defined(linux) mkl_max_threads = mkl_get_max_threads(); mkl_dynamic = mkl_get_dynamic(); mkl_set_num_threads(1); mkl_set_dynamic(0); #elif defined(__APPLE__) //setenv("VECLIB_MAXIMUM_THREADS", "1", true); #endif } // reset to free terms memcpy(buffer1,freeCoef_,sizeof(double)*quadraticSize); // add linear terms // multiply linearCoef_ and q // linearCoef_ is r x quadraticSize array cblas_dgemv(CblasColMajor, CblasTrans, r, quadraticSize, 1.0, linearCoef_, r, q, 1, 1.0, buffer1, 1); // compute qiqj int index = 0; for(int output=0; output<r; output++) for(int i=output; i<r; i++) { qiqj[index] = q[output] * q[i]; index++; } // update Rq // quadraticCoef_ is quadraticSize x quadraticSize matrix // each column gives quadratic coef for one matrix entry cblas_dgemv(CblasColMajor, CblasTrans, quadraticSize, quadraticSize, 1.0, quadraticCoef_, quadraticSize, qiqj, 1, 1.0, buffer1, 1); // unpack into a symmetric matrix int i1=0,j1=0; for(int i=0; i< quadraticSize; i++) { Rq[ELT(r,i1,j1)] = buffer1[i]; Rq[ELT(r,j1,i1)] = buffer1[i]; j1++; if(j1 == r) { i1++; j1 = i1; } } if (useSingleThread) { #if defined(WIN32) || defined(linux) mkl_set_num_threads(mkl_max_threads); mkl_set_dynamic(mkl_dynamic); #elif defined(__APPLE__) //unsetenv("VECLIB_MAXIMUM_THREADS"); #endif } }
int main(int argc, char** argv) { int maxnumit = 0; int maxrec = -1; const char *budget_type_str = NULL; const char *stage = NULL; const char *training = NULL; const char *dev = NULL; const char *path = NULL; const char * etransform_str = NULL; const char *kernel_str = NULL; const char *rbf_lambda_str = NULL; #ifdef NDEBUG log_info("ai-parse %s (Release)", VERSION); #else log_info("ai-parse %s (Debug)", VERSION); #endif struct argparse_option options[] = { OPT_HELP(), //OPT_BOOLEAN('f', "force", &force, "force to do", NULL), OPT_INTEGER('v', "verbosity", &verbosity, "Verbosity level. Minimum (Default) 0. Increasing values increase parser verbosity.", NULL), OPT_STRING('o', "modelname", &modelname, "Model name", NULL), OPT_STRING('p', "path", &path, "CoNLL base directory including sections", NULL), OPT_STRING('s', "stage", &stage, "[ optimize | train | parse ]", NULL), OPT_INTEGER('n', "maxnumit", &maxnumit, "Maximum number of iterations by perceptron. Default is 50", NULL), OPT_STRING('t', "training", &training, "Training sections for optimize and train. Apply sections for parse", NULL), OPT_STRING('d', "development", &dev, "Development sections for optimize", NULL), OPT_STRING('e', "epattern", &epattern, "Embedding Patterns", NULL), OPT_INTEGER('l', "edimension", &edimension, "Embedding dimension", NULL), OPT_INTEGER('m', "maxrec", &maxrec, "Maximum number of training instance", NULL), OPT_STRING('x', "etransform", &etransform_str, "Embedding Transformation", NULL), OPT_STRING('k', "kernel", &kernel_str, "Kernel Type", NULL), OPT_INTEGER('a', "bias", &bias, "Polynomial kernel additive term. Default is 1", NULL), OPT_INTEGER('c', "concurrency", &num_parallel_mkl_slaves, "Parallel MKL Slaves. Default is 90% of all machine cores", NULL), OPT_INTEGER('b', "degree", &polynomial_degree, "Degree of polynomial kernel. Default is 4", NULL), OPT_STRING('z', "lambda", &rbf_lambda_str, "Lambda multiplier for RBF Kernel.Default value is 0.025"), OPT_STRING('u', "budget_type", &budget_type_str, "Budget control methods. NONE|RANDOM", NULL), OPT_INTEGER('g', "budget_size", &budget_target, "Budget Target for budget based perceptron algorithms. Default 50K", NULL), OPT_END(), }; struct argparse argparse; argparse_init(&argparse, options, usage, 0); argc = argparse_parse(&argparse, argc, argv); int max_threads = mkl_get_max_threads(); log_info("There are max %d MKL threads", max_threads); if (num_parallel_mkl_slaves == -1) { num_parallel_mkl_slaves = (int) (max_threads * 0.9); if (num_parallel_mkl_slaves == 0) num_parallel_mkl_slaves = 1; } log_info("Number of MKL Slaves is set to be %d", num_parallel_mkl_slaves); mkl_set_num_threads(num_parallel_mkl_slaves); if (1 == mkl_get_dynamic()) log_info("Intel MKL may use less than %i threads for a large problem", num_parallel_mkl_slaves); else log_info("Intel MKL should use %i threads for a large problem", num_parallel_mkl_slaves); check(stage != NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0 || strcmp(stage, "parse") == 0), "Choose one of -s optimize, train, parse"); check(path != NULL, "Specify a ConLL base directory using -p"); check(edimension != 0, "Set embedding dimension using -l"); check(modelname != NULL, "Provide model name using -o"); if (budget_type_str != NULL) { if (strcmp(budget_type_str, "RANDOM") == 0 || strcmp(budget_type_str, "RANDOMIZED") == 0) { budget_method = RANDOMIZED; } else if (strcmp(budget_type_str, "NONE") == 0) { budget_method = NONE; } else { log_err("Unknown budget control type %s", budget_type_str); goto error; } } else { budget_method = NONE; } if (training == NULL) { log_warn("training section string is set to %s", DEFAULT_TRAINING_SECTION_STR); training = strdup(DEFAULT_TRAINING_SECTION_STR); } if (dev == NULL && (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0)) { log_info("development section string is set to %s", DEFAULT_DEV_SECTION_STR); dev = strdup(DEFAULT_DEV_SECTION_STR); } check(epattern != NULL, "Embedding pattern is required for -s optimize,train,parse"); if (etransform_str == NULL) { log_info("Embedding transformation is set to be QUADRATIC"); etransform = DEFAULT_EMBEDDING_TRANFORMATION; } else if (strcmp(etransform_str, "LINEAR") == 0) { etransform = LINEAR; } else if (strcmp(etransform_str, "QUADRATIC") == 0) { etransform = QUADRATIC; } else if (strcmp(etransform_str, "CUBIC") == 0) { etransform = CUBIC; } else { log_err("Unsupported transformation type for embedding %s", etransform_str); } if (strcmp(stage, "optimize") == 0 || strcmp(stage, "train") == 0) { if (maxnumit <= 0) { log_info("maxnumit is set to %d", DEFAULT_MAX_NUMIT); maxnumit = DEFAULT_MAX_NUMIT; } } if (kernel_str != NULL) { if (strcmp(kernel_str, "POLYNOMIAL") == 0) { log_info("Polynomial kernel will be used with bias %f and degree %d", bias, polynomial_degree); kernel = KPOLYNOMIAL; } else if (strcmp(kernel_str, "GAUSSIAN") == 0 || strcmp(kernel_str, "RBF") == 0) { if (rbf_lambda_str != NULL) { rbf_lambda = (float) atof(rbf_lambda_str); } log_info("RBF/GAUSSIAN kernel will be used with lambda %f ", rbf_lambda); kernel = KRBF; } else { log_err("Unsupported kernel type %s. Valid options are LINEAR, POLYNOMIAL, and RBF/GAUSSIAN", kernel_str); goto error; } } if (strcmp(stage, "optimize") == 0) { void *model = optimize(maxnumit, maxrec, path, training, dev, edimension); char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7)); check_mem(model_filename); sprintf(model_filename, "%s.model", modelname); FILE *fp = fopen(model_filename, "w"); if (kernel == KLINEAR) { PerceptronModel pmodel = (PerceptronModel) model; dump_PerceptronModel(fp, edimension, pmodel->embedding_w_best, pmodel->best_numit); PerceptronModel_free(pmodel); } else if (kernel == KPOLYNOMIAL || kernel == KRBF) { KernelPerceptron kpmodel = (KernelPerceptron) model; dump_KernelPerceptronModel(fp, kpmodel); } log_info("Model is dumped into %s file", model_filename); fclose(fp); } else if (strcmp(stage, "parse") == 0) { char* model_filename = (char*) malloc(sizeof (char) * (strlen(modelname) + 7)); check_mem(model_filename); sprintf(model_filename, "%s.model", modelname); FILE *fp = fopen(model_filename, "r"); check(fp != NULL, "%s could not be opened", model_filename); void *model; if (kernel == KLINEAR) model = load_PerceptronModel(fp); else model = load_KernelPerceptronModel(fp); fclose(fp); check(model != NULL, "Error in loading model file"); log_info("Model loaded from %s successfully", model_filename); parseall(model, path, training, edimension); } else { log_info("Waiting for implementation"); } return (EXIT_SUCCESS); error: return (EXIT_FAILURE); }
void TaskManager :: Loop(int thd) { /* static Timer tADD("add entry counter"); static Timer tCASready1("spin-CAS ready tick1"); static Timer tCASready2("spin-CAS ready tick2"); static Timer tCASyield("spin-CAS yield"); static Timer tCAS1("spin-CAS wait"); static Timer texit("exit zone"); static Timer tdec("decrement"); */ thread_id = thd; int thds = GetNumThreads(); int mynode = num_nodes * thd/thds; NodeData & mynode_data = *(nodedata[mynode]); TaskInfo ti; ti.nthreads = thds; ti.thread_nr = thd; // ti.nnodes = num_nodes; // ti.node_nr = mynode; #ifdef USE_NUMA numa_run_on_node (mynode); #endif active_workers++; workers_on_node[mynode]++; int jobdone = 0; #ifdef USE_MKL auto mkl_max = mkl_get_max_threads(); mkl_set_num_threads_local(1); #endif while (!done) { if (complete[mynode] > jobdone) jobdone = complete[mynode]; if (jobnr == jobdone) { // RegionTracer t(ti.thread_nr, tCASyield, ti.task_nr); if(sleep) this_thread::sleep_for(chrono::microseconds(sleep_usecs)); else { #ifdef WIN32 this_thread::yield(); #else // WIN32 sched_yield(); #endif // WIN32 } continue; } { // RegionTracer t(ti.thread_nr, tADD, ti.task_nr); // non-atomic fast check ... if ( (mynode_data.participate & 1) == 0) continue; int oldval = mynode_data.participate += 2; if ( (oldval & 1) == 0) { // job not active, going out again mynode_data.participate -= 2; continue; } } if (startup_function) (*startup_function)(); IntRange mytasks = Range(int(ntasks)).Split (mynode, num_nodes); try { while (1) { if (mynode_data.start_cnt >= mytasks.Size()) break; int mytask = mynode_data.start_cnt.fetch_add(1, memory_order_relaxed); if (mytask >= mytasks.Size()) break; ti.task_nr = mytasks.First()+mytask; ti.ntasks = ntasks; { RegionTracer t(ti.thread_nr, jobnr, RegionTracer::ID_JOB, ti.task_nr); (*func)(ti); } } } catch (Exception e) { { // cout << "got exception in TM" << endl; lock_guard<mutex> guard(copyex_mutex); delete ex; ex = new Exception (e); mynode_data.start_cnt = mytasks.Size(); } } #ifndef __MIC__ atomic_thread_fence (memory_order_release); #endif // __MIC__ if (cleanup_function) (*cleanup_function)(); jobdone = jobnr; mynode_data.participate-=2; { int oldpart = 1; if (mynode_data.participate.compare_exchange_strong (oldpart, 0)) { if (jobdone < jobnr.load()) { // reopen gate mynode_data.participate |= 1; } else { if (mynode != 0) mynode_data.start_cnt = 0; complete[mynode] = jobnr.load(); } } } } #ifdef USE_MKL mkl_set_num_threads_local(mkl_max); #endif workers_on_node[mynode]--; active_workers--; }
void StVKReducedInternalForces::Evaluate(double * q, double * fq) { /* // unoptimized version // reset to zero int i,j,k,l; for(l=0; l<r; l++) fq[l] = 0; // add linear terms int index = 0; for(l=0; l<r; l++) for(i=0; i<r; i++) { fq[l] += linearCoef_[index] * q[i]; index++; } // add quadratic terms index = 0; for(l=0; l<r; l++) for(i=0; i<r; i++) for(j=i; j<r; j++) { fq[l] += quadraticCoef_[index] * q[i] * q[j]; index++; } // add cubic terms index = 0; for(l=0; l<r; l++) for(i=0; i<r; i++) for(j=i; j<r; j++) for(k=j; k<r; k++) { fq[l] += cubicCoef_[index] * q[i] * q[j] * q[k]; index++; } */ if (useSingleThread) { #if defined(_WIN32) || defined(WIN32) || defined(linux) mkl_max_threads = mkl_get_max_threads(); mkl_dynamic = mkl_get_dynamic(); mkl_set_num_threads(1); mkl_set_dynamic(0); #elif defined(__APPLE__) //setenv("VECLIB_MAXIMUM_THREADS", "1", true); #endif } // add linear terms // multiply linearCoef_ and q // linearCoef_ is r x r array cblas_dgemv(CblasColMajor, CblasTrans, r, r, 1.0, linearCoef_, r, q, 1, 0.0, fq, 1); // compute qiqj int index = 0; for(int output=0; output<r; output++) for(int i=output; i<r; i++) { qiqj[index] = q[output] * q[i]; index++; } // add quadratic terms // quadraticCoef_ is quadraticSize x r matrix // each column gives quadratic coef for one force vector component cblas_dgemv(CblasColMajor, CblasTrans, quadraticSize, r, 1.0, quadraticCoef_, quadraticSize, qiqj, 1, 1.0, fq, 1); // add cubic terms // cubicCoef_ is cubicSize x r matrix // each column gives cubicSize coef for one force vector component int size = quadraticSize; double * qiqjPos = qiqj; double * cubicCoefPos = cubicCoef_; for(int i=0; i<r; i++) { cblas_dgemv(CblasColMajor, CblasTrans, size, r, q[i], cubicCoefPos, cubicSize, qiqjPos, 1, 1.0, fq, 1); int param = r-i; size -= param; qiqjPos += param; cubicCoefPos += param * (param+1) / 2; } if (addGravity) { for(int i=0; i<r; i++) fq[i] -= reducedGravityForce[i]; } if (useSingleThread) { #if defined(_WIN32) || defined(WIN32) || defined(linux) mkl_set_num_threads(mkl_max_threads); mkl_set_dynamic(mkl_dynamic); #elif defined(__APPLE__) //unsetenv("VECLIB_MAXIMUM_THREADS"); #endif } }
int main(int argc, char* argv[]){ //factor X into W*H matrix A,B,C; int max_iter; if(argc > 1){ if(!strcmp(argv[1],"-h")){ printf("usage: bench matrix_dim1[100] matrix_dim2[100] matrix_dim3[100] iterations[100] trials[10] mkl_threads[#procs]\n"); exit(0); } } if (argc > 3){ matrix_dim[0] = atoi(argv[1]); matrix_dim[1] = atoi(argv[2]); matrix_dim[2] = atoi(argv[3]); } else { matrix_dim[0] = MAT_DIM; matrix_dim[1] = MAT_DIM; matrix_dim[2] = MAT_DIM; } int num_trials; if(argc>5) num_trials = atoi(argv[5]); else num_trials = TRIALS; int verbose = 0; if (num_trials<10) verbose = 1; mkl_threads = mkl_get_max_threads(); if (argc>6){ if (atoi(argv[6]) < mkl_threads) mkl_threads = atoi(argv[6]); } if (argc>4) max_iter = atoi(argv[4]); else max_iter = MAX_ITER; printf("mkl_threads: \t\t%i\n",mkl_threads); printf("matrix_dims: \t\t%i,%i,%i\n",matrix_dim[0],matrix_dim[1],matrix_dim[2]); create_matrix(&A, matrix_dim[0], matrix_dim[1], 1); create_matrix(&B, matrix_dim[1], matrix_dim[2], 1); create_matrix(&C, matrix_dim[0], matrix_dim[2], 1); double t_min = 1E9; double t = 0; int trial; int iter; for(trial=0;trial<num_trials;trial++){ t = 0; t -= get_time(); for(iter=0;iter<max_iter;iter++) matrix_multiply(A,B,C,mkl_threads); t += get_time(); printf("%6i: %9.6f\n",trial,t); if(t < t_min) t_min = t; } printf("t_min: %9.6f\n",t_min); destroy_matrix(&A); destroy_matrix(&B); destroy_matrix(&C); return 0; }
MKLDisableThreading(bool condition) : num_threads{mkl_get_max_threads()} { if (condition) mkl_set_num_threads(1); }