void FATR util_mic_set_affinity_() { char affinity[BUFSZ]; char num_threads[BUFSZ]; int pos; int micdev; int nprocs; int ranks_per_dev; int rank_on_dev; int nthreads; int ppn; int ranks_per_device=util_getenv_nwc_ranks_per_device_(); if (ranks_per_device == 0) { return ; } else if (ranks_per_device < 0){ ranks_per_device = RANKS_PER_DEVICE; } pos=snprintf(affinity, BUFSZ, "KMP_PLACE_THREADS="); micdev=util_mic_get_device_(); ppn=util_cgetppn(); #pragma offload target(mic:micdev) out(nprocs) { /* do one offload to query the coprocessor for the number of cores */ nprocs = ((int) sysconf(_SC_NPROCESSORS_ONLN) / 4) - 1; } rank_on_dev = util_my_smp_index() % util_nwc_ranks_per_device_(); nthreads = nprocs / ranks_per_device * DEFAULT_OFFLOAD_THREAD_MULTIPLIER; pos+=snprintf(affinity+pos, BUFSZ-pos, "%dc,%dt,%do", nprocs / ranks_per_device, DEFAULT_OFFLOAD_THREAD_MULTIPLIER, rank_on_dev * (nprocs / ranks_per_device)); snprintf(num_threads, BUFSZ, "OMP_NUM_THREADS=%d", nthreads); printf("%02d: micdev=%d nprocs=%d rank_on_dev=%d ranks_per_device=%d affinity='%s' pos=%d\n", GA_Nodeid(), micdev, nprocs, rank_on_dev, ranks_per_device, affinity, pos); fflush(stdout); #pragma offload target(mic:micdev) in(affinity) in(num_threads) { /* set the affinity masks and the number of offloaded OpenMP threads */ kmp_set_defaults("KMP_AFFINITY=compact"); kmp_set_defaults(affinity); kmp_set_defaults(num_threads); } }
void calculate_production_rate(REACTION_DATA_ver2 &reaction_data, vector<double> &Rf, vector<double> &Rb, vector<double> &S_chem) { kmp_set_defaults("KMP_AFFINITY = scatter"); if (S_chem.size() != reaction_data.NS) S_chem.resize(reaction_data.NS); #pragma omp parallel for for (int s = 0; s <= reaction_data.NS-1; s++) S_chem[s] = 0.0; #pragma omp parallel for for (int s = 0; s <= reaction_data.NS-1; s++) { for (int k = 0; k <= reaction_data.NR-1; k++) { double Rf_m_Rb = Rf[k] - Rb[k]; S_chem[s] += (reaction_data.reaction_k[k].Product_coeff[s] - reaction_data.reaction_k[k].Reactant_coeff[s]) * Rf_m_Rb; } } #pragma omp parallel for for (int s = 0; s <= reaction_data.NS-1; s++) S_chem[s] = S_chem[s] * reaction_data.species_data[s].basic_data.M; }
int main(int argc, char* argv[]) { try { cmdline::parser cl; cl.add<int>("threads", 't', "number of threads", false); cl.add<int>("iter", 'i', "millions of iterations", false, 10); cl.parse_check(argc, argv); std::cout << "Initializing" << std::endl; if (cl.exist("threads")) { omp_set_num_threads(cl.get<int>("threads")); } #if 0 kmp_set_defaults("KMP_AFFINITY=compact"); #endif #pragma omp parallel for for (auto i = 0; i < FLOPS_ARRAY_SIZE; i++) { fa[i] = (float)i + 0.1f; fb[i] = (float)i + 0.2f; } int max_threads = omp_get_max_threads(); std::cout << "Starting compute on " << max_threads << " threads" << std::endl; Timer timer; float a = 1.1f; int iters = 1000000 * cl.get<int>("iter"); #pragma omp parallel for for (int i = 0; i < max_threads; ++i) { // each thread will work its own array section int offset = i * LOOP_COUNT; // loop many times to get lots of calculations for (int j = 0; j < iters; ++j) { // scale 1st array and add in the 2nd array for (int k = 0; k < LOOP_COUNT; ++k) { fa[k + offset] = a * fa[k + offset] + fb[k + offset]; } } } double gflops = (double)(1e-9 * max_threads * LOOP_COUNT * iters * FLOPSPERCALC); // elasped time double sec = timer(); std::cout << "Gflops = " << gflops << std::endl; std::cout << "secs = " << sec << std::endl; std::cout << "Gflops/s = " << (gflops / sec) << std::endl; } catch (std::string& ex) { std::cerr << "error: " << ex << std::endl; } catch (const char* ex) { std::cerr << "error: " << ex << std::endl; } }
int test_kmp_set_defaults_lock_bug() { /* checks that omp_get_num_threads is equal to the number of threads */ int nthreads_lib; int nthreads = 0; nthreads_lib = -1; #pragma omp parallel { omp_set_lock(&lock); nthreads++; omp_unset_lock(&lock); #pragma omp single { nthreads_lib = omp_get_num_threads (); } /* end of single */ } /* end of parallel */ kmp_set_defaults("OMP_NUM_THREADS"); #pragma omp parallel { omp_set_lock(&lock); nthreads++; omp_unset_lock(&lock); } /* end of parallel */ return (nthreads == 2*nthreads_lib); }
void initomp (int nthreads, int verbose) { char schedule[1024]; if (verbose == 1) { sprintf (schedule, "KMP_AFFINITY=granularity=fine,compact,verbose"); } else { sprintf (schedule, "KMP_AFFINITY=granularity=fine,compact"); } kmp_set_defaults (schedule); omp_set_num_threads (nthreads); }
/* * FN-01: Prepararization OP2A * @author Minkwan Kim * @version 1.0 25/5/2015 */ void ApplicationOP2A::preparation(int argc, char *argv[], string modulename) { /* * ============================================== * STEP 1: Initialize Parallel Communication: * - Development Status : Done * - Last modified on: July 23, 2014 * by: Minkwan Kim * ============================================= * */ time_running.initStartTime(); #ifdef MPI MPI_Init(&argc, &argv); // INITIALIZE MPI MPI_Comm_size(MPI_COMM_WORLD, &NP); // FINDOUT HOW MANY PROCESSORS IN THERE MPI_Comm_rank(MPI_COMM_WORLD, &P); // FINDOUT WHICH PROCESSOR I AM t0 = MPI_Wtime(); #endif if (NP > OP2A_MAX_N_TASK) throw Common::ExceptionNPExceed (FromHere(), "Number of processors exceeds MAX_PROCESSOR. Need to adjust value of MAX_N_TASKS"); kmp_set_defaults("KMP_AFFINITY = scatter"); /* * ================================================= * STEP 2: Show Version information: * - Development Status : Done * - Last modified on: July 23, 2014 * - by: Minkwan Kim * ================================================= */ Version versionOP2A(OP2A_VERSION_MAIN, OP2A_VERSION_SUB, m_now->tm_year + 1900, m_now->tm_mon + 1, m_now->tm_mday, modulename.c_str()); if (P == 0) { versionOP2A.info(); // Show the version information cout << " --> t = " << time_running.getDeltaT() << "[sec]" << endl << endl; } }
void calculate_reaction_rate(REACTION_DATA_ver2 &reaction_data, vector<double> &rhos, vector<double> &T, vector<double> &kf, vector<double> &kb, vector<double> &Rf, vector<double> &Rb) { kmp_set_defaults("KMP_AFFINITY = scatter"); // 1. Initialize Rf and Rb Rf.reserve(reaction_data.NR); Rb.reserve(reaction_data.NR); // 2. Calculate 10^-3 * rho_s / Ms vector<double> rhos_Ms(reaction_data.NS, 0.0); #pragma omp parallel for for (int s = 0; s <= reaction_data.NS-1; s++) rhos_Ms[s] = 0.001 * rhos[s]/reaction_data.species_data[s].basic_data.M; double n_mix = 0.0; for (int s = 0; s <= reaction_data.NS-1; s++) n_mix = n_mix + rhos[s]/reaction_data.species_data[s].basic_data.m; n_mix = n_mix * 1.0e-6; //3. Calculate Forward/backward reaction rate for (int k = 0; k <= reaction_data.NR-1; k++) { double Tf, Tb; calculate_reaction_temperature(reaction_data.reaction_k[k], T, Tf, Tb); kf[k] = cal_kf(reaction_data.reaction_k[k], Tf); kb[k] = cal_kb(reaction_data.reaction_k[k], n_mix, Tb); double Rf_temp = 1.0; for (int j = 0; j <= reaction_data.NS-1; j++) Rf_temp *= pow(rhos_Ms[j], reaction_data.reaction_k[k].Reactant_coeff[j]); Rf[k] = 1000.0*kf[k] * Rf_temp; double Rb_temp = 1.0; for (int j = 0; j <= reaction_data.NS-1; j++) Rf_temp *= pow(rhos_Ms[j], reaction_data.reaction_k[k].Product_coeff[j]); Rb[k] = 1000.0*kb[k] * Rb_temp; } }
/***************************************************************************//** * * @ingroup Auxiliary * * PLASMA_Init_Affinity - Initialize PLASMA. * ******************************************************************************* * * @param[in] cores * Number of cores to use (threads to launch). * If cores = 0, cores = PLASMA_NUM_THREADS if it is set, the * system number of core otherwise. * * @param[in] coresbind * Array to specify where to bind each thread. * Each thread i is binded to coresbind[hwloc(i)] if hwloc is * provided, or to coresbind[i] otherwise. * If coresbind = NULL, coresbind = PLASMA_AFF_THREADS if it * is set, the identity function otherwise. * ******************************************************************************* * * @return * \retval PLASMA_SUCCESS successful exit * ******************************************************************************/ int PLASMA_Init_Affinity(int cores, int *coresbind) { plasma_context_t *plasma; int status; int core; /* Create context and insert in the context map */ plasma = plasma_context_create(); if (plasma == NULL) { plasma_fatal_error("PLASMA_Init", "plasma_context_create() failed"); return PLASMA_ERR_OUT_OF_RESOURCES; } status = plasma_context_insert(plasma, pthread_self()); if (status != PLASMA_SUCCESS) { plasma_fatal_error("PLASMA_Init", "plasma_context_insert() failed"); return PLASMA_ERR_OUT_OF_RESOURCES; } /* Init number of cores and topology */ plasma_topology_init(); /* Set number of cores */ if ( cores < 1 ) { plasma->world_size = plasma_get_numthreads(); if ( plasma->world_size == -1 ) { plasma->world_size = 1; plasma_warning("PLASMA_Init", "Could not find the number of cores: the thread number is set to 1"); } } else plasma->world_size = cores; if (plasma->world_size <= 0) { plasma_fatal_error("PLASMA_Init", "failed to get system size"); return PLASMA_ERR_NOT_FOUND; } /* Check if not more cores than the hard limit */ if (plasma->world_size > CONTEXT_THREADS_MAX) { plasma_fatal_error("PLASMA_Init", "not supporting so many cores"); return PLASMA_ERR_INTERNAL_LIMIT; } /* Get the size of each NUMA node */ plasma->group_size = plasma_get_numthreads_numa(); while ( ((plasma->world_size)%(plasma->group_size)) != 0 ) (plasma->group_size)--; /* Initialize barriers */ plasma_barrier_init(plasma); plasma_barrier_bw_init(plasma); /* Initialize default thread attributes */ status = pthread_attr_init(&plasma->thread_attr); if (status != 0) { plasma_fatal_error("PLASMA_Init", "pthread_attr_init() failed"); return status; } /* Set scope to system */ status = pthread_attr_setscope(&plasma->thread_attr, PTHREAD_SCOPE_SYSTEM); if (status != 0) { plasma_fatal_error("PLASMA_Init", "pthread_attr_setscope() failed"); return status; } /* Set concurrency */ status = pthread_setconcurrency(plasma->world_size); if (status != 0) { plasma_fatal_error("PLASMA_Init", "pthread_setconcurrency() failed"); return status; } /* Launch threads */ memset(plasma->thread_id, 0, CONTEXT_THREADS_MAX*sizeof(pthread_t)); if (coresbind != NULL) { memcpy(plasma->thread_bind, coresbind, plasma->world_size*sizeof(int)); } else { plasma_get_affthreads(plasma->thread_bind); } /* Assign rank and thread ID for the master */ plasma->thread_rank[0] = 0; plasma->thread_id[0] = pthread_self(); for (core = 1; core < plasma->world_size; core++) { plasma->thread_rank[core] = core; pthread_create( &plasma->thread_id[core], &plasma->thread_attr, plasma_parallel_section, (void*)plasma); } /* Ensure BLAS are sequential and set thread affinity for the master */ #if defined(PLASMA_WITH_MKL) #if defined(__ICC) || defined(__INTEL_COMPILER) kmp_set_defaults("KMP_AFFINITY=disabled"); #endif #endif /* Initialize the dynamic scheduler */ plasma->quark = QUARK_Setup(plasma->world_size); plasma_barrier(plasma); plasma_setlapack_sequential(plasma); return PLASMA_SUCCESS; }
// // Main program - pedal to the metal...calculate using tons o'flops! // int main(int argc, char *argv[] ) { int i,j,k; int numthreads; double tstart, tstop, ttime; double gflops = 0.0; float a=1.1; // // initialize the compute arrays // // omp_set_num_threads(2); kmp_set_defaults("KMP_AFFINITY=compact"); #pragma omp parallel #pragma omp master numthreads = omp_get_num_threads(); printf("Initializing\r\n"); #pragma omp parallel for for(i=0; i<FLOPS_ARRAY_SIZE; i++) { fa[i] = (float)i + 0.1; fb[i] = (float)i + 0.2; } printf("Starting Compute on %d threads\r\n",numthreads); tstart = dtime(); // scale the calculation across threads requested // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY #pragma omp parallel for private(j,k) for (i=0; i<numthreads; i++) { // each thread will work it's own array section // calc offset into the right section int offset = i*LOOP_COUNT; // loop many times to get lots of calculations for(j=0; j<MAXFLOPS_ITERS; j++) { // scale 1st array and add in the 2nd array for(k=0; k<LOOP_COUNT; k++) { fa[k+offset] = a * fa[k+offset] + fb[k+offset]; } } } tstop = dtime(); // # of gigaflops we just calculated gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT* MAXFLOPS_ITERS*FLOPSPERCALC); //elasped time ttime = tstop - tstart; // // Print the results // if ((ttime) > 0.0) { printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n", gflops, ttime, gflops/ttime); } return( 0 ); }
int main(int argc, char *argv[]) { #ifdef _OPENMP #ifndef KMP_AFFINITY kmp_set_defaults("KMP_AFFINITY=compact, granularity=fine"); // kmp_set_defaults("KMP_AFFINITY=scatter, granularity=fine");//this gives much slower performance... #endif #pragma omp parallel #pragma omp master //#ifndef OMP_NUM_THREADS omp_set_num_threads(240); //#endif printf("\nComputing 7-point stencil on Intel Xeon Phi in %d threads.\n", omp_get_num_threads()); #endif const int nx = problem_dim; const int ny = problem_dim; const int nz = problem_dim; const int problem_size = sizeof(float)*nx*ny*nz; float *f1 = (float *)_mm_malloc(problem_size, 64); float *f2 = (float *)_mm_malloc(problem_size, 64); assert(f1 != MAP_FAILED); assert(f2 != MAP_FAILED); float *answer = (float *)_mm_malloc(problem_size, 64); float *f_final = NULL; int count = 0; float c0, c1; float l = 1.0; float kappa = 0.1; float dx = l / nx; float dy = l / ny; float dz = l / nz; float dt = 0.1*dx*dx / kappa; float scale = 0.1; count = scale / dt; f_final = (count % 2)? f2 : f1; create_field<float>(f1, nx, ny, nz, dx, dy, dz, kappa, 0.0); c1 = kappa*dt/(dx*dx); c0 = 1.0 - 6*c1; printf("Running heat kernel %d times\n", count); fflush(stdout); float *f1_t = f1; float *f2_t = f2; struct timeval time_begin, time_end; gettimeofday(&time_begin, NULL); for (int i = 0; i < count; ++i) { compute_7pt_stencile_mm512_ps(f2_t, f1_t, nx, ny, nz, c0, c1); float *t = f1_t; f1_t = f2_t; f2_t = t; } gettimeofday(&time_end, NULL); float time = count * dt; create_field<float>(answer, nx, ny, nz, dx, dy, dz, kappa, time); float err = accuracy<float>(answer,f_final, nx*ny*nz); double elapsed_time = (time_end.tv_sec - time_begin.tv_sec) + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6; float Gflops = (nx*ny*nz)*8.0*count/elapsed_time * 1.0e-09; float Gstens = (nx*ny*nz)*1.0*count/elapsed_time * 1.0e-06; double thput = (nx * ny * nz) * sizeof(float) * 3.0 * count / elapsed_time * 1.0e-09; fprintf(stderr, "Elapsed time : %.3f (s)\n", elapsed_time); fprintf(stderr, "FLOPS : %.3f (GFlops)\n", Gflops); fprintf(stderr, "Updates : %.3f (Mupdates/sec)\n", Gstens); fprintf(stderr, "Throughput : %.3f (GB/s)\n", thput); fprintf(stderr, "Accuracy : %e\n", err); _mm_free(f1); _mm_free(f2); _mm_free(answer); return 0; }
int main(int argc, char* argv[]) { double sum_delta = 0.0, sum_ref = 0.0, L1norm = 0.0; unsigned int seed = 123; int verbose = 0; if (argc > 2) { printf("usage: Black-Scholes <verbose> verbose = 1 for validtating result, the default is 0. \n"); exit(1); } if (argc == 1) verbose = 0; else if (argc == 2) verbose = atoi(argv[1]); kmp_set_defaults("KMP_AFFINITY=compact,granularity=fine"); #ifdef _OPENMP int ThreadNum = omp_get_max_threads(); omp_set_num_threads(ThreadNum); #else int ThreadNum = 1; #endif int OptPerThread = OPT_N / ThreadNum; int mem_size = sizeof(double) * OptPerThread; setlocale(LC_ALL,""); printf("Black-Scholes Formula Double Precision.\n"); printf("Compiler Version = %d\n", __INTEL_COMPILER/100); printf("Release Update = %d\n", __INTEL_COMPILER_UPDATE); printf("Build Time = %s %s\n", __DATE__, __TIME__); printf("Input Dataset = %d\n", OPT_N); printf("Repetitions = %d\n", NUM_ITERATIONS); printf("Chunk Size = %d\n", CHUNKSIZE); printf("Worker Threads = %d\n\n", ThreadNum); if (verbose) printf("Allocate and initialize memory on %d boundary,\n", SIMDALIGN); #pragma omp parallel reduction(+ : sum_delta) reduction(+ : sum_ref) { #ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; #endif double *CallResult = (double *)_mm_malloc(mem_size, SIMDALIGN); double *PutResult = (double *)_mm_malloc(mem_size, SIMDALIGN); double *StockPrice = (double *)_mm_malloc(mem_size, SIMDALIGN); double *OptionStrike = (double *)_mm_malloc(mem_size, SIMDALIGN); double *OptionYears = (double *)_mm_malloc(mem_size, SIMDALIGN); seed += threadID; for(int i = OptPerThread-1; i > -1 ; i--) { CallResult[i] = 0.0; PutResult[i] = -1.0; StockPrice[i] = RandDouble(5.0, 30.0, &seed); OptionStrike[i] = RandDouble(1.0, 100.0, &seed); OptionYears[i] = RandDouble(0.25, 10.0, &seed); } #pragma omp barrier if (threadID == 0) { start_cyc = _rdtsc(); } for(int i = 0; i < NUM_ITERATIONS; i++) for (int chunkBase = 0; chunkBase < OptPerThread; chunkBase += CHUNKSIZE) { #pragma simd vectorlength(CHUNKSIZE) #pragma simd #pragma vector aligned for(int opt = chunkBase; opt < (chunkBase+CHUNKSIZE); opt++) { double CNDD1, CNDD2; double T = OptionYears[opt]; double X = OptionStrike[opt]; double XexpRT = X*exp2(RLOG2E * T); double S = StockPrice[opt]; double sqrtT = sqrt(T); double d1 = log2(S / X) / (VLOG2E * sqrtT) + RVV * sqrtT; CNDF_C (&CNDD1, &d1 ); double d2 = d1 - VOLATILITY * sqrtT; CNDF_C (&CNDD2, &d2 ); double CallVal = S * CNDD1 - XexpRT * CNDD2; double PutVal = CallVal + XexpRT - S; CallResult[opt] = CallVal ; PutResult[opt] = PutVal ; } } #pragma omp barrier if (threadID == 0) { end_cyc = _rdtsc(); } if (verbose) { double delta = 0.0, ref = 0.0, L1norm = 0.0; int max_index = 0; for(int i = 0; i < OptPerThread; i++) { double callReference, putReference; BlackScholesReference( callReference, putReference, StockPrice[i], //Stock price OptionStrike[i], //Option strike OptionYears[i], //Option years RISKFREE, //Riskless rate VOLATILITY //Volatility rate ); ref = callReference; delta = fabs(callReference - CallResult[i]); sum_delta += delta; sum_ref += fabs(ref); } } _mm_free(CallResult); _mm_free(PutResult); _mm_free(StockPrice); _mm_free(OptionStrike); _mm_free(OptionYears); } //parallel section const unsigned long long cyc = end_cyc - start_cyc; double sec = cyc/(FREQ*1e9); printf("=============================================\n"); printf("Total Cycles = %lld\n", cyc); printf("Cycles/OptionPair at thread 0 = %5.2f\n", cyc/(1.0f*NUM_ITERATIONS*OptPerThread)); printf("Time Elapsed = %5.2f\n", sec); printf("Options/sec = %5.2f\n", (2.0f*NUM_ITERATIONS*OPT_N)/(1e9*sec)); printf("=============================================\n"); if (verbose) { L1norm = sum_delta / sum_ref; printf("L1 norm: %E\n", L1norm); printf((L1norm < 9e-5) ? "TEST PASSED\n" : "TEST FAILED\n"); } }