int main() { #ifdef _OPENMP (void) omp_set_dynamic(FALSE); if (omp_get_dynamic()) {printf("Warning: dynamic adjustment of threads has been set\n");} (void) omp_set_num_threads(3); (void) omp_set_nested(TRUE); if (! omp_get_nested()) {printf("Warning: nested parallelism not set\n");} #endif printf("Nested parallelism is %s\n", omp_get_nested() ? "supported" : "not supported"); /* ------------------------------------------------------------------------ Inside the parallel region we can no longer distinguish between the threads ------------------------------------------------------------------------ */ #pragma omp parallel { printf("Thread %d executes the outer parallel region\n", omp_get_thread_num()); #pragma omp parallel num_threads(2) { printf(" Thread %d executes the inner parallel region\n", omp_get_thread_num()); } /*-- End of inner parallel region --*/ } /*-- End of outer parallel region --*/ return(0); }
int main(int argc, char* argv[]) { signal(SIGINT, sigint_handler); #if !defined(NDEBUG) std::cout << "\t> Running in DEBUG mode" << std::endl; #endif #if defined(OPENMP_FOUND) omp_set_nested(true); std::cout << "\t> Running using OPENMP " << std::endl; std::cout << "\t\t> " << omp_get_max_threads() << " threads max" << std::endl; std::cout << "\t\t> " << omp_get_wtick()*1e9 << "ns tick" << std::endl; assert( omp_get_nested() ); #endif // test_random(); Rng rng; rng.seed(rand()); Options options = parse_options(argc, argv); typedef std::map<std::string, int> Wins; Wins wins; for (int kk=0; kk<options.number_of_games; kk++) { std::cout << std::endl << std::endl; std::cout << "****************************************" << std::endl; std::cout << "game " << kk << "/" << options.number_of_games << std::endl; const Game& game = play_game(options, rng); const int winner = game.state.get_winner(); if (winner < 0) wins["draw"]++; else { std::string winner_name = "bot"; if (game.hero_infos[winner].is_real_bot()) winner_name = game.hero_infos[winner].name; wins[winner_name]++; } std::cout << std::endl; std::cout << "after " << options.number_of_games << " games" << std::endl; for (Wins::const_iterator wi=wins.begin(), wie=wins.end(); wi!=wie; wi++) { if (wi->first == "draw") { std::cout << " " << wi->second << " draw" << std::endl; continue; } std::cout << " " << wi->second << " victory for " << wi->first << std::endl; } if (sigint_already_caught) break; } return 0; }
main () { thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } omp_set_dynamic (0); omp_set_num_threads (2); omp_set_nested (1); if (omp_get_nested () == 0) { printf ("test skipped.\n"); exit(0); } sum = 0; #pragma omp parallel { #pragma omp parallel { int add; if (omp_get_num_threads () == 1) { add = 2; printf ("nested parallel is serialized.\n"); } else { add = 1; } #pragma omp critical { sum += add; } } } if (sum != 2*2) { errors += 1; } sum = 0; #pragma omp parallel func_nesting (); if (sum != 2*2) { errors += 1; } if (errors == 0) { printf ("nesting 002 : SUCCESS\n"); return 0; } else { printf ("nesting 002 : FAILED\n"); return 1; } }
bool prepOpenMP() { try { GDEBUG_STREAM("--> OpenMP info <--"); GDEBUG_STREAM("--------------------------------------------------------"); int numOpenMPProcs = omp_get_num_procs(); GDEBUG_STREAM("GtPlusRecon, numOpenMPProcs : " << numOpenMPProcs); #ifndef WIN32 int maxOpenMPLevels = omp_get_max_active_levels(); GDEBUG_STREAM("GtPlusRecon, maxOpenMPLevels : " << maxOpenMPLevels); #endif // WIN32 int maxOpenMPThreads = omp_get_max_threads(); GDEBUG_STREAM("GtPlusRecon, maxOpenMPThreads : " << maxOpenMPThreads); if ( numOpenMPProcs != maxOpenMPThreads ) { GDEBUG_STREAM("GtPlusRecon, numOpenMPProcs != maxOpenMPThreads , hyperthreading must be disabled ... "); omp_set_num_threads(numOpenMPProcs); } // omp_set_nested(1); int allowOpenMPNested = omp_get_nested(); GDEBUG_STREAM("GtPlusRecon, allowOpenMPNested : " << allowOpenMPNested); #ifdef WIN32 GDEBUG_STREAM("----------------------------------"); GDEBUG_STREAM("GtPlus, set thread affinity ... "); /// lock the threads #pragma omp parallel default(shared) { int tid = omp_get_thread_num(); DWORD_PTR mask = (1 << tid); GDEBUG_STREAM("thread id : " << tid << " - mask : " << mask); SetThreadAffinityMask( GetCurrentThread(), mask ); } #endif // WIN32 GDEBUG_STREAM("--------------------------------------------------------"); } catch(...) { GERROR_STREAM("Errors in GtPlus prepOpenMP() ... "); return false; } return true; }
void print_settings() { #ifdef HAVE_MPI if (mpiArgs.rank == 0) { #endif /* HAVE_MPI */ fprintf(stdout, "(1) Application settings\n"); #ifdef HAVE_LIBGSL fprintf(stdout, "GSL configured : true\n"); #else fprintf(stdout, "GSL configured : false\n"); #endif /* HAVE_LIBGSL */ #ifdef HAVE_OPENMP fprintf(stdout, "OpenMP : true\n"); fprintf(stdout, "Max number of Threads : %d\n", omp_get_max_threads()); fprintf(stdout, "Support Nesting (0/1) : %d\n", omp_get_nested()); #else fprintf(stdout, "OpenMP : false\n"); #endif /* HAVE_OPENMP */ #ifdef NDEBUG fprintf(stdout, "Debug : true\n\n"); #else fprintf(stdout, "Debug : false\n\n"); #endif /* NDEBUG */ fprintf(stdout, "(2) Mesh settings\n"); fprintf(stdout, "Space Dimension : %d\n", globalArgs.s); fprintf(stdout, "Time Dimension : %d\n", globalArgs.t); fprintf(stdout, "Delta : %1.8f\n", globalArgs.d); fprintf(stdout, "Input Range : %2.2f <= x <= %2.2f; %2.2f <= y <= %2.2f\n\n", globalArgs.x0, globalArgs.x1, globalArgs.y0, globalArgs.y1); fprintf(stdout, "(3) Conjugate Gradient settings\n"); fprintf(stdout, "Error Threshold : %e\n\n", globalArgs.e); #ifdef HAVE_MPI fprintf(stdout, "(4) MPI settings\n"); fprintf(stdout, "Number Processors : %d\n", mpiArgs.num_tasks); } #endif /* HAVE_MPI */ fprintf(stdout, "\n\n"); fflush(stdout); }
void OpenMP::partition_master( F const& f , int num_partitions , int partition_size ) { if (omp_get_nested()) { using Exec = Impl::OpenMPExec; Exec * prev_instance = Impl::t_openmp_instance; Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size ); OpenMP::memory_space space; #pragma omp parallel num_threads(num_partitions) { void * const ptr = space.allocate( sizeof(Exec) ); Impl::t_openmp_instance = new (ptr) Exec( partition_size ); size_t pool_reduce_bytes = 32 * partition_size ; size_t team_reduce_bytes = 32 * partition_size ; size_t team_shared_bytes = 1024 * partition_size ; size_t thread_local_bytes = 1024 ; Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); omp_set_num_threads(partition_size); f( omp_get_thread_num(), omp_get_num_threads() ); Impl::t_openmp_instance->~Exec(); space.deallocate( Impl::t_openmp_instance, sizeof(Exec) ); Impl::t_openmp_instance = nullptr; } Impl::t_openmp_instance = prev_instance; } else { // nested openmp not enabled f(0,1); } }
int main (int argc, char *argv[]) { int nthreads, tid, procs, maxt, inpar, dynamic, nested; char name[50]; /* Start parallel region */ #pragma omp parallel private(nthreads, tid) { /* Obtain thread number */ tid = omp_get_thread_num(); /* Only master thread does this We could also use #pragma omp master */ if (tid == 0) { printf("Thread %d getting environment info...\n", tid); /* Get host name */ gethostname(name, 50); /* Get environment information */ procs = omp_get_num_procs(); nthreads = omp_get_num_threads(); maxt = omp_get_max_threads(); inpar = omp_in_parallel(); dynamic = omp_get_dynamic(); nested = omp_get_nested(); /* Print environment information */ printf("Hostname = %s\n", name); printf("Number of processors = %d\n", procs); printf("Number of threads = %d\n", nthreads); printf("Max threads = %d\n", maxt); printf("In parallel? = %d\n", inpar); printf("Dynamic threads enabled? = %d\n", dynamic); printf("Nested parallelism supported? = %d\n", nested); } } /* Done */ exit(0); }
void GOMP_parallel_start(void (*fn)(void *), void *data, unsigned nthreads) { debug_printf("GOMP_parallel_start(%p, %p, %u)\n", fn, data, nthreads); /* Identify the number of threads that can be spawned and start the processing */ if (!omp_in_parallel()) { debug_printf("not in parallel\n"); struct omp_icv_task *icv_task = bomp_icv_task_new(); if (!icv_task) { debug_printf("no icv task\n"); return; } icv_task->active_levels = 1; icv_task->nthreads = omp_get_max_threads(); debug_printf("omp_get_max_threads = %u\n", icv_task->nthreads); if (nthreads == 0 || (icv_task->dynamic && icv_task->nthreads < nthreads)) { icv_task->nthreads = OMP_GET_ICV_GLOBAL(thread_limit); debug_printf("resetting to = %u\n", icv_task->nthreads); } bomp_icv_set_task(icv_task); debug_printf("icv task set %u\n", icv_task->nthreads); /* start processing */ bomp_start_processing(fn, data, 0, icv_task->nthreads); } else { if (omp_get_nested()) { // handle nested paralellism assert(!"Handling nested paralellism\n"); } /* we have already started enough threads */ uint32_t active_levels = OMP_GET_ICV_TASK(active_levels); //debug_printf("setting active_levels to %u\n", active_levels+1); OMP_SET_ICV_TASK(active_levels, active_levels+1); } }
int main (void) { double d, e; int l; omp_lock_t lck; omp_nest_lock_t nlck; d = omp_get_wtime (); omp_init_lock (&lck); omp_set_lock (&lck); if (omp_test_lock (&lck)) abort (); omp_unset_lock (&lck); if (! omp_test_lock (&lck)) abort (); if (omp_test_lock (&lck)) abort (); omp_unset_lock (&lck); omp_destroy_lock (&lck); omp_init_nest_lock (&nlck); if (omp_test_nest_lock (&nlck) != 1) abort (); omp_set_nest_lock (&nlck); if (omp_test_nest_lock (&nlck) != 3) abort (); omp_unset_nest_lock (&nlck); omp_unset_nest_lock (&nlck); if (omp_test_nest_lock (&nlck) != 2) abort (); omp_unset_nest_lock (&nlck); omp_unset_nest_lock (&nlck); omp_destroy_nest_lock (&nlck); omp_set_dynamic (1); if (! omp_get_dynamic ()) abort (); omp_set_dynamic (0); if (omp_get_dynamic ()) abort (); omp_set_nested (1); if (! omp_get_nested ()) abort (); omp_set_nested (0); if (omp_get_nested ()) abort (); omp_set_num_threads (5); if (omp_get_num_threads () != 1) abort (); if (omp_get_max_threads () != 5) abort (); if (omp_get_thread_num () != 0) abort (); omp_set_num_threads (3); if (omp_get_num_threads () != 1) abort (); if (omp_get_max_threads () != 3) abort (); if (omp_get_thread_num () != 0) abort (); l = 0; #pragma omp parallel reduction (|:l) { l = omp_get_num_threads () != 3; l |= omp_get_thread_num () < 0; l |= omp_get_thread_num () >= 3; #pragma omp master l |= omp_get_thread_num () != 0; } if (l) abort (); if (omp_get_num_procs () <= 0) abort (); if (omp_in_parallel ()) abort (); #pragma omp parallel reduction (|:l) l = ! omp_in_parallel (); #pragma omp parallel reduction (|:l) if (1) l = ! omp_in_parallel (); if (l) abort (); e = omp_get_wtime (); if (d > e) abort (); d = omp_get_wtick (); /* Negative precision is definitely wrong, bigger than 1s clock resolution is also strange. */ if (d <= 0 || d > 1) abort (); return 0; }
//------------------------------------------------------------------------------------------------------------------------------ int main(int argc, char **argv){ int my_rank=0; int num_tasks=1; int OMP_Threads = 1; int OMP_Nested = 0; #ifdef _OPENMP #pragma omp parallel { #pragma omp master { OMP_Threads = omp_get_num_threads(); OMP_Nested = omp_get_nested(); } } #endif #ifdef USE_MPI int actual_threading_model = -1; int requested_threading_model = -1; requested_threading_model = MPI_THREAD_SINGLE; //requested_threading_model = MPI_THREAD_FUNNELED; //requested_threading_model = MPI_THREAD_SERIALIZED; //requested_threading_model = MPI_THREAD_MULTIPLE; //MPI_Init(&argc, &argv); #ifdef _OPENMP requested_threading_model = MPI_THREAD_FUNNELED; //requested_threading_model = MPI_THREAD_SERIALIZED; //requested_threading_model = MPI_THREAD_MULTIPLE; //MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model); #endif MPI_Init_thread(&argc, &argv, requested_threading_model, &actual_threading_model); MPI_Comm_size(MPI_COMM_WORLD, &num_tasks); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); //if(actual_threading_model>requested_threading_model)actual_threading_model=requested_threading_model; if(my_rank==0){ if(requested_threading_model == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); else if(requested_threading_model == MPI_THREAD_SINGLE )printf("Requested MPI_THREAD_SINGLE, "); else if(requested_threading_model == MPI_THREAD_FUNNELED )printf("Requested MPI_THREAD_FUNNELED, "); else if(requested_threading_model == MPI_THREAD_SERIALIZED)printf("Requested MPI_THREAD_SERIALIZED, "); else if(requested_threading_model == MPI_THREAD_MULTIPLE )printf("Requested MPI_THREAD_MULTIPLE, "); else printf("Requested Unknown MPI Threading Model (%d), ",requested_threading_model); if(actual_threading_model == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); else if(actual_threading_model == MPI_THREAD_SINGLE )printf("got MPI_THREAD_SINGLE\n"); else if(actual_threading_model == MPI_THREAD_FUNNELED )printf("got MPI_THREAD_FUNNELED\n"); else if(actual_threading_model == MPI_THREAD_SERIALIZED)printf("got MPI_THREAD_SERIALIZED\n"); else if(actual_threading_model == MPI_THREAD_MULTIPLE )printf("got MPI_THREAD_MULTIPLE\n"); else printf("got Unknown MPI Threading Model (%d)\n",actual_threading_model); } #ifdef USE_HPM // IBM HPM counters for BGQ... HPM_Init(); #endif #endif // USE_MPI int log2_box_dim = 6; int target_boxes_per_rank = 1; if(argc==3){ log2_box_dim=atoi(argv[1]); target_boxes_per_rank=atoi(argv[2]); }else{ if(my_rank==0){printf("usage: ./a.out [log2_box_dim] [target_boxes_per_rank]\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } if(log2_box_dim<4){ if(my_rank==0){printf("log2_box_dim must be at least 4\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } if(target_boxes_per_rank<1){ if(my_rank==0){printf("target_boxes_per_rank must be at least 1\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } if(my_rank==0){ if(OMP_Nested)fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=TRUE)\n\n" ,num_tasks,OMP_Threads); else fprintf(stdout,"%d MPI Tasks of %d threads (OMP_NESTED=FALSE)\n\n",num_tasks,OMP_Threads); } //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // calculate the problem size... #ifndef MAX_COARSE_DIM #define MAX_COARSE_DIM 11 #endif int64_t box_dim=1<<log2_box_dim; int64_t target_boxes = (int64_t)target_boxes_per_rank*(int64_t)num_tasks; int64_t boxes_in_i = -1; int64_t bi; for(bi=1;bi<1000;bi++){ // all possible problem sizes int64_t total_boxes = bi*bi*bi; if(total_boxes<=target_boxes){ int64_t coarse_grid_dim = box_dim*bi; while( (coarse_grid_dim%2) == 0){coarse_grid_dim=coarse_grid_dim/2;} if(coarse_grid_dim<=MAX_COARSE_DIM){ boxes_in_i = bi; } } } if(boxes_in_i<1){ if(my_rank==0){printf("failed to find an acceptable problem size\n");} #ifdef USE_MPI MPI_Finalize(); #endif exit(0); } //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // create the fine level... #ifdef USE_PERIODIC_BC int bc = BC_PERIODIC; #else int bc = BC_DIRICHLET; #endif level_type fine_grid; int ghosts=stencil_get_radius(); create_level(&fine_grid,boxes_in_i,box_dim,ghosts,VECTORS_RESERVED,bc,my_rank,num_tasks); //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #ifdef USE_HELMHOLTZ double a=1.0;double b=1.0; // Helmholtz if(my_rank==0)fprintf(stdout," Creating Helmholtz (a=%f, b=%f) test problem\n",a,b); #else double a=0.0;double b=1.0; // Poisson if(my_rank==0)fprintf(stdout," Creating Poisson (a=%f, b=%f) test problem\n",a,b); #endif double h0=1.0/( (double)boxes_in_i*(double)box_dim ); initialize_problem(&fine_grid,h0,a,b); // calculate VECTOR_ALPHA, VECTOR_BETA, and VECTOR_UTRUE //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( ((a==0.0)||(fine_grid.alpha_is_zero==1) ) && (fine_grid.boundary_condition.type == BC_PERIODIC)){ // Poisson w/ periodic BC's... // nominally, u shifted by any constant is still a valid solution. // However, by convention, we assume u sums to zero. double average_value_of_u = mean(&fine_grid,VECTOR_UTRUE); if(my_rank==0){fprintf(stdout," average value of u_true = %20.12e... shifting u_true to ensure it sums to zero...\n",average_value_of_u);} shift_vector(&fine_grid,VECTOR_UTRUE,VECTOR_UTRUE,-average_value_of_u); } //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //apply_op(&fine_grid,VECTOR_F,VECTOR_UTRUE,a,b); // by construction, f = A(u_true) //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if(fine_grid.boundary_condition.type == BC_PERIODIC){ double average_value_of_f = mean(&fine_grid,VECTOR_F); if(average_value_of_f!=0.0){ if(my_rank==0){fprintf(stderr," WARNING... Periodic boundary conditions, but f does not sum to zero... mean(f)=%e\n",average_value_of_f);} //shift_vector(&fine_grid,VECTOR_F,VECTOR_F,-average_value_of_f); } } //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - mg_type all_grids; int minCoarseDim = 1; rebuild_operator(&fine_grid,NULL,a,b); // i.e. calculate Dinv and lambda_max MGBuild(&all_grids,&fine_grid,a,b,minCoarseDim); // build the Multigrid Hierarchy double dtol= 0.0;double rtol=1e-10; // converged if ||b-Ax|| / ||b|| < rtol //double dtol=1e-15;double rtol= 0.0; // converged if ||D^{-1}(b-Ax)|| < dtol //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int doTiming; int minSolves = 10; // do at least minSolves MGSolves double timePerSolve = 0; for(doTiming=0;doTiming<=1;doTiming++){ // first pass warms up, second pass times #ifdef USE_HPM // IBM performance counters for BGQ... if(doTiming)HPM_Start("FMGSolve()"); #endif #ifdef USE_MPI double minTime = 30.0; // minimum time in seconds that the benchmark should run double startTime = MPI_Wtime(); if(doTiming==1){ if((minTime/timePerSolve)>minSolves)minSolves=(minTime/timePerSolve); // if one needs to do more than minSolves to run for minTime, change minSolves } #endif if(my_rank==0){ if(doTiming==0){fprintf(stdout,"\n\n===== warming up by running %d solves ===============================\n",minSolves);} else{fprintf(stdout,"\n\n===== running %d solves =============================================\n",minSolves);} fflush(stdout); } int numSolves = 0; // solves completed MGResetTimers(&all_grids); while( (numSolves<minSolves) ){ zero_vector(all_grids.levels[0],VECTOR_U); #ifdef USE_FCYCLES FMGSolve(&all_grids,VECTOR_U,VECTOR_F,a,b,dtol,rtol); #else MGSolve(&all_grids,VECTOR_U,VECTOR_F,a,b,dtol,rtol); #endif numSolves++; } #ifdef USE_MPI if(doTiming==0){ double endTime = MPI_Wtime(); timePerSolve = (endTime-startTime)/numSolves; MPI_Bcast(&timePerSolve,1,MPI_DOUBLE,0,MPI_COMM_WORLD); // after warmup, process 0 broadcasts the average time per solve (consensus) } #endif #ifdef USE_HPM // IBM performance counters for BGQ... if(doTiming)HPM_Stop("FMGSolve()"); #endif } MGPrintTiming(&all_grids); // don't include the error check in the timing results //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if(my_rank==0){fprintf(stdout,"calculating error... ");} double fine_error = error(&fine_grid,VECTOR_U,VECTOR_UTRUE); if(my_rank==0){fprintf(stdout,"h = %22.15e ||error|| = %22.15e\n\n",h0,fine_error);fflush(stdout);} //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // MGDestroy() //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #ifdef USE_MPI #ifdef USE_HPM // IBM performance counters for BGQ... HPM_Print(); #endif MPI_Finalize(); #endif //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - return(0); }
int main(int argc, char *argv[]) { omp_set_nested(1); omp_set_num_threads(2); printf("Master: Nthr %d Thrid %d Nested %d\n",omp_get_num_threads(),omp_get_thread_num(),omp_get_nested()); #pragma omp parallel { printf("Parallel 1: Nthr %d Thrid %d Nested %d\n",omp_get_num_threads(),omp_get_thread_num(),omp_get_nested()); omp_set_num_threads(2); #pragma omp parallel { printf("Parallel 2: Nthr %d Thrid %d Nested %d\n",omp_get_num_threads(),omp_get_thread_num(),omp_get_nested()); } } }
int32_t omp_get_nested_ (void) { return omp_get_nested (); }
void masterFunc (int argc, char ** argv) { /**************************************************************** * Step 1: Setup and Initialization * Load conf, init model, allocate mem, init params, init solver * Load cross-validation data ****************************************************************/ // Step 1.1: Load configuration if (argc < 2) { printf("argc %d\n", argc); exit(1); } string dirPath = argv[1]; boost::property_tree::ptree *confReader = new boost::property_tree::ptree(); boost::property_tree::ini_parser::read_ini(dirPath+"mpi.conf", *confReader); string section = "Master."; // int validBatchSize = confReader->get<int>(section + "validation_batch_size"); int nSendMax = confReader->get<int>(section + "max_iteration_number"); // Step 1.2 Initialize model section = "LSTM."; openblas_set_num_threads(1); int max_openmp_threads = confReader->get<int>(section + "max_threads"); omp_set_num_threads(max_openmp_threads); omp_set_nested(0); printf("MASTER openmp threads: max threads %d, nested %d\n", omp_get_max_threads(), omp_get_nested()); RecurrentNN *rnn = new RNNLSTM(confReader, section); int paramSize = rnn->m_paramSize; printf("paramSize: %d\n", paramSize); // Step 1.3: Allocate master memory float *params = new float[paramSize]; float *grad = new float[paramSize]; // Step 1.4: Initialize params rnn->initParams(params); // Step 1.5: Initialize SGD Solver section = "SGD."; sgdBase *sgdSolver = initSgdSolver(confReader, section, paramSize); printf("MASTER: finish step 1\n"); // Step 1.6: Load cross-validation data // section = "ValidationData."; // DataFactory *dataset = initDataFactory(confReader, section); // int numSample = dataset->getNumberOfData(); // int dataSize = dataset->getDataSize(); // int labelSize = dataset->getLabelSize(); // float *data = new float[validBatchSize * dataSize]; // float *label = new float[validBatchSize * labelSize]; /**************************************************************** * Step 2: Seed the slaves * (1) Broadcast paramSize to all slaves * (2) Send the same initial params with WORKTAG to all slaves ****************************************************************/ int nProc; MPI_Comm_size(MPI_COMM_WORLD, &nProc); int nSlave = nProc - 1; MPI_Bcast(¶mSize, 1, MPI_INT, ROOT, MPI_COMM_WORLD); int nSend = 0; int nRecv = 0; for (int rank = 1; rank < nProc; ++rank) { MPI_Send(params, paramSize, MPI_FLOAT, rank, WORKTAG, MPI_COMM_WORLD); nSend++; } printf("MASTER: finish step 2\n"); /**************************************************************** * Step 3: Paralleled training * Receive mini-batch grad from *ANY* slave * Update params based received grad * Re-send params to slave to process next mini-batch ****************************************************************/ MPI_Status status; // TEMP while loop condition while (nSend < nSendMax) { MPI_Recv(grad, paramSize, MPI_FLOAT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); nRecv++; sgdSolver->updateParams(params, grad, status.MPI_SOURCE); // Send updated params to corresponding slave MPI_Send(params, paramSize, MPI_FLOAT, status.MPI_SOURCE, WORKTAG, MPI_COMM_WORLD); nSend++; } printf("MASTER: finish step 3\n"); /**************************************************************** * Step 4: Stop the slaves ****************************************************************/ // Step 4.1: Receive all dispatched but irreceived grad result while (nRecv < nSend) { MPI_Recv(grad, paramSize, MPI_FLOAT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); sgdSolver->updateParams(params, grad, status.MPI_SOURCE); nRecv++; } // Step 4.2: Send STOPTAG to all slaves for (int rank = 1; rank < nProc; ++rank) { MPI_Send(&rank, 1, MPI_INT, rank, STOPTAG, MPI_COMM_WORLD); } printf("MASTER: finish step 4\n"); /**************************************************************** * Step 5: Save trained parameters and clear things ****************************************************************/ section = "Master."; string saveFilename = confReader->get<string>(section + "save_filename"); ofstream savefile (saveFilename.c_str(), ios::out|ios::binary); if (savefile.is_open()) { savefile.write ((char *)params, sizeof(float) * paramSize); savefile.close(); } else { printf("Failed to open savefile\n"); exit(1); } delete [] params; delete [] grad; delete confReader; delete sgdSolver; delete rnn; }
int test_openmp1(int argc, char *argv[]) #endif { short OK; if (argc>1) Verbose = 1; #ifdef _OPENMP omp_set_nested(-1); printf("%s%s%s\n", "Nested parallel blocks are ", omp_get_nested()?" ":"NOT ", "supported."); #endif MainThread(); #ifdef SPAWN_THREADS { pthread_t a_thr; pthread_t b_thr; int status; memset(&a_thr, 0, sizeof(a_thr)); /* [i_a] fix valid MSVC complaint about unitialized a_thr / b_thr */ memset(&b_thr, 0, sizeof(b_thr)); /* [i_a] fix valid MSVC complaint about unitialized a_thr / b_thr */ printf("%s:%d - %s - a_thr:%p - b_thr:%p\n", __FILE__,__LINE__,__FUNCTION__,a_thr.p,b_thr.p); status = pthread_create(&a_thr, NULL, _thread, (void*) 1 ); if ( status != 0 ) { printf("Failed to create thread 1\n"); return (-1); } status = pthread_create(&b_thr, NULL, _thread, (void*) 2 ); if ( status != 0 ) { printf("Failed to create thread 2\n"); return (-1); } status = pthread_join(a_thr, NULL); if ( status != 0 ) { printf("Failed to join thread 1\n"); return (-1); } printf("Joined thread1\n"); status = pthread_join(b_thr, NULL); if ( status != 0 ) { printf("Failed to join thread 2\n"); return (-1); } printf("Joined thread2\n"); } #endif // SPAWN_THREADS OK = 0; // Check that we have OpenMP before declaring things OK formally. #ifdef _OPENMP OK = 1; { short i; for (i=0;i<3;i++) OK &= ThreadOK[i]; } if (OK) printf("OMP : All looks good\n"); else printf("OMP : Error\n"); #else OK = 1; printf("OpenMP seems not enabled ...\n"); #endif return OK?0:1; }
const double*, const double*, const int*, const double*, const int*, const double*, double*, const int*); LIBXSTREAM_TARGET(mic) void process(LIBXSTREAM_INVAL(size_t) size, LIBXSTREAM_INVAL(size_t) nn, const size_t* idata, const double* adata, const double* bdata, double* cdata) { if (0 < LIBXSTREAM_GETVAL(size)) { static const double alpha = 1, beta = 1; static const char trans = 'N'; const int isize = static_cast<int>(size); const size_t base = idata[0]; #if defined(_OPENMP) && defined(MULTI_DGEMM_USE_NESTED) const int nthreads = omp_get_max_threads() / LIBXSTREAM_GETVAL(size); const int dynamic = omp_get_dynamic(), nested = omp_get_nested(); omp_set_dynamic(0); omp_set_nested(1); # pragma omp parallel for schedule(dynamic,1) num_threads(LIBXSTREAM_GETVAL(size)) #endif for (int i = 0; i < isize; ++i) { #if defined(_OPENMP) && defined(MULTI_DGEMM_USE_NESTED) omp_set_num_threads(nthreads); #endif LIBXSTREAM_ASSERT(base <= idata[i]); const size_t i0 = idata[i], i1 = (i + 1) < isize ? idata[i+1] : (i0 + LIBXSTREAM_GETVAL(nn)), n2 = i1 - i0, offset = i0 - base; const int n = static_cast<int>(std::sqrt(static_cast<double>(n2)) + 0.5); DGEMM(&trans, &trans, &n, &n, &n, &alpha, adata + offset, &n, bdata + offset, &n, &beta, cdata + offset, &n); } #if defined(_OPENMP) && defined(MULTI_DGEMM_USE_NESTED)
int main(int argc, char *argv[]) { Display *display; Window window; //initialization for a window int screen; //which screen /* set window size */ int width = atoi(argv[6]); int height = atoi(argv[7]); int xleft = atoi(argv[2]); int yleft = atoi(argv[4]); int xright = atoi(argv[3]); int yright = atoi(argv[5]); int NUM_THREADS = atoi(argv[1]); double xrange = xright - xleft; double yrange = yright - yleft; /* set window position */ int x = 0; int y = 0; int NUM_PROCS = omp_get_num_procs(); struct timeval tv1, tv2; double timeStart, timeEnd; gettimeofday(&tv1, NULL); timeStart = tv1.tv_sec * 1000000 + tv1.tv_usec; GC gc; printf("X Window is %sd\n", argv[8]); xflag = strcmp(argv[8], "enable"); omp_set_num_threads(NUM_THREADS); omp_set_nested(1); printf("Total %d threads functioning among %d processors\n", NUM_THREADS, NUM_PROCS); int nest = omp_get_nested(); printf("omp_nested is set to %d\n", nest); if (xflag == 0){ /* open connection with the server */ display = XOpenDisplay(NULL); if(display == NULL) { fprintf(stderr, "cannot open display\n"); return -1; } screen = DefaultScreen(display); /* border width in pixels */ int border_width = 0; /* create window */ window = XCreateSimpleWindow(display, RootWindow(display, screen), x, y, width, height, border_width, BlackPixel(display, screen), WhitePixel(display, screen)); /* create graph */ XGCValues values; long valuemask = 0; gc = XCreateGC(display, window, valuemask, &values); //XSetBackground (display, gc, WhitePixel (display, screen)); XSetForeground (display, gc, BlackPixel (display, screen)); XSetBackground(display, gc, 0X0000FF00); XSetLineAttributes (display, gc, 1, LineSolid, CapRound, JoinRound); /* map(show) the window */ XMapWindow(display, window); XSync(display, 0); } // Parameters Compl z, c; int repeats; double temp, lengthsq; int i, j; int fakewidth; int task; int localw = 0; int nlocal = 100; int tid; int width1; int judge=0; int cnt; for(cnt=0; cnt<NUM_THREADS; cnt++){ rowCnt[cnt] = 0; thgap[cnt] = 0; } #pragma omp parallel num_threads(NUM_THREADS) private(tid, temp, lengthsq, z, c, repeats, i, j) { tid = omp_get_thread_num(); printf("Thread %d!!\n", tid); #pragma omp for schedule(static, 1) for(i=0; i<width; i++) { for(j=0; j<height; j++) { gettimeofday(&thtv1[tid], NULL); thtimeStart[tid] = thtv1[tid].tv_sec * 1000000 + thtv1[tid].tv_usec; z.real = 0.0; z.imag = 0.0; c.real = xleft + (double)i * (xrange/(double)width); c.imag = yleft + (double)j * (yrange/(double)height); repeats = 0; lengthsq = 0.0; while(repeats < 100000 && lengthsq < 4.0) { temp = z.real*z.real - z.imag*z.imag + c.real; z.imag = 2*z.real*z.imag + c.imag; z.real = temp; lengthsq = z.real*z.real + z.imag*z.imag; repeats++; } #pragma omp critical { rowData[i][j] = repeats; rowCnt[tid]++; gettimeofday(&thtv2[tid], NULL); thtimeEnd[tid] = thtv2[tid].tv_sec * 1000000 + thtv2[tid].tv_usec; thgap[tid] += (thtimeEnd[tid]-thtimeStart[tid]) / CLOCKS_PER_SEC; } } } #pragma omp barrier } // Draw the graph if(xflag == 0){ for(i=0; i<width; i++) { for(j=0; j<height; j++) { XSetForeground (display, gc, 1024 * 1024 * (rowData[i][j] % 256)); XDrawPoint (display, window, gc, i, j); } } XFlush(display); } gettimeofday(&tv2, NULL); timeEnd = tv2.tv_sec * 1000000 + tv2.tv_usec; double gap = (timeEnd-timeStart) / CLOCKS_PER_SEC; printf("OOOOOOO Graph Drawing Done OOOOOO\n"); printf("Threads : %d\n", NUM_THREADS); printf("Running time : %lf\n", gap); printf("\n"); for(cnt=0; cnt<NUM_THREADS; cnt++){ printf("Thread %d computed %d points consuming %1f seconds\n", cnt, rowCnt[cnt], thgap[cnt]); } printf("\n"); FILE *outFile; outFile = fopen(argv[9], "a"); fprintf(outFile, "Threads : %d \n", NUM_THREADS); fprintf(outFile, "Running time : %lf\n\n", gap); fclose(outFile); sleep(5); return 0; }
int main () { int thds, *buf; int errors = 0; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi thread.\n"); exit (0); } buf = (int *) malloc (sizeof(int) * (thds + 1)); if (buf == NULL) { printf ("can not allocate memory.\n"); exit (1); } omp_set_dynamic (0); omp_set_nested (1); if (omp_get_nested () == 0) { printf ("nested parallelism is not implement.\n"); goto END; } omp_set_num_threads (1); #pragma omp parallel { int i, j; if (omp_get_num_threads () != 1) { #pragma omp critical errors += 1; } if (omp_get_thread_num () != 0) { errors += 1; } for (i=1; i<=thds; i++) { memset (buf, 0, sizeof(int) * (thds+1)); omp_set_num_threads (i); #pragma omp parallel { int id = omp_get_thread_num (); if (omp_get_num_threads () != i) { #pragma omp critical errors += 1; } buf[id] += 1; } for (j=0; j<i; j++) { if (buf[j] != 1) { #pragma omp critical errors += 1; } } for (j=i; j<=thds; j++) { if (buf[j] != 0) { #pragma omp critical errors += 1; } } } } END: if (errors == 0) { printf ("omp_set_nested 002 : SUCCESS\n"); return 0; } else { printf ("omp_set_nested 002 : FAILED\n"); return 1; } }
int main() { omp_set_nested(1); printf("is nested :%d \n", omp_get_nested()); /* screen ( integer) coordinate */ int iX,iY; unsigned char tablica[800][800][3]; const int iXmax = 800; const int iYmax = 800; /* world ( double) coordinate = parameter plane*/ double Cx,Cy; const double CxMin=-2.5; const double CxMax=1.5; const double CyMin=-2.0; const double CyMax=2.0; /* */ double PixelWidth=(CxMax-CxMin)/iXmax; double PixelHeight=(CyMax-CyMin)/iYmax; /* color component ( R or G or B) is coded from 0 to 255 */ /* it is 24 bit color RGB file */ const int MaxColorComponentValue=255; FILE * fp; FILE * blurp; char *filename="new1.ppm"; char *blurname="blur.ppm"; char *comment="# ";/* comment should start with # */ static unsigned char color[3]; /* Z=Zx+Zy*i ; Z0 = 0 */ double Zx, Zy; double Zx2, Zy2; /* Zx2=Zx*Zx; Zy2=Zy*Zy */ /* */ int Iteration; const int IterationMax=200; /* bail-out value , radius of circle ; */ const double EscapeRadius=2; double ER2=EscapeRadius*EscapeRadius; /*create new file,give it a name and open it in binary mode */ fp= fopen(filename,"wb"); /* b - binary mode */ blurp= fopen(blurname,"wb"); /* b - binary mode */ /*write ASCII header to the file*/ fprintf(fp,"P6\n %s\n %d\n %d\n %d\n",comment,iXmax,iYmax,MaxColorComponentValue); fprintf(blurp,"P6\n %s\n %d\n %d\n %d\n",comment,iXmax,iYmax,MaxColorComponentValue); /* compute and write image data bytes to the file*/ #pragma omp parallel for schedule(dynamic,10) firstprivate(Zx, Zy, Zx2, Zy2, Cx, Cy, Iteration, iX) shared(tablica, PixelHeight, PixelWidth, ER2) for(iY=0;iY<iYmax;iY++) { Cy=CyMin + iY*PixelHeight; if (fabs(Cy)< PixelHeight/2) Cy=0.0; /* Main antenna */ for(iX=0;iX<iXmax;iX++) { Cx=CxMin + iX*PixelWidth; /* initial value of orbit = critical point Z= 0 */ Zx=0.0; Zy=0.0; Zx2=Zx*Zx; Zy2=Zy*Zy; /* */ for (Iteration=0;Iteration<IterationMax && ((Zx2+Zy2)<ER2);Iteration++) { Zy=2*Zx*Zy + Cy; Zx=Zx2-Zy2 +Cx; Zx2=Zx*Zx; Zy2=Zy*Zy; }; /* compute pixel color (24 bit = 3 bytes) */ if (Iteration==IterationMax) { /* interior of Mandelbrot set = black */ tablica[iY][iX][0] = 0;// (120 * omp_get_thread_num()) % 255; tablica[iY][iX][1] = 0;//(210 * omp_get_thread_num()) % 255; tablica[iY][iX][2] = 0;//(100 * omp_get_thread_num()) % 255; } else { /* exterior of Mandelbrot set = white */ //printf("%d\n", omp_get_thread_num()); tablica[iY][iX][0]= 255;//(50 * omp_get_thread_num()) % 255; /* Red*/ tablica[iY][iX][1]= 255;// (80 * omp_get_thread_num()) % 255; /* Green */ tablica[iY][iX][2]= 255;//(10 * omp_get_thread_num()) % 255;/* Blue */ }; } } unsigned char t2[800][800][3]; unsigned char* tmp[9]; int i, j; //#pragma omp parallel sections { //#pragma omp section { for (i = 0; i < 800; i++) { for (j = 0; j < 800; j++) { tmp[4] = tablica[i][j]; if(i-1<0 && j-1 < 0) { tmp[0] = NULL; } else { tmp[0] = tablica[i-1][j-1]; } if(i-1<0) { tmp[1] = NULL; } else { tmp[1] = tablica[i-1][j]; } if(i-1<0 && j+1 > 800) { tmp[2] = NULL; } else { tmp[2] = tablica[i-1][j+1]; } if(j-1 < 0) { tmp[3] = NULL; } else { tmp[3] = tablica[i][j-1]; } if( j+1 >800) { tmp[5] = NULL; } else { tmp[5] = tablica[i][j+1]; } if(i+1>800 && j-1 < 0) { tmp[6] = NULL; } else { tmp[6] = tablica[i+1][j-1]; } if(i+1>800) { tmp[7] = NULL; } else { tmp[7] = tablica[i+1][j]; } if(i+1>800 && j+1 >800) { tmp[8] = NULL; } else { tmp[8] = tablica[i+1][j+1]; } int b; int red = 0; int blue = 0; int green = 0; for(b = 0 ; b < 9 ;b++) { if(b == 4) continue; if(tmp[b]==NULL) continue; red += tmp[b][0]; blue += tmp[b][1]; green += tmp[b][2]; } t2[i][j][0] = red; t2[i][j][1] = blue; t2[i][j][2] = green; } } } // #pragma omp section { } } /*write color to the file*/ for (i = 0; i < 800; i++) { for (j = 0; j < 800; j++) { fwrite(tablica[i][j],1,3,fp); fwrite(t2[i][j],1,3,blurp); } } fclose(fp); fclose(blurp); return 0; }
int main () { int d_o = omp_get_dynamic (); int n_o = omp_get_nested (); omp_sched_t s_o; int c_o; omp_get_schedule (&s_o, &c_o); int m_o = omp_get_max_threads (); omp_set_dynamic (1); omp_set_nested (1); omp_set_schedule (omp_sched_static, 2); omp_set_num_threads (4); int d = omp_get_dynamic (); int n = omp_get_nested (); omp_sched_t s; int c; omp_get_schedule (&s, &c); int m = omp_get_max_threads (); if (!omp_is_initial_device ()) abort (); #pragma omp target if (0) { omp_sched_t s_c; int c_c; omp_get_schedule (&s_c, &c_c); if (d_o != omp_get_dynamic () || n_o != omp_get_nested () || s_o != s_c || c_o != c_c || m_o != omp_get_max_threads ()) abort (); omp_set_dynamic (0); omp_set_nested (0); omp_set_schedule (omp_sched_dynamic, 4); omp_set_num_threads (2); if (!omp_is_initial_device ()) abort (); } if (!omp_is_initial_device ()) abort (); omp_sched_t s_c; int c_c; omp_get_schedule (&s_c, &c_c); if (d != omp_get_dynamic () || n != omp_get_nested () || s != s_c || c != c_c || m != omp_get_max_threads ()) abort (); #pragma omp target if (0) #pragma omp teams { omp_sched_t s_c; int c_c; omp_get_schedule (&s_c, &c_c); if (d_o != omp_get_dynamic () || n_o != omp_get_nested () || s_o != s_c || c_o != c_c || m_o != omp_get_max_threads ()) abort (); omp_set_dynamic (0); omp_set_nested (0); omp_set_schedule (omp_sched_dynamic, 4); omp_set_num_threads (2); if (!omp_is_initial_device ()) abort (); } if (!omp_is_initial_device ()) abort (); omp_get_schedule (&s_c, &c_c); if (d != omp_get_dynamic () || n != omp_get_nested () || s != s_c || c != c_c || m != omp_get_max_threads ()) abort (); return 0; }