/** * give control back to the scheduler after main() exits. This allows * remaining threads to continue running. * FIXME: we don't know whether user explicit calls exit() or main() normally returns * in the previous case, we should exit immediately, while in the later, we should * join other threads. * Overriding exit() does not work because normal returning from * main() also calls exit(). **/ static void exit_func(void) { // don't do anything if we're in a forked child process if( getpid() != capriccio_main_pid ) return; exit_func_done = 1; main_exited = 1; if( !exit_whole_program ) // this will block until all other threads finish thread_exit(NULL); // dump the blocking graph before we exit if( conf_dump_blocking_graph ) { tdebug("dumping blocking graph from exit_func()\n"); dump_blocking_graph(); } // FIXME: make sure to kill cloned children if( conf_dump_timing_info ) { if( main_timer.running ) stop_timer(&main_timer); if( scheduler_timer.running ) stop_timer(&scheduler_timer); if( app_timer.running ) stop_timer(&app_timer); print_timers(); } }
void* thread_disconnector() { if (DEBUG == 1) printf("DEBUG thread_disconnector: begin\n"); while (1) { int i=0; for (;i<connected_count;++i) { if (DEBUG==1) printf("DEBUG thread_disconnector: time: %f\n",get_timer_value(i)); if (get_timer_value(i) > 10000.0f) { if (DEBUG == 1) { printf("DEBUG thread_disconnector: client %d doesnt response\n",i); print_timers(); } sem_wait(&semaphore); // przesunac cl_names memcpy(cl_names+i, cl_names+i+1,sizeof(cl_names[i])*(connected_count-i-1)); // przesunac cl_addr memcpy(cl_addr+i, cl_addr+i+1,sizeof(cl_addr[i])*(connected_count-i-1)); // przesunac timery memcpy(timers_start+i, timers_start+i+1,sizeof(timers_start[i])*(connected_count-i-1)); --connected_count; sem_post(&semaphore); if (DEBUG == 1) { printf("DEBUG thread_disconnector: client %d shifting finished\n",i); print_timers(); } } } sleep(1); } }
char is_connected (char* name) { int i; if (DEBUG == 1) printf("DEBUG is_connected: begin\n"); print_timers(); for (i=0;i<connected_count; ++i) { if (DEBUG == 1) printf("DEBUG is_connected: 1%s %s1\n",name, cl_names[i]); if (strcmp(name, cl_names[i]) == 0) { reset_timer(i); return 1; } } if (DEBUG == 1) printf("DEBUG is_connected: end\n"); return 0; }
int main(int argc, char *argv[]) { int i; int iter; double total_time, mflops; logical verified; char Class; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Run the entire problem once to make sure all data is touched. // This reduces variable startup costs, which is important for such a // short benchmark. The other NPB 2 implementations are similar. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } setup(); setup_opencl(argc, argv); init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); fft_init(dims[0]); fft(1, &m_u1, &m_u0); //--------------------------------------------------------------------- // Start over from the beginning. Note that all operations must // be timed, in contrast to other benchmarks. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } timer_start(T_total); if (timers_enabled) timer_start(T_setup); DTIMER_START(T_compute_im); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_im); DTIMER_START(T_compute_ics); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_ics); DTIMER_START(T_fft_init); fft_init(dims[0]); DTIMER_STOP(T_fft_init); if (timers_enabled) timer_stop(T_setup); if (timers_enabled) timer_start(T_fft); fft(1, &m_u1, &m_u0); if (timers_enabled) timer_stop(T_fft); for (iter = 1; iter <= niter; iter++) { if (timers_enabled) timer_start(T_evolve); evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_evolve); if (timers_enabled) timer_start(T_fft); fft(-1, &m_u1, &m_u1); if (timers_enabled) timer_stop(T_fft); if (timers_enabled) timer_start(T_checksum); checksum(iter, &m_u1, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_checksum); } verify(NX, NY, NZ, niter, &verified, &Class); timer_stop(T_total); total_time = timer_read(T_total); if (total_time != 0.0) { mflops = 1.0e-6 * (double)NTOTAL * (14.8157 + 7.19641 * log((double)NTOTAL) + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter) / total_time; } else { mflops = 0.0; } c_print_results("FT", Class, NX, NY, NZ, niter, total_time, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) print_timers(); release_opencl(); fflush(stdout); return 0; }
static int realmain(void *carg) { unsigned arg = (uintptr_t)carg; /*c------------------------------------------------------------------- c-------------------------------------------------------------------*/ int i, ierr; /*------------------------------------------------------------------ c u0, u1, u2 are the main arrays in the problem. c Depending on the decomposition, these arrays will have different c dimensions. To accomodate all possibilities, we allocate them as c one-dimensional arrays and pass them to subroutines for different c views c - u0 contains the initial (transformed) initial condition c - u1 and u2 are working arrays c - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the c time evolution operator. c-----------------------------------------------------------------*/ /*-------------------------------------------------------------------- c Large arrays are in common so that they are allocated on the c heap rather than the stack. This common block is not c referenced directly anywhere else. Padding is to avoid accidental c cache problems, since all array sizes are powers of two. c-------------------------------------------------------------------*/ static dcomplex u0[NZ][NY][NX]; static dcomplex pad1[3]; static dcomplex u1[NZ][NY][NX]; static dcomplex pad2[3]; static dcomplex u2[NZ][NY][NX]; static dcomplex pad3[3]; static int indexmap[NZ][NY][NX]; int iter; int nthreads = 1; double total_time, mflops; boolean verified; char class; omp_set_num_threads(arg); /*-------------------------------------------------------------------- c Run the entire problem once to make sure all data is touched. c This reduces variable startup costs, which is important for such a c short benchmark. The other NPB 2 implementations are similar. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } setup(); #pragma omp parallel { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } fft(1, u1, u0); } /* end parallel */ /*-------------------------------------------------------------------- c Start over from the beginning. Note that all operations must c be timed, in contrast to other benchmarks. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } timer_start(T_TOTAL); if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP); #pragma omp parallel private(iter) firstprivate(niter) { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_SETUP); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(1, u1, u0); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } for (iter = 1; iter <= niter; iter++) { if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_EVOLVE); } evolve(u0, u1, iter, indexmap, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_EVOLVE); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(-1, u1, u2); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_CHECKSUM); } checksum(iter, u2, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_CHECKSUM); } } #pragma omp single verify(NX, NY, NZ, niter, &verified, &class); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end parallel */ timer_stop(T_TOTAL); total_time = timer_read(T_TOTAL); if( total_time != 0.0) { mflops = 1.0e-6*(double)(NTOTAL) * (14.8157+7.19641*log((double)(NTOTAL)) + (5.23518+7.21113*log((double)(NTOTAL)))*niter) /total_time; } else { mflops = 0.0; } #ifdef BOMP backend_create_time(arg); #endif printf("Computetime %d %f\n", arg, total_time); printf("client done\n"); /* c_print_results("FT", class, NX, NY, NZ, niter, nthreads, */ /* total_time, mflops, " floating point", verified, */ /* NPBVERSION, COMPILETIME, */ /* CS1, CS2, CS3, CS4, CS5, CS6, CS7); */ if (TIMERS_ENABLED == TRUE) print_timers(); }