static void calc_lines(Uint32 start, Uint32 end, Uint32* lines, double max_values_sq, Uint32 max_iter) { Uint32 i, iter_wert, icolor; double cx, cy; double pd_x = 3.0 / (double)MAX_X; double pd_y = 2.0 / (double)MAX_Y; #ifdef MANUAL VT_USER_START("calc_lines"); #endif for(i = start; i < end; i++) { cx = -2.0 + (i / MAX_Y) * pd_x; cy = -1.0 + (i % MAX_Y) * pd_y; iter_wert = mandelbrot_point(cx, cy, max_values_sq, max_iter); icolor = (double)iter_wert / (double)max_iter * (1u << 24); lines[i-start] = icolor; } #ifdef MANUAL VT_USER_END("calc_lines"); #endif }
static Uint32 mandelbrot_point(double cx, double cy, double max_value_sq, Uint32 max_iter) { double value_sq = 0; double x = 0, xt; double y = 0, yt; Uint32 iter = 0; #ifdef MANUAL VT_USER_START("mandelbrot_point"); #endif while((value_sq <= max_value_sq) && (iter < max_iter)) { xt = (x * x) - (y * y) + cx; yt = 2 * x * y + cy; x = xt; y = yt; iter++; value_sq = x * x + y * y; } #ifdef MANUAL VT_USER_END("mandelbrot_point"); #endif return iter; }
static void StopTimer (int nr) { timeval time; gettimeofday (&time, 0); // tottimes[nr] += time.tv_sec + 1e-6 * time.tv_usec - starttimes[nr]; #pragma omp atomic tottimes[nr] += time.tv_sec + 1e-6 * time.tv_usec; VT_USER_END (const_cast<char*> (names[nr].c_str())); }
void stop_tracing_here() { #ifdef GOOGLE_PROFILER ProfilerStop( ); impl::profile_handler(NULL); #ifdef VTRACE_SAMPLED VT_USER_END("sampling"); sample(); #endif #endif }
void Stop () { if (priority == 1) { // VT_USER_END_ID(timer_id); // VT_USER_END2(timer_id); VT_USER_END(name.c_str()); if (prev != NULL) // VT_USER_START_ID(prev -> timer_id); // VT_USER_START2(prev -> timer_id); VT_USER_START(prev -> name.c_str()); stack_top = prev; } }
void Start () { if (priority == 1) { prev = stack_top; stack_top = this; if (prev) // VT_USER_END_ID (prev -> timer_id); // VT_USER_END2 (prev -> timer_id); VT_USER_END (prev -> name.c_str()); // VT_USER_START_ID(timer_id); // VT_USER_START2 (timer_id); VT_USER_START (name.c_str()); } }
static void draw_pixel(SDL_Surface* pic, Uint32 x, Uint32 y, Uint32 color) { Uint32* pixel; #ifdef MANUAL VT_USER_START("draw_pixel"); #endif pixel = (Uint32*)pic->pixels + y * MAX_X + x; *pixel = color; #ifdef MANUAL VT_USER_END("draw_pixel"); #endif }
static void draw(SDL_Surface* pic, Uint32* field) { Uint32 i, j; #ifdef MANUAL VT_USER_START("draw"); #endif for(i = 0; i < MAX_X; i++) { for(j = 0; j < MAX_Y; j++) { draw_pixel(pic, i, j, field[i * MAX_Y + j]); } } #ifdef MANUAL VT_USER_END("draw"); #endif }
static void StopTimer (int nr) { tottimes[nr] += clock()-starttimes[nr]; VT_USER_END (const_cast<char*> (names[nr].c_str())); }
/* * Out-of-core gemms: * - Z' XR * - Z' Y * Z is m x m * The other matrix is m x n */ void ooc_gemm( int m, int n, int ooc_b, double *Z, char *in, char *out, int threshold, const char *obj_type, char *obj_name, int namelength, int nthreads_avg ) { /* Files */ FILE *fp_in = fgls_fopen( in, "rb" ); FILE *fp_out = fgls_fopen( out, "wb" ); /* OOC Problem dimensions */ /*size_t max_elems_per_buffer = 1L << 26; // 64MElems, 512 MBs*/ /*max_elems_per_buffer = max_elems_per_buffer - max_elems_per_buffer % n;*/ /*size_t num_cols_per_buff = max_elems_per_buffer / n;*/ /* Asynchronous IO data structures */ double *in_comp, *out_comp; double_buffering db_in, db_out; // B, C double_buffering_init( &db_in, ooc_b * m * sizeof(double), fp_in, NULL ); // _fp, cf not needed in this case double_buffering_init( &db_out, ooc_b * m * sizeof(double), fp_out, NULL ); // _fp, cf not needed in this case /* BLAS constants */ double ONE = 1.0; double ZERO = 0.0; /* Read first piece of "in" */ double_buffering_read( &db_in, IO_BUFF, MIN( (size_t)ooc_b * m, (size_t)m * n ) * sizeof(double), 0); double_buffering_swap( &db_in ); int cur_n; int i; for ( i = 0; i < n; i += ooc_b ) { /* Read next piece of "in" */ size_t nbytes = i + ooc_b > n ? 1 : MIN( ooc_b * m, ( n - (size_t)( i + ooc_b ) ) * m ) * sizeof(double); off_t offset = i + ooc_b > n ? 0 : (off_t)(i + ooc_b) * m * sizeof(double); double_buffering_read( &db_in, IO_BUFF, nbytes, offset ); /* Wait for current piece of "in" */ #if VAMPIR VT_USER_START("OOC_GEMM_WAIT"); #endif double_buffering_wait( &db_in, COMP_BUFF ); #if VAMPIR VT_USER_END("OOC_GEMM_WAIT"); #endif /* Compute */ in_comp = double_buffering_get_comp_buffer( &db_in ); out_comp = double_buffering_get_comp_buffer( &db_out ); cur_n = MIN( ooc_b, (n - i) ); /*printf("Compute\n");*/ // Sanity check average( in_comp, m, cur_n, threshold, obj_type, &obj_name[i*namelength], namelength, 1, nthreads_avg ); #if VAMPIR VT_USER_START("OOC_GEMM"); #endif /*printf("\nPRE: "); print_timestamp(); fflush( stdout );*/ dgemm_("T", "N", &m, &cur_n, &m, &ONE, Z, &m, in_comp, &m, &ZERO, out_comp, &m); /*printf("\nPOST: "); print_timestamp(); fflush( stdout );*/ #if VAMPIR VT_USER_END("OOC_GEMM"); #endif /* Wait until previous piece of "out" is written */ if ( i > 0) double_buffering_wait( &db_out, IO_BUFF ); /* Write current piece of "out" */ double_buffering_write( &db_out, COMP_BUFF, MIN( ooc_b * m, (size_t)(n - i) * m ) * sizeof(double), (off_t)i * m * sizeof(double) ); /* Swap buffers */ double_buffering_swap( &db_in ); double_buffering_swap( &db_out ); } /* Wait for the remaining io calls issued */ double_buffering_wait( &db_in, COMP_BUFF ); double_buffering_wait( &db_out, IO_BUFF ); /* Clean-up */ double_buffering_destroy( &db_in ); double_buffering_destroy( &db_out ); fclose( fp_in ); fclose( fp_out ); }
int main(int argc, char *argv[]) { int rank, size, next, prev, message; #ifdef MANUAL VT_USER_START("main"); #endif /* Start up MPI */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); /* Calculate the rank of the next process in the ring. Use the modulus operator so that the last process "wraps around" to rank zero. */ next = (rank + 1) % size; prev = (rank + size - 1) % size; /* If we are the "master" process (i.e., MPI_COMM_WORLD rank 0), put the number of times to go around the ring in the message. */ if (0 == rank) { message = NRING; printf("Process 0 sending %d to %d, tag %d (%d processes in ring)\n", message, next, TAG, size); MPI_Send(&message, 1, MPI_INT, next, TAG, MPI_COMM_WORLD); printf("Process 0 sent to %d\n", next); } /* Pass the message around the ring. The exit mechanism works as follows: the message (a positive integer) is passed around the ring. Each time it passes rank 0, it is decremented. When each processes receives a message containing a 0 value, it passes the message on to the next process and then quits. By passing the 0 message first, every process gets the 0 message and can quit normally. */ while (1) { #ifdef MANUAL VT_USER_START("ring_loop"); #endif MPI_Recv(&message, 1, MPI_INT, prev, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE); if (0 == rank) { --message; printf("Process 0 decremented value: %d\n", message); } MPI_Send(&message, 1, MPI_INT, next, TAG, MPI_COMM_WORLD); if (0 == message) { printf("Process %d exiting\n", rank); break; } #ifdef MANUAL VT_USER_END("ring_loop"); #endif } /* The last process does one extra send to process 0, which needs to be received before the program can exit */ if (0 == rank) { MPI_Recv(&message, 1, MPI_INT, prev, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } /* All done */ MPI_Finalize(); #ifdef MANUAL VT_USER_END("main"); #endif return 0; }
int main(int argc, char* argv[]) { int numprocs, rank, edge, pixel_count, start, end; double max_values_sq; Uint32 max_iter; #ifdef MANUAL VT_USER_START("main"); #endif MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if(numprocs <= 1) { fprintf(stderr, "%s: error: requires at least two MPI processes", argv[0]); #ifdef MANUAL VT_USER_END("main"); #endif return 1; } max_values_sq = 4.0; max_iter = 5000; edge = (MAX_X * MAX_Y) / (numprocs - 1); if(rank > 0) { int i = rank - 1; Uint32* pixels; start = i * edge; end = (i == numprocs - 2) ? MAX_X * MAX_Y : (i + 1) * edge; pixel_count = end - start; pixels = malloc(pixel_count * sizeof(Uint32)); calc_lines(start, end, pixels, max_values_sq, max_iter); MPI_Send((void*)pixels, pixel_count, MPI_INT, 0, 0, MPI_COMM_WORLD); free(pixels); } else /* rank == 0 */ { int i, recv_count = (edge + 1); Uint32* field = malloc(MAX_X * MAX_Y * sizeof(Uint32)); Uint32* fieldpos; SDL_Surface* pic; SDL_Event event; MPI_Status status; for(i = 1; i < numprocs; i++) { start = (i - 1) * edge; end = (i == numprocs - 1) ? MAX_X * MAX_Y : i * edge; pixel_count = end - start; recv_count = pixel_count; fieldpos = field+start; MPI_Recv(fieldpos, recv_count, MPI_INT, i, MPI_ANY_TAG, MPI_COMM_WORLD, &status); } SDL_Init(SDL_INIT_EVERYTHING); pic = SDL_SetVideoMode(MAX_X, MAX_Y, 32, SDL_HWSURFACE | SDL_DOUBLEBUF); SDL_WM_SetCaption("Mandelbrot", "Mandelbrot"); draw(pic, field); SDL_Flip(pic); do { SDL_Delay(50); SDL_PollEvent(&event); } while( event.type != SDL_QUIT && event.type != SDL_KEYDOWN ); SDL_FreeSurface(pic); SDL_Quit(); free(field); } MPI_Finalize(); #ifdef MANUAL VT_USER_END("main"); #endif return 0; }
/* * Cholesky-based solution of the * sequence of Feasible Generalized Least-Squares problem * in the context of GWAS: */ int fgls_chol( FGLS_config_t cf ) { int n = cf.n, m = cf.m, p = cf.p, t = cf.t, x_b = cf.x_b, /*y_b = cf.y_b,*/ wXL = cf.wXL, wXR = cf.wXR; /* In-core operands */ double *Phi; double *M; double *ests; double *h2; double *res_sigma; double alpha; double beta; /* Out-of-core operands */ double *Bij; // Auxiliary variables /* Reusable data thanks to constant XL */ double *XL; double *XL_orig; // XL and a copy (XL is overwritten at every iteration of j) double *B_t; // Top part of b ( in inv(S) b ) double *V_tl; // Top-Left part of V /* BLAS / LAPACK constants */ double ZERO = 0.0; double ONE = 1.0; int iONE = 1; /* LAPACK error value */ int info; /* iterators and auxiliar vars */ int ib, i, j, k, l; // size_t int nn = cf.n * cf.n; // size_t size_t size_one_b_record = p + (p*(p+1))/2; // Threading int id; double *tmpBs, *tmpVs; // Buffer with one B and one V per thread double *oneB, *oneV; // Each thread pointer to its B and V if ( cf.y_b != 1 ) { fprintf(stderr, "\n[Warning] y_b not used (set to 1)\n"); cf.y_b = 1; } /* Memory allocation */ // In-core build_SPD_Phi( cf.n, cf.Z, cf.W, cf.Phi ); Phi = cf.Phi; M = ( double * ) fgls_malloc ( (size_t)cf.n * cf.n * sizeof(double) ); ests = cf.ests; h2 = ests; res_sigma = &ests[2*cf.t]; XL_orig = cf.XL; XL = ( double * ) fgls_malloc ( cf.wXL * cf.n * sizeof(double) ); B_t = ( double * ) fgls_malloc ( cf.wXL * sizeof(double) ); V_tl = ( double * ) fgls_malloc ( cf.wXL * cf.wXL * sizeof(double) ); // Temporary storage prior to copying in db_B tmpBs = ( double * ) fgls_malloc ( cf.p * cf.num_threads * sizeof(double) ); tmpVs = ( double * ) fgls_malloc ( cf.p * cf.p * cf.num_threads * sizeof(double) ); /* Files and pointers for out-of-core */ double *XR_comp, *Y_comp, *B_comp; /* Asynchronous IO data structures */ double_buffering db_XR, db_Y, db_B; double_buffering_init( &db_XR, (size_t)cf.n * cf.wXR * cf.x_b * sizeof(double), cf.XR, &cf ); // _fp double_buffering_init( &db_Y, (size_t)cf.n * cf.y_b * sizeof(double), cf.Y, &cf ); double_buffering_init( &db_B, (size_t)size_one_b_record * cf.x_b * cf.y_b * sizeof(double), cf.B, &cf ); #if VAMPIR VT_USER_START("READ_X"); #endif /* Read first block of XR's */ double_buffering_read_XR( &db_XR, IO_BUFF, 0, (size_t)MIN( cf.x_b, cf.m ) - 1 ); double_buffering_swap( &db_XR ); #if VAMPIR VT_USER_END("READ_X"); #endif #if VAMPIR VT_USER_START("READ_Y"); #endif /* Read first Y */ double_buffering_read_Y( &db_Y, IO_BUFF, 0, 0 ); double_buffering_swap( &db_Y ); #if VAMPIR VT_USER_END("READ_Y"); #endif int iter = 0; for ( j = 0; j < t; j++ ) { /* Set the number of threads for the multi-threaded BLAS */ set_multi_threaded_BLAS( cf.num_threads ); #if VAMPIR VT_USER_START("READ_Y"); #endif /* Read next Y */ size_t next_j = (j+1) >= t ? 0 : j+1; double_buffering_read_Y( &db_Y, IO_BUFF, next_j, next_j ); #if VAMPIR VT_USER_END("READ_Y"); #endif #if VAMPIR VT_USER_START("COMP_J"); #endif /* M := sigma * ( h^2 Phi - (1 - h^2) I ) */ memcpy( M, Phi, (size_t)n * n * sizeof(double) ); alpha = res_sigma[j] * h2[j]; beta = res_sigma[j] * (1 - h2[j]); dscal_(&nn, &alpha, M, &iONE); for ( i = 0; i < n; i++ ) M[i*n + i] = M[i*n + i] + beta; /* L * L' = M */ dpotrf_(LOWER, &n, M, &n, &info); if (info != 0) { char err[STR_BUFFER_SIZE]; snprintf(err, STR_BUFFER_SIZE, "dpotrf(M) failed (info: %d)", info); error_msg(err, 1); } /* XL := inv(L) * XL */ memcpy( XL, XL_orig, wXL * n * sizeof(double) ); dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &wXL, &ONE, M, &n, XL, &n); #if VAMPIR VT_USER_START("WAIT_Y"); #endif // Wait until current Y is available for computation double_buffering_wait( &db_Y, COMP_BUFF ); #if VAMPIR VT_USER_END("WAIT_Y"); #endif /* y := inv(L) * y */ Y_comp = double_buffering_get_comp_buffer( &db_Y ); // Sanity check average( Y_comp, n, 1, cf.threshold, "TRAIT", &cf.Y_fvi->fvi_data[n*NAMELENGTH], NAMELENGTH, 0 ); dtrsv_(LOWER, NO_TRANS, NON_UNIT, &n, M, &n, Y_comp, &iONE); /* B_t := XL' * y */ dgemv_(TRANS, &n, &wXL, &ONE, XL, &n, Y_comp, &iONE, &ZERO, B_t, &iONE); /* V_tl := XL' * XL */ dsyrk_(LOWER, TRANS, &wXL, &n, &ONE, XL, &n, &ZERO, V_tl, &wXL); #if VAMPIR VT_USER_END("COMP_J"); #endif /* Solve for x_b X's at once */ for (ib = 0; ib < m; ib += x_b) { #if VAMPIR VT_USER_START("READ_X"); #endif /* Read next block of XR's */ size_t next_x_from = ((size_t)ib + x_b) >= m ? 0 : (size_t)ib + x_b; size_t next_x_to = ((size_t)ib + x_b) >= m ? MIN( (size_t)x_b, (size_t)m ) - 1 : next_x_from + MIN( (size_t)x_b, (size_t)m - next_x_from ) - 1; double_buffering_read_XR( &db_XR, IO_BUFF, next_x_from, next_x_to ); #if VAMPIR VT_USER_END("READ_X"); #endif #if VAMPIR VT_USER_START("WAIT_X"); #endif /* Wait until current block of XR's is available for computation */ double_buffering_wait( &db_XR, COMP_BUFF ); #if VAMPIR VT_USER_END("WAIT_X"); #endif /* Set the number of threads for the multi-threaded BLAS */ set_multi_threaded_BLAS( cf.num_threads ); #if VAMPIR VT_USER_START("COMP_IB"); #endif /* XR := inv(L) XR */ XR_comp = double_buffering_get_comp_buffer( &db_XR ); // Auxiliar variables int x_inc = MIN(x_b, m - ib); int rhss = wXR * x_inc; // Sanity check average( XR_comp, n, x_inc, cf.threshold, "SNP", &cf.XR_fvi->fvi_data[(n+ib)*NAMELENGTH], NAMELENGTH, 1 ); // Computation dtrsm_(LEFT, LOWER, NO_TRANS, NON_UNIT, &n, &rhss, &ONE, M, &n, XR_comp, &n); #if VAMPIR VT_USER_END("COMP_IB"); #endif #if CHOL_MIX_PARALLELISM /* Set the number of threads for the multi-threaded BLAS to 1. * The innermost loop is parallelized using OPENMP */ set_single_threaded_BLAS(); #endif #if VAMPIR VT_USER_START("COMP_I"); #endif B_comp = double_buffering_get_comp_buffer( &db_B ); #if CHOL_MIX_PARALLELISM #pragma omp parallel for private(Bij, oneB, oneV, i, k, info, id) schedule(static) num_threads(cf.num_threads) #endif for (i = 0; i < x_inc; i++) { id = omp_get_thread_num(); oneB = &tmpBs[ id * p ]; oneV = &tmpVs[ id * p * p ]; Bij = &B_comp[i * size_one_b_record]; // Building B // Copy B_T memcpy(oneB, B_t, wXL * sizeof(double)); // B_B := XR' * y dgemv_("T", &n, &wXR, &ONE, &XR_comp[i * wXR * n], &n, Y_comp, &iONE, &ZERO, &oneB[wXL], &iONE); // Building V // Copy V_TL for( k = 0; k < wXL; k++ ) dcopy_(&wXL, &V_tl[k*wXL], &iONE, &oneV[k*p], &iONE); // V_TL // V_BL := XR' * XL dgemm_("T", "N", &wXR, &wXL, &n, &ONE, &XR_comp[i * wXR * n], &n, XL, &n, &ZERO, &oneV[wXL], &p); // V_BL // V_BR := XR' * XR dsyrk_("L", "T", &wXR, &n, &ONE, &XR_comp[i * wXR * n], &n, &ZERO, &oneV[wXL * p + wXL], &p); // V_BR // B := inv(V) * B dpotrf_(LOWER, &p, oneV, &p, &info); if (info != 0) { for ( k = 0; k < size_one_b_record; k++ ) Bij[k] = 0.0/0.0; //nan("char-sequence"); continue; } dtrsv_(LOWER, NO_TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE); dtrsv_(LOWER, TRANS, NON_UNIT, &p, oneV, &p, oneB, &iONE); /* V := res_sigma * inv( X' inv(M) X) */ dpotri_(LOWER, &p, oneV, &p, &info); if (info != 0) { char err[STR_BUFFER_SIZE]; snprintf(err, STR_BUFFER_SIZE, "dpotri failed (info: %d)", info); error_msg(err, 1); } // Copy output for ( k = 0; k < p; k++ ) Bij[k] = (float) oneB[k]; for ( k = 0; k < p; k++ ) Bij[p+k] = (float)sqrt(oneV[k*p+k]); int idx = 0; for ( k = 0; k < p-1; k++ ) // Cols of V for ( l = k+1; l < p; l++ ) // Rows of V { Bij[p+p+idx] = (float)oneV[k*p+l]; idx++; } #if 0 printf("Chi square: %.6f\n", ( (oneB[p-1] / Bij[p+p-1]) * (oneB[p-1] / Bij[p+p-1]) ) ); #endif } #if VAMPIR VT_USER_END("COMP_I"); #endif #if VAMPIR VT_USER_START("WAIT_BV"); #endif /* Wait until the previous blocks of B's and V's are written */ if ( iter > 0) double_buffering_wait( &db_B, IO_BUFF ); #if VAMPIR VT_USER_END("WAIT_BV"); #endif /* Write current blocks of B's and V's */ #if VAMPIR VT_USER_START("WRITE_BV"); #endif double_buffering_write_B( &db_B, COMP_BUFF, ib, ib+x_inc - 1, j, j ); #if VAMPIR VT_USER_END("WRITE_BV"); #endif /* Swap buffers */ double_buffering_swap( &db_XR ); double_buffering_swap( &db_B ); iter++; } /* Swap buffers */ double_buffering_swap( &db_Y ); } #if VAMPIR VT_USER_START("WAIT_ALL"); #endif /* Wait for the remaining IO operations issued */ double_buffering_wait( &db_XR, COMP_BUFF ); double_buffering_wait( &db_Y, COMP_BUFF ); double_buffering_wait( &db_B, IO_BUFF ); #if VAMPIR VT_USER_END("WAIT_ALL"); #endif /* Clean-up */ free( M ); free( XL ); free( B_t ); free( V_tl ); free( tmpBs ); free( tmpVs ); double_buffering_destroy( &db_XR ); double_buffering_destroy( &db_Y ); double_buffering_destroy( &db_B ); return 0; }
void end(boost::mpl::true_) { VT_USER_END(name.c_str()); // std::cout << "vpt_end(" << N << ")\n"; }