int main(int argc, char **argv){ bsp_init(bspsieve, argc, argv); /* sequential part */ if (argc != 2) { printf("Usage: %s N\n", argv[0]); bsp_abort("Incorrect invocation.\n"); } sscanf(argv[1], "%lld", &N); printf("max prime requested = %lld\n", N); P = bsp_nprocs(); // maximum amount of procs if ( blockSize(P, 0, N) < sqrt(N)) printf("WARNING: such a large P (%d) with relatively small N (%lld) is inefficient. \n Choosing a lower P is recommended.\n\n", P, N); printf("Using %d processors. \n", P); /* SPMD part */ bspsieve(); /* sequential part */ exit(0); } /* end main */
void bsp_get( const MCBSP_PROCESSOR_INDEX_DATATYPE pid, const void * const source, const MCBSP_BYTESIZE_TYPE offset_in, void * const destination, const MCBSP_BYTESIZE_TYPE size_in ) { //library internals work with size_t only; convert if necessary const size_t offset = (size_t) offset_in; const size_t size = (size_t) size_in; //get init data struct mcbsp_thread_data * const data = mcbsp_internal_prefunction(); //build request struct mcbsp_communication_request request; //record source address const unsigned long int globalIndex = mcbsp_util_address_map_get( &(data->local2global), source ); const struct mcbsp_util_address_table_entry * entry = mcbsp_util_address_table_get( &(data->init->global2local), globalIndex, pid ); if( offset + size > entry->size ) { fprintf( stderr, "Error: bsp_get would go out of bounds at source processor (offset=%ld, size=%ld, while registered memory area is %ld bytes)!\n", offset, size, entry->size ); bsp_abort( "Aborting due to BSP primitive call with invalid arguments." ); } request.source = ((char*)(entry->address)) + offset; //record destination request.destination = destination; //record length request.length = size; //record payload request.payload = NULL; //record request mcbsp_util_stack_push( &(data->queues[ data->bsp_id ]), &request ); }
int *vecalloci(int n) { int *pi; if (n==0){ pi= NULL; } else { pi= (int *)malloc(n*SZINT); if (pi==NULL) bsp_abort("vecalloci: not enough memory"); } return pi; }
double *vecallocd(int n) { double *pd; if (n==0){ pd= NULL; } else { pd= (double *)malloc(n*SZDBL); if (pd==NULL) bsp_abort("vecallocd: not enough memory"); } return pd; }
double *vecallocd(int n){ /* This function allocates a vector of doubles of length n */ double *pd; if (n==0){ pd= NULL; } else { pd= (double *)malloc(n*SZDBL); if (pd==NULL) bsp_abort("vecallocd: not enough memory"); } return pd; } /* end vecallocd */
/* This function allocates a vector of integers of length n */ ulong *pi; if (n==0){ pi= NULL; } else { pi= (ulong *)malloc(n*SZULL); if (pi==NULL) bsp_abort("vecalloculi: not enough memory"); } return pi; } /* end vecalloculi */ int *vecalloci(int n){ /* This function allocates a vector of integers of length n */ int *pi; if (n==0){ pi= NULL; } else { pi= (int *)malloc(n*SZINT); if (pi==NULL) bsp_abort("vecalloci: not enough memory"); } return pi; } /* end vecalloci */
ulong *vecalloculi(ulong n) { /* This function allocates a vector of integers of length n */ ulong *pi; if (n==0){ pi= NULL; } else { pi= (ulong *)malloc(n*SZULL); if (pi==NULL) bsp_abort("vecalloculi: not enough memory"); } return pi; } /* end vecalloculi */
void bspinprod(){ double bspip(int p, int s, int n, double *x, double *y); int nloc(int p, int s, int n); double *x, alpha, time0, time1; int p, s, n, nl, i, iglob; bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ s= bsp_pid(); /* s = processor number */ if (s==0){ printf("Please enter n:\n"); fflush(stdout); scanf("%d",&n); if(n<0) bsp_abort("Error in input: n is negative"); } bsp_push_reg(&n,SZINT); bsp_sync(); bsp_get(0,&n,0,&n,SZINT); bsp_sync(); bsp_pop_reg(&n); nl= nloc(p,s,n); x= vecallocd(nl); for (i=0; i<nl; i++){ iglob= i*p+s; x[i]= iglob+1; } bsp_sync(); time0=bsp_time(); alpha= bspip(p,s,n,x,x); bsp_sync(); time1=bsp_time(); printf("Processor %d: sum of squares up to %d*%d is %.lf\n", s,n,n,alpha); fflush(stdout); if (s==0){ printf("This took only %.6lf seconds.\n", time1-time0); fflush(stdout); } vecfreed(x); bsp_end(); } /* end bspinprod */
void bsp_direct_get( const MCBSP_PROCESSOR_INDEX_DATATYPE pid, const void * const source, const MCBSP_BYTESIZE_TYPE offset_in, void * const destination, const MCBSP_BYTESIZE_TYPE size_in ) { //library internals work with size_t only; convert if necessary const size_t offset = (size_t) offset_in; const size_t size = (size_t) size_in; //get init data struct mcbsp_thread_data * const data = mcbsp_internal_prefunction(); //get source address const unsigned long int globalIndex = mcbsp_util_address_map_get( &(data->local2global), source ); const struct mcbsp_util_address_table_entry * entry = mcbsp_util_address_table_get( &(data->init->global2local), globalIndex, pid ); if( offset + size > entry->size ) { fprintf( stderr, "Error: bsp_direct_get would go out of bounds at source processor (offset=%ld, size=%ld, while registered memory area is %ld bytes)!\n", offset, size, entry->size ); bsp_abort( "Aborting due to BSP primitive call with invalid arguments." ); } //perform direct get memcpy( destination, ((char*)(entry->address)) + offset, size ); }
void parallel_part() { int i, j; srand(1452764); //Matrix initilization float **matrix = (float**)calloc(N+2, sizeof(float*)); for (i=0; i<N+2; i++) { matrix[i] = (float*)calloc(N+2, sizeof(float)); } for (i=0; i<N+2; i++) { for (j=0; j<N+2; j++) { matrix[i][j] = (float)rand()/(float)RAND_MAX; //printf("row %d, coloum %d, element: %f\n", i, j, matrix[i][j]); } } //Parallel part bsp_begin(bsp_nprocs()); int pid, x, y, done; pid=x=y=done=0; int sqroot = (int)(sqrt(bsp_nprocs())); int size = (int)(N/sqroot); //side float Ai_jm1, Aim1_j, Ai_jp1, Aip1_j; Ai_jm1 = Aim1_j = Ai_jp1 = Aip1_j = 0.0; float temp, diff, convergence, total_diff; temp = convergence = 0.0; float *diffs = (float*)calloc(bsp_nprocs(), sizeof(float)); int counter= 0; //(N/sqrt(p)) is an integer assurance if ( N%sqroot!=0) { bsp_abort("N/sqrt(p) is not an integer.\nProgram Aborted.\n"); } //Initiliaze a piece of martix in decomposition float **sub_martix = (float**)calloc(size, sizeof(float*)); for (i=0; i<size; i++) { sub_martix[i] = (float*) calloc(size, sizeof(float)); } //Initiliaze borders float *upper = (float*)calloc(size, sizeof(float)); float *lower = (float*)calloc(size, sizeof(float)); float *left = (float*)calloc(size, sizeof(float)); float *right = (float*)calloc(size, sizeof(float)); float *overlap = (float*)calloc(size, sizeof(float)); bsp_push_reg(&diff, sizeof(float)); bsp_push_reg(upper, size*sizeof(float)); bsp_push_reg(lower, size*sizeof(float)); bsp_push_reg(left, size*sizeof(float)); bsp_push_reg(right, size*sizeof(float)); //Make each matrix and border available globally for (i=0; i<size; i++) { bsp_push_reg(sub_martix[i], size*sizeof(float)); } bsp_sync(); /*Processor 0 distributes the data*/ if (bsp_pid()==0) { for (pid = 0; pid<bsp_nprocs(); pid++) { //Determine which part of the original matrix x = pid/sqroot; y = pid%sqroot; //Then the processor 0 copy the data to each processor for (i=0; i<size; i++) { for (j=0; j<size; j++) { sub_martix[i][j] = matrix[x*size+i+1][y*size+j+1]; } } if (pid!=0) { for (i=0; i<size; i++) { bsp_put(pid, sub_martix[i], sub_martix[i], 0, size*sizeof(float)); } } } } bsp_sync(); if (bsp_pid()==0) { for (pid=0; pid<bsp_nprocs(); pid++) { x=pid/sqroot; x=pid%sqroot; //if the part is in 1st row if (x==0) { for (i=0; i<size; i++) { upper[i] = matrix[0][y*size+1+i]; } } //if the part is in leftmost column if (y==0) { for (i=0; i<size; i++) { left[i] = matrix[x*size+1+i][0]; } } //if the part is in last row if (x==sqroot-1) { for (i=0; i<size; i++) { lower[i] = matrix[N+1][y*size+1+i]; } } //if the part is in rightmost column if (y==1) { for (i=0; i<size; i++) { right[i] = matrix[x*size+1+i][N+1]; } } if (pid!=0) { bsp_put(pid, upper, upper, 0, size*sizeof(float)); bsp_put(pid, lower, lower, 0, size*sizeof(float)); bsp_put(pid, left, left, 0, size*sizeof(float)); bsp_put(pid, right, right, 0, size*sizeof(float)); } } } bsp_sync(); /* Computation */ while (!done) { pid = bsp_pid(); diff=0.0; total_diff=0.0; x = pid/sqroot; y = pid%sqroot; //printf("Now %d th round:", ++counter); if (x<sqroot-1) { for (i=0; i<size; i++) { overlap[i] = sub_martix[size-1][i]; } bsp_put(bsp_pid()+sqroot, overlap, upper, 0, size*sizeof(float)); } if (y<sqroot-1) { for (i=0; i<size; i++) { overlap[i]=sub_martix[i][size-1]; } bsp_put(bsp_pid()+1, overlap, left, 0, size*sizeof(float)); } if (x>0) { for (i=0; i<size; i++) { overlap[i]=sub_martix[0][i]; } bsp_put(bsp_pid()-sqroot, overlap, lower, 0, size*sizeof(float)); } if (y>0) { for (i=0; i<size; i++) { overlap[i]=sub_martix[i][0]; } bsp_put(bsp_pid()-1, overlap, right, 0, size*sizeof(float)); } bsp_sync(); for (i=0; i<size; i++) { for (j=0; j<size; j++) { temp = sub_martix[i][j]; if (i-1<0) { Aim1_j=upper[j]; } else { Aim1_j=sub_martix[i-1][j]; } if (i+1>size-1) { Aip1_j=lower[j]; } else { Aip1_j=sub_martix[i+1][j]; } if (j-1<0) { if (y!=0) { Ai_jm1 = left[size-1]; } else { Ai_jm1 = left[i]; } } else { Ai_jm1 = sub_martix[i][j-1]; } if (j+1>size-1) { if (y!=sqroot-1) { Ai_jp1 = right[0]; } else { Ai_jp1 = right[i]; } } else { Ai_jp1 = sub_martix[i][j+1]; } sub_martix[i][j] = 0.2*(sub_martix[i][j] + Ai_jm1 + Aim1_j + Ai_jp1 + Aip1_j); //printf("data is %f\n", sub_martix[i][j]); diff += fabs(sub_martix[i][j]-temp); } } //printf("Result from pid: %d: difference= %f \n", bsp_pid(), diff); bsp_sync(); for (i=0; i<bsp_nprocs(); i++) { bsp_get(i, &diff, 0, &diffs[i], sizeof(float)); } bsp_sync(); for (i=0; i<bsp_nprocs(); i++) { total_diff += diffs[i]; } bsp_sync(); convergence = (total_diff)/(float)(N*N); //printf("Current Convergence is %f\n", convergence); if (convergence<TOL) { done = 1; } bsp_sync(); } for (i=0; i<size; i++) { bsp_pop_reg(sub_martix[i]); } bsp_pop_reg(&diff); bsp_pop_reg(lower); bsp_pop_reg(upper); bsp_pop_reg(left); bsp_pop_reg(right); bsp_sync(); for (i=0; i<size; i++) { free(sub_martix[i]); } free(sub_martix); free(diffs); free(lower); free(upper); free(left); free(right); free(overlap); bsp_sync(); bsp_end(); for (i=0; i<N+2; i++) { free(matrix[i]); } free(matrix); }
void bspfft_test() { void bspfft( double * x, int n, int p, int s, int sign, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); void bspfft_init( int n, int p, int s, double * w0, double * w, double * tw, int *rho_np, int *rho_p ); int k1_init( int n, int p ); int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p; double time0, time1, time2, ffttime, nflops, max_error, error_re, error_im, error, *Error, *x, *w0, *w, *tw; bsp_begin( P ); p = bsp_nprocs(); s = bsp_pid(); bsp_push_reg( &n, SZINT ); Error = vecallocd( p ); bsp_push_reg( Error, p * SZDBL ); bsp_sync(); if ( s == 0 ) { printf( "Please enter length n: \n" ); #ifdef _WIN32 scanf_s( "%d", &n ); #else scanf( "%d", &n ); #endif if ( n < 2 * p ) { bsp_abort( "Error in input: n < 2p" ); } for ( q = 1; q < p; q++ ) { bsp_put( q, &n, &n, 0, SZINT ); } } bsp_sync(); if ( s == 0 ) { printf( "FFT of vector of length %d using %d processors\n", n, p ); printf( "performing %d forward and %d backward transforms\n", NITERS, NITERS ); } /* Allocate, register, and initialize vectors */ np = n / p; x = vecallocd( 2 * np ); bsp_push_reg( x, 2 * np * SZDBL ); k1 = k1_init( n, p ); w0 = vecallocd( k1 ); w = vecallocd( np ); tw = vecallocd( 2 * np + p ); rho_np = vecalloci( np ); rho_p = vecalloci( p ); for ( j = 0; j < np; j++ ) { jglob = j * p + s; x[2 * j] = ( double )jglob; x[2 * j + 1] = 1.0; } bsp_sync(); time0 = bsp_time(); /* Initialize the weight and bit reversal tables */ for ( it = 0; it < NITERS; it++ ) { bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time1 = bsp_time(); /* Perform the FFTs */ for ( it = 0; it < NITERS; it++ ) { bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p ); bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p ); } bsp_sync(); time2 = bsp_time(); /* Compute the accuracy */ max_error = 0.0; for ( j = 0; j < np; j++ ) { jglob = j * p + s; error_re = fabs( x[2 * j] - ( double )jglob ); error_im = fabs( x[2 * j + 1] - 1.0 ); error = sqrt( error_re * error_re + error_im * error_im ); if ( error > max_error ) { max_error = error; } } bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL ); bsp_sync(); if ( s == 0 ) { max_error = 0.0; for ( q = 0; q < p; q++ ) { if ( Error[q] > max_error ) { max_error = Error[q]; } } } for ( j = 0; j < NPRINT && j < np; j++ ) { jglob = j * p + s; printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] ); } fflush( stdout ); bsp_sync(); if ( s == 0 ) { printf( "Time per initialization = %lf sec \n", ( time1 - time0 ) / NITERS ); ffttime = ( time2 - time1 ) / ( 2.0 * NITERS ); printf( "Time per FFT = %lf sec \n", ffttime ); nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n; printf( "Computing rate in FFT = %lf Mflop/s \n", nflops / ( MEGA * ffttime ) ); printf( "Absolute error= %e \n", max_error ); printf( "Relative error= %e \n\n", max_error / n ); } bsp_pop_reg( x ); bsp_pop_reg( Error ); bsp_pop_reg( &n ); bsp_sync(); vecfreei( rho_p ); vecfreei( rho_np ); vecfreed( tw ); vecfreed( w ); vecfreed( w0 ); vecfreed( x ); vecfreed( Error ); bsp_end(); } /* end bspfft_test */
void spmd( void ) { //parallel over three processes bsp_begin( 3 ); //test bsp_push_reg (results in next superstep) size_t localInt; bsp_push_reg( &localInt, sizeof( size_t ) ); checkLocalIntAddress[ bsp_pid() ] = &localInt; //check pid/nprocs, both using primitives as well as manually checkPcount[ bsp_pid() ] = (size_t)(bsp_nprocs()); pthread_mutex_lock( &test_mutex ); check++; checkP[ bsp_pid() ] = true; pthread_mutex_unlock( &test_mutex ); //nobody should be at superstep 0 if( superstep == 1 ) superstepOK = false; //test barrier synchronisation bsp_sync(); //note someone is at superstep 1 superstep = 1; //check bsp_time if( bsp_time() <= 0 ) bsp_abort( "FAILURE \t bsp_time returned 0 or less!\n" ); //set up a pop_reg, but should only take effect after the next sync //(testing the push_reg after this statement thus provides a free test) bsp_pop_reg( &localInt ); struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data ); if( data->localsToRemove.top != 1 || data->localsToRemove.cap != 16 || *((void**)(data->localsToRemove.array)) != (void*)&localInt ) { fprintf( stderr, "FAILURE \t bsp_pop_reg did not push entry on the to-remove stack (%p != %p)!\n", *((void**)(data->localsToRemove.array)), (void*)&localInt ); mcbsp_util_fatal(); } //check push_reg for( unsigned char i=0; i<3; ++i ) { if( checkLocalIntAddress[ i ] != mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ) { fprintf( stderr, "FAILURE \t bsp_push_reg did not register correct address!\n" ); mcbsp_util_fatal(); } } bsp_sync(); //check pop_reg for( unsigned char i=0; i<3; ++i ) { if( mcbsp_util_address_table_get( &(data->init->global2local), 0, i ) != NULL || data->localC != 0 ) { fprintf( stderr, "FAILURE \t bsp_pop_reg did not de-register correctly (entry=%p)!\n", mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ); mcbsp_util_fatal(); } //localInt = *(size_t*)mcbsp_util_stack_pop( &(data->removedGlobals) ); } bsp_sync(); //going to test communication primitives on the following area size_t commTest[ 3 ]; commTest[ 0 ] = commTest[ 1 ] = ((size_t)bsp_pid()); commTest[ 2 ] = (size_t)(bsp_nprocs()); bsp_push_reg( &commTest, 3 * sizeof( size_t ) ); //make push valid bsp_sync(); //after this put, commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &commTest, &commTest, sizeof( size_t ), 2*sizeof( size_t) ); commTest[ 2 ] = ULONG_MAX; //this should not influence the result after sync. //test behind-the-scenes const struct mcbsp_util_stack queue = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ]; size_t predicted_cap = predictCap( sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) ); if( queue.cap != predicted_cap || queue.top != sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) || queue.size != sizeof( struct mcbsp_message ) ) { fprintf( stderr, "FAILURE \t bsp_put did not adapt the communication queue as expected!\n(cap = %ld, top = %ld, size = %ld)\n", (size_t)queue.cap, (size_t)queue.top, (size_t)queue.size ); mcbsp_util_fatal(); } const struct mcbsp_message request = *((struct mcbsp_message*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message )) ); if( request.length != 2 * sizeof( size_t) ) { fprintf( stderr, "FAILURE \t bsp_put did not push a request of the expected length!\n(length = %ld)\n", (size_t)request.length ); mcbsp_util_fatal(); } const size_t * const chk_array = (size_t*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message ) - 2 * sizeof( size_t )); if( chk_array[ 0 ] != ((size_t)bsp_pid()) || chk_array[ 1 ] != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t bsp_put did not push an expected communication request!\n" ); mcbsp_util_fatal(); } //note there is no easy way to check request.destination; the top-level BSP test will handle that one bsp_sync(); //test for the above expectation after bsp_put, namely //commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs if( commTest[ 0 ] != ((size_t)bsp_pid()) || commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) || commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t array after bsp_put is not as expected! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } //do a get on the next processor on the last element of commTest bsp_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest[ 2 ]), sizeof( size_t ) ); //fill the expected value after the get to test non-buffering commTest[ 2 ] = ((size_t)bsp_pid()); //communicate bsp_sync(); //commTest[ 0 ] should equal bsp_pid, commTest[ 1 ] should equal bsp_pid-1, commTest[ 2 ] should be bsp_pid+1 if( commTest[ 0 ] != ((size_t)bsp_pid()) || commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs() - 1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t start of array after bsp_get changed! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } if( commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs() + 1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t last element of array after bsp_get erroneous! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } bsp_sync(); //test direct_get functionality size_t commTest2[ 3 ]; commTest2[ 0 ] = commTest[ 0 ]; //get commTest[1] from right neighbour bsp_direct_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, sizeof( size_t ), &(commTest2[ 1 ]), sizeof( size_t ) ); //get commTest[2] from left neighbour bsp_direct_get( (bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest2[ 2 ]), sizeof( size_t ) ); //now everything should equal bsp_pid if( commTest2[ 0 ] != ((size_t)bsp_pid()) || commTest2[ 1 ] != ((size_t)bsp_pid()) || commTest2[ 2 ] != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t direct_get does not function properly! (%d: [%ld %ld %ld])\n", bsp_pid(), commTest2[ 0 ], commTest2[ 1 ], commTest2[ 2 ] ); mcbsp_util_fatal(); } //now test single BSMP message bsp_send( (bsp_pid() + 1) % bsp_nprocs(), NULL, &commTest, sizeof( size_t ) ); //check messages const struct mcbsp_util_stack queue1 = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ]; const size_t new_predicted_cap = predictCap( sizeof( struct mcbsp_message ) + sizeof( size_t ) ); predicted_cap = predicted_cap > new_predicted_cap ? predicted_cap : new_predicted_cap; if( queue1.cap != predicted_cap || queue1.size != sizeof( struct mcbsp_message ) || queue1.top != sizeof( struct mcbsp_message ) + sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_send did not adapt the communication queue as expected!\n(cap = %ld, size = %ld, top = %ld; prediction was %ld, %ld, %ld)\n", (size_t)queue1.cap, (size_t)queue1.size, (size_t)queue1.top, (size_t)predicted_cap, (size_t)(sizeof( struct mcbsp_message )), (size_t)(sizeof( struct mcbsp_message ) + sizeof( size_t )) ); mcbsp_util_fatal(); } const struct mcbsp_message request2 = *(struct mcbsp_message*) ((char*)queue1.array + queue1.top - sizeof( struct mcbsp_message )); if( request2.destination != NULL || request2.length != sizeof( size_t ) || // assumes tagSize = 0 *(size_t *)queue1.array != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t bsp_send did not push the expected communication request!\n(top = %ld, destination = %p, length = %ld, payload = %ld\n", (size_t)queue1.top, request2.destination, (size_t)request2.length, *(size_t *)queue1.array ); mcbsp_util_fatal(); } bsp_sync(); //inspect incoming BSMP queue (assuming tagSize = 0) predicted_cap = predictCap( sizeof( size_t ) + sizeof( size_t ) ); if( data->bsmp.cap != predicted_cap || data->bsmp.top != sizeof( size_t ) + sizeof( size_t ) || data->bsmp.size != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t BSMP queue after superstep with sends is not as expected!\n(cap = %ld, top = %ld, size = %ld; prediction was %ld, %ld, %ld)\n", (size_t)data->bsmp.cap, (size_t)data->bsmp.top, (size_t)data->bsmp.size, (size_t)predicted_cap, (size_t)(8 + sizeof( size_t )), (size_t)(data->bsmp.size) ); mcbsp_util_fatal(); } if( *(size_t*)(data->bsmp.array) != (size_t)((bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t Value in BSMP queue is not correct!\n" ); mcbsp_util_fatal(); } //inspect using primitives MCBSP_NUMMSG_TYPE packets; MCBSP_BYTESIZE_TYPE packetSize; bsp_qsize( &packets, &packetSize ); if( packets != 1 || packetSize != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_qsize does not function correctly!\n" ); mcbsp_util_fatal(); } bsp_move( &commTest, sizeof( size_t ) ); if( commTest[ 0 ] != (size_t)(( bsp_pid() + bsp_nprocs() - 1 ) % bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t bsp_move does not function correctly!\n" ); mcbsp_util_fatal(); } //check set_tagsize MCBSP_BYTESIZE_TYPE tsz = sizeof( size_t ); bsp_set_tagsize( &tsz ); if( tsz != 0 ) { fprintf( stderr, "FAILURE \t return value of bsp_set_tagsize is incorrect!\n" ); mcbsp_util_fatal(); } bsp_sync(); //check set_tagsize if( data->init->tagSize != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_set_tagsize failed!\n" ); mcbsp_util_fatal(); } commTest[ 0 ] = ((size_t)bsp_pid()); commTest[ 1 ] = 3; commTest[ 2 ] = 8 + ((size_t)bsp_pid()); for( unsigned char i = 0; i < bsp_nprocs(); ++i ) { bsp_send( i, commTest, &(commTest[1]), 2 * sizeof( size_t ) ); char * const test = (char*)(data->queues[ (size_t)i ].array) + data->queues[ (size_t)i ].top - sizeof( struct mcbsp_message ) - sizeof( size_t ); if( *(size_t*)test != *commTest ) { fprintf( stderr, "FAILURE \t BSMP tag did not get pushed correctly (reads %ld instead of %ld)!\n", *(size_t*)test, *commTest ); mcbsp_util_fatal(); } } bsp_sync(); MCBSP_BYTESIZE_TYPE status; size_t tag; for( unsigned char i = 0; i < bsp_nprocs(); ++i ) { bsp_get_tag( &status, &tag ); if( tag >= ((size_t)bsp_nprocs()) || status != 2 * sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t error in BSMP tag handling! (tag=%ld, status=%ld)\n", tag, (size_t)status ); mcbsp_util_fatal(); } size_t *p_tag, *msg; if( bsp_hpmove( (void**)&p_tag, (void**)&msg ) != 2 * sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_hpmove does not return correct payload length." ); } if( msg[ 0 ] != 3 || *p_tag != tag ) { fprintf( stderr, "FAILURE \t bsp_hpmove does not contain correct message (tag=%ld, payload = %ld) which should be (%ld, 3).\n", *p_tag, msg[ 0 ], tag ); mcbsp_util_fatal(); } commTest[ tag ] = msg[ 1 ]; } for( unsigned short int i = 0; i < bsp_nprocs(); ++i ) { if( commTest[ i ] != (unsigned int)(8 + i) ) { fprintf( stderr, "FAILURE \t error in bsp_tag / bsp_(hp)move combination!\n" ); mcbsp_util_fatal(); } } bsp_sync(); #ifdef MCBSP_ALLOW_MULTIPLE_REGS //test multiple regs double mreg[17]; bsp_push_reg( &(mreg[0]), 7*sizeof( double ) ); bsp_sync(); double mregs = 1.3; bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 6 * sizeof( double ), sizeof( double ) ); bsp_push_reg( &(mreg[0]), 17*sizeof( double ) ); bsp_sync(); bsp_push_reg( &(mreg[0]), 13*sizeof( double ) ); bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 16 * sizeof( double ), sizeof( double ) ); bsp_sync(); if( mreg[ 6 ] != mreg[ 16 ] || mreg[ 6 ] != mregs ) { fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg calls (%f,%f,%f,...,%f,%f)\n", mreg[ 5 ], mreg[ 6 ], mreg[ 7 ], mreg[ 15 ], mreg[ 16 ] ); mcbsp_util_fatal(); } bsp_pop_reg( &(mreg[0]) ); bsp_pop_reg( &(mreg[0]) ); bsp_sync(); bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 2 * sizeof( double ), sizeof( double ) ); bsp_sync(); if( mreg[ 2 ] != mregs ) { fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg + multiple bsp_pop_reg calls\n" ); mcbsp_util_fatal(); } #endif bsp_end(); }
void bspsieve(){ double time0, time1; ulong *x; // local list of candidates ulong *ks; //place for proc0 to store intermediate ks ulong n, nl, i, iglob; int s, p; ulong k; // the current largest sure-prime n = N+1; // copy global N and increase by 1. (only proc 1 knows this) // this is so the maximum array idx == N bsp_begin(P); p= bsp_nprocs(); /* p = number of processors obtained */ printf("Now we have %d processors.\n", p); s= bsp_pid(); /* s = processor number */ if (s==0){ if(n<0) bsp_abort("Error in input: n is negative"); ks = vecalloculi(p); } bsp_push_reg(&n,SZULL); bsp_sync(); bsp_get(0,&n,0,&n,SZULL); //everyone reads N from proc 0 bsp_sync(); bsp_pop_reg(&n); nl= blockSize(p,s,n); // how big must s block be? printf("P(%d) tries to alloc vec of %lld ulongs", s, nl); printf(", size would be = %lld Mb\n", nl*SZULL/1024/1024); x= vecalloculi(nl); for (i=0; i<nl; i++){ // start by assuming everything is prime, except 1 iglob= globalIdx(p,s,n,i); x[i]= iglob; } if(s==0) x[1]=0; bsp_sync(); time0=bsp_time(); k = 2; // begin work while( k*k <= n ) { bspmarkmultiples(p,s,n,k,x); k = nextPrime(p,s,n,k,x); bsp_push_reg(&k, SZULL); bsp_sync(); if(s==0) { ks[0] = k; // my k for(i=1;i<p; i++) { bsp_get(i, &k, 0, &ks[i], SZULL); } } bsp_sync(); if(s==0) { k = findMinimum(p,ks); } bsp_sync(); //broadcast minimum bsp_get(0,&k,0,&k,SZULL); bsp_sync(); bsp_pop_reg(&k); } // end work bsp_sync(); time1=bsp_time(); ulong primes= 0; //printf("Processor %lld primes: \n", s); for(i = 0; i < blockSize(p,s,n); i++) if( x[i] != 0) primes++; //do not print primes, just count them. printf("proc %d finds %lld primes.\n", s, primes); fflush(stdout); if (s==0){ printf("This took only %.6lf seconds.\n", time1-time0); fflush(stdout); vecfreeuli(ks); } vecfreeuli(x); bsp_end(); } /* end bspsieve */