Esempio n. 1
0
int main(int argc, char **argv){

    bsp_init(bspsieve, argc, argv);

    /* sequential part */
    if (argc != 2)
    {
        printf("Usage: %s N\n", argv[0]);
        bsp_abort("Incorrect invocation.\n");
    }
    sscanf(argv[1], "%lld", &N);

    printf("max prime requested = %lld\n", N);
    P = bsp_nprocs(); // maximum amount of procs

    if ( blockSize(P, 0, N) < sqrt(N))
        printf("WARNING: such a large P (%d) with relatively small N (%lld) is inefficient. \n Choosing a lower P is recommended.\n\n", P, N);

    printf("Using %d processors. \n", P);

    /* SPMD part */
    bspsieve();

    /* sequential part */
    exit(0);

} /* end main */
Esempio n. 2
0
void bsp_get( const MCBSP_PROCESSOR_INDEX_DATATYPE pid, const void * const source,
	const MCBSP_BYTESIZE_TYPE offset_in, void * const destination,
	const MCBSP_BYTESIZE_TYPE size_in ) {
	//library internals work with size_t only; convert if necessary
	const size_t offset = (size_t) offset_in;
	const size_t size   = (size_t) size_in;

	//get init data
	struct mcbsp_thread_data * const data = mcbsp_internal_prefunction();

	//build request
	struct mcbsp_communication_request request;

	//record source address
	const unsigned long int globalIndex = mcbsp_util_address_map_get( &(data->local2global), source );
	const struct mcbsp_util_address_table_entry * entry = mcbsp_util_address_table_get( &(data->init->global2local), globalIndex, pid );
	if( offset + size > entry->size ) {
		fprintf( stderr, "Error: bsp_get would go out of bounds at source processor (offset=%ld, size=%ld, while registered memory area is %ld bytes)!\n", offset, size, entry->size );
		bsp_abort( "Aborting due to BSP primitive call with invalid arguments." );
	}
	request.source = ((char*)(entry->address)) + offset;

	//record destination
	request.destination = destination;

	//record length
	request.length = size;

	//record payload
	request.payload = NULL;

	//record request
	mcbsp_util_stack_push( &(data->queues[ data->bsp_id ]), &request );
}
Esempio n. 3
0
int *vecalloci(int n)
{
    int *pi;

    if (n==0){
        pi= NULL; 
    } else { 
        pi= (int *)malloc(n*SZINT);
        if (pi==NULL)
            bsp_abort("vecalloci: not enough memory");
    }
    return pi;
} 
Esempio n. 4
0
double *vecallocd(int n)
{
    double *pd;

    if (n==0){
        pd= NULL;
    } else {
        pd= (double *)malloc(n*SZDBL);
        if (pd==NULL)
            bsp_abort("vecallocd: not enough memory");
    }
    return pd;
} 
Esempio n. 5
0
double *vecallocd(int n){
    /* This function allocates a vector of doubles of length n */
    double *pd;

    if (n==0){
        pd= NULL;
    } else {
        pd= (double *)malloc(n*SZDBL);
        if (pd==NULL)
            bsp_abort("vecallocd: not enough memory");
    }
    return pd;

} /* end vecallocd */
Esempio n. 6
0
    /* This function allocates a vector of integers of length n */
    ulong *pi;

    if (n==0){
        pi= NULL; 
    } else { 
        pi= (ulong *)malloc(n*SZULL);
        if (pi==NULL)
            bsp_abort("vecalloculi: not enough memory");
    }
    return pi;

} /* end vecalloculi */
int *vecalloci(int n){
    /* This function allocates a vector of integers of length n */
    int *pi;

    if (n==0){
        pi= NULL; 
    } else { 
        pi= (int *)malloc(n*SZINT);
        if (pi==NULL)
            bsp_abort("vecalloci: not enough memory");
    }
    return pi;

} /* end vecalloci */
Esempio n. 7
0
ulong *vecalloculi(ulong n)
{
    /* This function allocates a vector of integers of length n */
    ulong *pi;

    if (n==0){
        pi= NULL; 
    } else { 
        pi= (ulong *)malloc(n*SZULL);
        if (pi==NULL)
            bsp_abort("vecalloculi: not enough memory");
    }
    return pi;

} /* end vecalloculi */
Esempio n. 8
0
void bspinprod(){
    
    double bspip(int p, int s, int n, double *x, double *y);
    int nloc(int p, int s, int n);
    double *x, alpha, time0, time1;
    int p, s, n, nl, i, iglob;
    
    bsp_begin(P);
    p= bsp_nprocs(); /* p = number of processors obtained */ 
    s= bsp_pid();    /* s = processor number */ 
    if (s==0){
        printf("Please enter n:\n"); fflush(stdout);
        scanf("%d",&n);
        if(n<0)
            bsp_abort("Error in input: n is negative");
    }
    bsp_push_reg(&n,SZINT);
    bsp_sync();

    bsp_get(0,&n,0,&n,SZINT);
    bsp_sync();
    bsp_pop_reg(&n);

    nl= nloc(p,s,n);
    x= vecallocd(nl);
    for (i=0; i<nl; i++){
        iglob= i*p+s;
        x[i]= iglob+1;
    }
    bsp_sync(); 
    time0=bsp_time();

    alpha= bspip(p,s,n,x,x);
    bsp_sync();  
    time1=bsp_time();

    printf("Processor %d: sum of squares up to %d*%d is %.lf\n",
            s,n,n,alpha); fflush(stdout);
    if (s==0){
        printf("This took only %.6lf seconds.\n", time1-time0);
        fflush(stdout);
    }

    vecfreed(x);
    bsp_end();

} /* end bspinprod */
Esempio n. 9
0
void bsp_direct_get( const MCBSP_PROCESSOR_INDEX_DATATYPE pid, const void * const source,
        const MCBSP_BYTESIZE_TYPE offset_in, void * const destination,
	const MCBSP_BYTESIZE_TYPE size_in ) {
	//library internals work with size_t only; convert if necessary
	const size_t offset = (size_t) offset_in;
	const size_t size   = (size_t) size_in;
	
	//get init data
	struct mcbsp_thread_data * const data = mcbsp_internal_prefunction();

	//get source address
	const unsigned long int globalIndex = mcbsp_util_address_map_get( &(data->local2global), source );
	const struct mcbsp_util_address_table_entry * entry = mcbsp_util_address_table_get( &(data->init->global2local), globalIndex, pid );
	if( offset + size > entry->size ) {
		fprintf( stderr, "Error: bsp_direct_get would go out of bounds at source processor (offset=%ld, size=%ld, while registered memory area is %ld bytes)!\n", offset, size, entry->size );
		bsp_abort( "Aborting due to BSP primitive call with invalid arguments." );
	}

	//perform direct get
	memcpy( destination, ((char*)(entry->address)) + offset, size );
}
void parallel_part()
{
    int i, j;
    srand(1452764);

    //Matrix initilization
    float **matrix = (float**)calloc(N+2, sizeof(float*));
    for (i=0; i<N+2; i++) {
        matrix[i] = (float*)calloc(N+2, sizeof(float));
    }
    for (i=0; i<N+2; i++) {
        for (j=0; j<N+2; j++) {
            matrix[i][j] = (float)rand()/(float)RAND_MAX;
            //printf("row %d, coloum %d, element: %f\n", i, j, matrix[i][j]);
        }
    }

    //Parallel part
    bsp_begin(bsp_nprocs());
    int pid, x, y, done;
    pid=x=y=done=0;
    int sqroot = (int)(sqrt(bsp_nprocs()));
    int size = (int)(N/sqroot);    //side
    float Ai_jm1, Aim1_j, Ai_jp1, Aip1_j;
    Ai_jm1 = Aim1_j = Ai_jp1 = Aip1_j = 0.0;
    float temp, diff, convergence, total_diff;
    temp = convergence = 0.0;
    float *diffs = (float*)calloc(bsp_nprocs(), sizeof(float));
    int counter= 0;

    //(N/sqrt(p)) is an integer assurance
    if ( N%sqroot!=0) {
        bsp_abort("N/sqrt(p) is not an integer.\nProgram Aborted.\n");
    }

    //Initiliaze a piece of martix in decomposition
    float **sub_martix = (float**)calloc(size, sizeof(float*));
    for (i=0; i<size; i++) {
        sub_martix[i] = (float*) calloc(size, sizeof(float));
    }
    //Initiliaze borders
    float *upper = (float*)calloc(size, sizeof(float));
    float *lower = (float*)calloc(size, sizeof(float));
    float *left = (float*)calloc(size, sizeof(float));
    float *right = (float*)calloc(size, sizeof(float));
    float *overlap = (float*)calloc(size, sizeof(float));

    bsp_push_reg(&diff, sizeof(float));
    bsp_push_reg(upper, size*sizeof(float));
    bsp_push_reg(lower, size*sizeof(float));
    bsp_push_reg(left, size*sizeof(float));
    bsp_push_reg(right, size*sizeof(float));

    //Make each matrix and border available globally
    for (i=0; i<size; i++) {
        bsp_push_reg(sub_martix[i], size*sizeof(float));
    }
    bsp_sync();
    /*Processor 0 distributes the data*/
    if (bsp_pid()==0) {
        for (pid = 0; pid<bsp_nprocs(); pid++) {
            //Determine which part of the original matrix
            x = pid/sqroot;
            y = pid%sqroot;
            //Then the processor 0 copy the data to each processor
            for (i=0; i<size; i++) {
                for (j=0; j<size; j++) {
                    sub_martix[i][j] = matrix[x*size+i+1][y*size+j+1];
                }
            }
            if (pid!=0) {
                for (i=0; i<size; i++) {
                    bsp_put(pid, sub_martix[i], sub_martix[i], 0, size*sizeof(float));
                }
            }
        }
    }
    bsp_sync();

    if (bsp_pid()==0) {
        for (pid=0; pid<bsp_nprocs(); pid++) {
            x=pid/sqroot;
            x=pid%sqroot;

            //if the part is in 1st row
            if (x==0) {
                for (i=0; i<size; i++) {
                    upper[i] = matrix[0][y*size+1+i];
                }
            }
            //if the part is in leftmost column
            if (y==0) {
                for (i=0; i<size; i++) {
                    left[i] = matrix[x*size+1+i][0];
                }
            }
            //if the part is in last row
            if (x==sqroot-1) {
                for (i=0; i<size; i++) {
                    lower[i] = matrix[N+1][y*size+1+i];
                }
            }
            //if the part is in rightmost column
            if (y==1) {
                for (i=0; i<size; i++) {
                    right[i] = matrix[x*size+1+i][N+1];
                }
            }

            if (pid!=0) {
                bsp_put(pid, upper, upper, 0, size*sizeof(float));
                bsp_put(pid, lower, lower, 0, size*sizeof(float));
                bsp_put(pid, left, left, 0, size*sizeof(float));
                bsp_put(pid, right, right, 0, size*sizeof(float));
            }
        }
    }
    bsp_sync();

    /* Computation */
    while (!done) {
        pid = bsp_pid();
        diff=0.0;
        total_diff=0.0;
        x = pid/sqroot;
        y = pid%sqroot;
        //printf("Now %d th round:", ++counter);

        if (x<sqroot-1) {
            for (i=0; i<size; i++) {
                overlap[i] = sub_martix[size-1][i];
            }
            bsp_put(bsp_pid()+sqroot, overlap, upper, 0, size*sizeof(float));
        }
        if (y<sqroot-1) {
            for (i=0; i<size; i++) {
                overlap[i]=sub_martix[i][size-1];
            }
            bsp_put(bsp_pid()+1, overlap, left, 0, size*sizeof(float));
        }
        if (x>0) {
            for (i=0; i<size; i++) {
                overlap[i]=sub_martix[0][i];
            }
            bsp_put(bsp_pid()-sqroot, overlap, lower, 0, size*sizeof(float));
        }
        if (y>0) {
            for (i=0; i<size; i++) {
                overlap[i]=sub_martix[i][0];
            }
            bsp_put(bsp_pid()-1, overlap, right, 0, size*sizeof(float));
        }
        bsp_sync();

        for (i=0; i<size; i++) {
            for (j=0; j<size; j++) {
                temp = sub_martix[i][j];
                if (i-1<0) {
                    Aim1_j=upper[j];
                }
                else {
                    Aim1_j=sub_martix[i-1][j];
                }
                if (i+1>size-1) {
                    Aip1_j=lower[j];
                }
                else {
                    Aip1_j=sub_martix[i+1][j];
                }
                if (j-1<0) {
                    if (y!=0) {
                        Ai_jm1 = left[size-1];
                    }
                    else {
                        Ai_jm1 = left[i];
                    }
                }
                else {
                    Ai_jm1 = sub_martix[i][j-1];
                }
                if (j+1>size-1) {
                    if (y!=sqroot-1) {
                        Ai_jp1 = right[0];
                    }
                    else {
                        Ai_jp1 = right[i];
                    }
                }
                else {
                    Ai_jp1 = sub_martix[i][j+1];
                }
                sub_martix[i][j] = 0.2*(sub_martix[i][j]
                                        + Ai_jm1
                                        + Aim1_j
                                        + Ai_jp1
                                        + Aip1_j);
                //printf("data is %f\n", sub_martix[i][j]);
                diff += fabs(sub_martix[i][j]-temp);
            }
        }

        //printf("Result from pid: %d: difference= %f \n", bsp_pid(), diff);
        bsp_sync();

        for (i=0; i<bsp_nprocs(); i++) {
            bsp_get(i, &diff, 0, &diffs[i], sizeof(float));
        }
        bsp_sync();

        for (i=0; i<bsp_nprocs(); i++) {
            total_diff += diffs[i];
        }
        bsp_sync();
        convergence = (total_diff)/(float)(N*N);
        //printf("Current Convergence is %f\n", convergence);
        if (convergence<TOL) {
            done = 1;
        }
        bsp_sync();
    }

    for (i=0; i<size; i++) {
        bsp_pop_reg(sub_martix[i]);
    }
    bsp_pop_reg(&diff);
    bsp_pop_reg(lower);
    bsp_pop_reg(upper);
    bsp_pop_reg(left);
    bsp_pop_reg(right);
    bsp_sync();

    for (i=0; i<size; i++) {
        free(sub_martix[i]);
    }
    free(sub_martix);
    free(diffs);
    free(lower);
    free(upper);
    free(left);
    free(right);
    free(overlap);
    bsp_sync();
    bsp_end();
    for (i=0; i<N+2; i++) {
        free(matrix[i]);
    }
    free(matrix);

}
Esempio n. 11
0
void bspfft_test()
{
    void bspfft( double * x, int n, int p, int s, int sign, double * w0,
                 double * w, double * tw, int *rho_np, int *rho_p );
    void bspfft_init( int n, int p, int s, double * w0,
                      double * w, double * tw, int *rho_np, int *rho_p );
    int k1_init( int n, int p );

    int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p;
    double time0, time1, time2, ffttime, nflops,
           max_error, error_re, error_im, error,
           *Error, *x, *w0, *w, *tw;

    bsp_begin( P );
    p = bsp_nprocs();
    s = bsp_pid();

    bsp_push_reg( &n, SZINT );
    Error = vecallocd( p );
    bsp_push_reg( Error, p * SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Please enter length n: \n" );

#ifdef _WIN32
        scanf_s( "%d", &n );
#else
        scanf( "%d", &n );
#endif

        if ( n < 2 * p )
        {
            bsp_abort( "Error in input: n < 2p" );
        }

        for ( q = 1; q < p; q++ )
        {
            bsp_put( q, &n, &n, 0, SZINT );
        }
    }

    bsp_sync();

    if ( s == 0 )
    {
        printf( "FFT of vector of length %d using %d processors\n", n, p );
        printf( "performing %d forward and %d backward transforms\n",
                NITERS, NITERS );
    }

    /* Allocate, register,  and initialize vectors */
    np = n / p;
    x = vecallocd( 2 * np );
    bsp_push_reg( x, 2 * np * SZDBL );
    k1 = k1_init( n, p );
    w0 = vecallocd( k1 );
    w =  vecallocd( np );
    tw = vecallocd( 2 * np + p );
    rho_np = vecalloci( np );
    rho_p =  vecalloci( p );

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        x[2 * j] = ( double )jglob;
        x[2 * j + 1] = 1.0;
    }

    bsp_sync();
    time0 = bsp_time();

    /* Initialize the weight and bit reversal tables */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time1 = bsp_time();

    /* Perform the FFTs */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p );
        bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time2 = bsp_time();

    /* Compute the accuracy */
    max_error = 0.0;

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        error_re = fabs( x[2 * j] - ( double )jglob );
        error_im = fabs( x[2 * j + 1] - 1.0 );
        error = sqrt( error_re * error_re + error_im * error_im );

        if ( error > max_error )
        {
            max_error = error;
        }
    }

    bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        max_error = 0.0;

        for ( q = 0; q < p; q++ )
        {
            if ( Error[q] > max_error )
            {
                max_error = Error[q];
            }
        }
    }

    for ( j = 0; j < NPRINT && j < np; j++ )
    {
        jglob = j * p + s;
        printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] );
    }

    fflush( stdout );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Time per initialization = %lf sec \n",
                ( time1 - time0 ) / NITERS );
        ffttime = ( time2 - time1 ) / ( 2.0 * NITERS );
        printf( "Time per FFT = %lf sec \n", ffttime );
        nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n;
        printf( "Computing rate in FFT = %lf Mflop/s \n",
                nflops / ( MEGA * ffttime ) );
        printf( "Absolute error= %e \n", max_error );
        printf( "Relative error= %e \n\n", max_error / n );
    }


    bsp_pop_reg( x );
    bsp_pop_reg( Error );
    bsp_pop_reg( &n );
    bsp_sync();

    vecfreei( rho_p );
    vecfreei( rho_np );
    vecfreed( tw );
    vecfreed( w );
    vecfreed( w0 );
    vecfreed( x );
    vecfreed( Error );
    bsp_end();

} /* end bspfft_test */
Esempio n. 12
0
File: bsp.c Progetto: jong42/git
void spmd( void ) {
	//parallel over three processes
	bsp_begin( 3 );

	//test bsp_push_reg (results in next superstep)
	size_t localInt;
	bsp_push_reg( &localInt, sizeof( size_t ) );
	checkLocalIntAddress[ bsp_pid() ] = &localInt;

	//check pid/nprocs, both using primitives as well as manually
	checkPcount[ bsp_pid() ] = (size_t)(bsp_nprocs());
	pthread_mutex_lock( &test_mutex );
	check++;
	checkP[ bsp_pid() ] = true;
	pthread_mutex_unlock( &test_mutex );

	//nobody should be at superstep 0
	if( superstep == 1 )
		superstepOK = false;

	//test barrier synchronisation
	bsp_sync();

	//note someone is at superstep 1
	superstep = 1;

	//check bsp_time
	if( bsp_time() <= 0 )
		bsp_abort( "FAILURE \t bsp_time returned 0 or less!\n" );

	//set up a pop_reg, but should only take effect after the next sync
	//(testing the push_reg after this statement thus provides a free test)
	bsp_pop_reg( &localInt );
	struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data );
	if( data->localsToRemove.top != 1 || data->localsToRemove.cap != 16 ||
		*((void**)(data->localsToRemove.array)) != (void*)&localInt ) {
		fprintf( stderr, "FAILURE \t bsp_pop_reg did not push entry on the to-remove stack (%p != %p)!\n",
			*((void**)(data->localsToRemove.array)), (void*)&localInt );
		mcbsp_util_fatal();
	}

	//check push_reg
	for( unsigned char i=0; i<3; ++i ) {
		if( checkLocalIntAddress[ i ] != mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ) {
			fprintf( stderr, "FAILURE \t bsp_push_reg did not register correct address!\n" );
			mcbsp_util_fatal();
		}
	}

	bsp_sync();

	//check pop_reg
	for( unsigned char i=0; i<3; ++i ) {
		if( mcbsp_util_address_table_get( &(data->init->global2local), 0, i ) != NULL ||
			data->localC != 0 ) {
			fprintf( stderr, "FAILURE \t bsp_pop_reg did not de-register correctly (entry=%p)!\n",
				mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address );
			mcbsp_util_fatal();
		}
		//localInt = *(size_t*)mcbsp_util_stack_pop( &(data->removedGlobals) );
	}

	bsp_sync();

	//going to test communication primitives on the following area
	size_t commTest[ 3 ];
	commTest[ 0 ] = commTest[ 1 ] = ((size_t)bsp_pid());
	commTest[ 2 ] = (size_t)(bsp_nprocs());
	bsp_push_reg( &commTest, 3 * sizeof( size_t ) );

	//make push valid
	bsp_sync();

	//after this put, commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs
	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &commTest, &commTest, sizeof( size_t ), 2*sizeof( size_t) );
	commTest[ 2 ] = ULONG_MAX; //this should not influence the result after sync.

	//test behind-the-scenes
	const struct mcbsp_util_stack queue = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ];
	size_t predicted_cap = predictCap( sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) );
	if( queue.cap != predicted_cap || queue.top != sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) || queue.size != sizeof( struct mcbsp_message ) ) {
		fprintf( stderr, "FAILURE \t bsp_put did not adapt the communication queue as expected!\n(cap = %ld, top = %ld, size = %ld)\n",
			(size_t)queue.cap, (size_t)queue.top, (size_t)queue.size );
		mcbsp_util_fatal();
	}
	const struct mcbsp_message request = *((struct mcbsp_message*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message )) );
	if( request.length != 2 * sizeof( size_t) ) {
		fprintf( stderr, "FAILURE \t bsp_put did not push a request of the expected length!\n(length = %ld)\n", (size_t)request.length );
		mcbsp_util_fatal();
	}
	const size_t * const chk_array = (size_t*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message ) - 2 * sizeof( size_t ));
	if( chk_array[ 0 ] != ((size_t)bsp_pid()) || chk_array[ 1 ] != ((size_t)bsp_pid()) ) {
		fprintf( stderr, "FAILURE \t bsp_put did not push an expected communication request!\n" );
		mcbsp_util_fatal();
	}
	//note there is no easy way to check request.destination; the top-level BSP test will handle that one

	bsp_sync();

	//test for the above expectation after bsp_put, namely
	//commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs
	if( commTest[ 0 ] != ((size_t)bsp_pid()) || 
		commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) ||
		commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs())
	) {
		fprintf( stderr, "FAILURE \t array after bsp_put is not as expected! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] );
		mcbsp_util_fatal();
	}
	
	//do a get on the next processor on the last element of commTest
	bsp_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest[ 2 ]), sizeof( size_t ) );

	//fill the expected value after the get to test non-buffering
	commTest[ 2 ] = ((size_t)bsp_pid());

	//communicate
	bsp_sync();

	//commTest[ 0 ] should equal bsp_pid, commTest[ 1 ] should equal bsp_pid-1, commTest[ 2 ] should be bsp_pid+1
	if( commTest[ 0 ] != ((size_t)bsp_pid()) || 
		commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs() - 1)%bsp_nprocs())
	) {
		fprintf( stderr, "FAILURE \t start of array after bsp_get changed! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] );
		mcbsp_util_fatal();
	}
	if( commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs() + 1)%bsp_nprocs()) ) {
		fprintf( stderr, "FAILURE \t last element of array after bsp_get erroneous! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] );
		mcbsp_util_fatal();
	}

	bsp_sync();

	//test direct_get functionality
	size_t commTest2[ 3 ];
	commTest2[ 0 ] = commTest[ 0 ];

	//get commTest[1] from right neighbour
	bsp_direct_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, sizeof( size_t ), &(commTest2[ 1 ]), sizeof( size_t ) );

	//get commTest[2] from left neighbour
	bsp_direct_get( (bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest2[ 2 ]), sizeof( size_t ) );

	//now everything should equal bsp_pid
	if( commTest2[ 0 ] != ((size_t)bsp_pid()) || 
		commTest2[ 1 ] != ((size_t)bsp_pid()) || 
		commTest2[ 2 ] != ((size_t)bsp_pid())
	) {
		fprintf( stderr, "FAILURE \t direct_get does not function properly! (%d: [%ld %ld %ld])\n", bsp_pid(), commTest2[ 0 ], commTest2[ 1 ], commTest2[ 2 ] );
		mcbsp_util_fatal();
	}

	//now test single BSMP message
	bsp_send( (bsp_pid() + 1) % bsp_nprocs(), NULL, &commTest, sizeof( size_t ) );
	
	//check messages
	const struct mcbsp_util_stack queue1 = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ];
	const size_t new_predicted_cap = predictCap( sizeof( struct mcbsp_message ) + sizeof( size_t ) );
	predicted_cap = predicted_cap > new_predicted_cap ? predicted_cap : new_predicted_cap;
	if( queue1.cap != predicted_cap || queue1.size != sizeof( struct mcbsp_message ) || queue1.top != sizeof( struct mcbsp_message ) + sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t bsp_send did not adapt the communication queue as expected!\n(cap = %ld, size = %ld, top = %ld; prediction was %ld, %ld, %ld)\n",
			(size_t)queue1.cap, (size_t)queue1.size, (size_t)queue1.top,
			(size_t)predicted_cap, (size_t)(sizeof( struct mcbsp_message )), (size_t)(sizeof( struct mcbsp_message ) + sizeof( size_t )) );
		mcbsp_util_fatal();
	}
	const struct mcbsp_message request2 = *(struct mcbsp_message*) ((char*)queue1.array + queue1.top - sizeof( struct mcbsp_message ));
	if( request2.destination != NULL ||
		request2.length != sizeof( size_t ) || // assumes tagSize = 0
		*(size_t *)queue1.array != ((size_t)bsp_pid()) ) {
		fprintf( stderr, "FAILURE \t bsp_send did not push the expected communication request!\n(top = %ld, destination = %p, length = %ld, payload = %ld\n",
			(size_t)queue1.top, request2.destination, (size_t)request2.length, *(size_t *)queue1.array );
		mcbsp_util_fatal();
	}

	bsp_sync();

	//inspect incoming BSMP queue (assuming tagSize = 0)
	predicted_cap = predictCap( sizeof( size_t ) + sizeof( size_t ) );
	if( data->bsmp.cap != predicted_cap || data->bsmp.top != sizeof( size_t ) + sizeof( size_t ) || data->bsmp.size != sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t BSMP queue after superstep with sends is not as expected!\n(cap = %ld, top = %ld, size = %ld; prediction was %ld, %ld, %ld)\n",
			(size_t)data->bsmp.cap, (size_t)data->bsmp.top, (size_t)data->bsmp.size,
			(size_t)predicted_cap, (size_t)(8 + sizeof( size_t )), (size_t)(data->bsmp.size) );
		mcbsp_util_fatal();
	}
	if( *(size_t*)(data->bsmp.array) != (size_t)((bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs()) ) {
		fprintf( stderr, "FAILURE \t Value in BSMP queue is not correct!\n" );
		mcbsp_util_fatal();
	}
	
	//inspect using primitives
	MCBSP_NUMMSG_TYPE   packets;
	MCBSP_BYTESIZE_TYPE packetSize;
	bsp_qsize( &packets, &packetSize );
	if( packets != 1 || packetSize != sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t bsp_qsize does not function correctly!\n" );
		mcbsp_util_fatal();
	}
	bsp_move( &commTest, sizeof( size_t ) );
	if( commTest[ 0 ] != (size_t)(( bsp_pid() + bsp_nprocs() - 1 ) % bsp_nprocs()) ) {
		fprintf( stderr, "FAILURE \t bsp_move does not function correctly!\n" );
		mcbsp_util_fatal();
	}
	
	//check set_tagsize
	MCBSP_BYTESIZE_TYPE tsz = sizeof( size_t );
	bsp_set_tagsize( &tsz );
	if( tsz != 0 ) {
		fprintf( stderr, "FAILURE \t return value of bsp_set_tagsize is incorrect!\n" );
		mcbsp_util_fatal();
	}

	bsp_sync();

	//check set_tagsize
	if( data->init->tagSize != sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t bsp_set_tagsize failed!\n" );
		mcbsp_util_fatal();
	}
	
	commTest[ 0 ] = ((size_t)bsp_pid());
	commTest[ 1 ] = 3;
	commTest[ 2 ] = 8 + ((size_t)bsp_pid());
	for( unsigned char i = 0; i < bsp_nprocs(); ++i ) {
		bsp_send( i, commTest, &(commTest[1]), 2 * sizeof( size_t ) );
		char * const test = (char*)(data->queues[ (size_t)i ].array) + data->queues[ (size_t)i ].top - sizeof( struct mcbsp_message ) - sizeof( size_t );
		if( *(size_t*)test != *commTest ) {
			fprintf( stderr, "FAILURE \t BSMP tag did not get pushed correctly (reads %ld instead of %ld)!\n", *(size_t*)test, *commTest );
			mcbsp_util_fatal();
		}
	}

	bsp_sync();

	MCBSP_BYTESIZE_TYPE status;
	size_t tag;
	for( unsigned char i = 0; i < bsp_nprocs(); ++i ) {
		bsp_get_tag( &status, &tag );
		if( tag >= ((size_t)bsp_nprocs()) || status != 2 * sizeof( size_t ) ) {
			fprintf( stderr, "FAILURE \t error in BSMP tag handling! (tag=%ld, status=%ld)\n", tag, (size_t)status );
			mcbsp_util_fatal();
		}
		size_t *p_tag, *msg;
		if( bsp_hpmove( (void**)&p_tag, (void**)&msg ) != 2 * sizeof( size_t ) ) {
			fprintf( stderr, "FAILURE \t bsp_hpmove does not return correct payload length." );
		}
		if( msg[ 0 ] != 3 || *p_tag != tag ) {
			fprintf( stderr, "FAILURE \t bsp_hpmove does not contain correct message (tag=%ld, payload = %ld) which should be (%ld, 3).\n", *p_tag, msg[ 0 ], tag );
			mcbsp_util_fatal();
		}
		commTest[ tag ] = msg[ 1 ];
	}
	for( unsigned short int i = 0; i < bsp_nprocs(); ++i ) {
		if( commTest[ i ] != (unsigned int)(8 + i) ) {
			fprintf( stderr, "FAILURE \t error in bsp_tag / bsp_(hp)move combination!\n" );
			mcbsp_util_fatal();
		}
	}

	bsp_sync();

#ifdef MCBSP_ALLOW_MULTIPLE_REGS
	//test multiple regs
	double mreg[17];
	bsp_push_reg( &(mreg[0]), 7*sizeof( double ) );

	bsp_sync();

	double mregs = 1.3;
	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 6 * sizeof( double ), sizeof( double ) );
	bsp_push_reg( &(mreg[0]), 17*sizeof( double ) );

	bsp_sync();

	bsp_push_reg( &(mreg[0]), 13*sizeof( double ) );
	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 16 * sizeof( double ), sizeof( double ) );

	bsp_sync();

	if( mreg[ 6 ] != mreg[ 16 ] ||  mreg[ 6 ] != mregs ) {
		fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg calls (%f,%f,%f,...,%f,%f)\n", mreg[ 5 ], mreg[ 6 ], mreg[ 7 ], mreg[ 15 ], mreg[ 16 ] );
		mcbsp_util_fatal();
	}
	bsp_pop_reg( &(mreg[0]) );
	bsp_pop_reg( &(mreg[0]) );

	bsp_sync();

	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 2 * sizeof( double ), sizeof( double ) );

	bsp_sync();

	if( mreg[ 2 ] != mregs ) {
		fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg + multiple bsp_pop_reg calls\n" );
		mcbsp_util_fatal();
	}
#endif

	bsp_end();
}
Esempio n. 13
0
void bspsieve(){
    
    double time0, time1;
    ulong *x;  // local list of candidates
    ulong *ks; //place for proc0 to store intermediate ks
    ulong n, 
          nl, 
          i, 
          iglob;
    int   s,
          p;
    ulong k;   // the current largest sure-prime

    n = N+1; // copy global N and increase by 1. (only proc 1 knows this)
             // this is so the maximum array idx == N
    
    bsp_begin(P);
    p= bsp_nprocs(); /* p = number of processors obtained */ 
    printf("Now we have %d processors.\n", p);
    s= bsp_pid();    /* s = processor number */ 
    if (s==0){
        if(n<0)
            bsp_abort("Error in input: n is negative");
        ks = vecalloculi(p);
    }

    bsp_push_reg(&n,SZULL);
    bsp_sync();

    bsp_get(0,&n,0,&n,SZULL); //everyone reads N from proc 0
    bsp_sync();
    bsp_pop_reg(&n);

    nl= blockSize(p,s,n); // how big must s block be?
    printf("P(%d) tries to alloc vec of %lld ulongs", s, nl);
    printf(", size would be = %lld Mb\n", nl*SZULL/1024/1024);
    x= vecalloculi(nl);

    for (i=0; i<nl; i++){
        // start by assuming everything is prime, except 1
        iglob= globalIdx(p,s,n,i);
        x[i]= iglob;
    }
    if(s==0)
        x[1]=0;
    bsp_sync(); 
    time0=bsp_time();
    k = 2;
    // begin work

    while( k*k <= n )
    {
        bspmarkmultiples(p,s,n,k,x);
        k = nextPrime(p,s,n,k,x);

        bsp_push_reg(&k, SZULL);
        bsp_sync();

        if(s==0)
        {
            ks[0] = k; // my k
            for(i=1;i<p; i++)
            {
                bsp_get(i, &k, 0, &ks[i], SZULL);
            }
        }

        bsp_sync();

        if(s==0)
        {
            k = findMinimum(p,ks);
        }
        bsp_sync();

        //broadcast minimum 
        bsp_get(0,&k,0,&k,SZULL); 
        bsp_sync();

        bsp_pop_reg(&k);
    }

    // end work
    bsp_sync();  
    time1=bsp_time();

    ulong primes= 0;
    //printf("Processor %lld primes: \n", s); 
    for(i = 0; i < blockSize(p,s,n); i++)
        if( x[i] != 0)
            primes++;
    //do not print primes, just count them. 
    printf("proc %d finds %lld primes.\n", s, primes);

    fflush(stdout);
    if (s==0){
        printf("This took only %.6lf seconds.\n", time1-time0);
        fflush(stdout);
        vecfreeuli(ks);
    }

    vecfreeuli(x);
    bsp_end();

} /* end bspsieve */