double bsp_time() { //get init data const struct mcbsp_thread_data * const data = mcbsp_internal_const_prefunction(); //get stop time #ifdef __MACH__ //get rights for accessing Mach's timers const kern_return_t rc1 = host_get_clock_service( mach_host_self(), SYSTEM_CLOCK, &(data->clock) ); if( rc1 != KERN_SUCCESS ) { fprintf( stderr, "Could not access the Mach system timer (%s)\n", mach_error_string( rc1 ) ); mcbsp_util_fatal(); } mach_timespec_t stop; const kern_return_t rc2 = clock_get_time( data->clock, &stop ); if( rc2 != KERN_SUCCESS ) { fprintf( stderr, "Could not get time at call to bsp_time (%s)\n", mach_error_string( rc2 ) ); mcbsp_util_fatal(); } #else struct timespec stop; clock_gettime( CLOCK_MONOTONIC, &stop); #endif //return time double time = (stop.tv_sec-data->start.tv_sec); time += (stop.tv_nsec-data->start.tv_nsec)/1000000000.0; return time; }
void mcbsp_internal_check_keys_allocated() { //if already allocated, we are done if( mcbsp_internal_keys_allocated ) return; //lock mutex against data race pthread_mutex_lock( &mcbsp_internal_keys_mutex ); //if still not allocated, allocate if( !mcbsp_internal_keys_allocated ) { if( pthread_key_create( &mcbsp_internal_init_data, free ) != 0 ) { fprintf( stderr, "Could not allocate mcbsp_internal_init_data key!\n" ); mcbsp_util_fatal(); } if( pthread_key_create( &mcbsp_internal_thread_data, free ) != 0 ) { fprintf( stderr, "Could not allocate mcbsp_internal_thread_data key!\n" ); mcbsp_util_fatal(); } if( pthread_setspecific( mcbsp_internal_init_data, NULL ) != 0 ) { fprintf( stderr, "Could not initialise mcbsp_internal_init_data to NULL!\n" ); mcbsp_util_fatal(); } if( pthread_setspecific( mcbsp_internal_thread_data, NULL ) != 0 ) { fprintf( stderr, "Could not initialise mcbsp_internal_thread_data to NULL!\n" ); mcbsp_util_fatal(); } mcbsp_internal_keys_allocated = true; } //unlock mutex and exit pthread_mutex_unlock( &mcbsp_internal_keys_mutex ); }
void bsp_vabort( char *error_message, va_list args ) { //print error message vfprintf( stderr, error_message, args ); //get thread-local data and check for errors const struct mcbsp_thread_data * const data = mcbsp_internal_const_prefunction(); //always check for failure of getting thread data, even in high-performance mode #ifdef NDEBUG if( data == NULL ) { fprintf( stderr, "Error: could not get thread-local data in call to bsp_abort( error_message )!\n" ); mcbsp_util_fatal(); } #endif //send signal to all sibling threads data->init->abort = true; //if there are threads in sync, wake them up //first get lock, otherwise threads may sync //while checking for synched threads pthread_mutex_lock( &(data->init->mutex) ); if( data->init->sync_entry_counter > 0 ) pthread_cond_broadcast( &(data->init->condition) ); pthread_mutex_unlock( &( data->init->mutex) ); //quit execution pthread_exit( NULL ); }
void mcbsp_internal_check_aborted() { const struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data ); #ifndef NDEBUG if( data == NULL ) { assert( false ); fprintf( stderr, "Error: could not get thread-local data in call to mcbsp_check_aborted()!\n" ); mcbsp_util_fatal(); } #endif if( data->init->abort ) pthread_exit( NULL ); }
struct mcbsp_init_data * bsp_begin_check() { //check if keys are allocated mcbsp_internal_check_keys_allocated(); //get necessary data struct mcbsp_init_data *init = pthread_getspecific( mcbsp_internal_init_data ); if( init == NULL ) { //maybe we are SPMD threads revisiting the bsp_begin? const struct mcbsp_thread_data * const thread_data = pthread_getspecific( mcbsp_internal_thread_data ); if( thread_data != NULL ) { //yes, so continue execution return NULL; } else { //no. We are not the ones spawning an SPMD program, //neither are we spawned from a corresponding SPMD... // //two possibilities: either fail hard /*fprintf( stderr, "Could not get initialisation data! Was the call to bsp_begin preceded by a call to bsp_init?\n" ); mcbsp_util_fatal();*/ //or assume we were called from main() and we construct an implied init init = malloc( sizeof( struct mcbsp_init_data ) ); if( init == NULL ) { fprintf( stderr, "Could not perform an implicit initialisation!\n" ); mcbsp_util_fatal(); } init->spmd = NULL; //we want to call main, but (*void)(void) does not match its profile init->argc = 0; init->argv = NULL; mcbsp_internal_check_keys_allocated(); if( pthread_setspecific( mcbsp_internal_init_data, init ) != 0 ) { fprintf( stderr, "Error: could not set BSP program key in implicit initialisation!\n" ); mcbsp_util_fatal(); } } } return init; }
void* mcbsp_internal_spmd( void *p ) { //get thread-local data struct mcbsp_thread_data *data = (struct mcbsp_thread_data *) p; //store thread-local data const int rc = pthread_setspecific( mcbsp_internal_thread_data, data ); if( rc != 0 ) { fprintf( stderr, "Could not store thread local data!\n" ); fprintf( stderr, "(%s)\n", strerror( rc ) ); mcbsp_util_fatal(); } #ifdef __MACH__ //get rights for accessing Mach's timers const kern_return_t rc1 = host_get_clock_service( mach_host_self(), SYSTEM_CLOCK, &(data->clock) ); if( rc1 != KERN_SUCCESS ) { fprintf( stderr, "Could not access the Mach system timer (%s)\n", mach_error_string( rc1 ) ); mcbsp_util_fatal(); } //record start time const kern_return_t rc2 = clock_get_time( data->clock, &(data->start) ); if( rc2 != KERN_SUCCESS ) { fprintf( stderr, "Could not get start time (%s)\n", mach_error_string( rc2 ) ); mcbsp_util_fatal(); } #else //record start time clock_gettime( CLOCK_MONOTONIC, &(data->start) ); #endif //continue with SPMD part if( data->init->spmd == NULL ) main( 0, NULL ); //we had an implicit bsp_init else data->init->spmd(); //call user-defined SPMD program //exit cleanly return NULL; }
void bsp_init( void (*spmd)(void), int argc, char **argv ) { //create a BSP-program specific initial data struct struct mcbsp_init_data *initialisationData = malloc( sizeof( struct mcbsp_init_data ) ); if( initialisationData == NULL ) { fprintf( stderr, "Error: could not allocate MulticoreBSP initialisation struct!\n" ); mcbsp_util_fatal(); } //set values initialisationData->spmd = spmd; initialisationData->bsp_program = NULL; initialisationData->argc = argc; initialisationData->argv = argv; //continue initialisation bsp_init_internal( initialisationData ); }
void bsp_init_internal( struct mcbsp_init_data * const initialisationData ) { //store using pthreads setspecific. Note this is per BSP program, not per thread //active within this BSP program! mcbsp_internal_check_keys_allocated(); if( pthread_getspecific( mcbsp_internal_init_data ) != NULL ) { const struct mcbsp_init_data * const oldData = pthread_getspecific( mcbsp_internal_init_data ); if( !oldData->ended ) { fprintf( stderr, "Warning: initialisation data corresponding to another BSP run found;\n" ); fprintf( stderr, " and this other run did not terminate (gracefully).\n" ); } } if( pthread_setspecific( mcbsp_internal_init_data, initialisationData ) != 0 ) { fprintf( stderr, "Error: could not set BSP program key!\n" ); mcbsp_util_fatal(); } }
struct mcbsp_thread_data * mcbsp_internal_prefunction() { //check if the BSP execution was aborted mcbsp_internal_check_aborted(); //get thread-local data struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data ); //check for errors if not in high-performance mode #ifndef NDEBUG if( data == NULL ) { fprintf( stderr, "Error: could not get thread-local data in call to bsp_abort( error_message )!\n" ); mcbsp_util_fatal(); } #endif //return data return data; }
void spmd( void ) { //parallel over three processes bsp_begin( 3 ); //test bsp_push_reg (results in next superstep) size_t localInt; bsp_push_reg( &localInt, sizeof( size_t ) ); checkLocalIntAddress[ bsp_pid() ] = &localInt; //check pid/nprocs, both using primitives as well as manually checkPcount[ bsp_pid() ] = (size_t)(bsp_nprocs()); pthread_mutex_lock( &test_mutex ); check++; checkP[ bsp_pid() ] = true; pthread_mutex_unlock( &test_mutex ); //nobody should be at superstep 0 if( superstep == 1 ) superstepOK = false; //test barrier synchronisation bsp_sync(); //note someone is at superstep 1 superstep = 1; //check bsp_time if( bsp_time() <= 0 ) bsp_abort( "FAILURE \t bsp_time returned 0 or less!\n" ); //set up a pop_reg, but should only take effect after the next sync //(testing the push_reg after this statement thus provides a free test) bsp_pop_reg( &localInt ); struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data ); if( data->localsToRemove.top != 1 || data->localsToRemove.cap != 16 || *((void**)(data->localsToRemove.array)) != (void*)&localInt ) { fprintf( stderr, "FAILURE \t bsp_pop_reg did not push entry on the to-remove stack (%p != %p)!\n", *((void**)(data->localsToRemove.array)), (void*)&localInt ); mcbsp_util_fatal(); } //check push_reg for( unsigned char i=0; i<3; ++i ) { if( checkLocalIntAddress[ i ] != mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ) { fprintf( stderr, "FAILURE \t bsp_push_reg did not register correct address!\n" ); mcbsp_util_fatal(); } } bsp_sync(); //check pop_reg for( unsigned char i=0; i<3; ++i ) { if( mcbsp_util_address_table_get( &(data->init->global2local), 0, i ) != NULL || data->localC != 0 ) { fprintf( stderr, "FAILURE \t bsp_pop_reg did not de-register correctly (entry=%p)!\n", mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ); mcbsp_util_fatal(); } //localInt = *(size_t*)mcbsp_util_stack_pop( &(data->removedGlobals) ); } bsp_sync(); //going to test communication primitives on the following area size_t commTest[ 3 ]; commTest[ 0 ] = commTest[ 1 ] = ((size_t)bsp_pid()); commTest[ 2 ] = (size_t)(bsp_nprocs()); bsp_push_reg( &commTest, 3 * sizeof( size_t ) ); //make push valid bsp_sync(); //after this put, commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &commTest, &commTest, sizeof( size_t ), 2*sizeof( size_t) ); commTest[ 2 ] = ULONG_MAX; //this should not influence the result after sync. //test behind-the-scenes const struct mcbsp_util_stack queue = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ]; size_t predicted_cap = predictCap( sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) ); if( queue.cap != predicted_cap || queue.top != sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) || queue.size != sizeof( struct mcbsp_message ) ) { fprintf( stderr, "FAILURE \t bsp_put did not adapt the communication queue as expected!\n(cap = %ld, top = %ld, size = %ld)\n", (size_t)queue.cap, (size_t)queue.top, (size_t)queue.size ); mcbsp_util_fatal(); } const struct mcbsp_message request = *((struct mcbsp_message*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message )) ); if( request.length != 2 * sizeof( size_t) ) { fprintf( stderr, "FAILURE \t bsp_put did not push a request of the expected length!\n(length = %ld)\n", (size_t)request.length ); mcbsp_util_fatal(); } const size_t * const chk_array = (size_t*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message ) - 2 * sizeof( size_t )); if( chk_array[ 0 ] != ((size_t)bsp_pid()) || chk_array[ 1 ] != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t bsp_put did not push an expected communication request!\n" ); mcbsp_util_fatal(); } //note there is no easy way to check request.destination; the top-level BSP test will handle that one bsp_sync(); //test for the above expectation after bsp_put, namely //commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs if( commTest[ 0 ] != ((size_t)bsp_pid()) || commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) || commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t array after bsp_put is not as expected! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } //do a get on the next processor on the last element of commTest bsp_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest[ 2 ]), sizeof( size_t ) ); //fill the expected value after the get to test non-buffering commTest[ 2 ] = ((size_t)bsp_pid()); //communicate bsp_sync(); //commTest[ 0 ] should equal bsp_pid, commTest[ 1 ] should equal bsp_pid-1, commTest[ 2 ] should be bsp_pid+1 if( commTest[ 0 ] != ((size_t)bsp_pid()) || commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs() - 1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t start of array after bsp_get changed! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } if( commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs() + 1)%bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t last element of array after bsp_get erroneous! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] ); mcbsp_util_fatal(); } bsp_sync(); //test direct_get functionality size_t commTest2[ 3 ]; commTest2[ 0 ] = commTest[ 0 ]; //get commTest[1] from right neighbour bsp_direct_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, sizeof( size_t ), &(commTest2[ 1 ]), sizeof( size_t ) ); //get commTest[2] from left neighbour bsp_direct_get( (bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest2[ 2 ]), sizeof( size_t ) ); //now everything should equal bsp_pid if( commTest2[ 0 ] != ((size_t)bsp_pid()) || commTest2[ 1 ] != ((size_t)bsp_pid()) || commTest2[ 2 ] != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t direct_get does not function properly! (%d: [%ld %ld %ld])\n", bsp_pid(), commTest2[ 0 ], commTest2[ 1 ], commTest2[ 2 ] ); mcbsp_util_fatal(); } //now test single BSMP message bsp_send( (bsp_pid() + 1) % bsp_nprocs(), NULL, &commTest, sizeof( size_t ) ); //check messages const struct mcbsp_util_stack queue1 = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ]; const size_t new_predicted_cap = predictCap( sizeof( struct mcbsp_message ) + sizeof( size_t ) ); predicted_cap = predicted_cap > new_predicted_cap ? predicted_cap : new_predicted_cap; if( queue1.cap != predicted_cap || queue1.size != sizeof( struct mcbsp_message ) || queue1.top != sizeof( struct mcbsp_message ) + sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_send did not adapt the communication queue as expected!\n(cap = %ld, size = %ld, top = %ld; prediction was %ld, %ld, %ld)\n", (size_t)queue1.cap, (size_t)queue1.size, (size_t)queue1.top, (size_t)predicted_cap, (size_t)(sizeof( struct mcbsp_message )), (size_t)(sizeof( struct mcbsp_message ) + sizeof( size_t )) ); mcbsp_util_fatal(); } const struct mcbsp_message request2 = *(struct mcbsp_message*) ((char*)queue1.array + queue1.top - sizeof( struct mcbsp_message )); if( request2.destination != NULL || request2.length != sizeof( size_t ) || // assumes tagSize = 0 *(size_t *)queue1.array != ((size_t)bsp_pid()) ) { fprintf( stderr, "FAILURE \t bsp_send did not push the expected communication request!\n(top = %ld, destination = %p, length = %ld, payload = %ld\n", (size_t)queue1.top, request2.destination, (size_t)request2.length, *(size_t *)queue1.array ); mcbsp_util_fatal(); } bsp_sync(); //inspect incoming BSMP queue (assuming tagSize = 0) predicted_cap = predictCap( sizeof( size_t ) + sizeof( size_t ) ); if( data->bsmp.cap != predicted_cap || data->bsmp.top != sizeof( size_t ) + sizeof( size_t ) || data->bsmp.size != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t BSMP queue after superstep with sends is not as expected!\n(cap = %ld, top = %ld, size = %ld; prediction was %ld, %ld, %ld)\n", (size_t)data->bsmp.cap, (size_t)data->bsmp.top, (size_t)data->bsmp.size, (size_t)predicted_cap, (size_t)(8 + sizeof( size_t )), (size_t)(data->bsmp.size) ); mcbsp_util_fatal(); } if( *(size_t*)(data->bsmp.array) != (size_t)((bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t Value in BSMP queue is not correct!\n" ); mcbsp_util_fatal(); } //inspect using primitives MCBSP_NUMMSG_TYPE packets; MCBSP_BYTESIZE_TYPE packetSize; bsp_qsize( &packets, &packetSize ); if( packets != 1 || packetSize != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_qsize does not function correctly!\n" ); mcbsp_util_fatal(); } bsp_move( &commTest, sizeof( size_t ) ); if( commTest[ 0 ] != (size_t)(( bsp_pid() + bsp_nprocs() - 1 ) % bsp_nprocs()) ) { fprintf( stderr, "FAILURE \t bsp_move does not function correctly!\n" ); mcbsp_util_fatal(); } //check set_tagsize MCBSP_BYTESIZE_TYPE tsz = sizeof( size_t ); bsp_set_tagsize( &tsz ); if( tsz != 0 ) { fprintf( stderr, "FAILURE \t return value of bsp_set_tagsize is incorrect!\n" ); mcbsp_util_fatal(); } bsp_sync(); //check set_tagsize if( data->init->tagSize != sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_set_tagsize failed!\n" ); mcbsp_util_fatal(); } commTest[ 0 ] = ((size_t)bsp_pid()); commTest[ 1 ] = 3; commTest[ 2 ] = 8 + ((size_t)bsp_pid()); for( unsigned char i = 0; i < bsp_nprocs(); ++i ) { bsp_send( i, commTest, &(commTest[1]), 2 * sizeof( size_t ) ); char * const test = (char*)(data->queues[ (size_t)i ].array) + data->queues[ (size_t)i ].top - sizeof( struct mcbsp_message ) - sizeof( size_t ); if( *(size_t*)test != *commTest ) { fprintf( stderr, "FAILURE \t BSMP tag did not get pushed correctly (reads %ld instead of %ld)!\n", *(size_t*)test, *commTest ); mcbsp_util_fatal(); } } bsp_sync(); MCBSP_BYTESIZE_TYPE status; size_t tag; for( unsigned char i = 0; i < bsp_nprocs(); ++i ) { bsp_get_tag( &status, &tag ); if( tag >= ((size_t)bsp_nprocs()) || status != 2 * sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t error in BSMP tag handling! (tag=%ld, status=%ld)\n", tag, (size_t)status ); mcbsp_util_fatal(); } size_t *p_tag, *msg; if( bsp_hpmove( (void**)&p_tag, (void**)&msg ) != 2 * sizeof( size_t ) ) { fprintf( stderr, "FAILURE \t bsp_hpmove does not return correct payload length." ); } if( msg[ 0 ] != 3 || *p_tag != tag ) { fprintf( stderr, "FAILURE \t bsp_hpmove does not contain correct message (tag=%ld, payload = %ld) which should be (%ld, 3).\n", *p_tag, msg[ 0 ], tag ); mcbsp_util_fatal(); } commTest[ tag ] = msg[ 1 ]; } for( unsigned short int i = 0; i < bsp_nprocs(); ++i ) { if( commTest[ i ] != (unsigned int)(8 + i) ) { fprintf( stderr, "FAILURE \t error in bsp_tag / bsp_(hp)move combination!\n" ); mcbsp_util_fatal(); } } bsp_sync(); #ifdef MCBSP_ALLOW_MULTIPLE_REGS //test multiple regs double mreg[17]; bsp_push_reg( &(mreg[0]), 7*sizeof( double ) ); bsp_sync(); double mregs = 1.3; bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 6 * sizeof( double ), sizeof( double ) ); bsp_push_reg( &(mreg[0]), 17*sizeof( double ) ); bsp_sync(); bsp_push_reg( &(mreg[0]), 13*sizeof( double ) ); bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 16 * sizeof( double ), sizeof( double ) ); bsp_sync(); if( mreg[ 6 ] != mreg[ 16 ] || mreg[ 6 ] != mregs ) { fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg calls (%f,%f,%f,...,%f,%f)\n", mreg[ 5 ], mreg[ 6 ], mreg[ 7 ], mreg[ 15 ], mreg[ 16 ] ); mcbsp_util_fatal(); } bsp_pop_reg( &(mreg[0]) ); bsp_pop_reg( &(mreg[0]) ); bsp_sync(); bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 2 * sizeof( double ), sizeof( double ) ); bsp_sync(); if( mreg[ 2 ] != mregs ) { fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg + multiple bsp_pop_reg calls\n" ); mcbsp_util_fatal(); } #endif bsp_end(); }
int main(int argc, char **argv) { //test bsp_init bsp_init( spmd, argc, argv ); if( !mcbsp_internal_keys_allocated ) { fprintf( stderr, "FAILURE \t bsp_init did not initialise internal keys!\n" ); mcbsp_util_fatal(); } struct mcbsp_init_data *initialisationData = pthread_getspecific( mcbsp_internal_init_data ); if( initialisationData == NULL ) { fprintf( stderr, "FAILURE \t did not retrieve correct program initialisation data!\n" ); mcbsp_util_fatal(); } if( initialisationData->spmd != spmd ) { fprintf( stderr, "FAILURE \t did not retrieve correct user-defined SPMD entry point!\n" ); mcbsp_util_fatal(); } if( initialisationData->argc != argc ) { fprintf( stderr, "FAILURE \t did not retrieve correct argument count!\n" ); mcbsp_util_fatal(); } if( initialisationData->argv != argv ) { fprintf( stderr, "FAILURE \t did not retrieve correct arguments!\n" ); mcbsp_util_fatal(); } //bsp_init OK //test bsp_begin and bsp_end, init test struct mcbsp_util_machine_info * MCBSP_MACHINE_INFO = mcbsp_internal_getMachineInfo(); MCBSP_MACHINE_INFO->threads = 7; if( MCBSP_MACHINE_INFO->manual_affinity != NULL ) free( MCBSP_MACHINE_INFO->manual_affinity ); MCBSP_MACHINE_INFO->manual_affinity = malloc( 3 * sizeof( size_t ) ); for( unsigned char i=0; i<3; ++i ) MCBSP_MACHINE_INFO->manual_affinity[ i ] = 0; MCBSP_MACHINE_INFO->affinity = MANUAL; checkP[ 0 ] = checkP[ 1 ] = checkP[ 2 ] = false; //actual test spmd(); if( check != 3 ) { fprintf( stderr, "FAILURE \t bsp_begin(3) did not correctly start three processes!\n" ); mcbsp_util_fatal(); } if( !( checkP[ 0 ] && checkP[ 1 ] && checkP[ 2 ] ) ) { fprintf( stderr, "FAILURE \t bsp_pid does not function correctly!\n" ); mcbsp_util_fatal(); } for( unsigned char i=0; i<3; ++i ) { if( checkPcount[ i ] != 3 ) { fprintf( stderr, "FAILURE \t bsp_nprocs does not function correctly!\n" ); mcbsp_util_fatal(); } } if( !superstepOK ) { fprintf( stderr, "FAILURE \t bsp_sync allowed one or more threads past a synchronisation point before at least one other thread reached it!\n" ); mcbsp_util_fatal(); } //cleanup free( MCBSP_MACHINE_INFO->manual_affinity ); MCBSP_MACHINE_INFO->manual_affinity = NULL; MCBSP_MACHINE_INFO->affinity = MCBSP_DEFAULT_AFFINITY; mcbsp_util_destroyMachineInfo( MCBSP_MACHINE_INFO ); //bsp_begin & bsp_end OK fprintf( stdout, "SUCCESS\n" ); exit( EXIT_SUCCESS ); }
void bsp_sync() { //get local data struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data ); //get lock pthread_mutex_lock( &(data->init->mutex) ); //see if synchronisation is complete if( data->init->sync_entry_counter++ == data->init->P - 1 ) { data->init->sync_entry_counter = 0; pthread_cond_broadcast( &(data->init->condition) ); } else pthread_cond_wait( &(data->init->condition), &(data->init->mutex) ); //unlock mutex pthread_mutex_unlock( &(data->init->mutex) ); //before continuing execution, check if we woke up due to an abort //and now exit if so (we could not exit earlier as not unlocking the //sync mutex will cause a deadlock). mcbsp_internal_check_aborted(); //check for mismatched sync/end if( data->init->ended ) { fprintf( stderr, "Mismatched bsp_sync and bsp_end detected!\n" ); mcbsp_util_fatal(); } //handle the various BSP requests //update tagSize, phase 1 if( data->bsp_id == 0 && data->newTagSize != data->init->tagSize ) data->init->tagSize = data->newTagSize; //look for requests with destination us, first cache get-requests for( MCBSP_PROCESSOR_INDEX_DATATYPE s = 0; s < data->init->P; ++s ) { struct mcbsp_util_stack * const queue = &(data->init->threadData[ s ]->queues[ data->bsp_id ]); //each request in queue is directed to us. Handle all of them. for( size_t r = 0; r < queue->top; ++r ) { struct mcbsp_communication_request * const request = (struct mcbsp_communication_request *) (((char*)(queue->array)) + r * queue->size); if( request->payload == NULL ) { //allocate payload request->payload = malloc( request->length ); //no data race here since we are the only ones allowed to write here memcpy( request->payload, request->source, request->length ); //nullify payload (effectively turning the request into a put-request) request->source = NULL; } } } //handle bsp_pop_reg while( !mcbsp_util_stack_empty( &(data->localsToRemove ) ) ) { //get local memory address to remove registration of void * const toRemove = *((void**)(mcbsp_util_stack_pop( &(data->localsToRemove) ))); //get corresponding global key const unsigned long int globalIndex = mcbsp_util_address_map_get( &(data->local2global), toRemove ); if( globalIndex == ULONG_MAX ) { fprintf( stderr, "Error: bsp_pop_reg requested on non-registered pointer!\n" ); mcbsp_util_fatal(); } //delete from table if( mcbsp_util_address_table_delete( &(data->init->global2local), globalIndex, data->bsp_id ) ) { //NOTE: this is safe, since it is guaranteed that this address table entry // will not change during synchronisation. //delete from map mcbsp_util_address_map_remove( &(data->local2global), toRemove ); } //register globalIndex now is free if( data->localC == globalIndex + 1 ) --(data->localC); else mcbsp_util_stack_push( &(data->removedGlobals), (void*)(&globalIndex) ); } //handle push_reg while( !mcbsp_util_stack_empty( &(data->localsToPush) ) ) { //get address const struct mcbsp_push_request request = *((struct mcbsp_push_request*)mcbsp_util_stack_pop( &(data->localsToPush) )); void * const address = request.address; const MCBSP_BYTESIZE_TYPE size = request.size; //get global index of this registration. First check map if the key already existed const unsigned long int mapSearch = mcbsp_util_address_map_get( &(data->local2global), address); //if the key was not found, create a new global entry const unsigned long int global_number = mapSearch != ULONG_MAX ? mapSearch : mcbsp_util_stack_empty( &(data->removedGlobals) ) ? data->localC++ : *(unsigned long int*)mcbsp_util_stack_pop( &(data->removedGlobals) ); //insert value, local2global map (if this is a new global entry) if( mapSearch == ULONG_MAX ) mcbsp_util_address_map_insert( &(data->local2global), address, global_number ); //insert value, global2local map (false sharing is possible here, but effects should be negligable) mcbsp_util_address_table_set( &(data->init->global2local), global_number, data->bsp_id, address, size ); } //coordinate exit using the same mutex (but not same condition!) pthread_mutex_lock( &(data->init->mutex) ); if( data->init->sync_exit_counter++ == data->init->P - 1 ) { data->init->sync_exit_counter = 0; pthread_cond_broadcast( &(data->init->mid_condition) ); } else pthread_cond_wait( &(data->init->mid_condition), &(data->init->mutex) ); pthread_mutex_unlock( &(data->init->mutex) ); //update tagsize, phase 2 (check) if( data->newTagSize != data->init->tagSize ) { fprintf( stderr, "Different tag sizes requested from different processes (%ld requested while process 0 requested %ld)!\n", data->newTagSize, data->init->tagSize ); mcbsp_util_fatal(); } //now process put requests to local destination for( MCBSP_PROCESSOR_INDEX_DATATYPE s = 0; s < data->init->P; ++s ) { struct mcbsp_util_stack * const queue = &(data->init->threadData[ s ]->queues[ data->bsp_id ]); //each request in queue is directed to us. Handle all of them. while( !mcbsp_util_stack_empty( queue ) ) { struct mcbsp_communication_request * const request = (struct mcbsp_communication_request *) mcbsp_util_stack_pop( queue ); if( request->source == NULL && request->destination == NULL && request->payload != NULL ) { //this is a BSMP message //construct message void * message = malloc( request->length ); memcpy( message, request->payload, request->length ); //record message mcbsp_util_stack_push( &(data->bsmp), &message ); //free payload free( request->payload ); } else if( request->source == NULL && request->payload != NULL ) { //no data race here since we are the only ones allowed to write here memcpy( request->destination, request->payload, request->length ); //free payload free( request->payload ); } else { fprintf( stderr, "Unknown BSP communication request encountered!\n" ); mcbsp_util_fatal(); } } } //final sync pthread_mutex_lock( &(data->init->mutex) ); if( data->init->sync_entry_counter++ == data->init->P - 1 ) { data->init->sync_entry_counter = 0; pthread_cond_broadcast( &(data->init->condition) ); } else pthread_cond_wait( &(data->init->condition), &(data->init->mutex) ); pthread_mutex_unlock( &(data->init->mutex) ); }
void bsp_begin( const MCBSP_PROCESSOR_INDEX_DATATYPE P ) { struct mcbsp_init_data * const init = bsp_begin_check(); //if the check did not return an init struct, we are a //spawned thread and should just continue the SPMD //code. if( init == NULL ) return; //otherwise we need to start the SPMD code int *pinning = mcbsp_util_pinning( MCBSP_AFFINITY, P ); if( pinning == NULL ) { fprintf( stderr, "Could not get a valid pinning!\n" ); mcbsp_util_fatal(); } init->threads = malloc( P * sizeof( pthread_t ) ); if( init->threads == NULL ) { fprintf( stderr, "Could not allocate new threads!\n" ); mcbsp_util_fatal(); } pthread_attr_t attr; #ifndef __MACH__ cpu_set_t mask; #endif //further initialise init object init->P = P; init->abort = false; init->ended = false; init->sync_entry_counter = 0; init->sync_exit_counter = 0; pthread_mutex_init( &(init->mutex), NULL ); pthread_cond_init ( &(init->condition), NULL ); pthread_cond_init ( &(init->mid_condition), NULL ); mcbsp_util_address_table_initialise( &(init->global2local), P ); init->threadData = malloc( P * sizeof( struct mcbsp_thread_data * ) ); init->tagSize = 0; //spawn P-1 threads. The condition checks for both signed and unsigned types //since user may set MCBSP_PROCESSOR_INDEX_DATATYPE to a signed type. for( MCBSP_PROCESSOR_INDEX_DATATYPE s = P - 1; s < P && s >= 0; --s ) { //allocate new thread-local data struct mcbsp_thread_data *thread_data = malloc( sizeof( struct mcbsp_thread_data ) ); if( thread_data == NULL ) { fprintf( stderr, "Could not allocate local thread data!\n" ); mcbsp_util_fatal(); } //provide a link to the SPMD program init struct thread_data->init = init; //set local ID thread_data->bsp_id = s; //set the maximum number of registered globals at any time (0, since SPMD not started yet) thread_data->localC = 0; //initialise local to global map mcbsp_util_address_map_initialise( &(thread_data->local2global ) ); //initialise stack used for efficient registration of globals after de-registrations mcbsp_util_stack_initialise( &(thread_data->removedGlobals), sizeof( unsigned long int ) ); //initialise stack used for de-registration of globals mcbsp_util_stack_initialise( &(thread_data->localsToRemove), sizeof( void * ) ); //initialise stacks used for communication thread_data->queues = malloc( P * sizeof( struct mcbsp_util_stack ) ); for( MCBSP_PROCESSOR_INDEX_DATATYPE i = 0; i < P; ++i ) mcbsp_util_stack_initialise( &(thread_data->queues[ i ]), sizeof( struct mcbsp_communication_request) ); //initialise default tag size thread_data->newTagSize = 0; //initialise BSMP queue mcbsp_util_stack_initialise( &(thread_data->bsmp), sizeof( void * ) ); //initialise push request queue mcbsp_util_stack_initialise( &(thread_data->localsToPush), sizeof( struct mcbsp_push_request ) ); //provide a link back to this thread-local data struct init->threadData[ s ] = thread_data; //spawn new threads if s>0 if( s > 0 ) { //create POSIX threads attributes (for pinning) pthread_attr_init( &attr ); #ifndef __MACH__ CPU_ZERO( &mask ); CPU_SET ( pinning[ s ], &mask ); pthread_attr_setaffinity_np( &attr, sizeof( cpu_set_t ), &mask ); #endif //spawn the actual thread if( pthread_create( &(init->threads[ s ]), &attr, mcbsp_internal_spmd, thread_data ) != 0 ) { fprintf( stderr, "Could not spawn new thread!\n" ); mcbsp_util_fatal(); } #ifdef __MACH__ thread_port_t osx_thread = pthread_mach_thread_np( init->threads[ s ] ); struct thread_affinity_policy ap; if( MCBSP_AFFINITY == SCATTER ) { //Affinity API release notes do not specify whether 0 is a valid tag, or in fact equal to NULL; so 1-based to be sure ap.affinity_tag = s + 1; } else if( MCBSP_AFFINITY == COMPACT ) { ap.affinity_tag = 1; } else if( MCBSP_AFFINITY == MANUAL ) { ap.affinity_tag = MCBSP_MANUAL_AFFINITY[ s ]; } else { fprintf( stderr, "Unhandled affinity type for Mac OS X!\n" ); mcbsp_util_fatal(); } thread_policy_set( osx_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&ap, THREAD_AFFINITY_POLICY_COUNT ); #endif //destroy attributes object pthread_attr_destroy( &attr ); } else { //continue ourselves as bsp_id 0. Do pinning #ifdef __MACH__ thread_port_t osx_thread = pthread_mach_thread_np( pthread_self() ); struct thread_affinity_policy ap; if( MCBSP_AFFINITY == SCATTER || MCBSP_AFFINITY == COMPACT ) ap.affinity_tag = 1; else if( MCBSP_AFFINITY == MANUAL ) ap.affinity_tag = MCBSP_MANUAL_AFFINITY[ s ]; else { fprintf( stderr, "Unhandled affinity type for Mac OS X!\n" ); mcbsp_util_fatal(); } thread_policy_set( osx_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&ap, THREAD_AFFINITY_POLICY_COUNT ); #else CPU_ZERO( &mask ); CPU_SET ( pinning[ s ], &mask ); if( pthread_setaffinity_np( pthread_self(), sizeof( cpu_set_t ), &mask ) != 0 ) { fprintf( stderr, "Could not pin master thread to requested hardware thread (%d)!\n", pinning[ s ] ); mcbsp_util_fatal(); } #endif //record our own descriptor init->threads[ 0 ] = pthread_self(); //copy part of mcbsp_internal_spmd. const int rc = pthread_setspecific( mcbsp_internal_thread_data, thread_data ); if( rc != 0 ) { fprintf( stderr, "Could not store thread-local data in continuator thread!\n" ); fprintf( stderr, "(%s)\n", strerror( rc ) ); mcbsp_util_fatal(); } #ifdef __MACH__ //get rights for accessing Mach's timers const kern_return_t rc1 = host_get_clock_service( mach_host_self(), SYSTEM_CLOCK, &(thread_data->clock) ); if( rc1 != KERN_SUCCESS ) { fprintf( stderr, "Could not access the Mach system timer (%s)\n", mach_error_string( rc1 ) ); mcbsp_util_fatal(); } const kern_return_t rc2 = clock_get_time( thread_data->clock, &(thread_data->start) ); if( rc2 != KERN_SUCCESS ) { fprintf( stderr, "Could not get starting time (%s)\n", mach_error_string( rc2 ) ); mcbsp_util_fatal(); } #else clock_gettime( CLOCK_MONOTONIC, &(thread_data->start) ); #endif //this one is extra, enables possible BSP-within-BSP execution. if( pthread_setspecific( mcbsp_internal_init_data, NULL ) != 0 ) { fprintf( stderr, "Could not reset initialisation data to NULL on SPMD start!\n" ); mcbsp_util_fatal(); } } } //free pinning only if it was not manually defined if( MCBSP_AFFINITY != MANUAL ) free( pinning ); }
void bsp_end() { //get thread-local data struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data ); if( data == NULL ) { fprintf( stderr, "Error: could not get thread-local data in call to bsp_abort( error_message )!\n" ); mcbsp_util_fatal(); } //record end data->init->ended = true; //get lock pthread_mutex_lock( &(data->init->mutex) ); //see if synchronisation is complete if( data->init->sync_entry_counter++ == data->init->P - 1 ) { data->init->sync_entry_counter = 0; pthread_cond_broadcast( &(data->init->condition) ); } else pthread_cond_wait( &(data->init->condition), &(data->init->mutex) ); //unlock mutex pthread_mutex_unlock( &(data->init->mutex) ); //set thread-local data to NULL if( pthread_setspecific( mcbsp_internal_thread_data, NULL ) != 0 ) { fprintf( stderr, "Could not set thread-local data to NULL on thread exit.\n" ); mcbsp_util_fatal(); } //free data and exit gracefully, #ifdef __MACH__ mach_port_deallocate( mach_task_self(), data->clock ); #endif mcbsp_util_address_map_destroy( &(data->local2global) ); mcbsp_util_stack_destroy( &(data->removedGlobals) ); mcbsp_util_stack_destroy( &(data->localsToRemove) ); for( MCBSP_PROCESSOR_INDEX_DATATYPE s = 0; s < data->init->P; ++s ) { mcbsp_util_stack_destroy( &(data->queues[ s ]) ); } free( data->queues ); mcbsp_util_stack_destroy( &(data->bsmp) ); mcbsp_util_stack_destroy( &(data->localsToPush) ); //exit if not master thread if( data->bsp_id != 0 ) { //free thread-local data free( data ); pthread_exit( NULL ); } //master thread cleans up init struct struct mcbsp_init_data *init = data->init; //that's everything we needed from the thread-local data struct free( data ); //wait for other threads for( MCBSP_PROCESSOR_INDEX_DATATYPE s = 1; s < init->P; ++s ) pthread_join( init->threads[ s ], NULL ); //destroy mutex and condition pthread_mutex_destroy( &(init->mutex) ); pthread_cond_destroy( &(init->condition) ); pthread_cond_destroy( &(init->mid_condition) ); //destroy global address table mcbsp_util_address_table_destroy( &(init->global2local) ); //destroy pointers to thread-local data structs free( init->threadData ); //exit gracefully, free threads array free( init->threads ); //exit gracefully, free BSP program init data free( init ); }