int main ( int argc, char **argv ) { int i; bool check = true; main__loop_1_data_t _loop_data; A = 0; WD *wg = getMyThreadSafe()->getCurrentWD(); for ( i = 0; i < NUM_ITERS; i++ ) { // If we're done processing half of the dataset if ( i == NUM_ITERS/2 ) { // Stop scheduler sys.stopScheduler(); } // Work descriptor creation WD * wd = new WD( new SMPDD( main__loop_1 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data ); wd->setPriority( 100 ); // Work Group affiliation wg->addWork( *wd ); // Work submission sys.submit( *wd ); if ( i == ( NUM_ITERS/2 + 5 ) ){ // Keep going sys.startScheduler(); } } // barrier (kind of) wg->waitCompletion(); /* * How can we be sure the test passed? Each task increments A. If we run N * tasks, A should be equal to N. * If it's less than N, that'd mean the scheduler lost something. */ if ( A.value() != NUM_ITERS ) check = false; if ( check ) { fprintf(stderr, "%s : %s\n", argv[0], "successful"); return 0; } else { fprintf(stderr, "%s: %s\n", argv[0], "unsuccessful"); return -1; } }
int main ( int argc, char **argv ) { cout << "PEs = " << sys.getSMPPlugin()->getNumPEs() << endl; cout << "Mode = " << sys.getExecutionMode() << endl; cout << "Verbose = " << sys.getVerbose() << endl; cout << "Args" << endl; for ( int i = 0; i < argc; i++ ) cout << argv[i] << endl; cout << "start" << endl; hello_world_args *data; const char *str; // Work arguments str = "std::string(1)"; data = new hello_world_args(); data->a = 1; strncpy(data->b, "char *string(1)", strlen("char *string(1)")); data->c = str; // Work descriptor creation WD * wd1 = new WD( new SMPDD( hello_world ), sizeof(hello_world_args), __alignof__(hello_world_args), data ); // Work arguments str = "std::string(2)"; data = new hello_world_args(); data->repeat_n_info.n = 10; data->a = 2; strncpy(data->b, "char *string(2)", strlen("char *string(2)")); data->c = str; // loading RepeatN Slicer Plugin sys.loadPlugin( "slicer-repeat_n" ); Slicer *slicer = sys.getSlicer ( "repeat_n" ); // Work descriptor creation WD * wd2 = new WorkDescriptor( new SMPDD( hello_world ), sizeof(hello_world_args), __alignof__(hello_world_args),data,0,NULL,NULL ); wd2->setSlicer(slicer); // Work Group affiliation and work submision WD *wg = getMyThreadSafe()->getCurrentWD(); wg->addWork( *wd1 ); wg->addWork( *wd2 ); if ( sys.getPMInterface().getInternalDataSize() > 0 ) { char *idata = NEW char[sys.getPMInterface().getInternalDataSize()]; sys.getPMInterface().initInternalData( idata ); wd1->setInternalData( idata ); }
int main ( int argc, char **argv ) { int i; bool check = true; main__loop_1_data_t _loop_data; // Repeat the test NUM_RUNS times for ( int testNumber = 0; testNumber < NUM_RUNS; ++testNumber ) { A = 0; WG *wg = getMyThreadSafe()->getCurrentWD(); // Stop scheduler sys.stopScheduler(); // increment variable for ( i = 0; i < NUM_ITERS; i++ ) { // Work descriptor creation WD * wd = new WD( new SMPDD( main__loop_1 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data ); wd->setPriority( 100 ); // Work Group affiliation wg->addWork( *wd ); // Work submission sys.submit( *wd ); } // Re-enable the scheduler sys.startScheduler(); // barrier (kind of) wg->waitCompletion(); /* * The verification criteria is that A is equal to the number of tasks * run. Should A be lower, that would indicate that not all tasks * successfuly finished. */ if ( A.value() != NUM_ITERS ) check = false; } if ( check ) { fprintf(stderr, "%s : %s\n", argv[0], "successful"); return 0; } else { fprintf(stderr, "%s: %s\n", argv[0], "unsuccessful"); return -1; } }
int main (int argc, char **argv) { cout << "start" << endl; //all threads perform a barrier: ThreadTeam &team = *getMyThreadSafe()->getTeam(); size = team.size(); counts = new int[team.size()]; counts[0] = 0; for ( unsigned i = 1; i < team.size(); i++ ) { counts[i] = 0; WD * wd = new WD(new SMPDD(barrier_code)); wd->tieTo(team[i]); sys.submit(*wd); } usleep(100); WD *wd = getMyThreadSafe()->getCurrentWD(); wd->tieTo(*getMyThreadSafe()); barrier_code(NULL); cout << "end" << endl; }
inline void AllocatedChunk::removeReference( WD const &wd ) { //ensure(_refs > 0, "invalid removeReference, chunk has 0 references!"); if ( _refs == 0 ) { *myThread->_file << " removeReference ON A CHUNK WITH 0 REFS!!!" << std::endl; } _refs--; _refWdId[&wd]--; if ( _refWdId[&wd] == 0 ) { _refLoc[wd.getId()].clear(); } //std::cerr << "del ref to chunk "<< (void*)this << " " << _refs.value() << std::endl; //if ( _refs == (unsigned int) -1 ) { // std::cerr << "overflow at references chunk "<< (void*)this << std::endl; sys.printBt(); //} else if ( _refs == 0 ) { // std::cerr << "zeroed at references chunk "<< (void*)this << std::endl; //} }
/*! * \brief Enqueue a work descriptor in the readyQueue of the passed thread * \param thread pointer to the thread to which readyQueue the task must be appended * \param wd a reference to the work descriptor to be enqueued * \sa ThreadData, WD and BaseThread */ virtual void queue ( BaseThread *thread, WD &wd ) { ThreadData &data = ( ThreadData & ) *thread->getTeamData()->getScheduleData(); if ( !data._init ) { data._cacheId = thread->runningOn()->getMemorySpaceId(); data._init = true; } TeamData &tdata = (TeamData &) *thread->getTeam()->getScheduleData(); if ( wd.isTied() ) { unsigned int index = wd.isTiedTo()->runningOn()->getMemorySpaceId(); tdata._readyQueues[index].push_front ( &wd ); return; } if ( wd.getNumCopies() > 0 ){ unsigned int numCaches = sys.getCacheMap().getSize(); unsigned int ranks[numCaches]; for (unsigned int i = 0; i < numCaches; i++ ) { ranks[i] = 0; } CopyData * copies = wd.getCopies(); for ( unsigned int i = 0; i < wd.getNumCopies(); i++ ) { if ( !copies[i].isPrivate() ) { WorkDescriptor* parent = wd.getParent(); if ( parent != NULL ) { Directory *dir = parent->getDirectory(); if ( dir != NULL ) { DirectoryEntry *de = dir->findEntry(copies[i].getAddress()); if ( de != NULL ) { for ( unsigned int j = 0; j < numCaches; j++ ) { ranks[j]+=((unsigned int)(de->getAccess( j+1 ) > 0))*copies[i].getSize(); } } } } } } unsigned int winner = 0; unsigned int maxRank = 0; for ( unsigned int i = 0; i < numCaches; i++ ) { if ( ranks[i] > maxRank ) { winner = i+1; maxRank = ranks[i]; } } tdata._readyQueues[winner].push_front( &wd ); } else { tdata._readyQueues[0].push_front ( &wd ); } }
void MemCacheCopy::generateInOps( BaseAddressSpaceInOps &ops, bool input, bool output, WD const &wd, unsigned int copyIdx ) { //NANOS_INSTRUMENT( InstrumentState inst4(NANOS_CC_CDIN_OP_GEN); ); _reg.key->lockObject(); if ( input && output ) { //re read version, in case of this being a commutative or concurrent access if ( _reg.getVersion() > _version ) { *myThread->_file << "[!!!] WARNING: concurrent or commutative detected, wd " << wd.getId() << " " << (wd.getDescription()!=NULL?wd.getDescription():"[no desc]") << " index " << copyIdx << " _reg.getVersion() " << _reg.getVersion() << " _version " << _version << std::endl; _version = _reg.getVersion(); } } if ( ops.getPE()->getMemorySpaceId() != 0 ) { /* CACHE ACCESS */ if ( input ) { if ( _policy == RegionCache::FPGA ) { _chunk->copyRegionFromHost( ops, _reg.id, _version, wd, copyIdx ); } else { _chunk->NEWaddReadRegion2( ops, _reg.id, _version, _locations, wd, copyIdx ); } } else if ( output ) { _chunk->NEWaddWriteRegion( _reg.id, _version, &wd, copyIdx ); } else { fatal("invalid path"); } } else { /* HOST ACCESS */ if ( input ) { ops.copyInputData( *this, wd, copyIdx ); } } //NANOS_INSTRUMENT( inst4.close(); ); _reg.key->unlockObject(); }
bool GPUThread::inlineWorkDependent ( WD &wd ) { GPUProcessor &myGPU = * ( GPUProcessor * ) myThread->runningOn(); if ( GPUConfig::isOverlappingInputsDefined() ) { // Wait for the input transfer stream to finish NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_INPUT_STREAM_SYNC_EVENT ); cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getInTransferStream() ); NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; // Erase the wait input list and synchronize it with cache myGPU.getInTransferList()->clearMemoryTransfers(); myGPU.freeInputPinnedMemory(); } // Check if someone is waiting for our data myGPU.getOutTransferList()->clearRequestedMemoryTransfers(); // We wait for wd inputs, but as we have just waited for them, we could skip this step wd.start( WD::IsNotAUserLevelThread ); GPUDD &dd = ( GPUDD & ) wd.getActiveDevice(); NANOS_INSTRUMENT ( InstrumentStateAndBurst inst1( "user-code", wd.getId(), NANOS_RUNNING ) ); ( dd.getWorkFct() )( wd.getData() ); if ( !GPUConfig::isOverlappingOutputsDefined() && !GPUConfig::isOverlappingInputsDefined() ) { // Wait for the GPU kernel to finish NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_DEVICE_SYNC_EVENT ); #ifdef NANOS_GPU_USE_CUDA32 cudaThreadSynchronize(); #else cudaDeviceSynchronize(); #endif NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; // Normally this instrumentation code is inserted by the compiler in the task outline. // But because the kernel call is asynchronous for GPUs we need to raise them manually here // when we know the kernel has really finished NANOS_INSTRUMENT ( raiseWDClosingEvents() ); // Copy out results from tasks executed previously // Do it always, as another GPU may be waiting for results myGPU.getOutTransferList()->executeMemoryTransfers(); } else { myGPU.getOutTransferList()->executeMemoryTransfers(); } if ( GPUConfig::isPrefetchingDefined() ) { WD * last = &wd; while ( canPrefetch() ) { // Get next task in order to prefetch data to device memory WD *next = Scheduler::prefetch( ( nanos::BaseThread * ) this, *last ); if ( next != NULL ) { next->init(); addNextWD( next ); last = next; } else { break; } } } if ( GPUConfig::isOverlappingOutputsDefined() ) { NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_OUTPUT_STREAM_SYNC_EVENT ); cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getOutTransferStream() ); NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; myGPU.freeOutputPinnedMemory(); } if ( GPUConfig::isOverlappingOutputsDefined() || GPUConfig::isOverlappingInputsDefined() ) { // Wait for the GPU kernel to finish, if we have not waited before //cudaThreadSynchronize(); NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_KERNEL_STREAM_SYNC_EVENT ); cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getKernelExecStream() ); NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; // Normally this instrumentation code is inserted by the compiler in the task outline. // But because the kernel call is asynchronous for GPUs we need to raise them manually here // when we know the kernel has really finished NANOS_INSTRUMENT ( raiseWDClosingEvents() ); } return true; }
inline void AllocatedChunk::addReference( WD const &wd, unsigned int loc ) { _refs++; _refWdId[&wd]++; _refLoc[wd.getId()].insert(loc); //std::cerr << "add ref to chunk "<< (void*)this << " " << _refs.value() << std::endl; }
int main ( int argc, char **argv ) { int i; bool check = true; main__loop_1_data_t _loop_data; // initialize vector for ( i = 0; i < VECTOR_SIZE; i++ ) A[i] = 0; // Stop scheduler sys.stopScheduler(); sys.waitUntilThreadsPaused(); WG *wg = getMyThreadSafe()->getCurrentWD(); // increment vector for ( i = 0; i < NUM_ITERS; i++ ) { #if USE_NANOS // loop info initialization _loop_data.loop_info.lower = 0; _loop_data.loop_info.upper = VECTOR_SIZE; _loop_data.loop_info.step = + 1; // Work descriptor creation WD * wd = new WD( new SMPDD( main__loop_1 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data ); wd->setPriority( 100 ); // Work Group affiliation wg->addWork( *wd ); // Work submission sys.submit( *wd ); #else for ( int j = 0; j < VECTOR_SIZE; j++ ) A[j] += 100; #endif } for ( i = 0; i < sys.getNumWorkers(); ++i ) { #if USE_NANOS // Second task: set to 0 WD* wd = new WD( new SMPDD( main__loop_2 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data ); // Use a higher priority wd->setPriority( 150 ); wg->addWork( *wd ); // Work submission sys.submit( *wd ); #else for ( int j = 0; j < VECTOR_SIZE; j++ ) A[j] = 0; #endif } // Re-enable the scheduler sys.startScheduler(); sys.waitUntilThreadsUnpaused(); // barrier (kind of) wg->waitCompletion(); /* * Verification criteria: The priority scheduler must ensure that the * highest priority task that was submitted the latest is executed before * at least one lower priority task. * In this case, as the highest priority task sets the elements in the A * array to 0, it is as simple as checking if that's the value at the end of * the execution. If it is, the test failed, otherwise, succeeded. */ for ( i = 0; i < VECTOR_SIZE; i++ ) if ( A[i] == 0 ) check = false; if ( check ) { fprintf(stderr, "%s : %s\n", argv[0], "successful"); return 0; } else { fprintf(stderr, "%s: %s\n", argv[0], "unsuccessful"); return -1; } }