inline void AllocatedChunk::removeReference( WD const &wd ) { //ensure(_refs > 0, "invalid removeReference, chunk has 0 references!"); if ( _refs == 0 ) { *myThread->_file << " removeReference ON A CHUNK WITH 0 REFS!!!" << std::endl; } _refs--; _refWdId[&wd]--; if ( _refWdId[&wd] == 0 ) { _refLoc[wd.getId()].clear(); } //std::cerr << "del ref to chunk "<< (void*)this << " " << _refs.value() << std::endl; //if ( _refs == (unsigned int) -1 ) { // std::cerr << "overflow at references chunk "<< (void*)this << std::endl; sys.printBt(); //} else if ( _refs == 0 ) { // std::cerr << "zeroed at references chunk "<< (void*)this << std::endl; //} }
void MemCacheCopy::generateInOps( BaseAddressSpaceInOps &ops, bool input, bool output, WD const &wd, unsigned int copyIdx ) { //NANOS_INSTRUMENT( InstrumentState inst4(NANOS_CC_CDIN_OP_GEN); ); _reg.key->lockObject(); if ( input && output ) { //re read version, in case of this being a commutative or concurrent access if ( _reg.getVersion() > _version ) { *myThread->_file << "[!!!] WARNING: concurrent or commutative detected, wd " << wd.getId() << " " << (wd.getDescription()!=NULL?wd.getDescription():"[no desc]") << " index " << copyIdx << " _reg.getVersion() " << _reg.getVersion() << " _version " << _version << std::endl; _version = _reg.getVersion(); } } if ( ops.getPE()->getMemorySpaceId() != 0 ) { /* CACHE ACCESS */ if ( input ) { if ( _policy == RegionCache::FPGA ) { _chunk->copyRegionFromHost( ops, _reg.id, _version, wd, copyIdx ); } else { _chunk->NEWaddReadRegion2( ops, _reg.id, _version, _locations, wd, copyIdx ); } } else if ( output ) { _chunk->NEWaddWriteRegion( _reg.id, _version, &wd, copyIdx ); } else { fatal("invalid path"); } } else { /* HOST ACCESS */ if ( input ) { ops.copyInputData( *this, wd, copyIdx ); } } //NANOS_INSTRUMENT( inst4.close(); ); _reg.key->unlockObject(); }
bool GPUThread::inlineWorkDependent ( WD &wd ) { GPUProcessor &myGPU = * ( GPUProcessor * ) myThread->runningOn(); if ( GPUConfig::isOverlappingInputsDefined() ) { // Wait for the input transfer stream to finish NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_INPUT_STREAM_SYNC_EVENT ); cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getInTransferStream() ); NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; // Erase the wait input list and synchronize it with cache myGPU.getInTransferList()->clearMemoryTransfers(); myGPU.freeInputPinnedMemory(); } // Check if someone is waiting for our data myGPU.getOutTransferList()->clearRequestedMemoryTransfers(); // We wait for wd inputs, but as we have just waited for them, we could skip this step wd.start( WD::IsNotAUserLevelThread ); GPUDD &dd = ( GPUDD & ) wd.getActiveDevice(); NANOS_INSTRUMENT ( InstrumentStateAndBurst inst1( "user-code", wd.getId(), NANOS_RUNNING ) ); ( dd.getWorkFct() )( wd.getData() ); if ( !GPUConfig::isOverlappingOutputsDefined() && !GPUConfig::isOverlappingInputsDefined() ) { // Wait for the GPU kernel to finish NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_DEVICE_SYNC_EVENT ); #ifdef NANOS_GPU_USE_CUDA32 cudaThreadSynchronize(); #else cudaDeviceSynchronize(); #endif NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; // Normally this instrumentation code is inserted by the compiler in the task outline. // But because the kernel call is asynchronous for GPUs we need to raise them manually here // when we know the kernel has really finished NANOS_INSTRUMENT ( raiseWDClosingEvents() ); // Copy out results from tasks executed previously // Do it always, as another GPU may be waiting for results myGPU.getOutTransferList()->executeMemoryTransfers(); } else { myGPU.getOutTransferList()->executeMemoryTransfers(); } if ( GPUConfig::isPrefetchingDefined() ) { WD * last = &wd; while ( canPrefetch() ) { // Get next task in order to prefetch data to device memory WD *next = Scheduler::prefetch( ( nanos::BaseThread * ) this, *last ); if ( next != NULL ) { next->init(); addNextWD( next ); last = next; } else { break; } } } if ( GPUConfig::isOverlappingOutputsDefined() ) { NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_OUTPUT_STREAM_SYNC_EVENT ); cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getOutTransferStream() ); NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; myGPU.freeOutputPinnedMemory(); } if ( GPUConfig::isOverlappingOutputsDefined() || GPUConfig::isOverlappingInputsDefined() ) { // Wait for the GPU kernel to finish, if we have not waited before //cudaThreadSynchronize(); NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_KERNEL_STREAM_SYNC_EVENT ); cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getKernelExecStream() ); NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT; // Normally this instrumentation code is inserted by the compiler in the task outline. // But because the kernel call is asynchronous for GPUs we need to raise them manually here // when we know the kernel has really finished NANOS_INSTRUMENT ( raiseWDClosingEvents() ); } return true; }
inline void AllocatedChunk::addReference( WD const &wd, unsigned int loc ) { _refs++; _refWdId[&wd]++; _refLoc[wd.getId()].insert(loc); //std::cerr << "add ref to chunk "<< (void*)this << " " << _refs.value() << std::endl; }