Esempio n. 1
0
inline void AllocatedChunk::removeReference( WD const &wd ) {
   //ensure(_refs > 0, "invalid removeReference, chunk has 0 references!");
   if ( _refs == 0 ) {
      *myThread->_file << " removeReference ON A CHUNK WITH 0 REFS!!!" << std::endl;
   }
   _refs--;
   _refWdId[&wd]--;
   if ( _refWdId[&wd] == 0 ) {
      _refLoc[wd.getId()].clear();
   }
   
   //std::cerr << "del ref to chunk "<< (void*)this << " " << _refs.value() << std::endl;
   //if ( _refs == (unsigned int) -1 ) {
   //   std::cerr << "overflow at references chunk "<< (void*)this << std::endl; sys.printBt();
   //} else if ( _refs == 0 ) {
   //   std::cerr << "zeroed at references chunk "<< (void*)this << std::endl;
   //}
   
}
Esempio n. 2
0
void MemCacheCopy::generateInOps( BaseAddressSpaceInOps &ops, bool input, bool output, WD const &wd, unsigned int copyIdx ) {
   //NANOS_INSTRUMENT( InstrumentState inst4(NANOS_CC_CDIN_OP_GEN); );
   _reg.key->lockObject();
   if ( input && output ) {
      //re read version, in case of this being a commutative or concurrent access
      if ( _reg.getVersion() > _version ) {
         *myThread->_file << "[!!!] WARNING: concurrent or commutative detected, wd " << wd.getId() << " " << (wd.getDescription()!=NULL?wd.getDescription():"[no desc]") << " index " << copyIdx << " _reg.getVersion() " << _reg.getVersion() << " _version " << _version << std::endl;
         _version = _reg.getVersion();
      }
   }

   if ( ops.getPE()->getMemorySpaceId() != 0 ) {
      /* CACHE ACCESS */
      if ( input )  {
         if ( _policy == RegionCache::FPGA ) {
            _chunk->copyRegionFromHost( ops, _reg.id, _version, wd, copyIdx );
         } else {
            _chunk->NEWaddReadRegion2( ops, _reg.id, _version, _locations, wd, copyIdx );
         }
      } else if ( output ) {
         _chunk->NEWaddWriteRegion( _reg.id, _version, &wd, copyIdx );
      } else {
         fatal("invalid path");
      }
   } else {
      /* HOST ACCESS */
      if ( input )  {
         ops.copyInputData( *this, wd, copyIdx );
      }
   }
   //NANOS_INSTRUMENT( inst4.close(); );
   _reg.key->unlockObject();
}
bool GPUThread::inlineWorkDependent ( WD &wd )
{
   GPUProcessor &myGPU = * ( GPUProcessor * ) myThread->runningOn();

   if ( GPUConfig::isOverlappingInputsDefined() ) {
      // Wait for the input transfer stream to finish
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_INPUT_STREAM_SYNC_EVENT );
      cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getInTransferStream() );
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;
      // Erase the wait input list and synchronize it with cache
      myGPU.getInTransferList()->clearMemoryTransfers();
      myGPU.freeInputPinnedMemory();
   }

   // Check if someone is waiting for our data
   myGPU.getOutTransferList()->clearRequestedMemoryTransfers();

   // We wait for wd inputs, but as we have just waited for them, we could skip this step
   wd.start( WD::IsNotAUserLevelThread );

   GPUDD &dd = ( GPUDD & ) wd.getActiveDevice();

   NANOS_INSTRUMENT ( InstrumentStateAndBurst inst1( "user-code", wd.getId(), NANOS_RUNNING ) );
   ( dd.getWorkFct() )( wd.getData() );

   if ( !GPUConfig::isOverlappingOutputsDefined() && !GPUConfig::isOverlappingInputsDefined() ) {
      // Wait for the GPU kernel to finish
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_DEVICE_SYNC_EVENT );
#ifdef NANOS_GPU_USE_CUDA32
      cudaThreadSynchronize();
#else
      cudaDeviceSynchronize();
#endif
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;

      // Normally this instrumentation code is inserted by the compiler in the task outline.
      // But because the kernel call is asynchronous for GPUs we need to raise them manually here
      // when we know the kernel has really finished
      NANOS_INSTRUMENT ( raiseWDClosingEvents() );

      // Copy out results from tasks executed previously
      // Do it always, as another GPU may be waiting for results
      myGPU.getOutTransferList()->executeMemoryTransfers();
   }
   else {
      myGPU.getOutTransferList()->executeMemoryTransfers();
   }

   if ( GPUConfig::isPrefetchingDefined() ) {
      WD * last = &wd;
      while ( canPrefetch() ) {
         // Get next task in order to prefetch data to device memory
         WD *next = Scheduler::prefetch( ( nanos::BaseThread * ) this, *last );
         if ( next != NULL ) {
            next->init();
            addNextWD( next );
            last = next;
         } else {
            break;
         }
      }
   }

   if ( GPUConfig::isOverlappingOutputsDefined() ) {
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_OUTPUT_STREAM_SYNC_EVENT );
      cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getOutTransferStream() );
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;
      myGPU.freeOutputPinnedMemory();
   }

   if ( GPUConfig::isOverlappingOutputsDefined() || GPUConfig::isOverlappingInputsDefined() ) {
      // Wait for the GPU kernel to finish, if we have not waited before
      //cudaThreadSynchronize();
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_KERNEL_STREAM_SYNC_EVENT );
      cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getKernelExecStream() );
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;

      // Normally this instrumentation code is inserted by the compiler in the task outline.
      // But because the kernel call is asynchronous for GPUs we need to raise them manually here
      // when we know the kernel has really finished
      NANOS_INSTRUMENT ( raiseWDClosingEvents() );
   }

   return true;
}
Esempio n. 4
0
inline void AllocatedChunk::addReference( WD const &wd, unsigned int loc ) {
   _refs++;
   _refWdId[&wd]++;
   _refLoc[wd.getId()].insert(loc);
   //std::cerr << "add ref to chunk "<< (void*)this << " " << _refs.value() << std::endl;
}