int main ( int argc, char **argv )
{
   int i;
   bool check = true;

   main__loop_1_data_t _loop_data;

   A = 0;

   WD *wg = getMyThreadSafe()->getCurrentWD(); 
   for ( i = 0; i < NUM_ITERS; i++ ) {
      
      // If we're done processing half of the dataset
      if ( i == NUM_ITERS/2 ) {
         // Stop scheduler
         sys.stopScheduler();
      }
      
      // Work descriptor creation
      WD * wd = new WD( new SMPDD( main__loop_1 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data );
      wd->setPriority( 100 );

      // Work Group affiliation
      wg->addWork( *wd );

      // Work submission
      sys.submit( *wd );
      
      if ( i == ( NUM_ITERS/2 + 5 ) ){
         // Keep going
         sys.startScheduler();
      }
   }
   // barrier (kind of)
   wg->waitCompletion();
   

   /*
    * How can we be sure the test passed? Each task increments A. If we run N
    * tasks, A should be equal to N.
    * If it's less than N, that'd mean the scheduler lost something.
    */
   if ( A.value() != NUM_ITERS ) check = false;

   if ( check ) {
      fprintf(stderr, "%s : %s\n", argv[0], "successful");
      return 0;
   }
   else {
      fprintf(stderr, "%s: %s\n", argv[0], "unsuccessful");
      return -1;
   }
}
예제 #2
0
int main ( int argc, char **argv )
{
   cout << "PEs = " << sys.getSMPPlugin()->getNumPEs() << endl;
   cout << "Mode = " << sys.getExecutionMode() << endl;
   cout << "Verbose = " << sys.getVerbose() << endl;
   cout << "Args" << endl;
   for ( int i = 0; i < argc; i++ )
      cout << argv[i] << endl;
   cout << "start" << endl;

   hello_world_args *data;
   const char *str;

   // Work arguments
   str = "std::string(1)";
   data = new hello_world_args();
   data->a = 1;
   strncpy(data->b, "char *string(1)", strlen("char *string(1)"));
   data->c = str;

   // Work descriptor creation
   WD * wd1 = new WD( new SMPDD( hello_world ), sizeof(hello_world_args), __alignof__(hello_world_args), data );

   // Work arguments
   str = "std::string(2)";
   data = new hello_world_args();
   data->repeat_n_info.n = 10;
   data->a = 2;
   strncpy(data->b, "char *string(2)", strlen("char *string(2)"));
   data->c = str;

   // loading RepeatN Slicer Plugin
   sys.loadPlugin( "slicer-repeat_n" );
   Slicer *slicer = sys.getSlicer ( "repeat_n" );
 
   // Work descriptor creation
   WD * wd2 = new WorkDescriptor( new SMPDD( hello_world ), sizeof(hello_world_args), __alignof__(hello_world_args),data,0,NULL,NULL );
   wd2->setSlicer(slicer);

   // Work Group affiliation and work submision
   WD *wg = getMyThreadSafe()->getCurrentWD();
   wg->addWork( *wd1 );
   wg->addWork( *wd2 );

   if ( sys.getPMInterface().getInternalDataSize() > 0 ) {
      char *idata = NEW char[sys.getPMInterface().getInternalDataSize()];
      sys.getPMInterface().initInternalData( idata );
      wd1->setInternalData( idata );
   }
int main ( int argc, char **argv )
{
   int i;
   bool check = true;

   main__loop_1_data_t _loop_data;

   // Repeat the test NUM_RUNS times
   for ( int testNumber = 0; testNumber < NUM_RUNS; ++testNumber ) {
      A = 0;
   
      WG *wg = getMyThreadSafe()->getCurrentWD();   
      // Stop scheduler
      sys.stopScheduler();
      // increment variable
      for ( i = 0; i < NUM_ITERS; i++ ) {
         // Work descriptor creation
         WD * wd = new WD( new SMPDD( main__loop_1 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data );
         wd->setPriority( 100 );
   
         // Work Group affiliation
         wg->addWork( *wd );
   
         // Work submission
         sys.submit( *wd );
      }
      // Re-enable the scheduler
      sys.startScheduler();
      // barrier (kind of)
      wg->waitCompletion();
      
   
      /*
       * The verification criteria is that A is equal to the number of tasks
       * run. Should A be lower, that would indicate that not all tasks
       * successfuly finished.
       */
      if ( A.value() != NUM_ITERS ) check = false;
   }

   if ( check ) {
      fprintf(stderr, "%s : %s\n", argv[0], "successful");
      return 0;
   }
   else {
      fprintf(stderr, "%s: %s\n", argv[0], "unsuccessful");
      return -1;
   }
}
예제 #4
0
int main (int argc, char **argv)
{
       cout << "start" << endl;
       //all threads perform a barrier: 
       ThreadTeam &team = *getMyThreadSafe()->getTeam();

       size = team.size();
       counts = new int[team.size()];
       counts[0] = 0;

       for ( unsigned i = 1; i < team.size(); i++ ) {
              counts[i] = 0;
              WD * wd = new WD(new SMPDD(barrier_code));
	      wd->tieTo(team[i]);
              sys.submit(*wd);
       }
       usleep(100);

       WD *wd = getMyThreadSafe()->getCurrentWD();
       wd->tieTo(*getMyThreadSafe());
       barrier_code(NULL);

       cout << "end" << endl;
}
예제 #5
0
inline void AllocatedChunk::removeReference( WD const &wd ) {
   //ensure(_refs > 0, "invalid removeReference, chunk has 0 references!");
   if ( _refs == 0 ) {
      *myThread->_file << " removeReference ON A CHUNK WITH 0 REFS!!!" << std::endl;
   }
   _refs--;
   _refWdId[&wd]--;
   if ( _refWdId[&wd] == 0 ) {
      _refLoc[wd.getId()].clear();
   }
   
   //std::cerr << "del ref to chunk "<< (void*)this << " " << _refs.value() << std::endl;
   //if ( _refs == (unsigned int) -1 ) {
   //   std::cerr << "overflow at references chunk "<< (void*)this << std::endl; sys.printBt();
   //} else if ( _refs == 0 ) {
   //   std::cerr << "zeroed at references chunk "<< (void*)this << std::endl;
   //}
   
}
            /*!
            *  \brief Enqueue a work descriptor in the readyQueue of the passed thread
            *  \param thread pointer to the thread to which readyQueue the task must be appended
            *  \param wd a reference to the work descriptor to be enqueued
            *  \sa ThreadData, WD and BaseThread
            */
            virtual void queue ( BaseThread *thread, WD &wd )
            {
                ThreadData &data = ( ThreadData & ) *thread->getTeamData()->getScheduleData();
                if ( !data._init ) {
                   data._cacheId = thread->runningOn()->getMemorySpaceId();
                   data._init = true;
                }
                TeamData &tdata = (TeamData &) *thread->getTeam()->getScheduleData();

                if ( wd.isTied() ) {
                    unsigned int index = wd.isTiedTo()->runningOn()->getMemorySpaceId();
                    tdata._readyQueues[index].push_front ( &wd );
                    return;
                }
                if ( wd.getNumCopies() > 0 ){
                   unsigned int numCaches = sys.getCacheMap().getSize();
                   unsigned int ranks[numCaches];
                   for (unsigned int i = 0; i < numCaches; i++ ) {
                      ranks[i] = 0;
                   }
                   CopyData * copies = wd.getCopies();
                   for ( unsigned int i = 0; i < wd.getNumCopies(); i++ ) {
                      if ( !copies[i].isPrivate() ) {
                         WorkDescriptor* parent = wd.getParent();
                         if ( parent != NULL ) {
                            Directory *dir = parent->getDirectory();
                            if ( dir != NULL ) {
                               DirectoryEntry *de = dir->findEntry(copies[i].getAddress());
                               if ( de != NULL ) {
                                  for ( unsigned int j = 0; j < numCaches; j++ ) {
                                     ranks[j]+=((unsigned int)(de->getAccess( j+1 ) > 0))*copies[i].getSize();
                                  }
                               }
                            }
                         }
                      }
                   }
                   unsigned int winner = 0;
                   unsigned int maxRank = 0;
                   for ( unsigned int i = 0; i < numCaches; i++ ) {
                      if ( ranks[i] > maxRank ) {
                         winner = i+1;
                         maxRank = ranks[i];
                      }
                   }
                   tdata._readyQueues[winner].push_front( &wd );
                } else {
                   tdata._readyQueues[0].push_front ( &wd );
                }
            }
예제 #7
0
void MemCacheCopy::generateInOps( BaseAddressSpaceInOps &ops, bool input, bool output, WD const &wd, unsigned int copyIdx ) {
   //NANOS_INSTRUMENT( InstrumentState inst4(NANOS_CC_CDIN_OP_GEN); );
   _reg.key->lockObject();
   if ( input && output ) {
      //re read version, in case of this being a commutative or concurrent access
      if ( _reg.getVersion() > _version ) {
         *myThread->_file << "[!!!] WARNING: concurrent or commutative detected, wd " << wd.getId() << " " << (wd.getDescription()!=NULL?wd.getDescription():"[no desc]") << " index " << copyIdx << " _reg.getVersion() " << _reg.getVersion() << " _version " << _version << std::endl;
         _version = _reg.getVersion();
      }
   }

   if ( ops.getPE()->getMemorySpaceId() != 0 ) {
      /* CACHE ACCESS */
      if ( input )  {
         if ( _policy == RegionCache::FPGA ) {
            _chunk->copyRegionFromHost( ops, _reg.id, _version, wd, copyIdx );
         } else {
            _chunk->NEWaddReadRegion2( ops, _reg.id, _version, _locations, wd, copyIdx );
         }
      } else if ( output ) {
         _chunk->NEWaddWriteRegion( _reg.id, _version, &wd, copyIdx );
      } else {
         fatal("invalid path");
      }
   } else {
      /* HOST ACCESS */
      if ( input )  {
         ops.copyInputData( *this, wd, copyIdx );
      }
   }
   //NANOS_INSTRUMENT( inst4.close(); );
   _reg.key->unlockObject();
}
bool GPUThread::inlineWorkDependent ( WD &wd )
{
   GPUProcessor &myGPU = * ( GPUProcessor * ) myThread->runningOn();

   if ( GPUConfig::isOverlappingInputsDefined() ) {
      // Wait for the input transfer stream to finish
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_INPUT_STREAM_SYNC_EVENT );
      cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getInTransferStream() );
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;
      // Erase the wait input list and synchronize it with cache
      myGPU.getInTransferList()->clearMemoryTransfers();
      myGPU.freeInputPinnedMemory();
   }

   // Check if someone is waiting for our data
   myGPU.getOutTransferList()->clearRequestedMemoryTransfers();

   // We wait for wd inputs, but as we have just waited for them, we could skip this step
   wd.start( WD::IsNotAUserLevelThread );

   GPUDD &dd = ( GPUDD & ) wd.getActiveDevice();

   NANOS_INSTRUMENT ( InstrumentStateAndBurst inst1( "user-code", wd.getId(), NANOS_RUNNING ) );
   ( dd.getWorkFct() )( wd.getData() );

   if ( !GPUConfig::isOverlappingOutputsDefined() && !GPUConfig::isOverlappingInputsDefined() ) {
      // Wait for the GPU kernel to finish
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_DEVICE_SYNC_EVENT );
#ifdef NANOS_GPU_USE_CUDA32
      cudaThreadSynchronize();
#else
      cudaDeviceSynchronize();
#endif
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;

      // Normally this instrumentation code is inserted by the compiler in the task outline.
      // But because the kernel call is asynchronous for GPUs we need to raise them manually here
      // when we know the kernel has really finished
      NANOS_INSTRUMENT ( raiseWDClosingEvents() );

      // Copy out results from tasks executed previously
      // Do it always, as another GPU may be waiting for results
      myGPU.getOutTransferList()->executeMemoryTransfers();
   }
   else {
      myGPU.getOutTransferList()->executeMemoryTransfers();
   }

   if ( GPUConfig::isPrefetchingDefined() ) {
      WD * last = &wd;
      while ( canPrefetch() ) {
         // Get next task in order to prefetch data to device memory
         WD *next = Scheduler::prefetch( ( nanos::BaseThread * ) this, *last );
         if ( next != NULL ) {
            next->init();
            addNextWD( next );
            last = next;
         } else {
            break;
         }
      }
   }

   if ( GPUConfig::isOverlappingOutputsDefined() ) {
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_OUTPUT_STREAM_SYNC_EVENT );
      cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getOutTransferStream() );
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;
      myGPU.freeOutputPinnedMemory();
   }

   if ( GPUConfig::isOverlappingOutputsDefined() || GPUConfig::isOverlappingInputsDefined() ) {
      // Wait for the GPU kernel to finish, if we have not waited before
      //cudaThreadSynchronize();
      NANOS_GPU_CREATE_IN_CUDA_RUNTIME_EVENT( NANOS_GPU_CUDA_KERNEL_STREAM_SYNC_EVENT );
      cudaStreamSynchronize( myGPU.getGPUProcessorInfo()->getKernelExecStream() );
      NANOS_GPU_CLOSE_IN_CUDA_RUNTIME_EVENT;

      // Normally this instrumentation code is inserted by the compiler in the task outline.
      // But because the kernel call is asynchronous for GPUs we need to raise them manually here
      // when we know the kernel has really finished
      NANOS_INSTRUMENT ( raiseWDClosingEvents() );
   }

   return true;
}
예제 #9
0
inline void AllocatedChunk::addReference( WD const &wd, unsigned int loc ) {
   _refs++;
   _refWdId[&wd]++;
   _refLoc[wd.getId()].insert(loc);
   //std::cerr << "add ref to chunk "<< (void*)this << " " << _refs.value() << std::endl;
}
int main ( int argc, char **argv )
{
   int i;
   bool check = true;

   main__loop_1_data_t _loop_data;

   // initialize vector
   for ( i = 0; i < VECTOR_SIZE; i++ ) A[i] = 0;

   // Stop scheduler
   sys.stopScheduler();
   sys.waitUntilThreadsPaused();
   WG *wg = getMyThreadSafe()->getCurrentWD();
   // increment vector
   for ( i = 0; i < NUM_ITERS; i++ ) {
#if USE_NANOS
      // loop info initialization
      _loop_data.loop_info.lower = 0;
      _loop_data.loop_info.upper = VECTOR_SIZE;
      _loop_data.loop_info.step = + 1;

      // Work descriptor creation
      WD * wd = new WD( new SMPDD( main__loop_1 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data );
      wd->setPriority( 100 );

      // Work Group affiliation
      wg->addWork( *wd );

      // Work submission
      sys.submit( *wd );

#else
      for ( int j = 0; j < VECTOR_SIZE; j++ ) A[j] += 100;
#endif
   }
   for ( i = 0; i < sys.getNumWorkers(); ++i )
   {
#if USE_NANOS
      // Second task: set to 0
      WD* wd = new WD( new SMPDD( main__loop_2 ), sizeof( _loop_data ), __alignof__(nanos_loop_info_t), ( void * ) &_loop_data );
      // Use a higher priority
      wd->setPriority( 150 );
      wg->addWork( *wd );
      // Work submission
      sys.submit( *wd );

#else
      for ( int j = 0; j < VECTOR_SIZE; j++ ) A[j] = 0;
#endif
   }
   // Re-enable the scheduler
   sys.startScheduler();
   sys.waitUntilThreadsUnpaused();

   
   // barrier (kind of)
   wg->waitCompletion();
   

   /*
    * Verification criteria: The priority scheduler must ensure that the
    * highest priority task that was submitted the latest is executed before
    * at least one lower priority task.
    * In this case, as the highest priority task sets the elements in the A
    * array to 0, it is as simple as checking if that's the value at the end of
    * the execution. If it is, the test failed, otherwise, succeeded.
    */
   for ( i = 0; i < VECTOR_SIZE; i++ ) if ( A[i] == 0 ) check = false;

   if ( check ) {
      fprintf(stderr, "%s : %s\n", argv[0], "successful");
      return 0;
   }
   else {
      fprintf(stderr, "%s: %s\n", argv[0], "unsuccessful");
      return -1;
   }
}