aligned_t Task::qthread_func( void * arg )
{
  Task * const task = reinterpret_cast< Task * >(arg);

  // First member of the team change state to executing.
  // Use compare-exchange to avoid race condition with a respawn.
  Kokkos::atomic_compare_exchange_strong( & task->m_state
                                        , int(Kokkos::Experimental::TASK_STATE_WAITING)
                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
                                        );

  if ( task->m_apply_team && ! task->m_apply_single ) {
    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;

    // Initialize team size and rank with shephered info
    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );

    (*task->m_apply_team)( task , member );

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(task)
       , member.team_rank()
       , member.team_size()
       );
fflush(stdout);
#endif

    member.team_barrier();
    if ( member.team_rank() == 0 ) task->closeout();
    member.team_barrier();
  }
  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
    // Team hard-wired to one, no cloning
    Kokkos::Impl::QthreadTeamPolicyMember member ;
    (*task->m_apply_team)( task , member );
    task->closeout();
  }
  else {
    (*task->m_apply_single)( task );
    task->closeout();
  }

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx return\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(task)
       );
fflush(stdout);
#endif

  return 0 ;
}
Esempio n. 2
0
void chpl_task_addToTaskList(chpl_fn_int_t     fid,
                             void             *arg,
                             c_sublocid_t      subLoc,
                             chpl_task_list_p *task_list,
                             int32_t           task_list_locale,
                             chpl_bool         is_begin_stmt,
                             int               lineno,
                             chpl_string       filename)
{
    qthread_shepherd_id_t const here_shep_id = qthread_shep();
    chpl_task_private_data_t *parent_chpl_data = chpl_task_getPrivateData();
    chpl_bool serial_state = parent_chpl_data->serial_state;
    chapel_wrapper_args_t wrapper_args = 
        {chpl_ftable[fid], arg, filename, lineno, *parent_chpl_data};

    PROFILE_INCR(profile_task_addToTaskList,1);

    if (serial_state) {
        syncvar_t ret = SYNCVAR_STATIC_EMPTY_INITIALIZER;
        qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args,
                                         sizeof(chapel_wrapper_args_t), &ret,
                                         here_shep_id);
        qthread_syncvar_readFF(NULL, &ret);
    } else if (subLoc == c_sublocid_any) {
        qthread_fork_syncvar_copyargs(chapel_wrapper, &wrapper_args,
                                      sizeof(chapel_wrapper_args_t), NULL);
    } else {
        if (subLoc == c_sublocid_curr)
            subLoc = (c_sublocid_t) here_shep_id;
        qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args,
                                         sizeof(chapel_wrapper_args_t), NULL,
                                         (qthread_shepherd_id_t) subLoc);
    }
}
Esempio n. 3
0
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
{
  verify_is_process("QthreadExec::exec_all(...)",true);

  s_active_function     = func ;
  s_active_function_arg = arg ;

  // Need to query which shepherd this main 'process' is running...
 
  const int main_shep = qthread_shep();

  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {

    // Unit tests hang with this call:
    //
    // qthread_fork_to_local_priority( driver_exec_all , NULL , NULL , jshep );
    //

    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
  }}

  driver_exec_all( NULL );

  s_active_function     = 0 ;
  s_active_function_arg = 0 ;
}
Esempio n. 4
0
void chpl_task_addToTaskList(chpl_fn_int_t     fid,
                             void             *arg,
                             c_sublocid_t      subloc,
                             chpl_task_list_p *task_list,
                             int32_t           task_list_locale,
                             chpl_bool         is_begin_stmt,
                             int               lineno,
                             chpl_string       filename)
{
    qthread_shepherd_id_t const here_shep_id = qthread_shep();
    chpl_bool serial_state = chpl_task_getSerial();
    chpl_qthread_wrapper_args_t wrapper_args = 
        {chpl_ftable[fid], arg, filename, lineno, false,
         {subloc, serial_state}};

    assert(subloc != c_sublocid_none);

    PROFILE_INCR(profile_task_addToTaskList,1);

    if (serial_state) {
        syncvar_t ret = SYNCVAR_STATIC_EMPTY_INITIALIZER;
        qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args,
                                         sizeof(chpl_qthread_wrapper_args_t), &ret,
                                         here_shep_id);
        qthread_syncvar_readFF(NULL, &ret);
    } else if (subloc == c_sublocid_any) {
        qthread_fork_copyargs(chapel_wrapper, &wrapper_args,
                              sizeof(chpl_qthread_wrapper_args_t), NULL);
    } else {
        qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args,
                                 sizeof(chpl_qthread_wrapper_args_t), NULL,
                                 (qthread_shepherd_id_t) subloc);
    }
}
// Test that writeFF waits for empty var to be filled, writes, and leaves full.
// Requires that only one worker is running. Basically does:
//     1: empty var
//     1: fork(writeFF)
//     1: yields
//     2: starts runnning
//     2: hits writeFF, and yields since var is empty
//     1: writeEF
//     1: hits readFF on forked task and yield
//     2: running again, finishes writeFF, task returns
//     1: readFF competes, finishes
static void testWriteFFWaits(void)
{
    aligned_t ret;
    concurrent_t=45;
    qthread_empty(&concurrent_t);
    assert(qthread_num_workers() == 1);

    iprintf("1: Forking writeFF wrapper\n");
    qthread_fork_to(writeFF_wrapper, NULL, &ret, qthread_shep());
    iprintf("1: Forked, now yielding to 2\n");
    qthread_yield();
    iprintf("1: Back from yield\n");

    // verify that writeFF has not completed
    assert(qthread_feb_status(&concurrent_t) == 0);
    assert(concurrent_t != 55);

    iprintf("1: Writing EF\n");
    qthread_writeEF_const(&concurrent_t, 35);

    // wait for writeFF wrapper to complete
    qthread_readFF(NULL, &ret);

    // veify that writeFF completed and that FEB is full
    iprintf("1: concurrent_t=%d\n", concurrent_t);
    assert(qthread_feb_status(&concurrent_t) == 1);
    assert(concurrent_t == 55);
}
Esempio n. 6
0
void chpl_task_yield(void)
{
    PROFILE_INCR(profile_task_yield,1);
    if (qthread_shep() == NO_SHEPHERD) {
        sched_yield();
    } else {
        qthread_yield();
    }
}
Esempio n. 7
0
static aligned_t checkres(void *arg)
{
    qthread_shepherd_id_t myshep = qthread_shep();

    assert(myshep == 1 || myshep == 0);

    iprintf("myshep = %u\n", (unsigned)myshep);
    iprintf("arg = %u\n", (unsigned)(uintptr_t)arg);
    assert(myshep == (qthread_shepherd_id_t)(intptr_t)arg);

    return 0;
}
Esempio n. 8
0
int accalt_get_thread_num() {
#ifdef ARGOBOTS
    int rank;
    ABT_xstream_self_rank(&rank);
    return rank;
#endif
#ifdef MASSIVETHREADS
    return myth_get_worker_num();
#endif
#ifdef QTHREADS
    return qthread_shep();
#endif
}
void Task::closeout()
{
  enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) |
                   int( Kokkos::Experimental::TASK_STATE_EXECUTING ) };

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx %s\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(this)
       , ( m_state == RESPAWN ? "respawn" : "complete" )
       );
fflush(stdout);
#endif

  // When dependent tasks run there would be a race
  // condition between destroying this task and
  // querying the active count pointer from this task.
  int volatile * const active_count = m_active_count ;

  if ( m_state == RESPAWN ) {
    // Task requests respawn, set state to waiting and reschedule the task
    m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
    schedule();
  }
  else {

    // Task did not respawn, is complete
    m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;

    // Release dependences before allowing dependent tasks to run.
    // Otherwise there is a thread race condition for removing dependences.
    for ( int i = 0 ; i < m_dep_size ; ++i ) {
      assign( & m_dep[i] , 0 );
    }

    // Set qthread FEB to full so that dependent tasks are allowed to execute.
    // This 'task' may be deleted immediately following this function call.
    qthread_fill( & m_qfeb );

    // The dependent task could now complete and destroy 'this' task
    // before the call to 'qthread_fill' returns.  Therefore, for
    // thread safety assume that 'this' task has now been destroyed.
  }

  // Decrement active task count before returning.
  Kokkos::atomic_decrement( active_count );
}
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
{
  verify_is_process("QthreadExec::exec_all(...)",true);

/*
  fprintf( stdout , "QthreadExec::exec_all\n");
  fflush(stdout);
*/

  s_active_function     = func ;
  s_active_function_arg = arg ;

  // Need to query which shepherd this main 'process' is running...
 
  const int main_shep = qthread_shep();

#if 1
  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
  }}
#else
  // If this function is used before the 'qthread.task_policy' unit test
  // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
  for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;

    if ( num_clone ) {
      const int ret = qthread_fork_clones_to_local_priority
        ( driver_exec_all   /* function */
        , NULL              /* function data block */
        , NULL              /* pointer to return value feb */
        , jshep             /* shepherd number */
        , num_clone - 1     /* number of instances - 1 */
        );

      assert(ret == QTHREAD_SUCCESS);
    }
  }
#endif

  driver_exec_all( NULL );

  s_active_function     = 0 ;
  s_active_function_arg = 0 ;
}
QthreadExec::QthreadExec()
{
  const int shepherd_rank        = qthread_shep();
  const int shepherd_worker_rank = qthread_worker_local(NULL);
  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;

  m_worker_base          = s_exec ;
  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
  m_reduce_end           = s_worker_reduce_end ;
  m_shepherd_rank        = shepherd_rank ;
  m_shepherd_size        = s_number_shepherds ;
  m_shepherd_worker_rank = shepherd_worker_rank ;
  m_shepherd_worker_size = s_number_workers_per_shepherd ;
  m_worker_rank          = worker_rank ;
  m_worker_size          = s_number_workers ;
  m_worker_state         = QthreadExec::Active ;
}
Esempio n. 12
0
void chpl_task_exit(void)
{
#ifdef CHAPEL_PROFILE
    profile_print();
#endif /* CHAPEL_PROFILE */

#ifdef QTHREAD_MULTINODE
#else
    if (qthread_shep() == NO_SHEPHERD) {
        /* sometimes, tasking is told to shutdown even though it hasn't been
         * told to start yet */
        if (chpl_qthread_done_initializing == 1) {
            qthread_syncvar_fill(&canexit);
            while (done_finalizing == 0) SPINLOCK_BODY();
        }
    } else {
        qthread_syncvar_fill(&exit_ret);
    }
#endif /* QTHREAD_MULTINODE */
}
Esempio n. 13
0
void chpl_task_setSubLoc(c_sublocid_t subLoc)
{
    qthread_shepherd_id_t curr_shep;

    // Only change sublocales if the caller asked for a particular one,
    // which is not the current one, and we're a (movable) task.
    //
    // Note: It's likely that this won't work in all cases where we need
    //       it.  In particular, we envision needing to move execution
    //       from sublocale to sublocale while initializing the memory
    //       layer, in order to get the NUMA domain affinity right for
    //       the subparts of the heap.  But this will be happening well
    //       before tasking init and in any case would be done from the
    //       main thread of execution, which doesn't have a shepherd.
    //       The code below wouldn't work in that situation.
    if (subLoc != c_sublocid_any &&
        subLoc != c_sublocid_curr &&
        (curr_shep = qthread_shep()) != NO_SHEPHERD &&
        (qthread_shepherd_id_t) subLoc != curr_shep) {
        qthread_migrate_to((qthread_shepherd_id_t) subLoc);
    }
}
Esempio n. 14
0
static aligned_t migrant(void *arg)
{
    int myshep = qthread_shep();

    assert(myshep == 1 || myshep == 0);

    iprintf("migrant running on shep %i\n", myshep);
    if (myshep == 1) {
        qthread_migrate_to(0);
	iprintf("migrant now running on shep %i\n", (int)qthread_shep());
        assert(qthread_shep() == 0);
    } else {
        qthread_migrate_to(1);
	if (qthread_shep() != 1) {
	    fprintf(stderr, "Expected to be on shepherd 1, actually on shepherd %i\n", qthread_shep());
	    assert(qthread_shep() == 1);
	    abort();
	}
    }

    return 0;
}
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
{
  const int exec_all_reduce_alloc = align_alloc( reduce_size );
  const int shepherd_scan_alloc   = align_alloc( 8 );
  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );

  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
       s_worker_shared_end < shepherd_shared_end ) {

/*
  fprintf( stdout , "QthreadExec::resize\n");
  fflush(stdout);
*/

    // Clear current worker memory before allocating new worker memory
    clear_workers();

    // Increase the buffers to an aligned allocation
    s_worker_reduce_end   = exec_all_reduce_alloc ;
    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
    s_worker_shared_end   = shepherd_shared_end ;

    // Need to query which shepherd this main 'process' is running...
 
    const int main_shep = qthread_shep();

    // Have each worker resize its memory for proper first-touch
#if 1
    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
    }}
#else
    // If this function is used before the 'qthread.task_policy' unit test
    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;

      if ( num_clone ) {
        const int ret = qthread_fork_clones_to_local_priority
          ( driver_resize_worker_scratch   /* function */
          , NULL                           /* function data block */
          , NULL                           /* pointer to return value feb */
          , jshep                          /* shepherd number */
          , num_clone - 1                  /* number of instances - 1 */
          );

        assert(ret == QTHREAD_SUCCESS);
      }
    }
#endif

    driver_resize_worker_scratch( NULL );

    // Verify all workers allocated

    bool ok = true ;
    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }

    if ( ! ok ) {
      std::ostringstream msg ;
      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
      }
      msg << " }" ;
      Kokkos::Impl::throw_runtime_exception( msg.str() );
    }
  }
}
Esempio n. 16
0
GLT_func_prefix int glt_get_thread_num() {
    int a = qthread_shep();
    //printf("en glt_get_thread_num devolviendo %d\n",a);
    return a;
}
Esempio n. 17
0
void Task::schedule()
{
  // Is waiting for execution

  // Increment active task count before spawning.
  Kokkos::atomic_increment( m_active_count );

  // spawn in qthread.  must malloc the precondition array and give to qthread.
  // qthread will eventually free this allocation so memory will not be leaked.

  // concern with thread safety of malloc, does this need to be guarded?
  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );

  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );

  for ( int i = 0 ; i < m_dep_size ; ++i ) {
    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
  }

  if ( m_apply_team && ! m_apply_single ) {
    // If more than one shepherd spawn on a shepherd other than this shepherd
    const int num_shepherd            = qthread_num_shepherds();
    const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
    const int this_shepherd           = qthread_shep();

    int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(this)
       , spawn_shepherd
       , num_worker_per_shepherd - 1
       );
fflush(stdout);
#endif

    qthread_spawn_cloneable
      ( & Task::qthread_func
      , this
      , 0
      , NULL
      , m_dep_size , qprecon /* dependences */
      , spawn_shepherd
      , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
      , num_worker_per_shepherd - 1
      );
  }
  else {
    qthread_spawn( & Task::qthread_func /* function */
                 , this                 /* function argument */
                 , 0
                 , NULL
                 , m_dep_size , qprecon /* dependences */
                 , NO_SHEPHERD
                 , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
                 );
  }
}
Esempio n. 18
0
c_sublocid_t chpl_task_getSubLoc(void)
{
    return (c_sublocid_t) qthread_shep();
}