void Task::closeout() { enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) | int( Kokkos::Experimental::TASK_STATE_EXECUTING ) }; #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx %s\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(this) , ( m_state == RESPAWN ? "respawn" : "complete" ) ); fflush(stdout); #endif // When dependent tasks run there would be a race // condition between destroying this task and // querying the active count pointer from this task. int volatile * const active_count = m_active_count ; if ( m_state == RESPAWN ) { // Task requests respawn, set state to waiting and reschedule the task m_state = Kokkos::Experimental::TASK_STATE_WAITING ; schedule(); } else { // Task did not respawn, is complete m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ; // Release dependences before allowing dependent tasks to run. // Otherwise there is a thread race condition for removing dependences. for ( int i = 0 ; i < m_dep_size ; ++i ) { assign( & m_dep[i] , 0 ); } // Set qthread FEB to full so that dependent tasks are allowed to execute. // This 'task' may be deleted immediately following this function call. qthread_fill( & m_qfeb ); // The dependent task could now complete and destroy 'this' task // before the call to 'qthread_fill' returns. Therefore, for // thread safety assume that 'this' task has now been destroyed. } // Decrement active task count before returning. Kokkos::atomic_decrement( active_count ); }
// Notes: // - Each task receives distinct copy of parent // - Copy of child is shallow, be careful with `state` member static aligned_t visit(void *args_) { node_t *parent = (node_t *)args_; int parent_height = parent->height; int num_children = parent->num_children; aligned_t expect = parent->expect; aligned_t num_descendants[num_children]; aligned_t sum_descendants = 1; if (num_children != 0) { node_t child __attribute__((aligned(8))); aligned_t donec = 0; // Spawn children, if any child.height = parent_height + 1; child.dc = &donec; child.expect = num_children; qthread_empty(&donec); for (int i = 0; i < num_children; i++) { child.acc = &num_descendants[i]; for (int j = 0; j < num_samples; j++) { rng_spawn(parent->state.state, child.state.state, i); } child.num_children = calc_num_children(&child); qthread_fork_syncvar_copyargs(visit, &child, sizeof(node_t), NULL); } // Wait for children to finish up, accumulate descendants counts if (donec != expect) qthread_readFF(NULL, &donec); for (int i = 0; i < num_children; i++) { sum_descendants += num_descendants[i]; } } *parent->acc = sum_descendants; if (qthread_incr(parent->dc, 1) + 1 == expect) { qthread_fill(parent->dc); } return 0; }
/* * The main procedure simply creates a producer and a consumer task to run in * parallel */ int main(int argc, char *argv[]) { aligned_t t[2]; assert(qthread_initialize() == 0); CHECK_VERBOSE(); NUMARG(bufferSize, "BUFFERSIZE"); numItems = 8 * bufferSize; NUMARG(numItems, "NUMITEMS"); iprintf("%i threads...\n", qthread_num_shepherds()); buff = malloc(sizeof(aligned_t) * bufferSize); for (unsigned int i = 0; i < bufferSize; ++i) { buff[i] = 0; } qthread_fork(consumer, NULL, &t[0]); qthread_fork(producer, NULL, &t[1]); qthread_readFF(NULL, &t[0]); qthread_readFF(NULL, &t[1]); /* cleanup... unnecessary in general, but for the moment I'm tracking down * errors in the FEB system, so let's clean up */ for (unsigned int i = 0; i < bufferSize; ++i) { qthread_fill(buff + i); } free(buff); iprintf("Success!\n"); return 0; }
inline int qthread_fill(const T *const dest) { QTHREAD_CHECKSIZE(T); return qthread_fill((aligned_t *)dest); }