inline void OrderAccess::release_store_fence(volatile jushort* p, jushort v) { *p = v; fence(); }
inline void OrderAccess::release_store_fence(volatile julong* p, julong v) { *p = v; fence(); }
inline void OrderAccess::store_ptr_fence(intptr_t* p, intptr_t v) { *p = v; fence(); }
inline void OrderAccess::store_ptr_fence(void** p, void* v) { *p = v; fence(); }
inline void OrderAccess::store_fence(jfloat* p, jfloat v) { *p = v; fence(); }
inline void OrderAccess::store_fence(jdouble* p, jdouble v) { *p = v; fence(); }
inline void OrderAccess::store_fence(juint* p, juint v) { *p = v; fence(); }
void * P1(void * arg) { x = 1; // Instrumentation for CPROVER fence(); __unbuffered_cnt++; }
inline void OrderAccess::store_fence(jshort* p, jshort v) { *p = v; fence(); }
inline void OrderAccess::store_fence(jubyte* p, jubyte v) { *p = v; fence(); }
inline void OrderAccess::storeload() { fence(); }
inline void OrderAccess::release_store_ptr_fence(volatile void* p, void* v) { *(void* volatile *)p = v; fence(); }
/* The Binomial Spanning Tree algorithm. Outlay: The game scales with log2(NP) and uses 1 byte of memory. */ static int _algorithm_binomial_tree(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync) { int rc = OSHMEM_SUCCESS; long value = SHMEM_SYNC_INIT; int root_id = oshmem_proc_group_find_id(group, PE_root); int my_id = oshmem_proc_group_find_id(group, group->my_pe); int peer_id = 0; int peer_pe = 0; int vrank; int dim = opal_cube_dim(group->proc_count); int hibit; int mask; int i = 0; SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Tree", group->my_pe); SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld root = #%d", group->my_pe, pSync[0], PE_root); vrank = (my_id + group->proc_count - root_id) % group->proc_count; hibit = opal_hibit(vrank, dim); SCOLL_VERBOSE(15, "[#%d] dim = %d vrank = %d hibit = %d", group->my_pe, dim, vrank, hibit); dim--; pSync[0] = SHMEM_SYNC_READY; /* Receive data from parent in the tree. */ if (vrank > 0) { value = SHMEM_SYNC_READY; SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); while ((value = pSync[0]) < 0) { SCOLL_VERBOSE(14, "[#%d] Broadcast size is a negative value (%li)\n", group->my_pe, pSync[0]); MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); } if (OSHMEM_SUCCESS != rc) { return rc; } nlong = (size_t) pSync[0]; } /* Send data to the children. */ for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer_id = vrank | mask; if (peer_id < group->proc_count) { /* Wait for the child to be ready to receive (pSync must have the initial value) */ peer_id = (peer_id + root_id) % group->proc_count; peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); SCOLL_VERBOSE(14, "[#%d] check remote pe is ready to receive #%d", group->my_pe, peer_pe); do { rc = MCA_SPML_CALL(get((void*)pSync, sizeof(long), (void*)pSync, peer_pe)); } while ((OSHMEM_SUCCESS == rc) && (pSync[0] != SHMEM_SYNC_READY)); SCOLL_VERBOSE(14, "[#%d] send data to #%d", group->my_pe, peer_pe); rc = MCA_SPML_CALL(put(target, nlong, (my_id == root_id ? (void *)source : target), peer_pe)); MCA_SPML_CALL(fence()); SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); value = nlong; rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); if (OSHMEM_SUCCESS != rc) { break; } } } return rc; }
inline void OrderAccess::release_store_fence(volatile jfloat* p, jfloat v) { *p = v; fence(); }
inline void OrderAccess::store_fence(julong* p, julong v) { *p = v; fence(); }
inline void OrderAccess::release_store_fence(volatile jdouble* p, jdouble v) { *p = v; fence(); }
void ThreadsExec::print_configuration( std::ostream & s , const bool detail ) { verify_is_process("ThreadsExec::print_configuration",false); fence(); const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); // Forestall compiler warnings for unused variables. (void) numa_count; (void) cores_per_numa; (void) threads_per_core; s << "Kokkos::Threads" ; #if defined( KOKKOS_HAVE_PTHREAD ) s << " KOKKOS_HAVE_PTHREAD" ; #endif #if defined( KOKKOS_HAVE_HWLOC ) s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ; #endif if ( s_thread_pool_size[0] ) { s << " threads[" << s_thread_pool_size[0] << "]" << " threads_per_numa[" << s_thread_pool_size[1] << "]" << " threads_per_core[" << s_thread_pool_size[2] << "]" ; if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; } s << " ReduceScratch[" << s_current_reduce_size << "]" << " SharedScratch[" << s_current_shared_size << "]" ; s << std::endl ; if ( detail ) { for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) { ThreadsExec * const th = s_threads_exec[i] ; if ( th ) { const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 ); s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "." << th->m_numa_core_rank << " ]" ; s << " Fan{" ; for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) { ThreadsExec * const thfan = th->m_pool_base[rank_rev+(1<<j)] ; s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]" ; } s << " }" ; if ( th == & s_threads_process ) { s << " is_process" ; } } s << std::endl ; } } } else { s << " not initialized" << std::endl ; } }