KOKKOS_INLINE_FUNCTION double shfl_up(const double &val, const int& delta, const int& width ) { int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_up(lo,delta,width); hi = __shfl_up(hi,delta,width); return __hiloint2double(hi,lo); }
KOKKOS_INLINE_FUNCTION Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) { int lo = __double2loint(*reinterpret_cast<const double*>(&val)); int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); lo = __shfl_up(lo,delta,width); hi = __shfl_up(hi,delta,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast<const Scalar*>(&tmp)); }
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 5 > * v ) { assert( blockDim.x == 32 ); assert( blockDim.y <= 32 ); __shared__ unsigned int buffer[ 32 ]; if( threadIdx.y == 0 ) { buffer[ threadIdx.x ] = 0; } __syncthreads(); const unsigned int WIDTH = seqs->seq_width; IntType * seq_ptr = seqs->sequences; unsigned int N = 0; unsigned int seq_idx = blockIdx.y * gridDim.x + blockIdx.x; unsigned int seq_begin = seq_idx * WIDTH + threadIdx.y; unsigned int seq_end = (seq_idx + 1) * WIDTH; while( seq_begin < seq_end ) { IntType x = seq_ptr[ seq_begin ]; N += (( x >> threadIdx.x) & 1); seq_begin += blockDim.y; } __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( N, i ); N += ((unsigned int) (threadIdx.x >= i) * t); } if( threadIdx.x == 31 ) { buffer[ threadIdx.y ] = N; } __syncthreads(); N = buffer[ threadIdx.x ]; __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( N, i ); N += ((unsigned int) (threadIdx.x >= i) * t); } if( threadIdx.y == 0 && threadIdx.x == 31 ) { res->data[ seq_idx ] = N; } __syncthreads(); }
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize) { #if __CUDA_ARCH__ >= 300 int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl_up(lo, delta, width); hi = __shfl_up(hi, delta, width); return __hiloint2double(hi, lo); #else return 0.0; #endif }
KOKKOS_INLINE_FUNCTION Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) { Scalar tmp1 = val; float tmp = *reinterpret_cast<float*>(&tmp1); tmp = __shfl_up(tmp,delta,width); return *reinterpret_cast<Scalar*>(&tmp); }
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 3 > * v ) { typedef device_sequence_space< IntType > space_type; typedef typename space_type::int_type int_type; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int _count = seqs->seq_count; if( bid >= _count ) return; assert( _count <= res->size ); // sanity check: enough allocated space for results unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = blockDim.x * blockDim.y; // threads per block unsigned int bpg = gridDim.x * gridDim.y; // blocks per grid unsigned int wpb = (tpb >> 5); unsigned int spg = wpb * bpg; // sequences/grid = sequences(warps)/block * blocks/grid assert( (tpb & 31) == 0); // sanity check: all warps are full unsigned int lane_id = (tid & 31); unsigned int warp_id = (tid >> 5); unsigned int _width = seqs->seq_width; int_type * sptr = seqs->sequences; unsigned int * countptr = res->data; unsigned int max_seq_id = _count / wpb; // max_rounds = sequences * block/sequences max_seq_id += ((_count % wpb) ? 1 : 0); // would !!(_count % spg) be more efficient? max_seq_id *= wpb; unsigned int seq_id = bid * wpb + warp_id; while( seq_id < max_seq_id ) { // blocks of grid may terminate early; only block for tail may diverge unsigned int degree = 0; unsigned int end = (seq_id + 1) * _width; unsigned int idx = end - ((seq_id < _count) ? _width : 0); // true for all threads in warp while( idx < end ) { // all threads in a warp read same bit block; no divergence int_type b = sptr[ idx++ ]; // all threads in a warp read/load same bit block degree += ((b >> lane_id) & 1); // would !!( b & lane_mask), where (lane_mask = (1 << lane_id)), be more efficient? } __syncthreads(); // sync all warps for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int d = __shfl_up(degree, i ); degree += (( lane_id >= i ) * d); } if( lane_id == 31 && (seq_id < _count) ) { countptr[ seq_id ] = degree; } __syncthreads(); seq_id += spg; } }
KOKKOS_INLINE_FUNCTION unsigned int shfl_up( const unsigned int &val, const int& delta, const int& width) { unsigned int tmp1 = val; int tmp = *reinterpret_cast<int*>(&tmp1); tmp = __shfl_up(tmp,delta,width); return *reinterpret_cast<unsigned int*>(&tmp); }
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize) { #if __CUDA_ARCH__ >= 300 return __shfl_up(val, delta, width); #else return T(); #endif }
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize) { #if __CUDA_ARCH__ >= 300 return (unsigned int) __shfl_up((int) val, delta, width); #else return 0; #endif }
KOKKOS_INLINE_FUNCTION Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) { Impl::shfl_union<Scalar> s_val; Impl::shfl_union<Scalar> r_val; s_val = val; for(int i = 0; i<s_val.n; i++) r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width); return r_val.value(); }
__device__ void update( uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask) { const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE; const auto dp0 = dp[0]; uint32_t lazy_out = 0, local_min = 0; { const unsigned int k = 0; #if CUDA_VERSION >= 9000 const uint32_t prev = __shfl_up_sync(mask, dp[DP_BLOCK_SIZE - 1], 1); #else const uint32_t prev = __shfl_up(dp[DP_BLOCK_SIZE - 1], 1); #endif uint32_t out = min(dp[k] - last_min, p2); if(lane_id != 0){ out = min(out, prev - last_min + p1); } out = min(out, dp[k + 1] - last_min + p1); lazy_out = local_min = out + local_costs[k]; } for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){ uint32_t out = min(dp[k] - last_min, p2); out = min(out, dp[k - 1] - last_min + p1); out = min(out, dp[k + 1] - last_min + p1); dp[k - 1] = lazy_out; lazy_out = out + local_costs[k]; local_min = min(local_min, lazy_out); } { const unsigned int k = DP_BLOCK_SIZE - 1; #if CUDA_VERSION >= 9000 const uint32_t next = __shfl_down_sync(mask, dp0, 1); #else const uint32_t next = __shfl_down(dp0, 1); #endif uint32_t out = min(dp[k] - last_min, p2); out = min(out, dp[k - 1] - last_min + p1); if(lane_id + 1 != SUBGROUP_SIZE){ out = min(out, next - last_min + p1); } dp[k - 1] = lazy_out; dp[k] = out + local_costs[k]; local_min = min(local_min, dp[k]); } last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask); }
__global__ void finalize_pairwise_diff_stats( pairwise_diff_stats * stats ) { assert( blockDim.x * blockDim.y == 32 ); unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned long long tot = stats->block_bin[tid]; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned long long t = __shfl_up( tot, i ); tot += ((tid >= i ) ? t : 0); } if( tid == 31 ) { stats->total = tot; double d = (double)tot; double c = (double) stats->count; d /= c; stats->mean = d; } }
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 1 > * v ) { typedef device_sequence_space< IntType > space_type; typedef typename space_type::int_type int_type; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int _count = seqs->seq_count; if( bid >= _count ) return; assert( _count <= res->size ); // sanity check: enough allocated space for results popcountGPU< int_type > pc; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = blockDim.x * blockDim.y; // threads per block unsigned int bpg = gridDim.x * gridDim.y; // blocks per grid assert( (tpb & 31) == 0); // sanity check: all warps are full unsigned int lane_id = (tid & 31); unsigned int warp_id = (tid >> 5); unsigned int _width = seqs->seq_width; int_type * sptr = seqs->sequences; unsigned int * countptr = res->data; __shared__ unsigned int buffer[32]; while( bid < _count ) { unsigned int degree = 0; unsigned int end = (bid + 1) * _width; unsigned int idx = (end - _width) + tid; while( idx < end ) { // at most 1 warp diverges int_type b = sptr[ idx ]; degree += pc.evalGPU( b ); idx += tpb; } __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int d = __shfl_up(degree, i ); degree += (( lane_id >= i ) * d); } if( tpb > 32 ) { // true for all threads in block if( lane_id == 31 ) { buffer[ warp_id ] = degree; } __syncthreads(); degree = buffer[ lane_id ]; __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int d = __shfl_up( degree, i ); degree += ((lane_id >= i) * d); } } if( tid == 31 ) { countptr[ bid ] = degree; } __syncthreads(); bid += bpg; } }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 2 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef poisson_cdf< RealType, 32 > poisson_type; unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) { return; } assert( (nAlleles & 31) == 0 ); // multiple of 32 alleles const unsigned int MAX_EVENTS_PER_WARP = 64; // maximum number of recombination events per sequence const unsigned int MAX_WARP_PER_BLOCK = 32; const unsigned int HASH_WIDTH = 32; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int lane_id = (tid & 31); unsigned int warp_id = (tid >> 5); unsigned int tpb = (blockDim.x * blockDim.y); assert( (tpb & 31) == 0 ); // all warps are full unsigned int bpg = (gridDim.x * gridDim.y); unsigned int wpb = (tpb >> 5); // == sequences/block (spb) assert( wpb <= MAX_WARP_PER_BLOCK ); unsigned int spg = bpg * wpb; int_type sequence_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; real_type * allele_list = alleles->locations; __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; // 4 * 32 == 128 __shared__ real_type rand_pool[ MAX_WARP_PER_BLOCK * MAX_EVENTS_PER_WARP ]; // 4 * 32 * 64 == 8192 __shared__ unsigned int event_hash[ MAX_WARP_PER_BLOCK * HASH_WIDTH ]; // 4 * 32 * 128 == 16384 //---------------------- // 24704 (bytes of shared memory) unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[tid]; } __syncthreads(); state_type local_state = states[ bid * tpb + tid ]; // assumes every thread in the GRID has a state defined unsigned int max_seq_id = nSequences / wpb; max_seq_id += (( nSequences % wpb ) ? 1 : 0); max_seq_id *= wpb; int_type seq_idx = bid * wpb + warp_id; while( seq_idx < max_seq_id ) { // should allow blocks to terminate early // generate a recombination hash for each sequence (warp) // unsigned int psum = 0; // for( unsigned int w = lane_id; w < HASH_WIDTH; w += 32 ) { real_type x = curand_uniform( &local_state ); unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); // compute prefix sum with in each warp for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int r = __shfl_up( rand, i ); rand += ( (lane_id >= i ) * r ); } // rand += psum; // event_hash[ warp_id * HASH_WIDTH + w ] = rand; // event_hash contain prefix sum (scan) event_hash[ tid ] = rand; unsigned int s = __shfl_up( rand, 1); // s *= ( lane_id != 0 ); s += (warp_id * MAX_EVENTS_PER_WARP); // shift s and rand to be relative to warp rand += (warp_id * MAX_EVENTS_PER_WARP); // BEGIN divergent code real_type accum = 0.; while (s < rand) { x = curand_uniform( &local_state ); accum += (log( x ) / (real_type)(rand - s)); rand_pool[s++] = ((((real_type)tid) + (1.0 - exp(accum))) / ((real_type)32.)); } __syncthreads(); // END divergent code // // psum = __shfl( rand, 31 ); // } unsigned int seq_offset = seq_idx * sequence_width; unsigned int a_id = lane_id; while( a_id < nAlleles ) { real_type x = allele_list[ a_id ]; unsigned int h_idx = (unsigned int) ( x * ((real_type) HASH_WIDTH )); // map allele location to bin index unsigned int s = event_hash[ h_idx++ ]; unsigned int e = event_hash[ h_idx ]; __syncthreads(); int_type cmask = s; s += (warp_id * MAX_EVENTS_PER_WARP); e += (warp_id * MAX_EVENTS_PER_WARP ); // BEGIN divergent code while( s < e ) { real_type y = rand_pool[ s++ ]; cmask += (x > y); } __syncthreads(); // END divergent code cmask = ((cmask & 1) << lane_id); // reduce cmask within each warp for( unsigned int i = 1; i < 32; i <<= 1 ) { int_type _c = __shfl_up( cmask, i ); cmask |= ((lane_id >= i) * _c); } if( lane_id == 31 && seq_idx < nSequences ) { seqs[seq_offset] = cmask; } __syncthreads(); a_id += 32; ++seq_offset; } __syncthreads(); seq_idx += spg; } states[ bid * tpb + tid ] = local_state; }
__global__ void select_and_crossover_kernel( StateType * states , device_sequence_space< SequenceIntType > * parent_seqs , device_event_space< IntType, no_order_tag > * parent_ids , AlleleSpaceType * alleles , device_free_space< SequenceIntType, unit_ordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< SequenceIntType > * offspring_seqs ) { typedef StateType state_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef device_sequence_space< SequenceIntType > sequence_space_type; typedef typename sequence_space_type::int_type sequence_int_type; typedef device_event_space< IntType, no_order_tag > selection_type; typedef typename selection_type::int_type selection_int_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int offspring_count = offspring_seqs->seq_count; if( bid >= offspring_count ) return; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int warp_id = (tid >> 5 ); unsigned int lane_id = (tid & 31 ); unsigned int tpb = (blockDim.x * blockDim.y); unsigned int wpb = (tpb >> 5); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int par_count = parent_seqs->seq_count; if( par_count == 0 ) { if( bid == 0 && tid == 0 ) { unsigned int off_cap = offspring_seqs->capacity; memset( offspring_seqs->sequences, 0, off_cap * sizeof( sequence_int_type) ); } return; } state_type local_state = states[ bid * tpb + tid ]; __shared__ unsigned int s_event_hash[ 32 ]; __shared__ real_type s_rand_pool[ 1024 ]; __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[ tid ]; } __syncthreads(); while( bid < offspring_count ) { real_type r = curand_uniform( &local_state ); unsigned int e = _find_poisson_maxk32( s_pois_cdf, r, max_k ); __syncthreads(); if( tid < 32 ) { s_event_hash[ tid ] = e; } __syncthreads(); e = s_event_hash[ lane_id ]; __syncthreads(); unsigned int psum = e; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int _e = __shfl_up( e, i ); e = ((_e > e) ? _e : e ); } unsigned int max = __shfl( e, 31 ); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int p = __shfl_up( psum, i ); psum += (( lane_id >= i ) * p ); } unsigned int base_count = __shfl_up( psum, 1 ); if( lane_id == 0 ) base_count = 0; __syncthreads(); for( unsigned int i = warp_id; i < max; i += wpb ) { // warp divergence r = curand_uniform( &local_state ); s_rand_pool[ i * 32 + lane_id ] = (((real_type) lane_id + r) / (real_type) 32); } __syncthreads(); // set up parent and offspring indexes selection_int_type id = par_ids[ bid ]; // all threads in block share same parent id <<= 1; unsigned int p0 = id * par_width + warp_id; unsigned int o_idx = bid * off_width + warp_id; unsigned int a_id = tid; while( a_id < par_allele_cap ) { // true for all threads sequence_int_type p = par_seqs[ p0 ]; sequence_int_type q = par_seqs[ p0 + par_width ]; real_type a = all_loc[ a_id ]; // every thread reads a location just in case e = (psum - base_count); unsigned int x = base_count; for( unsigned int i = 0; i < max; ++i ) { // should not diverge real_type y = s_rand_pool[ i * 32 + lane_id ]; x += ((i < e) * ( y < a)); } __syncthreads(); x = ((x & 1) << lane_id); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int _x = __shfl_up( x, i ); x |= ((lane_id >= i ) * _x ); } if( lane_id == 31 ) { off_seqs[ o_idx ] = ((p & ~x) | (q & x)); } __syncthreads(); p0 += wpb; o_idx += wpb; a_id += tpb; } bid += bpg; } bid = blockIdx.y * gridDim.x + blockIdx.x; local_state[ bid * tpb + tid ]; }
__global__ void pairwise_difference_kernel( device_sequence_space< IntType > * sequences, basic_data_space< unsigned int > * sub_pop, pairwise_diff_stats * stats ) { unsigned int tpb = (blockDim.x * blockDim.y); assert( tpb % 32 == 0 ); // 32 == threads per warp; all warps are full unsigned int wpb = (tpb >> 5); // warp per block unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int lane_id = (tid & 31); unsigned int warp_id = (tid >> 5); unsigned int bpg = (gridDim.x * gridDim.y); // blocks per grid unsigned int wpg = (wpb * bpg); // warps per grid unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int R = sequences->seq_count; unsigned int C = sequences->seq_width; IntType * data = sequences->sequences; popcountGPU< IntType > pc; unsigned int N = sub_pop->size; unsigned int * sp = sub_pop->data; unsigned int column_tail = (C & 31); // == % 32 // unsigned int column_full_warps = (C & (~31)); // ==( / 32) * 32 unsigned int column_full_warps = (C - column_tail); __shared__ unsigned long long block_buffer[ 32 ]; // 32 == max warps per block __shared__ unsigned int warp_index[32]; unsigned int M = N - 1; // max index if( warp_id == 0 ) { block_buffer[ lane_id ] = 0; warp_index[ lane_id ] = M; } __syncthreads(); // within each block verify that the sub-population is valid // NOTE consider moving this to a pre-processing step unsigned int s0 = tid; while( s0 < N ) { unsigned int idx = sp[s0]; assert( idx < R ); s0 += tpb; } __syncthreads(); s0 = 0; unsigned int s1 = bid * wpb + warp_id + 1; // s1 = s0 + grid_warp_id + 1 = warp_id + 1 bool is_done = false; while( !is_done ) { while( s1 >= N && s0 < M ) { ++s0; s1 -= (M - s0); } unsigned int s0_p = ((s0 >= M) ? M : s0); // M is valid index in sub population list unsigned int s1_p = ((s1 >= M) ? M : s1); unsigned int s0_idx = sp[s0_p]; unsigned int s1_idx = sp[s1_p]; __syncthreads(); unsigned int s0_end = s0_idx * C + column_full_warps; unsigned int s0_start = s0_idx * C + lane_id; unsigned int s1_start = s1_idx * C + lane_id; unsigned int tot = 0; while( s0_start < s0_end ) { // all sequences same length so only need to iterate along first IntType s0_data = data[ s0_start ]; IntType s1_data = data[ s1_start ]; tot += pc.evalGPU( s0_data ^ s1_data ); s0_start += 32; s1_start += 32; } __syncthreads(); // handle tail if( column_tail ) { // true for all threads s0_end += column_tail; IntType s0_data = 0; IntType s1_data = 0; if( s0_start < s0_end ) { s0_data = data[s0_start]; s1_data = data[s1_start]; } __syncthreads(); tot += pc.evalGPU( s0_data ^ s1_data ); } __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( tot, i ); tot += ((lane_id >= i) * t); } if( lane_id == 31 ) { block_buffer[warp_id] += tot; warp_index[warp_id] = s0; } __syncthreads(); // find the minimum s0 within the block unsigned int tmp = warp_index[lane_id]; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up(tmp, i); tmp = ((tmp < t) ? tmp : t); } unsigned int t = __shfl(tmp, 31); is_done = (t >= M); // done when all warps have reach s0 >= M s1 += wpg; } unsigned long long tot = block_buffer[ lane_id ]; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned long long t = __shfl_up( tot, i ); tot += (( lane_id >= i ) ? t : 0); } if( warp_id == 0 && lane_id == 31 ) { stats->block_bin[ bid ] = tot; if(bid == 0) { unsigned long long count = N; count *= (count - 1); count >>= 1; stats->count = count; }
__global__ void build_crossover_mask_kernel( AlleleSpaceType * alleles, device_sequence_space< IntType > * buffer, device_event_pool_space< RealType, IntType > * events ) { typedef device_event_pool_space< RealType, IntType > pool_type; int_type bid = blockIdx.y * gridDim.x + blockIdx.x; int_type tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); unsigned int bpg = (gridDim.x * gridDim.y); real_type * allele_list = alleles->locations; int_type * buf = buffer->buffer; int_type seq_width = buffer->seq_width; int_type nSequences = buffer->seq_count; unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) return; __shared__ real_type rand_pool[ pool_type::MAX_EVENTS_PER_OFFSET ]; // 1 sequence per block while( bid < nSequences ) { int_type lo = events->offsets[ bid ], hi = events->offsets[ bid + 1 ]; // true for all threads if( lo == hi ) { unsigned int id = bid * seq_width + tid; while( id < seq_width ) { buf[ id ] = 0; id += tpb; } __syncthreads(); } else { // move events from global memory into shared memory if( lo + tid < hi ) { rand_pool[tid] = events->event_pool[ lo + tid ]; } __syncthreads(); hi -= lo; unsigned int id = tid; unsigned int offset = bid * seq_width + threadIdx.y; while( id < nAlleles ) { real_type pos = allele_list[ id ]; unsigned int n = 0; for( unsigned int i = 0; i < hi; ++i ) { n += ((rand_pool[ i ] < pos) ? 1 : 0); } __syncthreads(); int_type mask = (1 << threadIdx.x); mask *= (n & 1); // scan right mask for( unsigned int i = 1; i < 32; i <<= 1 ) { int_type _c = __shfl_up( mask, i ); mask |= ((threadIdx.x >= i ) *_c ); } // last thread of each warp writes mask block to global memory if( threadIdx.x == 31 ) { buf[ offset ] = mask; } __syncthreads(); id += tpb; offset += blockDim.y; } __syncthreads(); } bid += bpg; } }
KOKKOS_INLINE_FUNCTION float shfl_up(const float &val, const int& delta, const int& width ) { return __shfl_up(val,delta,width); }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 5 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef xover_config< unordered_tag, 5 > xover_type; assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS ); // encourage quick return; unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) return; // only considering 2d grid/block definitions int_type bid = blockIdx.y * gridDim.x + blockIdx.x; int_type tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); unsigned int bpg = (gridDim.x * gridDim.y); real_type * allele_list = alleles->locations; int_type * seqs = sequences->sequences; int_type seq_width = sequences->seq_width; int_type nSequences = sequences->seq_count; const unsigned int MAX_EVENTS = 32; __shared__ int_type event_count; __shared__ real_type rand_pool[ MAX_EVENTS ]; unsigned int seqIdx = bid; // 1 sequence per block while( seqIdx < nSequences ) { // use the first thread of each block to populate a pool // of random numbers if( tid == 0 ) { state_type local_state = states[ bid ]; // update the event_count event_count = curand_poisson( &local_state, pois->lambda ); for( unsigned int i = 0; i < event_count; ++i ) { rand_pool[ i ] = curand_uniform( &local_state ); } states[ bid ] = local_state; } __syncthreads(); // copy event_count from shared memory to thread local memory // unsigned int nEvents = event_count; // true for all threads if( nEvents == 0 ) { unsigned int id = tid; while( id < seq_width ) { seqs[ seqIdx * seq_width + tid ] = 0; id += tpb; } } else { unsigned int id = tid; unsigned int offset = seqIdx * seq_width + threadIdx.y; while( id < nAlleles ) { real_type pos = allele_list[ id ]; unsigned int n = 0; for( unsigned int i = 0; i < nEvents; ++i ) { n += ((rand_pool[ i ] < pos) ? 1 : 0); } __syncthreads(); int_type mask = (1 << threadIdx.x); mask *= (n & 1); // scan left mask for( unsigned int i = 1; i < 32; i <<= 1 ) { int_type _c = __shfl_up( mask, i ); mask |= ((threadIdx.x >= i ) *_c ); } if( threadIdx.x == 31 ) { seqs[ offset ] = mask; } __syncthreads(); id += tpb; offset += blockDim.y; } } seqIdx += bpg; __syncthreads(); } }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 4 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef xover_config< unordered_tag, 4 > xover_type; assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS ); int_type bid = blockIdx.y * gridDim.x + blockIdx.x; int_type tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); assert( (tpb % allele_space_type::ALIGNMENT_SIZE) == 0 ); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int i; int_type sequence_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; //int_type cap = sequences->capacity; real_type * allele_list = alleles->locations; unsigned int nAlleles = alleles->capacity; assert( (nAlleles % allele_space_type::ALIGNMENT_SIZE) == 0 ); if( nAlleles == 0 ) { return; } __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; __shared__ real_type rand_pool[ allele_space_type::ALIGNMENT_SIZE ]; __shared__ unsigned int event_hash[ allele_space_type::ALIGNMENT_SIZE]; unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[tid]; } __syncthreads(); state_type local_state = states[ bid * tpb + tid ]; unsigned int nonzero_warp = (threadIdx.y != 0); unsigned int nonzero_thread = (tid != 0); int_type seq_idx = bid; while( seq_idx < nSequences ) { real_type x = curand_uniform( &local_state ); // x in (0, 1] rand_pool[ tid ] = ((x >= 1.0) ? 0.0 : x); // wrap around x to be in [0, 1); this way all event hash bins follow [,) pattern x = curand_uniform( &local_state ); int_type rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); for( i = 1; i < 32; i <<= 1 ) { unsigned int r = __shfl_up( rand, i ); rand += ( (threadIdx.x >= i ) * r ); } if( threadIdx.x == 31 ) { event_hash[ threadIdx.y ] = rand; } __syncthreads(); unsigned int _sum = event_hash[ threadIdx.x ]; _sum *= (threadIdx.x < blockDim.y); __syncthreads(); for( i = 1; i < 32; i <<= 1 ) { unsigned int s = __shfl_up( _sum, i ); _sum += (( threadIdx.x >= i ) * s); } unsigned int s = __shfl( _sum, 31 ); // assert( max_events < allele_space_type::ALIGNMENT_SIZE ); // if( s == 0 ) { // true for all threads in block assuming 1 block per sequence // if there are no events for this sequence, then simply clear the memory unsigned int seq_start = seq_idx * sequence_width + tid; unsigned int seq_end = (seq_idx + 1) * sequence_width; while( seq_start < seq_end ) { seqs[ seq_start ] = 0; seq_start += tpb; } __syncthreads(); } else { s = __shfl( _sum, threadIdx.y - nonzero_warp); s *= nonzero_warp; __syncthreads(); rand += s; event_hash[tid] = rand; __syncthreads(); i = event_hash[ tid - nonzero_thread]; // minimum event index i *= nonzero_thread; __syncthreads(); // BEGIN divergent code while (i < rand) { x = rand_pool[ i ]; rand_pool[ i++ ] = (((real_type) tid) + x) / ((real_type) allele_space_type::ALIGNMENT_SIZE); } __syncthreads(); // END divergent code unsigned int seq_offset = seq_idx * sequence_width + threadIdx.y; // every thread in a warp has the same block offset i = tid; while( i < nAlleles ) { x = allele_list[ i ]; rand = (unsigned int) ( x * ((real_type)allele_space_type::ALIGNMENT_SIZE)); unsigned int nonzero_bin = (rand != 0); // _keep == 0 -> rand == 0 -> e_min == 0; _keep == 1 -> rand != 0 -> e_min == event_hash[ rand - 1] // each thread reads hash (and random pool) relative to their // local allele (x) // this code will result in bank conflicts // // initial performance results suggest that this is // an acceptable overhead as overall runtime of simulation // loop is minimized when this algorithm is used unsigned int e_max = event_hash[ rand ]; unsigned int e_min = event_hash[ rand - nonzero_bin ]; e_min *= nonzero_bin; int_type cmask = e_min; // BEGIN divergent code while( e_min < e_max ) { real_type y = rand_pool[ e_min++ ]; cmask += (y < x); } __syncthreads(); // END divergent code cmask = ((cmask & 1) << threadIdx.x); for( unsigned int j = 1; j < 32; j <<= 1 ) { int_type _c = __shfl_up( cmask, j); cmask |= ((threadIdx.x >= j) * _c); } if( threadIdx.x == 31 ) { // assert( seq_offset < cap ); seqs[ seq_offset ] = cmask; } __syncthreads(); i += tpb; seq_offset += blockDim.y; // wpb } __syncthreads(); } seq_idx += bpg; } states[ bid * tpb + tid ] = local_state; }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 3 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef xover_config< unordered_tag, 3 > xover_type; typedef poisson_cdf< RealType, 32 > poisson_type; assert( blockDim.y <= xover_type::MAX_WARPS && blockDim.x == 32); // 8 or fewer full warps unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) { return; } assert( (nAlleles & 31) == 0 ); // nAlleles == 32 * m const unsigned int MAX_EVENTS_PER_WARP = 128; // maximum number of recombination events per sequence __shared__ real_type rand_pool[ MAX_EVENTS_PER_WARP * xover_type::MAX_WARPS ]; // 16 warps/block (arbitrary number); 512 random numbers __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; // 4 * 32 == 128 unsigned int max_k = pois->max_k; if( threadIdx.y == 0 ) { // use first warp to read poisson CDF s_pois_cdf[ threadIdx.x ] = pois->_cdf[threadIdx.x]; } __syncthreads(); unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int spg = bpg * blockDim.y; int_type seq_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; real_type * allele_list = alleles->locations; unsigned int seq_idx = bid * blockDim.y + threadIdx.y; state_type local_state = states[ seq_idx * 32 + threadIdx.x ]; // every thread/warp uses the SAME random state unsigned int max_seqs = nSequences / blockDim.y; max_seqs += (( nSequences % blockDim.y) ? 1 : 0); max_seqs *= blockDim.y; while( seq_idx < max_seqs ) { real_type x = curand_uniform( &local_state ); unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); for(unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( rand, i ); rand += ((threadIdx.x >= i) * t); } unsigned int max_events = __shfl( rand, 31 ); // fill random pool // if( max_events >= MAX_EVENTS_PER_WARP ) { if( threadIdx.x == 0 ) { printf( "Too many events to generate: %d\n", max_events ); } assert( max_events < MAX_EVENTS_PER_WARP ); } __syncthreads(); rand_pool[ tid ] = curand_uniform( &local_state ); rand_pool[ tid + tpb ] = curand_uniform( &local_state ); rand_pool[ tid + 2 * tpb ] = curand_uniform( &local_state ); rand_pool[ tid + 3 * tpb ] = curand_uniform( &local_state ); __syncthreads(); unsigned int seq_offset = seq_idx * seq_width; unsigned int a_idx = threadIdx.x; while( a_idx < nAlleles ) { real_type loc = allele_list[ a_idx ]; unsigned int s = 0, mask = 0; while( s < max_events ) { // warps within a block diverge. threads within a warp do not real_type y = rand_pool[ s++ ]; // every thread within a warp read/loads the same value (sequence offset) mask += ( y < loc ); } __syncthreads(); mask = ((mask & 1) << threadIdx.x); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int e = __shfl_up( mask, i ); mask |= ((threadIdx.x >= i) * e); } if( threadIdx.x == 31 && seq_idx < nSequences ) { seqs[ seq_offset ] = mask; } __syncthreads(); a_idx += 32; ++seq_offset; } seq_idx += spg; } seq_idx = bid * blockDim.y + threadIdx.y; // reset seq_idx states[ seq_idx * 32 + threadIdx.x ] = local_state; }