__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize) { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 int lo = __double2loint(val); int hi = __double2hiint(val); lo = __shfl(lo, srcLane, width); hi = __shfl(hi, srcLane, width); return __hiloint2double(hi, lo); #else return 0.0; #endif }
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize) { #if __CUDA_ARCH__ >= 300 return (unsigned int) __shfl((int) val, srcLane, width); #else return 0; #endif }
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize) { #if __CUDA_ARCH__ >= 300 return __shfl(val, srcLane, width); #else return T(); #endif }
KOKKOS_INLINE_FUNCTION Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width ) { Scalar tmp1 = val; float tmp = *reinterpret_cast<float*>(&tmp1); tmp = __shfl(tmp,srcLane,width); return *reinterpret_cast<Scalar*>(&tmp); }
KOKKOS_INLINE_FUNCTION Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) { Impl::shfl_union<Scalar> s_val; Impl::shfl_union<Scalar> r_val; s_val = val; for(int i = 0; i<s_val.n; i++) r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width); return r_val.value(); }
__global__ void select_and_crossover_kernel( StateType * states , device_sequence_space< SequenceIntType > * parent_seqs , device_event_space< IntType, no_order_tag > * parent_ids , AlleleSpaceType * alleles , device_free_space< SequenceIntType, unit_ordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< SequenceIntType > * offspring_seqs ) { typedef StateType state_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef device_sequence_space< SequenceIntType > sequence_space_type; typedef typename sequence_space_type::int_type sequence_int_type; typedef device_event_space< IntType, no_order_tag > selection_type; typedef typename selection_type::int_type selection_int_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int offspring_count = offspring_seqs->seq_count; if( bid >= offspring_count ) return; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int warp_id = (tid >> 5 ); unsigned int lane_id = (tid & 31 ); unsigned int tpb = (blockDim.x * blockDim.y); unsigned int wpb = (tpb >> 5); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int par_count = parent_seqs->seq_count; if( par_count == 0 ) { if( bid == 0 && tid == 0 ) { unsigned int off_cap = offspring_seqs->capacity; memset( offspring_seqs->sequences, 0, off_cap * sizeof( sequence_int_type) ); } return; } state_type local_state = states[ bid * tpb + tid ]; __shared__ unsigned int s_event_hash[ 32 ]; __shared__ real_type s_rand_pool[ 1024 ]; __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[ tid ]; } __syncthreads(); while( bid < offspring_count ) { real_type r = curand_uniform( &local_state ); unsigned int e = _find_poisson_maxk32( s_pois_cdf, r, max_k ); __syncthreads(); if( tid < 32 ) { s_event_hash[ tid ] = e; } __syncthreads(); e = s_event_hash[ lane_id ]; __syncthreads(); unsigned int psum = e; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int _e = __shfl_up( e, i ); e = ((_e > e) ? _e : e ); } unsigned int max = __shfl( e, 31 ); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int p = __shfl_up( psum, i ); psum += (( lane_id >= i ) * p ); } unsigned int base_count = __shfl_up( psum, 1 ); if( lane_id == 0 ) base_count = 0; __syncthreads(); for( unsigned int i = warp_id; i < max; i += wpb ) { // warp divergence r = curand_uniform( &local_state ); s_rand_pool[ i * 32 + lane_id ] = (((real_type) lane_id + r) / (real_type) 32); } __syncthreads(); // set up parent and offspring indexes selection_int_type id = par_ids[ bid ]; // all threads in block share same parent id <<= 1; unsigned int p0 = id * par_width + warp_id; unsigned int o_idx = bid * off_width + warp_id; unsigned int a_id = tid; while( a_id < par_allele_cap ) { // true for all threads sequence_int_type p = par_seqs[ p0 ]; sequence_int_type q = par_seqs[ p0 + par_width ]; real_type a = all_loc[ a_id ]; // every thread reads a location just in case e = (psum - base_count); unsigned int x = base_count; for( unsigned int i = 0; i < max; ++i ) { // should not diverge real_type y = s_rand_pool[ i * 32 + lane_id ]; x += ((i < e) * ( y < a)); } __syncthreads(); x = ((x & 1) << lane_id); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int _x = __shfl_up( x, i ); x |= ((lane_id >= i ) * _x ); } if( lane_id == 31 ) { off_seqs[ o_idx ] = ((p & ~x) | (q & x)); } __syncthreads(); p0 += wpb; o_idx += wpb; a_id += tpb; } bid += bpg; } bid = blockIdx.y * gridDim.x + blockIdx.x; local_state[ bid * tpb + tid ]; }
__global__ void pairwise_difference_kernel( device_sequence_space< IntType > * sequences, basic_data_space< unsigned int > * sub_pop, pairwise_diff_stats * stats ) { unsigned int tpb = (blockDim.x * blockDim.y); assert( tpb % 32 == 0 ); // 32 == threads per warp; all warps are full unsigned int wpb = (tpb >> 5); // warp per block unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int lane_id = (tid & 31); unsigned int warp_id = (tid >> 5); unsigned int bpg = (gridDim.x * gridDim.y); // blocks per grid unsigned int wpg = (wpb * bpg); // warps per grid unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int R = sequences->seq_count; unsigned int C = sequences->seq_width; IntType * data = sequences->sequences; popcountGPU< IntType > pc; unsigned int N = sub_pop->size; unsigned int * sp = sub_pop->data; unsigned int column_tail = (C & 31); // == % 32 // unsigned int column_full_warps = (C & (~31)); // ==( / 32) * 32 unsigned int column_full_warps = (C - column_tail); __shared__ unsigned long long block_buffer[ 32 ]; // 32 == max warps per block __shared__ unsigned int warp_index[32]; unsigned int M = N - 1; // max index if( warp_id == 0 ) { block_buffer[ lane_id ] = 0; warp_index[ lane_id ] = M; } __syncthreads(); // within each block verify that the sub-population is valid // NOTE consider moving this to a pre-processing step unsigned int s0 = tid; while( s0 < N ) { unsigned int idx = sp[s0]; assert( idx < R ); s0 += tpb; } __syncthreads(); s0 = 0; unsigned int s1 = bid * wpb + warp_id + 1; // s1 = s0 + grid_warp_id + 1 = warp_id + 1 bool is_done = false; while( !is_done ) { while( s1 >= N && s0 < M ) { ++s0; s1 -= (M - s0); } unsigned int s0_p = ((s0 >= M) ? M : s0); // M is valid index in sub population list unsigned int s1_p = ((s1 >= M) ? M : s1); unsigned int s0_idx = sp[s0_p]; unsigned int s1_idx = sp[s1_p]; __syncthreads(); unsigned int s0_end = s0_idx * C + column_full_warps; unsigned int s0_start = s0_idx * C + lane_id; unsigned int s1_start = s1_idx * C + lane_id; unsigned int tot = 0; while( s0_start < s0_end ) { // all sequences same length so only need to iterate along first IntType s0_data = data[ s0_start ]; IntType s1_data = data[ s1_start ]; tot += pc.evalGPU( s0_data ^ s1_data ); s0_start += 32; s1_start += 32; } __syncthreads(); // handle tail if( column_tail ) { // true for all threads s0_end += column_tail; IntType s0_data = 0; IntType s1_data = 0; if( s0_start < s0_end ) { s0_data = data[s0_start]; s1_data = data[s1_start]; } __syncthreads(); tot += pc.evalGPU( s0_data ^ s1_data ); } __syncthreads(); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( tot, i ); tot += ((lane_id >= i) * t); } if( lane_id == 31 ) { block_buffer[warp_id] += tot; warp_index[warp_id] = s0; } __syncthreads(); // find the minimum s0 within the block unsigned int tmp = warp_index[lane_id]; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up(tmp, i); tmp = ((tmp < t) ? tmp : t); } unsigned int t = __shfl(tmp, 31); is_done = (t >= M); // done when all warps have reach s0 >= M s1 += wpg; } unsigned long long tot = block_buffer[ lane_id ]; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned long long t = __shfl_up( tot, i ); tot += (( lane_id >= i ) ? t : 0); } if( warp_id == 0 && lane_id == 31 ) { stats->block_bin[ bid ] = tot; if(bid == 0) { unsigned long long count = N; count *= (count - 1); count >>= 1; stats->count = count; }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 4 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef xover_config< unordered_tag, 4 > xover_type; assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS ); int_type bid = blockIdx.y * gridDim.x + blockIdx.x; int_type tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); assert( (tpb % allele_space_type::ALIGNMENT_SIZE) == 0 ); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int i; int_type sequence_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; //int_type cap = sequences->capacity; real_type * allele_list = alleles->locations; unsigned int nAlleles = alleles->capacity; assert( (nAlleles % allele_space_type::ALIGNMENT_SIZE) == 0 ); if( nAlleles == 0 ) { return; } __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; __shared__ real_type rand_pool[ allele_space_type::ALIGNMENT_SIZE ]; __shared__ unsigned int event_hash[ allele_space_type::ALIGNMENT_SIZE]; unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[tid]; } __syncthreads(); state_type local_state = states[ bid * tpb + tid ]; unsigned int nonzero_warp = (threadIdx.y != 0); unsigned int nonzero_thread = (tid != 0); int_type seq_idx = bid; while( seq_idx < nSequences ) { real_type x = curand_uniform( &local_state ); // x in (0, 1] rand_pool[ tid ] = ((x >= 1.0) ? 0.0 : x); // wrap around x to be in [0, 1); this way all event hash bins follow [,) pattern x = curand_uniform( &local_state ); int_type rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); for( i = 1; i < 32; i <<= 1 ) { unsigned int r = __shfl_up( rand, i ); rand += ( (threadIdx.x >= i ) * r ); } if( threadIdx.x == 31 ) { event_hash[ threadIdx.y ] = rand; } __syncthreads(); unsigned int _sum = event_hash[ threadIdx.x ]; _sum *= (threadIdx.x < blockDim.y); __syncthreads(); for( i = 1; i < 32; i <<= 1 ) { unsigned int s = __shfl_up( _sum, i ); _sum += (( threadIdx.x >= i ) * s); } unsigned int s = __shfl( _sum, 31 ); // assert( max_events < allele_space_type::ALIGNMENT_SIZE ); // if( s == 0 ) { // true for all threads in block assuming 1 block per sequence // if there are no events for this sequence, then simply clear the memory unsigned int seq_start = seq_idx * sequence_width + tid; unsigned int seq_end = (seq_idx + 1) * sequence_width; while( seq_start < seq_end ) { seqs[ seq_start ] = 0; seq_start += tpb; } __syncthreads(); } else { s = __shfl( _sum, threadIdx.y - nonzero_warp); s *= nonzero_warp; __syncthreads(); rand += s; event_hash[tid] = rand; __syncthreads(); i = event_hash[ tid - nonzero_thread]; // minimum event index i *= nonzero_thread; __syncthreads(); // BEGIN divergent code while (i < rand) { x = rand_pool[ i ]; rand_pool[ i++ ] = (((real_type) tid) + x) / ((real_type) allele_space_type::ALIGNMENT_SIZE); } __syncthreads(); // END divergent code unsigned int seq_offset = seq_idx * sequence_width + threadIdx.y; // every thread in a warp has the same block offset i = tid; while( i < nAlleles ) { x = allele_list[ i ]; rand = (unsigned int) ( x * ((real_type)allele_space_type::ALIGNMENT_SIZE)); unsigned int nonzero_bin = (rand != 0); // _keep == 0 -> rand == 0 -> e_min == 0; _keep == 1 -> rand != 0 -> e_min == event_hash[ rand - 1] // each thread reads hash (and random pool) relative to their // local allele (x) // this code will result in bank conflicts // // initial performance results suggest that this is // an acceptable overhead as overall runtime of simulation // loop is minimized when this algorithm is used unsigned int e_max = event_hash[ rand ]; unsigned int e_min = event_hash[ rand - nonzero_bin ]; e_min *= nonzero_bin; int_type cmask = e_min; // BEGIN divergent code while( e_min < e_max ) { real_type y = rand_pool[ e_min++ ]; cmask += (y < x); } __syncthreads(); // END divergent code cmask = ((cmask & 1) << threadIdx.x); for( unsigned int j = 1; j < 32; j <<= 1 ) { int_type _c = __shfl_up( cmask, j); cmask |= ((threadIdx.x >= j) * _c); } if( threadIdx.x == 31 ) { // assert( seq_offset < cap ); seqs[ seq_offset ] = cmask; } __syncthreads(); i += tpb; seq_offset += blockDim.y; // wpb } __syncthreads(); } seq_idx += bpg; } states[ bid * tpb + tid ] = local_state; }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 3 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef xover_config< unordered_tag, 3 > xover_type; typedef poisson_cdf< RealType, 32 > poisson_type; assert( blockDim.y <= xover_type::MAX_WARPS && blockDim.x == 32); // 8 or fewer full warps unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) { return; } assert( (nAlleles & 31) == 0 ); // nAlleles == 32 * m const unsigned int MAX_EVENTS_PER_WARP = 128; // maximum number of recombination events per sequence __shared__ real_type rand_pool[ MAX_EVENTS_PER_WARP * xover_type::MAX_WARPS ]; // 16 warps/block (arbitrary number); 512 random numbers __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; // 4 * 32 == 128 unsigned int max_k = pois->max_k; if( threadIdx.y == 0 ) { // use first warp to read poisson CDF s_pois_cdf[ threadIdx.x ] = pois->_cdf[threadIdx.x]; } __syncthreads(); unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int spg = bpg * blockDim.y; int_type seq_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; real_type * allele_list = alleles->locations; unsigned int seq_idx = bid * blockDim.y + threadIdx.y; state_type local_state = states[ seq_idx * 32 + threadIdx.x ]; // every thread/warp uses the SAME random state unsigned int max_seqs = nSequences / blockDim.y; max_seqs += (( nSequences % blockDim.y) ? 1 : 0); max_seqs *= blockDim.y; while( seq_idx < max_seqs ) { real_type x = curand_uniform( &local_state ); unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); for(unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( rand, i ); rand += ((threadIdx.x >= i) * t); } unsigned int max_events = __shfl( rand, 31 ); // fill random pool // if( max_events >= MAX_EVENTS_PER_WARP ) { if( threadIdx.x == 0 ) { printf( "Too many events to generate: %d\n", max_events ); } assert( max_events < MAX_EVENTS_PER_WARP ); } __syncthreads(); rand_pool[ tid ] = curand_uniform( &local_state ); rand_pool[ tid + tpb ] = curand_uniform( &local_state ); rand_pool[ tid + 2 * tpb ] = curand_uniform( &local_state ); rand_pool[ tid + 3 * tpb ] = curand_uniform( &local_state ); __syncthreads(); unsigned int seq_offset = seq_idx * seq_width; unsigned int a_idx = threadIdx.x; while( a_idx < nAlleles ) { real_type loc = allele_list[ a_idx ]; unsigned int s = 0, mask = 0; while( s < max_events ) { // warps within a block diverge. threads within a warp do not real_type y = rand_pool[ s++ ]; // every thread within a warp read/loads the same value (sequence offset) mask += ( y < loc ); } __syncthreads(); mask = ((mask & 1) << threadIdx.x); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int e = __shfl_up( mask, i ); mask |= ((threadIdx.x >= i) * e); } if( threadIdx.x == 31 && seq_idx < nSequences ) { seqs[ seq_offset ] = mask; } __syncthreads(); a_idx += 32; ++seq_offset; } seq_idx += spg; } seq_idx = bid * blockDim.y + threadIdx.y; // reset seq_idx states[ seq_idx * 32 + threadIdx.x ] = local_state; }
KOKKOS_INLINE_FUNCTION float shfl(const float &val, const int& srcLane, const int& width ) { return __shfl(val,srcLane,width); }