KOKKOS_INLINE_FUNCTION
 double shfl_up(const double &val, const int& delta, const int& width ) {
   int lo = __double2loint(val);
   int hi = __double2hiint(val);
   lo = __shfl_up(lo,delta,width);
   hi = __shfl_up(hi,delta,width);
   return __hiloint2double(hi,lo);
 }
 KOKKOS_INLINE_FUNCTION
 Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
   int lo = __double2loint(*reinterpret_cast<const double*>(&val));
   int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
   lo = __shfl_up(lo,delta,width);
   hi = __shfl_up(hi,delta,width);
   const double tmp = __hiloint2double(hi,lo);
   return *(reinterpret_cast<const Scalar*>(&tmp));
 }
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 5 > * v ) {
    assert( blockDim.x == 32 );
    assert( blockDim.y <= 32 );

    __shared__ unsigned int buffer[ 32 ];

    if( threadIdx.y == 0 ) {
        buffer[ threadIdx.x ] = 0;
    }
    __syncthreads();

    const unsigned int WIDTH = seqs->seq_width;

    IntType * seq_ptr = seqs->sequences;

    unsigned int N = 0;
    unsigned int seq_idx = blockIdx.y * gridDim.x  + blockIdx.x;
    unsigned int seq_begin = seq_idx * WIDTH + threadIdx.y;
    unsigned int seq_end = (seq_idx + 1) * WIDTH;
    while( seq_begin < seq_end ) {
        IntType x = seq_ptr[ seq_begin ];
        N += (( x >> threadIdx.x) & 1);
        seq_begin += blockDim.y;    
    }
    __syncthreads();

    for( unsigned int i = 1; i < 32; i <<= 1 ) {
        unsigned int t = __shfl_up( N, i );
        N += ((unsigned int) (threadIdx.x >= i) * t);
    }

    
    if( threadIdx.x == 31 ) {
        buffer[ threadIdx.y ] = N;
    }
    __syncthreads();

    N = buffer[ threadIdx.x ];
    __syncthreads();

    for( unsigned int i = 1; i < 32; i <<= 1 ) {
        unsigned int t = __shfl_up( N, i );
        N += ((unsigned int) (threadIdx.x >= i) * t);
    }

    if( threadIdx.y == 0 && threadIdx.x == 31 ) {
        res->data[ seq_idx ] = N;           
    }
    __syncthreads();
}
예제 #4
0
    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
    {
    #if __CUDA_ARCH__ >= 300
        int lo = __double2loint(val);
        int hi = __double2hiint(val);

        lo = __shfl_up(lo, delta, width);
        hi = __shfl_up(hi, delta, width);

        return __hiloint2double(hi, lo);
    #else
        return 0.0;
    #endif
    }
 KOKKOS_INLINE_FUNCTION
 Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
   Scalar tmp1 = val;
   float tmp = *reinterpret_cast<float*>(&tmp1);
   tmp = __shfl_up(tmp,delta,width);
   return *reinterpret_cast<Scalar*>(&tmp);
 }
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 3 > * v ) {

    typedef device_sequence_space< IntType >    space_type;
    typedef typename space_type::int_type       int_type;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int _count = seqs->seq_count;

    if( bid >= _count ) return;

    assert( _count <= res->size );  // sanity check: enough allocated space for results

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int tpb = blockDim.x * blockDim.y; // threads per block
    unsigned int bpg = gridDim.x * gridDim.y;   // blocks per grid
    unsigned int wpb = (tpb >> 5);
    unsigned int spg = wpb * bpg;   // sequences/grid = sequences(warps)/block * blocks/grid

    assert( (tpb & 31) == 0); // sanity check: all warps are full

    unsigned int lane_id = (tid & 31);
    unsigned int warp_id = (tid >> 5);

    unsigned int _width = seqs->seq_width;

    int_type * sptr = seqs->sequences;

    unsigned int * countptr = res->data;

    unsigned int max_seq_id = _count / wpb; // max_rounds = sequences * block/sequences 
    max_seq_id += ((_count % wpb) ? 1 : 0); // would !!(_count % spg) be more efficient?
    max_seq_id *= wpb;

    unsigned int seq_id = bid * wpb + warp_id;

    while( seq_id < max_seq_id ) {  // blocks of grid may terminate early; only block for tail may diverge
        unsigned int degree = 0;

        unsigned int end = (seq_id + 1) * _width;
        unsigned int idx = end - ((seq_id < _count) ? _width : 0);  // true for all threads in warp

        while( idx < end ) {    // all threads in a warp read same bit block; no divergence
            int_type b = sptr[ idx++ ]; // all threads in a warp read/load same bit block
            degree += ((b >> lane_id) & 1); // would !!( b & lane_mask), where (lane_mask = (1 << lane_id)), be more efficient?
        }
        __syncthreads();    // sync all warps

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int d = __shfl_up(degree, i );
            degree += (( lane_id >= i ) * d);
        }

        if( lane_id == 31 && (seq_id < _count) ) {
            countptr[ seq_id ] = degree;
        }
        __syncthreads();

        seq_id += spg;
    }
}
KOKKOS_INLINE_FUNCTION
unsigned int shfl_up(
    const unsigned int &val, const int& delta, const int& width) {
    unsigned int tmp1 = val;
    int tmp = *reinterpret_cast<int*>(&tmp1);
    tmp = __shfl_up(tmp,delta,width);
    return *reinterpret_cast<unsigned int*>(&tmp);
}
예제 #8
0
 __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
 {
 #if __CUDA_ARCH__ >= 300
     return __shfl_up(val, delta, width);
 #else
     return T();
 #endif
 }
예제 #9
0
 __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
 {
 #if __CUDA_ARCH__ >= 300
     return (unsigned int) __shfl_up((int) val, delta, width);
 #else
     return 0;
 #endif
 }
    KOKKOS_INLINE_FUNCTION
    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
      Impl::shfl_union<Scalar> s_val;
      Impl::shfl_union<Scalar> r_val;
      s_val = val;

      for(int i = 0; i<s_val.n; i++)
        r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
      return r_val.value();
    }
예제 #11
0
	__device__ void update(
		uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask)
	{
		const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;

		const auto dp0 = dp[0];
		uint32_t lazy_out = 0, local_min = 0;
		{
			const unsigned int k = 0;
#if CUDA_VERSION >= 9000
			const uint32_t prev =
				__shfl_up_sync(mask, dp[DP_BLOCK_SIZE - 1], 1);
#else
			const uint32_t prev = __shfl_up(dp[DP_BLOCK_SIZE - 1], 1);
#endif
			uint32_t out = min(dp[k] - last_min, p2);
			if(lane_id != 0){ out = min(out, prev - last_min + p1); }
			out = min(out, dp[k + 1] - last_min + p1);
			lazy_out = local_min = out + local_costs[k];
		}
		for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){
			uint32_t out = min(dp[k] - last_min, p2);
			out = min(out, dp[k - 1] - last_min + p1);
			out = min(out, dp[k + 1] - last_min + p1);
			dp[k - 1] = lazy_out;
			lazy_out = out + local_costs[k];
			local_min = min(local_min, lazy_out);
		}
		{
			const unsigned int k = DP_BLOCK_SIZE - 1;
#if CUDA_VERSION >= 9000
			const uint32_t next = __shfl_down_sync(mask, dp0, 1);
#else
			const uint32_t next = __shfl_down(dp0, 1);
#endif
			uint32_t out = min(dp[k] - last_min, p2);
			out = min(out, dp[k - 1] - last_min + p1);
			if(lane_id + 1 != SUBGROUP_SIZE){
				out = min(out, next - last_min + p1);
			}
			dp[k - 1] = lazy_out;
			dp[k] = out + local_costs[k];
			local_min = min(local_min, dp[k]);
		}
		last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask);
	}
예제 #12
0
__global__ void finalize_pairwise_diff_stats( pairwise_diff_stats * stats ) {
    assert( blockDim.x * blockDim.y == 32 );

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned long long tot = stats->block_bin[tid];
    for( unsigned int i = 1; i < 32; i <<= 1 ) {
        unsigned long long t = __shfl_up( tot, i );
        tot += ((tid >= i ) ? t : 0);
    }

    if( tid == 31 ) {
        stats->total = tot;

        double d = (double)tot;
        double c = (double) stats->count;

        d /= c;

        stats->mean = d;
    }
}
__global__ void sequence_hamming_weight_kernel( device_sequence_space< IntType > * seqs, basic_data_space< unsigned int > * res, clotho::utility::algo_version< 1 > * v ) {

    typedef device_sequence_space< IntType >    space_type;
    typedef typename space_type::int_type       int_type;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int _count = seqs->seq_count;

    if( bid >= _count ) return;

    assert( _count <= res->size );  // sanity check: enough allocated space for results

    popcountGPU< int_type > pc;

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int tpb = blockDim.x * blockDim.y; // threads per block
    unsigned int bpg = gridDim.x * gridDim.y;   // blocks per grid

    assert( (tpb & 31) == 0); // sanity check: all warps are full

    unsigned int lane_id = (tid & 31);
    unsigned int warp_id = (tid >> 5);

    unsigned int _width = seqs->seq_width;

    int_type * sptr = seqs->sequences;

    unsigned int * countptr = res->data;

    __shared__ unsigned int buffer[32];

    while( bid < _count ) {
        unsigned int degree = 0;

        unsigned int end = (bid + 1) * _width;
        unsigned int idx = (end - _width) + tid;

        while( idx < end ) {    // at most 1 warp diverges
            int_type b = sptr[ idx ];
            degree += pc.evalGPU( b );

            idx += tpb;
        }
        __syncthreads();

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int d = __shfl_up(degree, i );
            degree += (( lane_id >= i ) * d);
        }

        if( tpb > 32 ) { // true for all threads in block
            if( lane_id == 31 ) {
                buffer[ warp_id ] = degree;
            }
            __syncthreads();

            degree = buffer[ lane_id ];
            __syncthreads();

            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int d = __shfl_up( degree, i );
                degree +=  ((lane_id >= i) * d);
            }
        }

        if( tid == 31 ) {
            countptr[ bid ] = degree;
        }
        __syncthreads();
        bid += bpg;
    }
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 2 > * v ) {

    typedef StateType  state_type;
    typedef AlleleSpaceType allele_space_type;
    typedef typename allele_space_type::real_type   real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;

    typedef poisson_cdf< RealType, 32 > poisson_type;

    unsigned int  nAlleles = alleles->capacity;
    if( nAlleles == 0 ) { return; }

    assert( (nAlleles & 31) == 0 ); // multiple of 32 alleles

    const unsigned int MAX_EVENTS_PER_WARP = 64;    // maximum number of recombination events per sequence
    const unsigned int MAX_WARP_PER_BLOCK = 32;
    const unsigned int HASH_WIDTH = 32;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int lane_id = (tid & 31);
    unsigned int warp_id = (tid >> 5);

    unsigned int tpb = (blockDim.x * blockDim.y);
    assert( (tpb & 31) == 0 );  // all warps are full

    unsigned int bpg = (gridDim.x * gridDim.y);
    unsigned int wpb = (tpb >> 5);  // == sequences/block (spb)

    assert( wpb <= MAX_WARP_PER_BLOCK );

    unsigned int spg = bpg * wpb;
    
    int_type sequence_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;
    real_type   * allele_list = alleles->locations;

    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];                      // 4 * 32 == 128
    __shared__ real_type        rand_pool[ MAX_WARP_PER_BLOCK * MAX_EVENTS_PER_WARP ];  // 4 * 32 * 64 == 8192
    __shared__ unsigned int     event_hash[ MAX_WARP_PER_BLOCK * HASH_WIDTH ];          // 4 * 32 * 128 == 16384
                                                                                        //----------------------
                                                                                        // 24704 (bytes of shared memory)

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[tid];
    }
    __syncthreads();

    state_type local_state = states[ bid * tpb + tid ]; // assumes every thread in the GRID has a state defined

    unsigned int max_seq_id = nSequences / wpb;
    max_seq_id += (( nSequences % wpb ) ? 1 : 0);
    max_seq_id *= wpb;

    int_type seq_idx = bid * wpb + warp_id;

    while( seq_idx < max_seq_id ) { // should allow blocks to terminate early

        // generate a recombination hash for each sequence (warp)
//        unsigned int psum = 0;
//        for( unsigned int w = lane_id; w < HASH_WIDTH; w += 32 ) {
            real_type x = curand_uniform( &local_state );

            unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
            __syncthreads();

            // compute prefix sum with in each warp
            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int r = __shfl_up( rand, i );
                rand += ( (lane_id >= i ) * r );
            }

//            rand += psum;
//            event_hash[ warp_id * HASH_WIDTH + w ] = rand;   //  event_hash contain prefix sum (scan)
            event_hash[ tid ] = rand;

            unsigned int s = __shfl_up( rand, 1);   //
            s *= ( lane_id != 0 );

            s += (warp_id * MAX_EVENTS_PER_WARP);       // shift s and rand to be relative to warp
            rand += (warp_id * MAX_EVENTS_PER_WARP);

            // BEGIN divergent code
            real_type accum = 0.;
            while (s < rand) {
                x = curand_uniform( &local_state );

                accum += (log( x ) / (real_type)(rand - s));

                rand_pool[s++] = ((((real_type)tid) + (1.0 - exp(accum))) / ((real_type)32.));
            }
            __syncthreads();
            // END divergent code
            //
//            psum = __shfl( rand, 31 );
//        }

        unsigned int seq_offset = seq_idx * sequence_width;
        unsigned int a_id = lane_id;
        while( a_id < nAlleles ) {
            real_type x = allele_list[ a_id ];

            unsigned int h_idx = (unsigned int) ( x * ((real_type) HASH_WIDTH ));  // map allele location to bin index

            unsigned int s = event_hash[ h_idx++ ];
            unsigned int e = event_hash[ h_idx ];
            __syncthreads();

            int_type cmask = s;

            s += (warp_id * MAX_EVENTS_PER_WARP);
            e += (warp_id * MAX_EVENTS_PER_WARP );

            // BEGIN divergent code
            while( s < e  ) {
                real_type y = rand_pool[ s++ ];
                cmask += (x > y);
            }
            __syncthreads();
            // END divergent code

            cmask = ((cmask & 1) << lane_id);

            // reduce cmask within each warp
            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                int_type _c = __shfl_up( cmask, i );
                cmask |= ((lane_id >= i) * _c);
            }

            if( lane_id == 31 && seq_idx < nSequences ) {
                seqs[seq_offset] = cmask;
            }
            __syncthreads();

            a_id += 32;
            ++seq_offset;
        }
        __syncthreads();

        seq_idx += spg;
    }

    states[ bid * tpb + tid ] = local_state;
}
__global__ void select_and_crossover_kernel( StateType * states
                                            , device_sequence_space< SequenceIntType > * parent_seqs
                                            , device_event_space< IntType, no_order_tag >  * parent_ids
                                            , AlleleSpaceType * alleles
                                            , device_free_space< SequenceIntType, unit_ordered_tag > * free_space
                                            , poisson_cdf< RealType, 32 > * pois
                                            , device_sequence_space< SequenceIntType > * offspring_seqs ) {
    typedef StateType                                       state_type;
    typedef poisson_cdf< RealType, 32 >                     poisson_type;
    typedef device_sequence_space< SequenceIntType >        sequence_space_type;
    typedef typename sequence_space_type::int_type          sequence_int_type;

    typedef device_event_space< IntType, no_order_tag >    selection_type;
    typedef typename selection_type::int_type               selection_int_type;

    typedef AlleleSpaceType                                 allele_space_type;
    typedef typename allele_space_type::real_type           real_type;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int offspring_count = offspring_seqs->seq_count;

    if( bid >= offspring_count ) return;

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int warp_id = (tid >> 5 );
    unsigned int lane_id = (tid & 31 );

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int wpb = (tpb >> 5);
    unsigned int bpg = (gridDim.x * gridDim.y);
    unsigned int par_count = parent_seqs->seq_count;

    if( par_count == 0 ) {
        if( bid == 0 && tid == 0 ) {
            unsigned int off_cap = offspring_seqs->capacity;

            memset( offspring_seqs->sequences, 0, off_cap * sizeof( sequence_int_type) );
        }

        return;
    }

    state_type local_state = states[ bid * tpb + tid ];

    __shared__ unsigned int s_event_hash[ 32 ];
    __shared__ real_type    s_rand_pool[ 1024 ];
    __shared__ real_type    s_pois_cdf[ poisson_type::MAX_K ];

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[ tid ];
    }
    __syncthreads();

    while( bid < offspring_count ) {
        real_type r = curand_uniform( &local_state );

        unsigned int e = _find_poisson_maxk32( s_pois_cdf, r, max_k );
        __syncthreads();

        if( tid < 32 ) {
            s_event_hash[ tid ] = e;
        }
        __syncthreads();

        e = s_event_hash[ lane_id ];
        __syncthreads();
        
        unsigned int psum = e;
        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int _e = __shfl_up( e, i );
            e = ((_e > e) ? _e : e );
        }

        unsigned int max = __shfl( e, 31 );

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int p = __shfl_up( psum, i );
            psum += (( lane_id >= i ) * p );
        }

        unsigned int base_count = __shfl_up( psum, 1 );
        if( lane_id == 0 ) base_count = 0;
        __syncthreads();

        for( unsigned int i = warp_id; i < max; i += wpb ) {    // warp divergence
            r = curand_uniform( &local_state );

            s_rand_pool[ i * 32 + lane_id ] = (((real_type) lane_id + r) / (real_type) 32);
        }
        __syncthreads();

        // set up parent and offspring indexes
        selection_int_type id = par_ids[ bid ]; // all threads in block share same parent
        id <<= 1;

        unsigned int p0 = id * par_width + warp_id;
        unsigned int o_idx = bid * off_width + warp_id;
        unsigned int a_id = tid;

        while( a_id < par_allele_cap ) {    // true for all threads
            sequence_int_type p = par_seqs[ p0 ];
            sequence_int_type q = par_seqs[ p0 + par_width ];
            real_type a = all_loc[ a_id ];  // every thread reads a location just in case

            e = (psum - base_count);
            unsigned int x = base_count;
            for( unsigned int i = 0; i < max; ++i ) {   // should not diverge
                real_type y = s_rand_pool[ i * 32 + lane_id ];
                x += ((i < e) * ( y < a));
            }
            __syncthreads();

            x = ((x & 1) << lane_id);

            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int _x = __shfl_up( x, i );
                x |= ((lane_id >= i ) * _x );
            }

            if( lane_id == 31 ) {
                off_seqs[ o_idx ] = ((p & ~x) | (q & x));
            }
            __syncthreads();

            p0 += wpb;
            o_idx += wpb;
            a_id += tpb;
        }
        bid += bpg;
    }

    bid = blockIdx.y * gridDim.x + blockIdx.x;
    local_state[ bid * tpb + tid ];
}
예제 #16
0
__global__ void pairwise_difference_kernel( device_sequence_space< IntType > * sequences, basic_data_space< unsigned int > * sub_pop, pairwise_diff_stats * stats ) {

    unsigned int tpb = (blockDim.x * blockDim.y);

    assert( tpb % 32 == 0 );    // 32 == threads per warp; all warps are full

    unsigned int wpb = (tpb >> 5);  // warp per block

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int lane_id = (tid & 31);
    unsigned int warp_id = (tid >> 5);

    unsigned int bpg = (gridDim.x * gridDim.y); // blocks per grid
    unsigned int wpg = (wpb * bpg); // warps per grid

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;

    unsigned int R = sequences->seq_count;
    unsigned int C = sequences->seq_width;

    IntType * data = sequences->sequences;

    popcountGPU< IntType > pc;

    unsigned int N = sub_pop->size;
    unsigned int * sp = sub_pop->data;

    unsigned int column_tail = (C & 31);    // == % 32
//    unsigned int column_full_warps = (C & (~31));  // ==( / 32) * 32
    unsigned int column_full_warps = (C - column_tail);

    __shared__ unsigned long long block_buffer[ 32 ]; // 32 == max warps per block
    __shared__ unsigned int warp_index[32];

    unsigned int M = N - 1; // max index

    if( warp_id == 0 ) {
        block_buffer[ lane_id ] = 0;
        warp_index[ lane_id ] = M;
    }
    __syncthreads();

    // within each block verify that the sub-population is valid
    // NOTE consider moving this to a pre-processing step
    unsigned int s0 = tid;
    while( s0 < N ) {
        unsigned int idx = sp[s0];
        assert( idx < R );   
        s0 += tpb;
    }
    __syncthreads();


    s0 = 0;
    unsigned int s1 = bid * wpb + warp_id + 1;    // s1 = s0 + grid_warp_id + 1 =  warp_id + 1

    bool is_done = false;
    while( !is_done ) {

        while( s1 >= N  && s0 < M ) {
            ++s0;
            s1 -= (M - s0);
        }

        unsigned int s0_p = ((s0 >= M) ? M : s0);   // M is valid index in sub population list
        unsigned int s1_p = ((s1 >= M) ? M : s1);

        unsigned int s0_idx = sp[s0_p];
        unsigned int s1_idx = sp[s1_p];
        __syncthreads();

        unsigned int s0_end = s0_idx * C + column_full_warps;
        unsigned int s0_start = s0_idx * C + lane_id;

        unsigned int s1_start = s1_idx * C + lane_id;

        unsigned int tot = 0;
        while( s0_start < s0_end ) {    // all sequences same length so only need to iterate along first
            IntType s0_data = data[ s0_start ];
            IntType s1_data = data[ s1_start ];

            tot += pc.evalGPU( s0_data ^ s1_data );

            s0_start += 32;
            s1_start += 32;
        }
        __syncthreads();

        // handle tail
        if( column_tail ) { // true for all threads
            s0_end += column_tail;

            IntType s0_data = 0;
            IntType s1_data = 0;

            if( s0_start < s0_end ) {
                s0_data = data[s0_start];
                s1_data = data[s1_start];
            }
            __syncthreads();

            tot += pc.evalGPU( s0_data ^ s1_data );
        }
        __syncthreads();

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int t = __shfl_up( tot, i );
            tot += ((lane_id >= i) * t);
        }

        if( lane_id == 31 ) {
            block_buffer[warp_id] += tot;
            warp_index[warp_id] =  s0;
        }
        __syncthreads();

        // find the minimum s0 within the block
        unsigned int tmp = warp_index[lane_id];
        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int t = __shfl_up(tmp, i);
            tmp = ((tmp < t) ? tmp : t);
        }

        unsigned int t = __shfl(tmp, 31);

        is_done = (t >= M);   // done when all warps have reach s0 >= M
        s1 += wpg;
    }

    unsigned long long tot = block_buffer[ lane_id ];
    for( unsigned int i = 1; i < 32; i <<= 1 ) {
        unsigned long long t = __shfl_up( tot, i );
        tot += (( lane_id >= i ) ? t : 0);
    }

    if( warp_id == 0 && lane_id == 31 ) {
        stats->block_bin[ bid ] = tot;

        if(bid == 0) {
            unsigned long long count = N;
            count *= (count - 1);
            count >>= 1;
            stats->count = count;
        }
__global__ void build_crossover_mask_kernel(  AlleleSpaceType * alleles,
    device_sequence_space< IntType > * buffer,
    device_event_pool_space< RealType, IntType > * events
    ) {

    typedef device_event_pool_space< RealType, IntType > pool_type;

    int_type bid = blockIdx.y * gridDim.x + blockIdx.x;
    int_type tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int bpg = (gridDim.x * gridDim.y);

    real_type * allele_list = alleles->locations;

    int_type * buf = buffer->buffer;
    int_type seq_width = buffer->seq_width;
    int_type nSequences = buffer->seq_count;

    unsigned int nAlleles = alleles->capacity;
    if( nAlleles == 0 ) return;

    __shared__ real_type    rand_pool[ pool_type::MAX_EVENTS_PER_OFFSET ];

    // 1 sequence per block
    while( bid < nSequences ) {
        int_type lo = events->offsets[ bid ], hi = events->offsets[ bid + 1 ];

        // true for all threads
        if( lo == hi ) {
            unsigned int id = bid * seq_width + tid;
            while( id < seq_width ) {
                buf[ id ] = 0;
                id += tpb;
            }
            __syncthreads();
        } else {
            // move events from global memory into shared memory
            if( lo + tid < hi ) {
                rand_pool[tid] = events->event_pool[ lo + tid ];
            }
            __syncthreads();

            hi -= lo;

            unsigned int id = tid;
            unsigned int offset = bid * seq_width + threadIdx.y;

            while( id < nAlleles ) {
                real_type pos = allele_list[ id ];

                unsigned int n = 0;
                for( unsigned int i = 0; i < hi; ++i ) {
                    n += ((rand_pool[ i ] < pos) ? 1 : 0);
                }
                __syncthreads();

                int_type mask = (1 << threadIdx.x);
                mask *= (n & 1);

                // scan right mask
                for( unsigned int i = 1; i < 32; i <<= 1 ) {
                    int_type _c = __shfl_up( mask, i );
                    mask |= ((threadIdx.x >= i ) *_c );
                }

                // last thread of each warp writes mask block to global memory
                if( threadIdx.x == 31 ) {
                    buf[ offset ] = mask;
                }
                __syncthreads();

                id += tpb;
                offset += blockDim.y;
            }
            __syncthreads();
        }

        bid += bpg;
    }
}
 KOKKOS_INLINE_FUNCTION
 float shfl_up(const float &val, const int& delta, const int& width ) {
   return __shfl_up(val,delta,width);
 }
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 5 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;

    typedef poisson_cdf< RealType, 32 >                 poisson_type;
    typedef xover_config< unordered_tag, 5 >            xover_type;

    assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS );

    // encourage quick return;
    unsigned int nAlleles = alleles->capacity;
    if( nAlleles == 0 ) return;

    // only considering 2d grid/block definitions
    int_type bid = blockIdx.y * gridDim.x + blockIdx.x;
    int_type tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int bpg = (gridDim.x * gridDim.y);

    real_type * allele_list = alleles->locations;

    int_type * seqs = sequences->sequences;
    int_type seq_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;

    const unsigned int MAX_EVENTS = 32;

    __shared__ int_type     event_count;
    __shared__ real_type    rand_pool[ MAX_EVENTS ];

    unsigned int seqIdx = bid;

    // 1 sequence per block
    while( seqIdx < nSequences ) {
        // use the first thread of each block to populate a pool
        // of random numbers
        if( tid == 0 ) {
            state_type local_state = states[ bid ];
            // update the event_count
            event_count = curand_poisson( &local_state, pois->lambda );

            for( unsigned int i = 0; i < event_count; ++i ) {
                rand_pool[ i ] = curand_uniform( &local_state );
            }

            states[ bid ] = local_state;
        }
        __syncthreads();

        // copy event_count from shared memory to thread local memory
        // 
        unsigned int nEvents = event_count;

        // true for all threads
        if( nEvents == 0 ) {
            unsigned int id = tid;
            while( id < seq_width ) {
                seqs[ seqIdx * seq_width + tid ] = 0;
                id += tpb;
            }
        } else {
            unsigned int id = tid;
            unsigned int offset = seqIdx * seq_width + threadIdx.y;

            while( id < nAlleles ) {
                real_type pos = allele_list[ id ];

                unsigned int n = 0;
                for( unsigned int i = 0; i < nEvents; ++i ) {
                    n += ((rand_pool[ i ] < pos) ? 1 : 0);
                }
                __syncthreads();

                int_type mask = (1 << threadIdx.x);
                mask *= (n & 1);

                // scan left mask
                for( unsigned int i = 1; i < 32; i <<= 1 ) {
                    int_type _c = __shfl_up( mask, i );
                    mask |= ((threadIdx.x >= i ) *_c );
                }

                if( threadIdx.x == 31 ) {
                    seqs[ offset ] = mask;
                }
                __syncthreads();

                id += tpb;
                offset += blockDim.y;
            }
        }

        seqIdx += bpg;
        __syncthreads();
    }
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 4 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;

    typedef poisson_cdf< RealType, 32 >                 poisson_type;
    typedef xover_config< unordered_tag, 4 >            xover_type;

    assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS );

    int_type bid = blockIdx.y * gridDim.x + blockIdx.x;
    int_type tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    assert( (tpb % allele_space_type::ALIGNMENT_SIZE) == 0 );

    unsigned int bpg = (gridDim.x * gridDim.y);

    unsigned int i;
    int_type sequence_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;

    //int_type  cap = sequences->capacity;

    real_type       * allele_list = alleles->locations;
    unsigned int    nAlleles = alleles->capacity;

    assert( (nAlleles % allele_space_type::ALIGNMENT_SIZE) == 0 );

    if( nAlleles == 0 ) { return; }

    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];
    __shared__ real_type        rand_pool[ allele_space_type::ALIGNMENT_SIZE ];
    __shared__ unsigned int     event_hash[ allele_space_type::ALIGNMENT_SIZE];

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[tid];
    }
    __syncthreads();

    state_type local_state = states[ bid * tpb + tid ];

    unsigned int nonzero_warp = (threadIdx.y != 0);
    unsigned int nonzero_thread = (tid != 0);
    int_type seq_idx = bid;
    while( seq_idx < nSequences ) {
        real_type x = curand_uniform( &local_state );   // x in (0, 1]
        rand_pool[ tid ] = ((x >= 1.0) ? 0.0 : x);  // wrap around x to be in [0, 1); this way all event hash bins follow [,) pattern

        x = curand_uniform( &local_state );

        int_type rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
        __syncthreads();

        for( i = 1; i < 32; i <<= 1 ) {
            unsigned int r = __shfl_up( rand, i );
            rand += ( (threadIdx.x >= i ) * r );
        }

        if( threadIdx.x == 31 ) {
            event_hash[ threadIdx.y ] = rand;
        }
        __syncthreads();

        unsigned int _sum = event_hash[ threadIdx.x ];
        _sum *= (threadIdx.x < blockDim.y);
        __syncthreads();

        for( i = 1; i < 32; i <<= 1 ) {
            unsigned int s = __shfl_up( _sum, i );
            _sum += (( threadIdx.x >= i ) * s);
        }

        unsigned int s = __shfl( _sum, 31 );
//        assert( max_events < allele_space_type::ALIGNMENT_SIZE );
//
        if( s == 0 ) { // true for all threads in block assuming 1 block per sequence
            // if there are no events for this sequence, then simply clear the memory
            unsigned int seq_start = seq_idx * sequence_width + tid;
            unsigned int seq_end = (seq_idx + 1) * sequence_width;
            while( seq_start < seq_end ) {
                seqs[ seq_start ] = 0;
                seq_start += tpb;
            }
            __syncthreads();
        } else {

            s = __shfl( _sum, threadIdx.y - nonzero_warp);
            s *= nonzero_warp;
            __syncthreads();

            rand += s;
            event_hash[tid] = rand;
            __syncthreads();

            i = event_hash[ tid - nonzero_thread];    // minimum event index
            i *= nonzero_thread;
            __syncthreads();

            // BEGIN divergent code
            while (i < rand) {
                x = rand_pool[ i ];
                rand_pool[ i++ ] = (((real_type) tid) + x) / ((real_type) allele_space_type::ALIGNMENT_SIZE);
            }
            __syncthreads();
            // END divergent code

            unsigned int seq_offset = seq_idx * sequence_width + threadIdx.y;   // every thread in a warp has the same block offset
            i = tid;
            while( i < nAlleles ) {
                x = allele_list[ i ];

                rand = (unsigned int) ( x * ((real_type)allele_space_type::ALIGNMENT_SIZE));

                unsigned int nonzero_bin = (rand != 0);  // _keep == 0 -> rand == 0 -> e_min == 0; _keep == 1 -> rand != 0 -> e_min == event_hash[ rand - 1]

                // each thread reads hash (and random pool) relative to their 
                // local allele (x)
                // this code will result in bank conflicts
                //
                // initial performance results suggest that this is
                // an acceptable overhead as overall runtime of simulation
                // loop is minimized when this algorithm is used
                unsigned int e_max = event_hash[ rand ];
                unsigned int e_min = event_hash[ rand - nonzero_bin ];
                e_min *= nonzero_bin;

                int_type cmask = e_min;

                // BEGIN divergent code
                while( e_min < e_max ) {
                    real_type y = rand_pool[ e_min++ ];
                    cmask += (y < x);
                }
                __syncthreads();
                // END divergent code

                cmask = ((cmask & 1) << threadIdx.x);
                
                for( unsigned int j = 1; j < 32; j <<= 1 ) {
                    int_type _c = __shfl_up( cmask, j);
                    cmask |= ((threadIdx.x >= j) * _c);
                }
                if( threadIdx.x == 31 ) {
//                    assert( seq_offset < cap );
                    seqs[ seq_offset ] = cmask;
                }
                __syncthreads();

                i += tpb;
                seq_offset += blockDim.y;   // wpb
            }
            __syncthreads();

        }
        seq_idx += bpg;
    }

    states[ bid * tpb + tid ] = local_state;
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 3 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;
    typedef xover_config< unordered_tag, 3 >            xover_type;

    typedef poisson_cdf< RealType, 32 > poisson_type;

    assert( blockDim.y <= xover_type::MAX_WARPS && blockDim.x == 32);   // 8 or fewer full warps

    unsigned int  nAlleles = alleles->capacity;
    if( nAlleles == 0 ) { return; }

    assert( (nAlleles & 31) == 0 ); // nAlleles == 32 * m

    const unsigned int MAX_EVENTS_PER_WARP = 128;    // maximum number of recombination events per sequence
    __shared__ real_type rand_pool[ MAX_EVENTS_PER_WARP * xover_type::MAX_WARPS ];  // 16 warps/block (arbitrary number); 512 random numbers
    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];                      // 4 * 32 == 128

    unsigned int max_k = pois->max_k;

    if( threadIdx.y == 0 ) {    // use first warp to read poisson CDF
        s_pois_cdf[ threadIdx.x ] = pois->_cdf[threadIdx.x];
    }
    __syncthreads();

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int bpg = (gridDim.x * gridDim.y);

    unsigned int spg = bpg * blockDim.y;
    
    int_type seq_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;
    real_type   * allele_list = alleles->locations;

    unsigned int seq_idx = bid * blockDim.y + threadIdx.y;
    state_type local_state = states[ seq_idx * 32 + threadIdx.x ]; // every thread/warp uses the SAME random state

    unsigned int max_seqs = nSequences / blockDim.y;
    max_seqs += (( nSequences % blockDim.y) ? 1 : 0);
    max_seqs *= blockDim.y;

    while( seq_idx < max_seqs ) {
        real_type x = curand_uniform( &local_state );
        unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
        __syncthreads();

        for(unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int t = __shfl_up( rand, i );
            rand += ((threadIdx.x >= i) * t);
        }

        unsigned int max_events = __shfl( rand, 31 );
        // fill random pool
        //
        if( max_events >= MAX_EVENTS_PER_WARP ) {
            if( threadIdx.x == 0 ) {
                printf( "Too many events to generate: %d\n", max_events );
            }
            assert( max_events < MAX_EVENTS_PER_WARP );
        }
        __syncthreads();

        rand_pool[ tid ] = curand_uniform( &local_state );
        rand_pool[ tid + tpb ] = curand_uniform( &local_state );
        rand_pool[ tid + 2 * tpb ] = curand_uniform( &local_state );
        rand_pool[ tid + 3 * tpb ] = curand_uniform( &local_state );
        __syncthreads();

        unsigned int seq_offset = seq_idx * seq_width;
        unsigned int a_idx = threadIdx.x;
        while( a_idx < nAlleles ) {
            real_type loc = allele_list[ a_idx ];

            unsigned int s = 0, mask = 0;
            while( s < max_events ) {   // warps within a block diverge. threads within a warp do not
                real_type y = rand_pool[ s++ ]; // every thread within a warp read/loads the same value (sequence offset)
                mask += ( y < loc );
            }
            __syncthreads();

            mask = ((mask & 1) << threadIdx.x);

            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int e = __shfl_up( mask, i );
                mask |= ((threadIdx.x >= i) * e);
            }

            if( threadIdx.x == 31 && seq_idx < nSequences ) {
                seqs[ seq_offset ] = mask;
            }
            __syncthreads();
            a_idx += 32;
            ++seq_offset;
        }
        seq_idx += spg;
    }

    seq_idx = bid * blockDim.y + threadIdx.y;  // reset seq_idx
    states[ seq_idx * 32 + threadIdx.x ] = local_state;
}