Exemplo n.º 1
0
    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
        int lo = __double2loint(val);
        int hi = __double2hiint(val);

        lo = __shfl(lo, srcLane, width);
        hi = __shfl(hi, srcLane, width);

        return __hiloint2double(hi, lo);
    #else
        return 0.0;
    #endif
    }
Exemplo n.º 2
0
 __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
 {
 #if __CUDA_ARCH__ >= 300
     return (unsigned int) __shfl((int) val, srcLane, width);
 #else
     return 0;
 #endif
 }
Exemplo n.º 3
0
 __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
 {
 #if __CUDA_ARCH__ >= 300
     return __shfl(val, srcLane, width);
 #else
     return T();
 #endif
 }
 KOKKOS_INLINE_FUNCTION
 Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
     ) {
   Scalar tmp1 = val;
   float tmp = *reinterpret_cast<float*>(&tmp1);
   tmp = __shfl(tmp,srcLane,width);
   return *reinterpret_cast<Scalar*>(&tmp);
 }
    KOKKOS_INLINE_FUNCTION
    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
      Impl::shfl_union<Scalar> s_val;
      Impl::shfl_union<Scalar> r_val;
      s_val = val;

      for(int i = 0; i<s_val.n; i++)
        r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
      return r_val.value();
    }
__global__ void select_and_crossover_kernel( StateType * states
                                            , device_sequence_space< SequenceIntType > * parent_seqs
                                            , device_event_space< IntType, no_order_tag >  * parent_ids
                                            , AlleleSpaceType * alleles
                                            , device_free_space< SequenceIntType, unit_ordered_tag > * free_space
                                            , poisson_cdf< RealType, 32 > * pois
                                            , device_sequence_space< SequenceIntType > * offspring_seqs ) {
    typedef StateType                                       state_type;
    typedef poisson_cdf< RealType, 32 >                     poisson_type;
    typedef device_sequence_space< SequenceIntType >        sequence_space_type;
    typedef typename sequence_space_type::int_type          sequence_int_type;

    typedef device_event_space< IntType, no_order_tag >    selection_type;
    typedef typename selection_type::int_type               selection_int_type;

    typedef AlleleSpaceType                                 allele_space_type;
    typedef typename allele_space_type::real_type           real_type;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int offspring_count = offspring_seqs->seq_count;

    if( bid >= offspring_count ) return;

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int warp_id = (tid >> 5 );
    unsigned int lane_id = (tid & 31 );

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int wpb = (tpb >> 5);
    unsigned int bpg = (gridDim.x * gridDim.y);
    unsigned int par_count = parent_seqs->seq_count;

    if( par_count == 0 ) {
        if( bid == 0 && tid == 0 ) {
            unsigned int off_cap = offspring_seqs->capacity;

            memset( offspring_seqs->sequences, 0, off_cap * sizeof( sequence_int_type) );
        }

        return;
    }

    state_type local_state = states[ bid * tpb + tid ];

    __shared__ unsigned int s_event_hash[ 32 ];
    __shared__ real_type    s_rand_pool[ 1024 ];
    __shared__ real_type    s_pois_cdf[ poisson_type::MAX_K ];

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[ tid ];
    }
    __syncthreads();

    while( bid < offspring_count ) {
        real_type r = curand_uniform( &local_state );

        unsigned int e = _find_poisson_maxk32( s_pois_cdf, r, max_k );
        __syncthreads();

        if( tid < 32 ) {
            s_event_hash[ tid ] = e;
        }
        __syncthreads();

        e = s_event_hash[ lane_id ];
        __syncthreads();
        
        unsigned int psum = e;
        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int _e = __shfl_up( e, i );
            e = ((_e > e) ? _e : e );
        }

        unsigned int max = __shfl( e, 31 );

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int p = __shfl_up( psum, i );
            psum += (( lane_id >= i ) * p );
        }

        unsigned int base_count = __shfl_up( psum, 1 );
        if( lane_id == 0 ) base_count = 0;
        __syncthreads();

        for( unsigned int i = warp_id; i < max; i += wpb ) {    // warp divergence
            r = curand_uniform( &local_state );

            s_rand_pool[ i * 32 + lane_id ] = (((real_type) lane_id + r) / (real_type) 32);
        }
        __syncthreads();

        // set up parent and offspring indexes
        selection_int_type id = par_ids[ bid ]; // all threads in block share same parent
        id <<= 1;

        unsigned int p0 = id * par_width + warp_id;
        unsigned int o_idx = bid * off_width + warp_id;
        unsigned int a_id = tid;

        while( a_id < par_allele_cap ) {    // true for all threads
            sequence_int_type p = par_seqs[ p0 ];
            sequence_int_type q = par_seqs[ p0 + par_width ];
            real_type a = all_loc[ a_id ];  // every thread reads a location just in case

            e = (psum - base_count);
            unsigned int x = base_count;
            for( unsigned int i = 0; i < max; ++i ) {   // should not diverge
                real_type y = s_rand_pool[ i * 32 + lane_id ];
                x += ((i < e) * ( y < a));
            }
            __syncthreads();

            x = ((x & 1) << lane_id);

            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int _x = __shfl_up( x, i );
                x |= ((lane_id >= i ) * _x );
            }

            if( lane_id == 31 ) {
                off_seqs[ o_idx ] = ((p & ~x) | (q & x));
            }
            __syncthreads();

            p0 += wpb;
            o_idx += wpb;
            a_id += tpb;
        }
        bid += bpg;
    }

    bid = blockIdx.y * gridDim.x + blockIdx.x;
    local_state[ bid * tpb + tid ];
}
__global__ void pairwise_difference_kernel( device_sequence_space< IntType > * sequences, basic_data_space< unsigned int > * sub_pop, pairwise_diff_stats * stats ) {

    unsigned int tpb = (blockDim.x * blockDim.y);

    assert( tpb % 32 == 0 );    // 32 == threads per warp; all warps are full

    unsigned int wpb = (tpb >> 5);  // warp per block

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int lane_id = (tid & 31);
    unsigned int warp_id = (tid >> 5);

    unsigned int bpg = (gridDim.x * gridDim.y); // blocks per grid
    unsigned int wpg = (wpb * bpg); // warps per grid

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;

    unsigned int R = sequences->seq_count;
    unsigned int C = sequences->seq_width;

    IntType * data = sequences->sequences;

    popcountGPU< IntType > pc;

    unsigned int N = sub_pop->size;
    unsigned int * sp = sub_pop->data;

    unsigned int column_tail = (C & 31);    // == % 32
//    unsigned int column_full_warps = (C & (~31));  // ==( / 32) * 32
    unsigned int column_full_warps = (C - column_tail);

    __shared__ unsigned long long block_buffer[ 32 ]; // 32 == max warps per block
    __shared__ unsigned int warp_index[32];

    unsigned int M = N - 1; // max index

    if( warp_id == 0 ) {
        block_buffer[ lane_id ] = 0;
        warp_index[ lane_id ] = M;
    }
    __syncthreads();

    // within each block verify that the sub-population is valid
    // NOTE consider moving this to a pre-processing step
    unsigned int s0 = tid;
    while( s0 < N ) {
        unsigned int idx = sp[s0];
        assert( idx < R );   
        s0 += tpb;
    }
    __syncthreads();


    s0 = 0;
    unsigned int s1 = bid * wpb + warp_id + 1;    // s1 = s0 + grid_warp_id + 1 =  warp_id + 1

    bool is_done = false;
    while( !is_done ) {

        while( s1 >= N  && s0 < M ) {
            ++s0;
            s1 -= (M - s0);
        }

        unsigned int s0_p = ((s0 >= M) ? M : s0);   // M is valid index in sub population list
        unsigned int s1_p = ((s1 >= M) ? M : s1);

        unsigned int s0_idx = sp[s0_p];
        unsigned int s1_idx = sp[s1_p];
        __syncthreads();

        unsigned int s0_end = s0_idx * C + column_full_warps;
        unsigned int s0_start = s0_idx * C + lane_id;

        unsigned int s1_start = s1_idx * C + lane_id;

        unsigned int tot = 0;
        while( s0_start < s0_end ) {    // all sequences same length so only need to iterate along first
            IntType s0_data = data[ s0_start ];
            IntType s1_data = data[ s1_start ];

            tot += pc.evalGPU( s0_data ^ s1_data );

            s0_start += 32;
            s1_start += 32;
        }
        __syncthreads();

        // handle tail
        if( column_tail ) { // true for all threads
            s0_end += column_tail;

            IntType s0_data = 0;
            IntType s1_data = 0;

            if( s0_start < s0_end ) {
                s0_data = data[s0_start];
                s1_data = data[s1_start];
            }
            __syncthreads();

            tot += pc.evalGPU( s0_data ^ s1_data );
        }
        __syncthreads();

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int t = __shfl_up( tot, i );
            tot += ((lane_id >= i) * t);
        }

        if( lane_id == 31 ) {
            block_buffer[warp_id] += tot;
            warp_index[warp_id] =  s0;
        }
        __syncthreads();

        // find the minimum s0 within the block
        unsigned int tmp = warp_index[lane_id];
        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int t = __shfl_up(tmp, i);
            tmp = ((tmp < t) ? tmp : t);
        }

        unsigned int t = __shfl(tmp, 31);

        is_done = (t >= M);   // done when all warps have reach s0 >= M
        s1 += wpg;
    }

    unsigned long long tot = block_buffer[ lane_id ];
    for( unsigned int i = 1; i < 32; i <<= 1 ) {
        unsigned long long t = __shfl_up( tot, i );
        tot += (( lane_id >= i ) ? t : 0);
    }

    if( warp_id == 0 && lane_id == 31 ) {
        stats->block_bin[ bid ] = tot;

        if(bid == 0) {
            unsigned long long count = N;
            count *= (count - 1);
            count >>= 1;
            stats->count = count;
        }
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 4 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;

    typedef poisson_cdf< RealType, 32 >                 poisson_type;
    typedef xover_config< unordered_tag, 4 >            xover_type;

    assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS );

    int_type bid = blockIdx.y * gridDim.x + blockIdx.x;
    int_type tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    assert( (tpb % allele_space_type::ALIGNMENT_SIZE) == 0 );

    unsigned int bpg = (gridDim.x * gridDim.y);

    unsigned int i;
    int_type sequence_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;

    //int_type  cap = sequences->capacity;

    real_type       * allele_list = alleles->locations;
    unsigned int    nAlleles = alleles->capacity;

    assert( (nAlleles % allele_space_type::ALIGNMENT_SIZE) == 0 );

    if( nAlleles == 0 ) { return; }

    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];
    __shared__ real_type        rand_pool[ allele_space_type::ALIGNMENT_SIZE ];
    __shared__ unsigned int     event_hash[ allele_space_type::ALIGNMENT_SIZE];

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[tid];
    }
    __syncthreads();

    state_type local_state = states[ bid * tpb + tid ];

    unsigned int nonzero_warp = (threadIdx.y != 0);
    unsigned int nonzero_thread = (tid != 0);
    int_type seq_idx = bid;
    while( seq_idx < nSequences ) {
        real_type x = curand_uniform( &local_state );   // x in (0, 1]
        rand_pool[ tid ] = ((x >= 1.0) ? 0.0 : x);  // wrap around x to be in [0, 1); this way all event hash bins follow [,) pattern

        x = curand_uniform( &local_state );

        int_type rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
        __syncthreads();

        for( i = 1; i < 32; i <<= 1 ) {
            unsigned int r = __shfl_up( rand, i );
            rand += ( (threadIdx.x >= i ) * r );
        }

        if( threadIdx.x == 31 ) {
            event_hash[ threadIdx.y ] = rand;
        }
        __syncthreads();

        unsigned int _sum = event_hash[ threadIdx.x ];
        _sum *= (threadIdx.x < blockDim.y);
        __syncthreads();

        for( i = 1; i < 32; i <<= 1 ) {
            unsigned int s = __shfl_up( _sum, i );
            _sum += (( threadIdx.x >= i ) * s);
        }

        unsigned int s = __shfl( _sum, 31 );
//        assert( max_events < allele_space_type::ALIGNMENT_SIZE );
//
        if( s == 0 ) { // true for all threads in block assuming 1 block per sequence
            // if there are no events for this sequence, then simply clear the memory
            unsigned int seq_start = seq_idx * sequence_width + tid;
            unsigned int seq_end = (seq_idx + 1) * sequence_width;
            while( seq_start < seq_end ) {
                seqs[ seq_start ] = 0;
                seq_start += tpb;
            }
            __syncthreads();
        } else {

            s = __shfl( _sum, threadIdx.y - nonzero_warp);
            s *= nonzero_warp;
            __syncthreads();

            rand += s;
            event_hash[tid] = rand;
            __syncthreads();

            i = event_hash[ tid - nonzero_thread];    // minimum event index
            i *= nonzero_thread;
            __syncthreads();

            // BEGIN divergent code
            while (i < rand) {
                x = rand_pool[ i ];
                rand_pool[ i++ ] = (((real_type) tid) + x) / ((real_type) allele_space_type::ALIGNMENT_SIZE);
            }
            __syncthreads();
            // END divergent code

            unsigned int seq_offset = seq_idx * sequence_width + threadIdx.y;   // every thread in a warp has the same block offset
            i = tid;
            while( i < nAlleles ) {
                x = allele_list[ i ];

                rand = (unsigned int) ( x * ((real_type)allele_space_type::ALIGNMENT_SIZE));

                unsigned int nonzero_bin = (rand != 0);  // _keep == 0 -> rand == 0 -> e_min == 0; _keep == 1 -> rand != 0 -> e_min == event_hash[ rand - 1]

                // each thread reads hash (and random pool) relative to their 
                // local allele (x)
                // this code will result in bank conflicts
                //
                // initial performance results suggest that this is
                // an acceptable overhead as overall runtime of simulation
                // loop is minimized when this algorithm is used
                unsigned int e_max = event_hash[ rand ];
                unsigned int e_min = event_hash[ rand - nonzero_bin ];
                e_min *= nonzero_bin;

                int_type cmask = e_min;

                // BEGIN divergent code
                while( e_min < e_max ) {
                    real_type y = rand_pool[ e_min++ ];
                    cmask += (y < x);
                }
                __syncthreads();
                // END divergent code

                cmask = ((cmask & 1) << threadIdx.x);
                
                for( unsigned int j = 1; j < 32; j <<= 1 ) {
                    int_type _c = __shfl_up( cmask, j);
                    cmask |= ((threadIdx.x >= j) * _c);
                }
                if( threadIdx.x == 31 ) {
//                    assert( seq_offset < cap );
                    seqs[ seq_offset ] = cmask;
                }
                __syncthreads();

                i += tpb;
                seq_offset += blockDim.y;   // wpb
            }
            __syncthreads();

        }
        seq_idx += bpg;
    }

    states[ bid * tpb + tid ] = local_state;
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 3 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;
    typedef xover_config< unordered_tag, 3 >            xover_type;

    typedef poisson_cdf< RealType, 32 > poisson_type;

    assert( blockDim.y <= xover_type::MAX_WARPS && blockDim.x == 32);   // 8 or fewer full warps

    unsigned int  nAlleles = alleles->capacity;
    if( nAlleles == 0 ) { return; }

    assert( (nAlleles & 31) == 0 ); // nAlleles == 32 * m

    const unsigned int MAX_EVENTS_PER_WARP = 128;    // maximum number of recombination events per sequence
    __shared__ real_type rand_pool[ MAX_EVENTS_PER_WARP * xover_type::MAX_WARPS ];  // 16 warps/block (arbitrary number); 512 random numbers
    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];                      // 4 * 32 == 128

    unsigned int max_k = pois->max_k;

    if( threadIdx.y == 0 ) {    // use first warp to read poisson CDF
        s_pois_cdf[ threadIdx.x ] = pois->_cdf[threadIdx.x];
    }
    __syncthreads();

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int bpg = (gridDim.x * gridDim.y);

    unsigned int spg = bpg * blockDim.y;
    
    int_type seq_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;
    real_type   * allele_list = alleles->locations;

    unsigned int seq_idx = bid * blockDim.y + threadIdx.y;
    state_type local_state = states[ seq_idx * 32 + threadIdx.x ]; // every thread/warp uses the SAME random state

    unsigned int max_seqs = nSequences / blockDim.y;
    max_seqs += (( nSequences % blockDim.y) ? 1 : 0);
    max_seqs *= blockDim.y;

    while( seq_idx < max_seqs ) {
        real_type x = curand_uniform( &local_state );
        unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
        __syncthreads();

        for(unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int t = __shfl_up( rand, i );
            rand += ((threadIdx.x >= i) * t);
        }

        unsigned int max_events = __shfl( rand, 31 );
        // fill random pool
        //
        if( max_events >= MAX_EVENTS_PER_WARP ) {
            if( threadIdx.x == 0 ) {
                printf( "Too many events to generate: %d\n", max_events );
            }
            assert( max_events < MAX_EVENTS_PER_WARP );
        }
        __syncthreads();

        rand_pool[ tid ] = curand_uniform( &local_state );
        rand_pool[ tid + tpb ] = curand_uniform( &local_state );
        rand_pool[ tid + 2 * tpb ] = curand_uniform( &local_state );
        rand_pool[ tid + 3 * tpb ] = curand_uniform( &local_state );
        __syncthreads();

        unsigned int seq_offset = seq_idx * seq_width;
        unsigned int a_idx = threadIdx.x;
        while( a_idx < nAlleles ) {
            real_type loc = allele_list[ a_idx ];

            unsigned int s = 0, mask = 0;
            while( s < max_events ) {   // warps within a block diverge. threads within a warp do not
                real_type y = rand_pool[ s++ ]; // every thread within a warp read/loads the same value (sequence offset)
                mask += ( y < loc );
            }
            __syncthreads();

            mask = ((mask & 1) << threadIdx.x);

            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int e = __shfl_up( mask, i );
                mask |= ((threadIdx.x >= i) * e);
            }

            if( threadIdx.x == 31 && seq_idx < nSequences ) {
                seqs[ seq_offset ] = mask;
            }
            __syncthreads();
            a_idx += 32;
            ++seq_offset;
        }
        seq_idx += spg;
    }

    seq_idx = bid * blockDim.y + threadIdx.y;  // reset seq_idx
    states[ seq_idx * 32 + threadIdx.x ] = local_state;
}
 KOKKOS_INLINE_FUNCTION
 float shfl(const float &val, const int& srcLane, const int& width ) {
   return __shfl(val,srcLane,width);
 }