Пример #1
0
HEMI_DEV_CALLABLE_INLINE
void jump_decider_device(RNGState* rng, double* nll_current,
                         const double* nll_proposed, double* v_current,
                         const double* v_proposed, unsigned nparameters,
                         int* accepted, int* counter, float* jump_buffer,
                         const bool debug_mode=false) {
#ifdef HEMI_DEV_CODE
  double u = curand_uniform(&rng[0]);
#else
  double u = gRandom->Uniform();
#endif

  // metropolis algorithm
  double np = nll_proposed[0];
  double nc = nll_current[0];
  if (debug_mode || (np < nc || u <= exp(nc - np))) {
    nll_current[0] = np;
    for (unsigned i=0; i<nparameters; i++) {
      v_current[i] = v_proposed[i];
    }
    accepted[0] += 1;
  }

  // append all steps to jump buffer
  int count = counter[0];
  for (unsigned i=0; i<nparameters; i++) {
    jump_buffer[count * (nparameters + 1) + i] = v_current[i];
  }
  jump_buffer[count * (nparameters + 1) + nparameters] = nll_current[0];
  counter[0] = count + 1;    
}
Пример #2
0
                    DINLINE Type operator()(RNGState* state) const
                    {
                        // (0.f, 1.0f]
                        const Type raw = curand_uniform(state);

                        /// \warn hack, are is that really ok? I say, yes, since
                        /// it shifts just exactly one number. Axel
                        ///
                        ///   Note: (1.0f - raw) does not work, since
                        ///         nvidia seems to return denormalized
                        ///         floats around 0.f (thats not as they
                        ///         state it out in their documentation)
                        // [0.f, 1.0f)
                        const Type r = raw * static_cast<float>( raw != Type(1.0) );
                        return r;
                    }
__global__ void select_and_crossover_kernel( StateType * states
                                            , device_sequence_space< SequenceIntType > * parent_seqs
                                            , device_event_space< IntType, no_order_tag >  * parent_ids
                                            , AlleleSpaceType * alleles
                                            , device_free_space< SequenceIntType, unit_ordered_tag > * free_space
                                            , poisson_cdf< RealType, 32 > * pois
                                            , device_sequence_space< SequenceIntType > * offspring_seqs ) {
    typedef StateType                                       state_type;
    typedef poisson_cdf< RealType, 32 >                     poisson_type;
    typedef device_sequence_space< SequenceIntType >        sequence_space_type;
    typedef typename sequence_space_type::int_type          sequence_int_type;

    typedef device_event_space< IntType, no_order_tag >    selection_type;
    typedef typename selection_type::int_type               selection_int_type;

    typedef AlleleSpaceType                                 allele_space_type;
    typedef typename allele_space_type::real_type           real_type;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int offspring_count = offspring_seqs->seq_count;

    if( bid >= offspring_count ) return;

    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int warp_id = (tid >> 5 );
    unsigned int lane_id = (tid & 31 );

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int wpb = (tpb >> 5);
    unsigned int bpg = (gridDim.x * gridDim.y);
    unsigned int par_count = parent_seqs->seq_count;

    if( par_count == 0 ) {
        if( bid == 0 && tid == 0 ) {
            unsigned int off_cap = offspring_seqs->capacity;

            memset( offspring_seqs->sequences, 0, off_cap * sizeof( sequence_int_type) );
        }

        return;
    }

    state_type local_state = states[ bid * tpb + tid ];

    __shared__ unsigned int s_event_hash[ 32 ];
    __shared__ real_type    s_rand_pool[ 1024 ];
    __shared__ real_type    s_pois_cdf[ poisson_type::MAX_K ];

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[ tid ];
    }
    __syncthreads();

    while( bid < offspring_count ) {
        real_type r = curand_uniform( &local_state );

        unsigned int e = _find_poisson_maxk32( s_pois_cdf, r, max_k );
        __syncthreads();

        if( tid < 32 ) {
            s_event_hash[ tid ] = e;
        }
        __syncthreads();

        e = s_event_hash[ lane_id ];
        __syncthreads();
        
        unsigned int psum = e;
        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int _e = __shfl_up( e, i );
            e = ((_e > e) ? _e : e );
        }

        unsigned int max = __shfl( e, 31 );

        for( unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int p = __shfl_up( psum, i );
            psum += (( lane_id >= i ) * p );
        }

        unsigned int base_count = __shfl_up( psum, 1 );
        if( lane_id == 0 ) base_count = 0;
        __syncthreads();

        for( unsigned int i = warp_id; i < max; i += wpb ) {    // warp divergence
            r = curand_uniform( &local_state );

            s_rand_pool[ i * 32 + lane_id ] = (((real_type) lane_id + r) / (real_type) 32);
        }
        __syncthreads();

        // set up parent and offspring indexes
        selection_int_type id = par_ids[ bid ]; // all threads in block share same parent
        id <<= 1;

        unsigned int p0 = id * par_width + warp_id;
        unsigned int o_idx = bid * off_width + warp_id;
        unsigned int a_id = tid;

        while( a_id < par_allele_cap ) {    // true for all threads
            sequence_int_type p = par_seqs[ p0 ];
            sequence_int_type q = par_seqs[ p0 + par_width ];
            real_type a = all_loc[ a_id ];  // every thread reads a location just in case

            e = (psum - base_count);
            unsigned int x = base_count;
            for( unsigned int i = 0; i < max; ++i ) {   // should not diverge
                real_type y = s_rand_pool[ i * 32 + lane_id ];
                x += ((i < e) * ( y < a));
            }
            __syncthreads();

            x = ((x & 1) << lane_id);

            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int _x = __shfl_up( x, i );
                x |= ((lane_id >= i ) * _x );
            }

            if( lane_id == 31 ) {
                off_seqs[ o_idx ] = ((p & ~x) | (q & x));
            }
            __syncthreads();

            p0 += wpb;
            o_idx += wpb;
            a_id += tpb;
        }
        bid += bpg;
    }

    bid = blockIdx.y * gridDim.x + blockIdx.x;
    local_state[ bid * tpb + tid ];
}
Пример #4
0
__global__ void
 truncnorm_kernel(
      float *x,      // Vector to contain returned samples
      int n,         // Number of samples to return
      float mu,     // Vector of mu's
      float sigma,  // Vector of sigma's
      float a,      // Vector of lower-truncation values
      float b)      // Vector of upper-truncation values
{
    // Usual block/thread indexing...
    int myblock = blockIdx.x + blockIdx.y * gridDim.x;
    int blocksize = blockDim.x * blockDim.y * blockDim.z;
    int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
    int idx = myblock * blocksize + subthread;

    float tmp;      // Trying to sample from normal distribution
    float z;        // Exponential random variable
    float phi;      // Calculated probability
    int accept = 0;   // Initialize accept or not
    int maxitr = 1000;  // Maximum try for just sampling in basic normal distribution

  	// Setting RNG
  	curandState state;
  	curand_init(9131 + idx*17, idx, 0, &state);

    // Draw sample
    if (idx < n){
        int i = 0;

        if (isfinite(a) && !isfinite(b)){       //Left truncate
            float mu_ = (a - mu) / sigma;    //standardizing, gives left cut point
            float alpha = ( mu_ + sqrt(mu_ * mu_ + 4)) / 2;  //optimal alpha

            while(!accept && i < maxitr){
                tmp = mu + sigma * curand_normal(&state);  //standard sampling
                if(tmp > a) {
                    x[idx] = tmp;
                    accept = 1;
                }
                else i++;
            }
            while(!accept){
                z = mu_ - log(curand_uniform(&state))/alpha;

                if (mu_ < alpha){
                    phi = exp( -(alpha - z) * (alpha - z) /2);
                }
                else {
                    phi = exp( -(mu_ - alpha) * (mu_ - alpha) / 2 ) * exp(-(alpha - z)*(alpha - z)/2);

                }

                if( curand_uniform(&state) < phi ){
                    x[idx] = z * sigma + mu;
                    accept = 1;
                }
            }
        }
        else if (isfinite(b) && !isfinite(a)){   //Right truncate
            float mu_new = -mu;
            float mu_ = - (b - mu_new) / sigma;
            float alpha = ( mu_ + sqrt(mu_ * mu_ + 4)) / 2;  //optimal alpha

            while(!accept && i < maxitr){
                tmp = mu + sigma * curand_normal(&state);  //standard sampling
                if(tmp < b) {
                    x[idx] = tmp;
                    accept = 1;
                }
                else i++;
            }
            while(!accept){
                z = mu_ - log(curand_uniform(&state))/alpha;

                if (mu_ < alpha){
                    phi = exp( -(alpha - z) * (alpha - z) /2);
                }
                else {
                    phi = exp( -(mu_ - alpha) * (mu_ - alpha) / 2 ) * exp(-(alpha - z)*(alpha - z)/2);

                }

                if( curand_uniform(&state) < phi ){
                    x[idx] = -(z * sigma + mu_new);
                    accept = 1;
                }
            }

        }
        else if (!isfinite(a) && !isfinite(b)){ //No truncation
            x[idx] = mu + sigma * curand_normal(&state);
        }
        else if (isfinite(a) && isfinite(b)){  //Finite truncation
            float mu_ = (a - mu) / sigma;
            float mu_plus = (b - mu) / sigma;

            while(!accept && i < maxitr){
                    tmp = mu + sigma * curand_normal(&state);  //standard sampling
                    if(tmp <= b && tmp >= a) {
                        x[idx] = tmp;
                        accept = 1;
                    }
                    else i++;
                }
            while(!accept){
                float g;
                z = mu_ + (mu_plus-mu_)*curand_uniform(&state);

                if ( 0 >= mu_ && 0 <= mu_plus)
                g = exp ( -z * z / 2);
                else if ( mu_plus < 0)
                g = exp( ( (mu_plus * mu_plus) - (z * z) )/2 );
                else if ( 0 < mu_ )
                g = exp( ( (mu_ * mu_) - (z * z) )/2 );

                if (curand_uniform(&state) < g){
                    x[idx] = z * sigma + mu;
                    accept = 1;
                }

            }

        }
    }
    return;
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 5 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;

    typedef poisson_cdf< RealType, 32 >                 poisson_type;
    typedef xover_config< unordered_tag, 5 >            xover_type;

    assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS );

    // encourage quick return;
    unsigned int nAlleles = alleles->capacity;
    if( nAlleles == 0 ) return;

    // only considering 2d grid/block definitions
    int_type bid = blockIdx.y * gridDim.x + blockIdx.x;
    int_type tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int bpg = (gridDim.x * gridDim.y);

    real_type * allele_list = alleles->locations;

    int_type * seqs = sequences->sequences;
    int_type seq_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;

    const unsigned int MAX_EVENTS = 32;

    __shared__ int_type     event_count;
    __shared__ real_type    rand_pool[ MAX_EVENTS ];

    unsigned int seqIdx = bid;

    // 1 sequence per block
    while( seqIdx < nSequences ) {
        // use the first thread of each block to populate a pool
        // of random numbers
        if( tid == 0 ) {
            state_type local_state = states[ bid ];
            // update the event_count
            event_count = curand_poisson( &local_state, pois->lambda );

            for( unsigned int i = 0; i < event_count; ++i ) {
                rand_pool[ i ] = curand_uniform( &local_state );
            }

            states[ bid ] = local_state;
        }
        __syncthreads();

        // copy event_count from shared memory to thread local memory
        // 
        unsigned int nEvents = event_count;

        // true for all threads
        if( nEvents == 0 ) {
            unsigned int id = tid;
            while( id < seq_width ) {
                seqs[ seqIdx * seq_width + tid ] = 0;
                id += tpb;
            }
        } else {
            unsigned int id = tid;
            unsigned int offset = seqIdx * seq_width + threadIdx.y;

            while( id < nAlleles ) {
                real_type pos = allele_list[ id ];

                unsigned int n = 0;
                for( unsigned int i = 0; i < nEvents; ++i ) {
                    n += ((rand_pool[ i ] < pos) ? 1 : 0);
                }
                __syncthreads();

                int_type mask = (1 << threadIdx.x);
                mask *= (n & 1);

                // scan left mask
                for( unsigned int i = 1; i < 32; i <<= 1 ) {
                    int_type _c = __shfl_up( mask, i );
                    mask |= ((threadIdx.x >= i ) *_c );
                }

                if( threadIdx.x == 31 ) {
                    seqs[ offset ] = mask;
                }
                __syncthreads();

                id += tpb;
                offset += blockDim.y;
            }
        }

        seqIdx += bpg;
        __syncthreads();
    }
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 4 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;

    typedef poisson_cdf< RealType, 32 >                 poisson_type;
    typedef xover_config< unordered_tag, 4 >            xover_type;

    assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS );

    int_type bid = blockIdx.y * gridDim.x + blockIdx.x;
    int_type tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    assert( (tpb % allele_space_type::ALIGNMENT_SIZE) == 0 );

    unsigned int bpg = (gridDim.x * gridDim.y);

    unsigned int i;
    int_type sequence_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;

    //int_type  cap = sequences->capacity;

    real_type       * allele_list = alleles->locations;
    unsigned int    nAlleles = alleles->capacity;

    assert( (nAlleles % allele_space_type::ALIGNMENT_SIZE) == 0 );

    if( nAlleles == 0 ) { return; }

    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];
    __shared__ real_type        rand_pool[ allele_space_type::ALIGNMENT_SIZE ];
    __shared__ unsigned int     event_hash[ allele_space_type::ALIGNMENT_SIZE];

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[tid];
    }
    __syncthreads();

    state_type local_state = states[ bid * tpb + tid ];

    unsigned int nonzero_warp = (threadIdx.y != 0);
    unsigned int nonzero_thread = (tid != 0);
    int_type seq_idx = bid;
    while( seq_idx < nSequences ) {
        real_type x = curand_uniform( &local_state );   // x in (0, 1]
        rand_pool[ tid ] = ((x >= 1.0) ? 0.0 : x);  // wrap around x to be in [0, 1); this way all event hash bins follow [,) pattern

        x = curand_uniform( &local_state );

        int_type rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
        __syncthreads();

        for( i = 1; i < 32; i <<= 1 ) {
            unsigned int r = __shfl_up( rand, i );
            rand += ( (threadIdx.x >= i ) * r );
        }

        if( threadIdx.x == 31 ) {
            event_hash[ threadIdx.y ] = rand;
        }
        __syncthreads();

        unsigned int _sum = event_hash[ threadIdx.x ];
        _sum *= (threadIdx.x < blockDim.y);
        __syncthreads();

        for( i = 1; i < 32; i <<= 1 ) {
            unsigned int s = __shfl_up( _sum, i );
            _sum += (( threadIdx.x >= i ) * s);
        }

        unsigned int s = __shfl( _sum, 31 );
//        assert( max_events < allele_space_type::ALIGNMENT_SIZE );
//
        if( s == 0 ) { // true for all threads in block assuming 1 block per sequence
            // if there are no events for this sequence, then simply clear the memory
            unsigned int seq_start = seq_idx * sequence_width + tid;
            unsigned int seq_end = (seq_idx + 1) * sequence_width;
            while( seq_start < seq_end ) {
                seqs[ seq_start ] = 0;
                seq_start += tpb;
            }
            __syncthreads();
        } else {

            s = __shfl( _sum, threadIdx.y - nonzero_warp);
            s *= nonzero_warp;
            __syncthreads();

            rand += s;
            event_hash[tid] = rand;
            __syncthreads();

            i = event_hash[ tid - nonzero_thread];    // minimum event index
            i *= nonzero_thread;
            __syncthreads();

            // BEGIN divergent code
            while (i < rand) {
                x = rand_pool[ i ];
                rand_pool[ i++ ] = (((real_type) tid) + x) / ((real_type) allele_space_type::ALIGNMENT_SIZE);
            }
            __syncthreads();
            // END divergent code

            unsigned int seq_offset = seq_idx * sequence_width + threadIdx.y;   // every thread in a warp has the same block offset
            i = tid;
            while( i < nAlleles ) {
                x = allele_list[ i ];

                rand = (unsigned int) ( x * ((real_type)allele_space_type::ALIGNMENT_SIZE));

                unsigned int nonzero_bin = (rand != 0);  // _keep == 0 -> rand == 0 -> e_min == 0; _keep == 1 -> rand != 0 -> e_min == event_hash[ rand - 1]

                // each thread reads hash (and random pool) relative to their 
                // local allele (x)
                // this code will result in bank conflicts
                //
                // initial performance results suggest that this is
                // an acceptable overhead as overall runtime of simulation
                // loop is minimized when this algorithm is used
                unsigned int e_max = event_hash[ rand ];
                unsigned int e_min = event_hash[ rand - nonzero_bin ];
                e_min *= nonzero_bin;

                int_type cmask = e_min;

                // BEGIN divergent code
                while( e_min < e_max ) {
                    real_type y = rand_pool[ e_min++ ];
                    cmask += (y < x);
                }
                __syncthreads();
                // END divergent code

                cmask = ((cmask & 1) << threadIdx.x);
                
                for( unsigned int j = 1; j < 32; j <<= 1 ) {
                    int_type _c = __shfl_up( cmask, j);
                    cmask |= ((threadIdx.x >= j) * _c);
                }
                if( threadIdx.x == 31 ) {
//                    assert( seq_offset < cap );
                    seqs[ seq_offset ] = cmask;
                }
                __syncthreads();

                i += tpb;
                seq_offset += blockDim.y;   // wpb
            }
            __syncthreads();

        }
        seq_idx += bpg;
    }

    states[ bid * tpb + tid ] = local_state;
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 3 > * v ) {

    typedef StateType                                   state_type;
    typedef AlleleSpaceType                             allele_space_type;
    typedef typename allele_space_type::real_type       real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;
    typedef xover_config< unordered_tag, 3 >            xover_type;

    typedef poisson_cdf< RealType, 32 > poisson_type;

    assert( blockDim.y <= xover_type::MAX_WARPS && blockDim.x == 32);   // 8 or fewer full warps

    unsigned int  nAlleles = alleles->capacity;
    if( nAlleles == 0 ) { return; }

    assert( (nAlleles & 31) == 0 ); // nAlleles == 32 * m

    const unsigned int MAX_EVENTS_PER_WARP = 128;    // maximum number of recombination events per sequence
    __shared__ real_type rand_pool[ MAX_EVENTS_PER_WARP * xover_type::MAX_WARPS ];  // 16 warps/block (arbitrary number); 512 random numbers
    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];                      // 4 * 32 == 128

    unsigned int max_k = pois->max_k;

    if( threadIdx.y == 0 ) {    // use first warp to read poisson CDF
        s_pois_cdf[ threadIdx.x ] = pois->_cdf[threadIdx.x];
    }
    __syncthreads();

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int tpb = (blockDim.x * blockDim.y);
    unsigned int bpg = (gridDim.x * gridDim.y);

    unsigned int spg = bpg * blockDim.y;
    
    int_type seq_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;
    real_type   * allele_list = alleles->locations;

    unsigned int seq_idx = bid * blockDim.y + threadIdx.y;
    state_type local_state = states[ seq_idx * 32 + threadIdx.x ]; // every thread/warp uses the SAME random state

    unsigned int max_seqs = nSequences / blockDim.y;
    max_seqs += (( nSequences % blockDim.y) ? 1 : 0);
    max_seqs *= blockDim.y;

    while( seq_idx < max_seqs ) {
        real_type x = curand_uniform( &local_state );
        unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
        __syncthreads();

        for(unsigned int i = 1; i < 32; i <<= 1 ) {
            unsigned int t = __shfl_up( rand, i );
            rand += ((threadIdx.x >= i) * t);
        }

        unsigned int max_events = __shfl( rand, 31 );
        // fill random pool
        //
        if( max_events >= MAX_EVENTS_PER_WARP ) {
            if( threadIdx.x == 0 ) {
                printf( "Too many events to generate: %d\n", max_events );
            }
            assert( max_events < MAX_EVENTS_PER_WARP );
        }
        __syncthreads();

        rand_pool[ tid ] = curand_uniform( &local_state );
        rand_pool[ tid + tpb ] = curand_uniform( &local_state );
        rand_pool[ tid + 2 * tpb ] = curand_uniform( &local_state );
        rand_pool[ tid + 3 * tpb ] = curand_uniform( &local_state );
        __syncthreads();

        unsigned int seq_offset = seq_idx * seq_width;
        unsigned int a_idx = threadIdx.x;
        while( a_idx < nAlleles ) {
            real_type loc = allele_list[ a_idx ];

            unsigned int s = 0, mask = 0;
            while( s < max_events ) {   // warps within a block diverge. threads within a warp do not
                real_type y = rand_pool[ s++ ]; // every thread within a warp read/loads the same value (sequence offset)
                mask += ( y < loc );
            }
            __syncthreads();

            mask = ((mask & 1) << threadIdx.x);

            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int e = __shfl_up( mask, i );
                mask |= ((threadIdx.x >= i) * e);
            }

            if( threadIdx.x == 31 && seq_idx < nSequences ) {
                seqs[ seq_offset ] = mask;
            }
            __syncthreads();
            a_idx += 32;
            ++seq_offset;
        }
        seq_idx += spg;
    }

    seq_idx = bid * blockDim.y + threadIdx.y;  // reset seq_idx
    states[ seq_idx * 32 + threadIdx.x ] = local_state;
}
__global__ void crossover_kernel( StateType * states
                                , AlleleSpaceType * alleles
                                , device_free_space< IntType, unordered_tag > * free_space
                                , poisson_cdf< RealType, 32 > * pois
                                , device_sequence_space< IntType > * sequences
                                , clotho::utility::algo_version< 2 > * v ) {

    typedef StateType  state_type;
    typedef AlleleSpaceType allele_space_type;
    typedef typename allele_space_type::real_type   real_type;

    typedef device_sequence_space< IntType >            sequence_space_type;
    typedef typename sequence_space_type::int_type      int_type;

    typedef poisson_cdf< RealType, 32 > poisson_type;

    unsigned int  nAlleles = alleles->capacity;
    if( nAlleles == 0 ) { return; }

    assert( (nAlleles & 31) == 0 ); // multiple of 32 alleles

    const unsigned int MAX_EVENTS_PER_WARP = 64;    // maximum number of recombination events per sequence
    const unsigned int MAX_WARP_PER_BLOCK = 32;
    const unsigned int HASH_WIDTH = 32;

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    unsigned int lane_id = (tid & 31);
    unsigned int warp_id = (tid >> 5);

    unsigned int tpb = (blockDim.x * blockDim.y);
    assert( (tpb & 31) == 0 );  // all warps are full

    unsigned int bpg = (gridDim.x * gridDim.y);
    unsigned int wpb = (tpb >> 5);  // == sequences/block (spb)

    assert( wpb <= MAX_WARP_PER_BLOCK );

    unsigned int spg = bpg * wpb;
    
    int_type sequence_width = sequences->seq_width;
    int_type nSequences = sequences->seq_count;    

    int_type    * seqs = sequences->sequences;
    real_type   * allele_list = alleles->locations;

    __shared__ real_type        s_pois_cdf[ poisson_type::MAX_K ];                      // 4 * 32 == 128
    __shared__ real_type        rand_pool[ MAX_WARP_PER_BLOCK * MAX_EVENTS_PER_WARP ];  // 4 * 32 * 64 == 8192
    __shared__ unsigned int     event_hash[ MAX_WARP_PER_BLOCK * HASH_WIDTH ];          // 4 * 32 * 128 == 16384
                                                                                        //----------------------
                                                                                        // 24704 (bytes of shared memory)

    unsigned int max_k = pois->max_k;
    if( tid < poisson_type::MAX_K ) {
        s_pois_cdf[ tid ] = pois->_cdf[tid];
    }
    __syncthreads();

    state_type local_state = states[ bid * tpb + tid ]; // assumes every thread in the GRID has a state defined

    unsigned int max_seq_id = nSequences / wpb;
    max_seq_id += (( nSequences % wpb ) ? 1 : 0);
    max_seq_id *= wpb;

    int_type seq_idx = bid * wpb + warp_id;

    while( seq_idx < max_seq_id ) { // should allow blocks to terminate early

        // generate a recombination hash for each sequence (warp)
//        unsigned int psum = 0;
//        for( unsigned int w = lane_id; w < HASH_WIDTH; w += 32 ) {
            real_type x = curand_uniform( &local_state );

            unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k );
            __syncthreads();

            // compute prefix sum with in each warp
            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                unsigned int r = __shfl_up( rand, i );
                rand += ( (lane_id >= i ) * r );
            }

//            rand += psum;
//            event_hash[ warp_id * HASH_WIDTH + w ] = rand;   //  event_hash contain prefix sum (scan)
            event_hash[ tid ] = rand;

            unsigned int s = __shfl_up( rand, 1);   //
            s *= ( lane_id != 0 );

            s += (warp_id * MAX_EVENTS_PER_WARP);       // shift s and rand to be relative to warp
            rand += (warp_id * MAX_EVENTS_PER_WARP);

            // BEGIN divergent code
            real_type accum = 0.;
            while (s < rand) {
                x = curand_uniform( &local_state );

                accum += (log( x ) / (real_type)(rand - s));

                rand_pool[s++] = ((((real_type)tid) + (1.0 - exp(accum))) / ((real_type)32.));
            }
            __syncthreads();
            // END divergent code
            //
//            psum = __shfl( rand, 31 );
//        }

        unsigned int seq_offset = seq_idx * sequence_width;
        unsigned int a_id = lane_id;
        while( a_id < nAlleles ) {
            real_type x = allele_list[ a_id ];

            unsigned int h_idx = (unsigned int) ( x * ((real_type) HASH_WIDTH ));  // map allele location to bin index

            unsigned int s = event_hash[ h_idx++ ];
            unsigned int e = event_hash[ h_idx ];
            __syncthreads();

            int_type cmask = s;

            s += (warp_id * MAX_EVENTS_PER_WARP);
            e += (warp_id * MAX_EVENTS_PER_WARP );

            // BEGIN divergent code
            while( s < e  ) {
                real_type y = rand_pool[ s++ ];
                cmask += (x > y);
            }
            __syncthreads();
            // END divergent code

            cmask = ((cmask & 1) << lane_id);

            // reduce cmask within each warp
            for( unsigned int i = 1; i < 32; i <<= 1 ) {
                int_type _c = __shfl_up( cmask, i );
                cmask |= ((lane_id >= i) * _c);
            }

            if( lane_id == 31 && seq_idx < nSequences ) {
                seqs[seq_offset] = cmask;
            }
            __syncthreads();

            a_id += 32;
            ++seq_offset;
        }
        __syncthreads();

        seq_idx += spg;
    }

    states[ bid * tpb + tid ] = local_state;
}
Пример #9
0
 template<> __device__
 void generate_uniform<cfloat>(cfloat *cval, curandState_t *state)
 {
     cval->x = curand_uniform(state);
     cval->y = curand_uniform(state);
 }
Пример #10
0
 template<> __device__
 void generate_uniform<float>(float *val, curandState_t *state)
 {
     *val = curand_uniform(state);
 }
Пример #11
0
 template<> __device__
 void generate_uniform<char>(char *val, curandState_t *state)
 {
     *val = curand_uniform(state) > 0.5;
 }
Пример #12
0
__global__ void _scatter_mutations( StateType * states
                                    , device_free_space< IntType, unit_ordered_tag< IntType > >     * fspace
                                    , device_event_space< IntType, unit_ordered_tag< IntType > >    * events
                                    , device_sequence_space< IntType >                              * sequences
                                    , AlleleSpaceType                                               * alleles ) {
    typedef StateType   state_type;

    typedef device_event_space< IntType, unit_ordered_tag< IntType > >    event_space_type;
    typedef typename event_space_type::order_tag_type       order_tag_type;
    typedef typename event_space_type::size_type            size_type;
    typedef typename event_space_type::int_type             int_type;

    typedef float                                           real_type;

    assert( (blockDim.x * blockDim.y) == 1 );

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    unsigned int bpg = gridDim.x * gridDim.y;
    unsigned int wpb = (bpg >> 5);

    unsigned int lane_id = (bid & 31);
    unsigned int warp_id = (bid >> 5);

    unsigned int lane_max = events->bin_summary[ lane_id ];

    if( warp_id >= lane_max ) return;

    unsigned int R = sequences->seq_count;
    unsigned int _width = sequences->seq_width;

    unsigned int * flist = fspace->free_list;

    state_type local_state = states[ bid ];

    int_type *  seqs = sequences->sequences;

    order_tag_type * otag = NULL;

    unsigned int lidx = 0, k = 0;
    while( warp_id < lane_max ) {   // should only be one thread per block so no divergence

        // scan free_list for k-th free index in lane
        while( warp_id + 1 != k && lidx < _width ) {
            int_type f = flist[ lidx++ ];

            if( ((f >> lane_id) & 1) ) {
                ++k;
            }
        }

        if( warp_id + 1 == k ) {
            real_type r = curand_uniform( &local_state );

            unsigned int idx = r * R;

            idx *= (idx != R);  // idx == R -> idx = idx * 0 == 0

            idx *= _width;
            idx += (lidx - 1);

            int_type b = seqs[ idx ];
            b |= ( 1 << lane_id );
            seqs[ idx ] = b;

            idx = (lidx - 1) * 32 + lane_id;
            _generate_random_allele( &local_state, alleles, idx, otag );
        }

        warp_id += wpb;
    }
}
Пример #13
0
__global__ void _scatter_mutations( StateType * states
                                    , device_free_space< IntType, unordered_tag >   * fspace
                                    , device_event_space< IntType, unordered_tag >  * events
                                    , device_sequence_space< IntType >              * sequences
                                    , AlleleSpaceType                               * alleles) {
    typedef StateType   state_type;

    typedef device_sequence_space< IntType >                sequence_type;
    typedef device_event_space< IntType, unordered_tag >    event_space_type;
    typedef typename event_space_type::size_type            size_type;
    typedef typename event_space_type::int_type             int_type;

    typedef float                                           real_type;  // does not need same precision as AlleleSpaceType

    assert( (blockDim.x*blockDim.y) == 1 );

    unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x;
    size_type   tot = events->total;
    if( bid >= tot ) return;

    unsigned int _width = sequences->seq_width;
    if( _width == 0 ) return;

//    if( bid == 0 )
//        printf( "%d Free Space; %d New Mutation\n", fspace->total, tot );
    assert( tot <= fspace->total ); // true for all blocks (enough free bits for new mutations)

    fixed_width_converter< sequence_type::OBJECTS_PER_INT > converter;
    unsigned int bpg = gridDim.x * gridDim.y;

    unsigned int R = sequences->seq_count;

    unsigned int * fmap = fspace->free_map;
    state_type local_state = states[ bid ];

    unordered_tag * otag = NULL;

    int_type *  seqs = sequences->sequences;

    while( bid < tot ) {
        unsigned int fidx = fmap[ bid ];

        unsigned int block_idx = converter.major_offset( fidx );
        unsigned int bit_idx = converter.minor_offset( fidx );

        real_type r =  curand_uniform( &local_state );

        unsigned int idx = r * R;

        idx *= (idx != R );

        idx *= _width;
        idx += block_idx;

        // need to perform a bitonic sort to identify
        // whether multiple bit indices within the same block
        // are being updated

        int_type b = seqs[ idx ];
        b |= ( 1 << bit_idx );
        seqs[ idx ] = b;

        _generate_random_allele( &local_state, alleles, fidx, otag );

        bid += bpg;
    }

    states[ bid ] = local_state;
}
Пример #14
0
 __device__ inline float operator ()(float data, curandState* state) {
     return curand_uniform(state);
 }