HEMI_DEV_CALLABLE_INLINE void jump_decider_device(RNGState* rng, double* nll_current, const double* nll_proposed, double* v_current, const double* v_proposed, unsigned nparameters, int* accepted, int* counter, float* jump_buffer, const bool debug_mode=false) { #ifdef HEMI_DEV_CODE double u = curand_uniform(&rng[0]); #else double u = gRandom->Uniform(); #endif // metropolis algorithm double np = nll_proposed[0]; double nc = nll_current[0]; if (debug_mode || (np < nc || u <= exp(nc - np))) { nll_current[0] = np; for (unsigned i=0; i<nparameters; i++) { v_current[i] = v_proposed[i]; } accepted[0] += 1; } // append all steps to jump buffer int count = counter[0]; for (unsigned i=0; i<nparameters; i++) { jump_buffer[count * (nparameters + 1) + i] = v_current[i]; } jump_buffer[count * (nparameters + 1) + nparameters] = nll_current[0]; counter[0] = count + 1; }
DINLINE Type operator()(RNGState* state) const { // (0.f, 1.0f] const Type raw = curand_uniform(state); /// \warn hack, are is that really ok? I say, yes, since /// it shifts just exactly one number. Axel /// /// Note: (1.0f - raw) does not work, since /// nvidia seems to return denormalized /// floats around 0.f (thats not as they /// state it out in their documentation) // [0.f, 1.0f) const Type r = raw * static_cast<float>( raw != Type(1.0) ); return r; }
__global__ void select_and_crossover_kernel( StateType * states , device_sequence_space< SequenceIntType > * parent_seqs , device_event_space< IntType, no_order_tag > * parent_ids , AlleleSpaceType * alleles , device_free_space< SequenceIntType, unit_ordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< SequenceIntType > * offspring_seqs ) { typedef StateType state_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef device_sequence_space< SequenceIntType > sequence_space_type; typedef typename sequence_space_type::int_type sequence_int_type; typedef device_event_space< IntType, no_order_tag > selection_type; typedef typename selection_type::int_type selection_int_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int offspring_count = offspring_seqs->seq_count; if( bid >= offspring_count ) return; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int warp_id = (tid >> 5 ); unsigned int lane_id = (tid & 31 ); unsigned int tpb = (blockDim.x * blockDim.y); unsigned int wpb = (tpb >> 5); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int par_count = parent_seqs->seq_count; if( par_count == 0 ) { if( bid == 0 && tid == 0 ) { unsigned int off_cap = offspring_seqs->capacity; memset( offspring_seqs->sequences, 0, off_cap * sizeof( sequence_int_type) ); } return; } state_type local_state = states[ bid * tpb + tid ]; __shared__ unsigned int s_event_hash[ 32 ]; __shared__ real_type s_rand_pool[ 1024 ]; __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[ tid ]; } __syncthreads(); while( bid < offspring_count ) { real_type r = curand_uniform( &local_state ); unsigned int e = _find_poisson_maxk32( s_pois_cdf, r, max_k ); __syncthreads(); if( tid < 32 ) { s_event_hash[ tid ] = e; } __syncthreads(); e = s_event_hash[ lane_id ]; __syncthreads(); unsigned int psum = e; for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int _e = __shfl_up( e, i ); e = ((_e > e) ? _e : e ); } unsigned int max = __shfl( e, 31 ); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int p = __shfl_up( psum, i ); psum += (( lane_id >= i ) * p ); } unsigned int base_count = __shfl_up( psum, 1 ); if( lane_id == 0 ) base_count = 0; __syncthreads(); for( unsigned int i = warp_id; i < max; i += wpb ) { // warp divergence r = curand_uniform( &local_state ); s_rand_pool[ i * 32 + lane_id ] = (((real_type) lane_id + r) / (real_type) 32); } __syncthreads(); // set up parent and offspring indexes selection_int_type id = par_ids[ bid ]; // all threads in block share same parent id <<= 1; unsigned int p0 = id * par_width + warp_id; unsigned int o_idx = bid * off_width + warp_id; unsigned int a_id = tid; while( a_id < par_allele_cap ) { // true for all threads sequence_int_type p = par_seqs[ p0 ]; sequence_int_type q = par_seqs[ p0 + par_width ]; real_type a = all_loc[ a_id ]; // every thread reads a location just in case e = (psum - base_count); unsigned int x = base_count; for( unsigned int i = 0; i < max; ++i ) { // should not diverge real_type y = s_rand_pool[ i * 32 + lane_id ]; x += ((i < e) * ( y < a)); } __syncthreads(); x = ((x & 1) << lane_id); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int _x = __shfl_up( x, i ); x |= ((lane_id >= i ) * _x ); } if( lane_id == 31 ) { off_seqs[ o_idx ] = ((p & ~x) | (q & x)); } __syncthreads(); p0 += wpb; o_idx += wpb; a_id += tpb; } bid += bpg; } bid = blockIdx.y * gridDim.x + blockIdx.x; local_state[ bid * tpb + tid ]; }
__global__ void truncnorm_kernel( float *x, // Vector to contain returned samples int n, // Number of samples to return float mu, // Vector of mu's float sigma, // Vector of sigma's float a, // Vector of lower-truncation values float b) // Vector of upper-truncation values { // Usual block/thread indexing... int myblock = blockIdx.x + blockIdx.y * gridDim.x; int blocksize = blockDim.x * blockDim.y * blockDim.z; int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; int idx = myblock * blocksize + subthread; float tmp; // Trying to sample from normal distribution float z; // Exponential random variable float phi; // Calculated probability int accept = 0; // Initialize accept or not int maxitr = 1000; // Maximum try for just sampling in basic normal distribution // Setting RNG curandState state; curand_init(9131 + idx*17, idx, 0, &state); // Draw sample if (idx < n){ int i = 0; if (isfinite(a) && !isfinite(b)){ //Left truncate float mu_ = (a - mu) / sigma; //standardizing, gives left cut point float alpha = ( mu_ + sqrt(mu_ * mu_ + 4)) / 2; //optimal alpha while(!accept && i < maxitr){ tmp = mu + sigma * curand_normal(&state); //standard sampling if(tmp > a) { x[idx] = tmp; accept = 1; } else i++; } while(!accept){ z = mu_ - log(curand_uniform(&state))/alpha; if (mu_ < alpha){ phi = exp( -(alpha - z) * (alpha - z) /2); } else { phi = exp( -(mu_ - alpha) * (mu_ - alpha) / 2 ) * exp(-(alpha - z)*(alpha - z)/2); } if( curand_uniform(&state) < phi ){ x[idx] = z * sigma + mu; accept = 1; } } } else if (isfinite(b) && !isfinite(a)){ //Right truncate float mu_new = -mu; float mu_ = - (b - mu_new) / sigma; float alpha = ( mu_ + sqrt(mu_ * mu_ + 4)) / 2; //optimal alpha while(!accept && i < maxitr){ tmp = mu + sigma * curand_normal(&state); //standard sampling if(tmp < b) { x[idx] = tmp; accept = 1; } else i++; } while(!accept){ z = mu_ - log(curand_uniform(&state))/alpha; if (mu_ < alpha){ phi = exp( -(alpha - z) * (alpha - z) /2); } else { phi = exp( -(mu_ - alpha) * (mu_ - alpha) / 2 ) * exp(-(alpha - z)*(alpha - z)/2); } if( curand_uniform(&state) < phi ){ x[idx] = -(z * sigma + mu_new); accept = 1; } } } else if (!isfinite(a) && !isfinite(b)){ //No truncation x[idx] = mu + sigma * curand_normal(&state); } else if (isfinite(a) && isfinite(b)){ //Finite truncation float mu_ = (a - mu) / sigma; float mu_plus = (b - mu) / sigma; while(!accept && i < maxitr){ tmp = mu + sigma * curand_normal(&state); //standard sampling if(tmp <= b && tmp >= a) { x[idx] = tmp; accept = 1; } else i++; } while(!accept){ float g; z = mu_ + (mu_plus-mu_)*curand_uniform(&state); if ( 0 >= mu_ && 0 <= mu_plus) g = exp ( -z * z / 2); else if ( mu_plus < 0) g = exp( ( (mu_plus * mu_plus) - (z * z) )/2 ); else if ( 0 < mu_ ) g = exp( ( (mu_ * mu_) - (z * z) )/2 ); if (curand_uniform(&state) < g){ x[idx] = z * sigma + mu; accept = 1; } } } } return; }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 5 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef xover_config< unordered_tag, 5 > xover_type; assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS ); // encourage quick return; unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) return; // only considering 2d grid/block definitions int_type bid = blockIdx.y * gridDim.x + blockIdx.x; int_type tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); unsigned int bpg = (gridDim.x * gridDim.y); real_type * allele_list = alleles->locations; int_type * seqs = sequences->sequences; int_type seq_width = sequences->seq_width; int_type nSequences = sequences->seq_count; const unsigned int MAX_EVENTS = 32; __shared__ int_type event_count; __shared__ real_type rand_pool[ MAX_EVENTS ]; unsigned int seqIdx = bid; // 1 sequence per block while( seqIdx < nSequences ) { // use the first thread of each block to populate a pool // of random numbers if( tid == 0 ) { state_type local_state = states[ bid ]; // update the event_count event_count = curand_poisson( &local_state, pois->lambda ); for( unsigned int i = 0; i < event_count; ++i ) { rand_pool[ i ] = curand_uniform( &local_state ); } states[ bid ] = local_state; } __syncthreads(); // copy event_count from shared memory to thread local memory // unsigned int nEvents = event_count; // true for all threads if( nEvents == 0 ) { unsigned int id = tid; while( id < seq_width ) { seqs[ seqIdx * seq_width + tid ] = 0; id += tpb; } } else { unsigned int id = tid; unsigned int offset = seqIdx * seq_width + threadIdx.y; while( id < nAlleles ) { real_type pos = allele_list[ id ]; unsigned int n = 0; for( unsigned int i = 0; i < nEvents; ++i ) { n += ((rand_pool[ i ] < pos) ? 1 : 0); } __syncthreads(); int_type mask = (1 << threadIdx.x); mask *= (n & 1); // scan left mask for( unsigned int i = 1; i < 32; i <<= 1 ) { int_type _c = __shfl_up( mask, i ); mask |= ((threadIdx.x >= i ) *_c ); } if( threadIdx.x == 31 ) { seqs[ offset ] = mask; } __syncthreads(); id += tpb; offset += blockDim.y; } } seqIdx += bpg; __syncthreads(); } }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 4 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef poisson_cdf< RealType, 32 > poisson_type; typedef xover_config< unordered_tag, 4 > xover_type; assert( blockDim.x == 32 && blockDim.y <= xover_type::MAX_WARPS ); int_type bid = blockIdx.y * gridDim.x + blockIdx.x; int_type tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); assert( (tpb % allele_space_type::ALIGNMENT_SIZE) == 0 ); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int i; int_type sequence_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; //int_type cap = sequences->capacity; real_type * allele_list = alleles->locations; unsigned int nAlleles = alleles->capacity; assert( (nAlleles % allele_space_type::ALIGNMENT_SIZE) == 0 ); if( nAlleles == 0 ) { return; } __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; __shared__ real_type rand_pool[ allele_space_type::ALIGNMENT_SIZE ]; __shared__ unsigned int event_hash[ allele_space_type::ALIGNMENT_SIZE]; unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[tid]; } __syncthreads(); state_type local_state = states[ bid * tpb + tid ]; unsigned int nonzero_warp = (threadIdx.y != 0); unsigned int nonzero_thread = (tid != 0); int_type seq_idx = bid; while( seq_idx < nSequences ) { real_type x = curand_uniform( &local_state ); // x in (0, 1] rand_pool[ tid ] = ((x >= 1.0) ? 0.0 : x); // wrap around x to be in [0, 1); this way all event hash bins follow [,) pattern x = curand_uniform( &local_state ); int_type rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); for( i = 1; i < 32; i <<= 1 ) { unsigned int r = __shfl_up( rand, i ); rand += ( (threadIdx.x >= i ) * r ); } if( threadIdx.x == 31 ) { event_hash[ threadIdx.y ] = rand; } __syncthreads(); unsigned int _sum = event_hash[ threadIdx.x ]; _sum *= (threadIdx.x < blockDim.y); __syncthreads(); for( i = 1; i < 32; i <<= 1 ) { unsigned int s = __shfl_up( _sum, i ); _sum += (( threadIdx.x >= i ) * s); } unsigned int s = __shfl( _sum, 31 ); // assert( max_events < allele_space_type::ALIGNMENT_SIZE ); // if( s == 0 ) { // true for all threads in block assuming 1 block per sequence // if there are no events for this sequence, then simply clear the memory unsigned int seq_start = seq_idx * sequence_width + tid; unsigned int seq_end = (seq_idx + 1) * sequence_width; while( seq_start < seq_end ) { seqs[ seq_start ] = 0; seq_start += tpb; } __syncthreads(); } else { s = __shfl( _sum, threadIdx.y - nonzero_warp); s *= nonzero_warp; __syncthreads(); rand += s; event_hash[tid] = rand; __syncthreads(); i = event_hash[ tid - nonzero_thread]; // minimum event index i *= nonzero_thread; __syncthreads(); // BEGIN divergent code while (i < rand) { x = rand_pool[ i ]; rand_pool[ i++ ] = (((real_type) tid) + x) / ((real_type) allele_space_type::ALIGNMENT_SIZE); } __syncthreads(); // END divergent code unsigned int seq_offset = seq_idx * sequence_width + threadIdx.y; // every thread in a warp has the same block offset i = tid; while( i < nAlleles ) { x = allele_list[ i ]; rand = (unsigned int) ( x * ((real_type)allele_space_type::ALIGNMENT_SIZE)); unsigned int nonzero_bin = (rand != 0); // _keep == 0 -> rand == 0 -> e_min == 0; _keep == 1 -> rand != 0 -> e_min == event_hash[ rand - 1] // each thread reads hash (and random pool) relative to their // local allele (x) // this code will result in bank conflicts // // initial performance results suggest that this is // an acceptable overhead as overall runtime of simulation // loop is minimized when this algorithm is used unsigned int e_max = event_hash[ rand ]; unsigned int e_min = event_hash[ rand - nonzero_bin ]; e_min *= nonzero_bin; int_type cmask = e_min; // BEGIN divergent code while( e_min < e_max ) { real_type y = rand_pool[ e_min++ ]; cmask += (y < x); } __syncthreads(); // END divergent code cmask = ((cmask & 1) << threadIdx.x); for( unsigned int j = 1; j < 32; j <<= 1 ) { int_type _c = __shfl_up( cmask, j); cmask |= ((threadIdx.x >= j) * _c); } if( threadIdx.x == 31 ) { // assert( seq_offset < cap ); seqs[ seq_offset ] = cmask; } __syncthreads(); i += tpb; seq_offset += blockDim.y; // wpb } __syncthreads(); } seq_idx += bpg; } states[ bid * tpb + tid ] = local_state; }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 3 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef xover_config< unordered_tag, 3 > xover_type; typedef poisson_cdf< RealType, 32 > poisson_type; assert( blockDim.y <= xover_type::MAX_WARPS && blockDim.x == 32); // 8 or fewer full warps unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) { return; } assert( (nAlleles & 31) == 0 ); // nAlleles == 32 * m const unsigned int MAX_EVENTS_PER_WARP = 128; // maximum number of recombination events per sequence __shared__ real_type rand_pool[ MAX_EVENTS_PER_WARP * xover_type::MAX_WARPS ]; // 16 warps/block (arbitrary number); 512 random numbers __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; // 4 * 32 == 128 unsigned int max_k = pois->max_k; if( threadIdx.y == 0 ) { // use first warp to read poisson CDF s_pois_cdf[ threadIdx.x ] = pois->_cdf[threadIdx.x]; } __syncthreads(); unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tpb = (blockDim.x * blockDim.y); unsigned int bpg = (gridDim.x * gridDim.y); unsigned int spg = bpg * blockDim.y; int_type seq_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; real_type * allele_list = alleles->locations; unsigned int seq_idx = bid * blockDim.y + threadIdx.y; state_type local_state = states[ seq_idx * 32 + threadIdx.x ]; // every thread/warp uses the SAME random state unsigned int max_seqs = nSequences / blockDim.y; max_seqs += (( nSequences % blockDim.y) ? 1 : 0); max_seqs *= blockDim.y; while( seq_idx < max_seqs ) { real_type x = curand_uniform( &local_state ); unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); for(unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int t = __shfl_up( rand, i ); rand += ((threadIdx.x >= i) * t); } unsigned int max_events = __shfl( rand, 31 ); // fill random pool // if( max_events >= MAX_EVENTS_PER_WARP ) { if( threadIdx.x == 0 ) { printf( "Too many events to generate: %d\n", max_events ); } assert( max_events < MAX_EVENTS_PER_WARP ); } __syncthreads(); rand_pool[ tid ] = curand_uniform( &local_state ); rand_pool[ tid + tpb ] = curand_uniform( &local_state ); rand_pool[ tid + 2 * tpb ] = curand_uniform( &local_state ); rand_pool[ tid + 3 * tpb ] = curand_uniform( &local_state ); __syncthreads(); unsigned int seq_offset = seq_idx * seq_width; unsigned int a_idx = threadIdx.x; while( a_idx < nAlleles ) { real_type loc = allele_list[ a_idx ]; unsigned int s = 0, mask = 0; while( s < max_events ) { // warps within a block diverge. threads within a warp do not real_type y = rand_pool[ s++ ]; // every thread within a warp read/loads the same value (sequence offset) mask += ( y < loc ); } __syncthreads(); mask = ((mask & 1) << threadIdx.x); for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int e = __shfl_up( mask, i ); mask |= ((threadIdx.x >= i) * e); } if( threadIdx.x == 31 && seq_idx < nSequences ) { seqs[ seq_offset ] = mask; } __syncthreads(); a_idx += 32; ++seq_offset; } seq_idx += spg; } seq_idx = bid * blockDim.y + threadIdx.y; // reset seq_idx states[ seq_idx * 32 + threadIdx.x ] = local_state; }
__global__ void crossover_kernel( StateType * states , AlleleSpaceType * alleles , device_free_space< IntType, unordered_tag > * free_space , poisson_cdf< RealType, 32 > * pois , device_sequence_space< IntType > * sequences , clotho::utility::algo_version< 2 > * v ) { typedef StateType state_type; typedef AlleleSpaceType allele_space_type; typedef typename allele_space_type::real_type real_type; typedef device_sequence_space< IntType > sequence_space_type; typedef typename sequence_space_type::int_type int_type; typedef poisson_cdf< RealType, 32 > poisson_type; unsigned int nAlleles = alleles->capacity; if( nAlleles == 0 ) { return; } assert( (nAlleles & 31) == 0 ); // multiple of 32 alleles const unsigned int MAX_EVENTS_PER_WARP = 64; // maximum number of recombination events per sequence const unsigned int MAX_WARP_PER_BLOCK = 32; const unsigned int HASH_WIDTH = 32; unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int lane_id = (tid & 31); unsigned int warp_id = (tid >> 5); unsigned int tpb = (blockDim.x * blockDim.y); assert( (tpb & 31) == 0 ); // all warps are full unsigned int bpg = (gridDim.x * gridDim.y); unsigned int wpb = (tpb >> 5); // == sequences/block (spb) assert( wpb <= MAX_WARP_PER_BLOCK ); unsigned int spg = bpg * wpb; int_type sequence_width = sequences->seq_width; int_type nSequences = sequences->seq_count; int_type * seqs = sequences->sequences; real_type * allele_list = alleles->locations; __shared__ real_type s_pois_cdf[ poisson_type::MAX_K ]; // 4 * 32 == 128 __shared__ real_type rand_pool[ MAX_WARP_PER_BLOCK * MAX_EVENTS_PER_WARP ]; // 4 * 32 * 64 == 8192 __shared__ unsigned int event_hash[ MAX_WARP_PER_BLOCK * HASH_WIDTH ]; // 4 * 32 * 128 == 16384 //---------------------- // 24704 (bytes of shared memory) unsigned int max_k = pois->max_k; if( tid < poisson_type::MAX_K ) { s_pois_cdf[ tid ] = pois->_cdf[tid]; } __syncthreads(); state_type local_state = states[ bid * tpb + tid ]; // assumes every thread in the GRID has a state defined unsigned int max_seq_id = nSequences / wpb; max_seq_id += (( nSequences % wpb ) ? 1 : 0); max_seq_id *= wpb; int_type seq_idx = bid * wpb + warp_id; while( seq_idx < max_seq_id ) { // should allow blocks to terminate early // generate a recombination hash for each sequence (warp) // unsigned int psum = 0; // for( unsigned int w = lane_id; w < HASH_WIDTH; w += 32 ) { real_type x = curand_uniform( &local_state ); unsigned int rand = _find_poisson_maxk32( s_pois_cdf, x, max_k ); __syncthreads(); // compute prefix sum with in each warp for( unsigned int i = 1; i < 32; i <<= 1 ) { unsigned int r = __shfl_up( rand, i ); rand += ( (lane_id >= i ) * r ); } // rand += psum; // event_hash[ warp_id * HASH_WIDTH + w ] = rand; // event_hash contain prefix sum (scan) event_hash[ tid ] = rand; unsigned int s = __shfl_up( rand, 1); // s *= ( lane_id != 0 ); s += (warp_id * MAX_EVENTS_PER_WARP); // shift s and rand to be relative to warp rand += (warp_id * MAX_EVENTS_PER_WARP); // BEGIN divergent code real_type accum = 0.; while (s < rand) { x = curand_uniform( &local_state ); accum += (log( x ) / (real_type)(rand - s)); rand_pool[s++] = ((((real_type)tid) + (1.0 - exp(accum))) / ((real_type)32.)); } __syncthreads(); // END divergent code // // psum = __shfl( rand, 31 ); // } unsigned int seq_offset = seq_idx * sequence_width; unsigned int a_id = lane_id; while( a_id < nAlleles ) { real_type x = allele_list[ a_id ]; unsigned int h_idx = (unsigned int) ( x * ((real_type) HASH_WIDTH )); // map allele location to bin index unsigned int s = event_hash[ h_idx++ ]; unsigned int e = event_hash[ h_idx ]; __syncthreads(); int_type cmask = s; s += (warp_id * MAX_EVENTS_PER_WARP); e += (warp_id * MAX_EVENTS_PER_WARP ); // BEGIN divergent code while( s < e ) { real_type y = rand_pool[ s++ ]; cmask += (x > y); } __syncthreads(); // END divergent code cmask = ((cmask & 1) << lane_id); // reduce cmask within each warp for( unsigned int i = 1; i < 32; i <<= 1 ) { int_type _c = __shfl_up( cmask, i ); cmask |= ((lane_id >= i) * _c); } if( lane_id == 31 && seq_idx < nSequences ) { seqs[seq_offset] = cmask; } __syncthreads(); a_id += 32; ++seq_offset; } __syncthreads(); seq_idx += spg; } states[ bid * tpb + tid ] = local_state; }
template<> __device__ void generate_uniform<cfloat>(cfloat *cval, curandState_t *state) { cval->x = curand_uniform(state); cval->y = curand_uniform(state); }
template<> __device__ void generate_uniform<float>(float *val, curandState_t *state) { *val = curand_uniform(state); }
template<> __device__ void generate_uniform<char>(char *val, curandState_t *state) { *val = curand_uniform(state) > 0.5; }
__global__ void _scatter_mutations( StateType * states , device_free_space< IntType, unit_ordered_tag< IntType > > * fspace , device_event_space< IntType, unit_ordered_tag< IntType > > * events , device_sequence_space< IntType > * sequences , AlleleSpaceType * alleles ) { typedef StateType state_type; typedef device_event_space< IntType, unit_ordered_tag< IntType > > event_space_type; typedef typename event_space_type::order_tag_type order_tag_type; typedef typename event_space_type::size_type size_type; typedef typename event_space_type::int_type int_type; typedef float real_type; assert( (blockDim.x * blockDim.y) == 1 ); unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; unsigned int bpg = gridDim.x * gridDim.y; unsigned int wpb = (bpg >> 5); unsigned int lane_id = (bid & 31); unsigned int warp_id = (bid >> 5); unsigned int lane_max = events->bin_summary[ lane_id ]; if( warp_id >= lane_max ) return; unsigned int R = sequences->seq_count; unsigned int _width = sequences->seq_width; unsigned int * flist = fspace->free_list; state_type local_state = states[ bid ]; int_type * seqs = sequences->sequences; order_tag_type * otag = NULL; unsigned int lidx = 0, k = 0; while( warp_id < lane_max ) { // should only be one thread per block so no divergence // scan free_list for k-th free index in lane while( warp_id + 1 != k && lidx < _width ) { int_type f = flist[ lidx++ ]; if( ((f >> lane_id) & 1) ) { ++k; } } if( warp_id + 1 == k ) { real_type r = curand_uniform( &local_state ); unsigned int idx = r * R; idx *= (idx != R); // idx == R -> idx = idx * 0 == 0 idx *= _width; idx += (lidx - 1); int_type b = seqs[ idx ]; b |= ( 1 << lane_id ); seqs[ idx ] = b; idx = (lidx - 1) * 32 + lane_id; _generate_random_allele( &local_state, alleles, idx, otag ); } warp_id += wpb; } }
__global__ void _scatter_mutations( StateType * states , device_free_space< IntType, unordered_tag > * fspace , device_event_space< IntType, unordered_tag > * events , device_sequence_space< IntType > * sequences , AlleleSpaceType * alleles) { typedef StateType state_type; typedef device_sequence_space< IntType > sequence_type; typedef device_event_space< IntType, unordered_tag > event_space_type; typedef typename event_space_type::size_type size_type; typedef typename event_space_type::int_type int_type; typedef float real_type; // does not need same precision as AlleleSpaceType assert( (blockDim.x*blockDim.y) == 1 ); unsigned int bid = blockIdx.y * gridDim.x + blockIdx.x; size_type tot = events->total; if( bid >= tot ) return; unsigned int _width = sequences->seq_width; if( _width == 0 ) return; // if( bid == 0 ) // printf( "%d Free Space; %d New Mutation\n", fspace->total, tot ); assert( tot <= fspace->total ); // true for all blocks (enough free bits for new mutations) fixed_width_converter< sequence_type::OBJECTS_PER_INT > converter; unsigned int bpg = gridDim.x * gridDim.y; unsigned int R = sequences->seq_count; unsigned int * fmap = fspace->free_map; state_type local_state = states[ bid ]; unordered_tag * otag = NULL; int_type * seqs = sequences->sequences; while( bid < tot ) { unsigned int fidx = fmap[ bid ]; unsigned int block_idx = converter.major_offset( fidx ); unsigned int bit_idx = converter.minor_offset( fidx ); real_type r = curand_uniform( &local_state ); unsigned int idx = r * R; idx *= (idx != R ); idx *= _width; idx += block_idx; // need to perform a bitonic sort to identify // whether multiple bit indices within the same block // are being updated int_type b = seqs[ idx ]; b |= ( 1 << bit_idx ); seqs[ idx ] = b; _generate_random_allele( &local_state, alleles, fidx, otag ); bid += bpg; } states[ bid ] = local_state; }
__device__ inline float operator ()(float data, curandState* state) { return curand_uniform(state); }