__global__ void benchmark_func(hipLaunchParm lp, T seed, T *g_data){ functor_mad<T> mad_op; const unsigned int blockSize = blockdim; const int stride = blockSize; int idx = hipBlockIdx_x*blockSize*granularity + hipThreadIdx_x; T tmps[granularity]; #pragma unroll for(int j=0; j<granularity; j++){ // Load elements (memory intensive part) tmps[j] = g_data[idx+j*stride]; // Perform computations (compute intensive part) for(int i=0; i<compute_iterations; i++){ tmps[j] = mad_op(tmps[j], tmps[j], seed); } } // Multiply add reduction T sum = (T)0; #pragma unroll for(int j=0; j<granularity; j+=2) sum += tmps[j]*tmps[j+1]; // Dummy code if( sum==(T)-1 ) // Designed so it never executes g_data[idx] = sum; }
__global__ void benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){ functor_mad<T> mad_op; const int index_stride = blockdim; const int index_base = hipBlockIdx_x*blockdim*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x; const int halfarraysize = hipGridDim_x*blockdim*UNROLLED_MEMORY_ACCESSES; const int offset_slips = 1+UNROLLED_MEMORY_ACCESSES-((memory_ratio+1)/2); const int array_index_bound = index_base+offset_slips*index_stride; const int initial_index_range = memory_ratio>0 ? UNROLLED_MEMORY_ACCESSES % ((memory_ratio+1)/2) : 1; int initial_index_factor = 0; volatile T *data = g_data; int array_index = index_base; T r0 = seed + hipBlockIdx_x * blockdim + hipThreadIdx_x, r1 = r0+(T)(2), r2 = r0+(T)(3), r3 = r0+(T)(5), r4 = r0+(T)(7), r5 = r0+(T)(11), r6 = r0+(T)(13), r7 = r0+(T)(17); for(int j=0; j<COMP_ITERATIONS; j+=UNROLL_ITERATIONS){ #pragma unroll for(int i=0; i<UNROLL_ITERATIONS-memory_ratio; i++){ r0 = mad_op(r0, r0, r4); r1 = mad_op(r1, r1, r5); r2 = mad_op(r2, r2, r6); r3 = mad_op(r3, r3, r7); r4 = mad_op(r4, r4, r0); r5 = mad_op(r5, r5, r1); r6 = mad_op(r6, r6, r2); r7 = mad_op(r7, r7, r3); } bool do_write = true; int reg_idx = 0; #pragma unroll for(int i=UNROLL_ITERATIONS-memory_ratio; i<UNROLL_ITERATIONS; i++){ // Each iteration maps to one memory operation T& r = reg_idx==0 ? r0 : (reg_idx==1 ? r1 : (reg_idx==2 ? r2 : (reg_idx==3 ? r3 : (reg_idx==4 ? r4 : (reg_idx==5 ? r5 : (reg_idx==6 ? r6 : r7)))))); if( do_write ) data[ array_index+halfarraysize ] = r; else { r = data[ array_index ]; if( ++reg_idx>=REGBLOCK_SIZE ) reg_idx = 0; array_index += index_stride; } do_write = !do_write; } if( array_index >= array_index_bound ){ if( ++initial_index_factor > initial_index_range) initial_index_factor = 0; array_index = index_base + initial_index_factor*index_stride; } } if( (r0==GPU_INF(T)) && (r1==GPU_INF(T)) && (r2==GPU_INF(T)) && (r3==GPU_INF(T)) && (r4==GPU_INF(T)) && (r5==GPU_INF(T)) && (r6==GPU_INF(T)) && (r7==GPU_INF(T)) ){ // extremely unlikely to happen g_data[0] = r0+r1+r2+r3+r4+r5+r6+r7; } }