__global__ void benchmark_func(hipLaunchParm lp, T seed, T *g_data){
	functor_mad<T> mad_op;
	const unsigned int blockSize = blockdim;
	const int stride = blockSize;
	int idx = hipBlockIdx_x*blockSize*granularity + hipThreadIdx_x;

	T tmps[granularity];
	#pragma unroll
	for(int j=0; j<granularity; j++){
		// Load elements (memory intensive part)
		tmps[j] = g_data[idx+j*stride];
		// Perform computations (compute intensive part)
		for(int i=0; i<compute_iterations; i++){
			tmps[j] = mad_op(tmps[j], tmps[j], seed);
		}
	}
	// Multiply add reduction
	T sum = (T)0;
	#pragma unroll
	for(int j=0; j<granularity; j+=2)
		sum += tmps[j]*tmps[j+1];
	// Dummy code
	if( sum==(T)-1 ) // Designed so it never executes
		g_data[idx] = sum;
}
Beispiel #2
0
__global__ void
benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){
	functor_mad<T> mad_op;
	const int index_stride = blockdim;
	const int index_base = hipBlockIdx_x*blockdim*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x;
	const int halfarraysize = hipGridDim_x*blockdim*UNROLLED_MEMORY_ACCESSES;
	const int offset_slips = 1+UNROLLED_MEMORY_ACCESSES-((memory_ratio+1)/2);
	const int array_index_bound = index_base+offset_slips*index_stride;
	const int initial_index_range = memory_ratio>0 ? UNROLLED_MEMORY_ACCESSES % ((memory_ratio+1)/2) : 1;
	int initial_index_factor = 0;
	volatile T *data = g_data;

	int array_index = index_base;
	T r0 = seed + hipBlockIdx_x * blockdim + hipThreadIdx_x,
	  r1 = r0+(T)(2),
	  r2 = r0+(T)(3),
	  r3 = r0+(T)(5),
	  r4 = r0+(T)(7),
	  r5 = r0+(T)(11),
	  r6 = r0+(T)(13),
	  r7 = r0+(T)(17);

	for(int j=0; j<COMP_ITERATIONS; j+=UNROLL_ITERATIONS){
		#pragma unroll
		for(int i=0; i<UNROLL_ITERATIONS-memory_ratio; i++){
			r0 = mad_op(r0, r0, r4);
			r1 = mad_op(r1, r1, r5);
			r2 = mad_op(r2, r2, r6);
			r3 = mad_op(r3, r3, r7);
			r4 = mad_op(r4, r4, r0);
			r5 = mad_op(r5, r5, r1);
			r6 = mad_op(r6, r6, r2);
			r7 = mad_op(r7, r7, r3);
		}
		bool do_write = true;
		int reg_idx = 0;
		#pragma unroll
		for(int i=UNROLL_ITERATIONS-memory_ratio; i<UNROLL_ITERATIONS; i++){
			// Each iteration maps to one memory operation
			T& r = reg_idx==0 ? r0 : (reg_idx==1 ? r1 : (reg_idx==2 ? r2 : (reg_idx==3 ? r3 : (reg_idx==4 ? r4 : (reg_idx==5 ? r5 : (reg_idx==6 ? r6 : r7))))));
			if( do_write )
				data[ array_index+halfarraysize ] = r;
			else {
				r = data[ array_index ];
				if( ++reg_idx>=REGBLOCK_SIZE )
					reg_idx = 0;
				array_index += index_stride;
			}
			do_write = !do_write;
		}
		if( array_index >= array_index_bound ){
			if( ++initial_index_factor > initial_index_range)
				initial_index_factor = 0;
			array_index = index_base + initial_index_factor*index_stride;
		}
	}
	if( (r0==GPU_INF(T)) && (r1==GPU_INF(T)) && (r2==GPU_INF(T)) && (r3==GPU_INF(T)) &&
	    (r4==GPU_INF(T)) && (r5==GPU_INF(T)) && (r6==GPU_INF(T)) && (r7==GPU_INF(T)) ){ // extremely unlikely to happen
		g_data[0] = r0+r1+r2+r3+r4+r5+r6+r7;
	}
}