Exemplo n.º 1
0
void scan(icl_buffer *data, icl_buffer *flag, UINT n) {
	// use size for actual n, not overapproximation as in allocation
	UINT numWorkGroups = ((n + wx - 1) / wx);
	UINT sizeScanBuff = ((numWorkGroups + wx -1) / wx) * wx;

	size_t gx = numWorkGroups * wx;
	INT init = -1;

#if TIMING
	clFinish(dev->queue);
	icl_start_timer(timer);
#endif

	icl_run_kernel(perBlockScanByKey, 1, &gx, &wx, NULL, perBlockScanEvent, 8,
			(size_t)0, (void *)flag,
			(size_t)0, (void *)data,
			sizeof(INT), &init,
			sizeof(UINT), &n,
			sizeof(UINT) * wx, NULL,
			sizeof(INT) * wx, NULL,
			(size_t)0, (void *)keySumArray,
			(size_t)0, (void *)preSumArray);
/*
    V_OPENCL( kernels[0].setArg( 0, firstKey.getBuffer()), "Error setArg kernels[ 0 ]" ); // Input keys
    V_OPENCL( kernels[0].setArg( 1, firstValue.getBuffer()),"Error setArg kernels[ 0 ]" ); // Input buffer
    V_OPENCL( kernels[0].setArg( 2, result.getBuffer( ) ), "Error setArg kernels[ 0 ]" ); // Output buffer
    V_OPENCL( kernels[0].setArg( 3, init ),                 "Error setArg kernels[ 0 ]" ); // Initial value exclusive
    V_OPENCL( kernels[0].setArg( 4, numElements ),          "Error setArg kernels[ 0 ]" ); // Size of scratch buffer
    V_OPENCL( kernels[0].setArg( 5, ldsKeySize, NULL ),     "Error setArg kernels[ 0 ]" ); // Scratch buffer
    V_OPENCL( kernels[0].setArg( 6, ldsValueSize, NULL ),   "Error setArg kernels[ 0 ]" ); // Scratch buffer
    V_OPENCL( kernels[0].setArg( 7, *binaryPredicateBuffer),"Error setArg kernels[ 0 ]" ); // User provided functor
    V_OPENCL( kernels[0].setArg( 8, *binaryFunctionBuffer ),"Error setArg kernels[ 0 ]" ); // User provided functor
    V_OPENCL( kernels[0].setArg( 9, *keySumArray ),         "Error setArg kernels[ 0 ]" ); // Output per block sum
    V_OPENCL( kernels[0].setArg(10, *preSumArray ),         "Error setArg kernels[ 0 ]" ); // Output per block sum
    V_OPENCL( kernels[0].setArg(11, doExclusiveScan ),      "Error setArg kernels[ 0 ]" ); // Exclusive scan?
*/

	UINT workPerThread = sizeScanBuff / wx;

	icl_run_kernel(intraBlockInclusiveScanByKey, 1, &wx, &wx, NULL, intraBlockEvent, 7,
			(size_t)0, (void *)keySumArray,
			(size_t)0, (void *)preSumArray,
			(size_t)0, (void *)postSumArray,
			sizeof(UINT), &numWorkGroups,
			sizeof(UINT) * wx, NULL,
			sizeof(INT) * wx, NULL,
			sizeof(UINT), &workPerThread);

/*
INT* output = (UINT*)malloc(n * sizeof(INT));
icl_read_buffer(keySumArray, CL_TRUE, n * sizeof(INT), output, NULL, NULL);
for(int i = 0; i < n; ++i)
	printf("%d ", output[i]);
printf("\n"); */
/*
    V_OPENCL( kernels[1].setArg( 0, *keySumArray ),         "Error setArg kernels[ 1 ]" ); // Input keys
    V_OPENCL( kernels[1].setArg( 1, *preSumArray ),         "Error setArg kernels[ 1 ]" ); // Input buffer
    V_OPENCL( kernels[1].setArg( 2, *postSumArray ),        "Error setArg kernels[ 1 ]" ); // Output buffer
    V_OPENCL( kernels[1].setArg( 3, numWorkGroupsK0 ),      "Error setArg kernels[ 1 ]" ); // Size of scratch buffer
    V_OPENCL( kernels[1].setArg( 4, ldsKeySize, NULL ),     "Error setArg kernels[ 1 ]" ); // Scratch buffer
    V_OPENCL( kernels[1].setArg( 5, ldsValueSize, NULL ),   "Error setArg kernels[ 1 ]" ); // Scratch buffer
    V_OPENCL( kernels[1].setArg( 6, workPerThread ),        "Error setArg kernels[ 1 ]" ); // User provided functor
    V_OPENCL( kernels[1].setArg( 7, *binaryPredicateBuffer ),"Error setArg kernels[ 1 ]" ); // User provided functor
    V_OPENCL( kernels[1].setArg( 8, *binaryFunctionBuffer ),"Error setArg kernels[ 1 ]" ); // User provided functor
*/

	icl_run_kernel(perBlockAdditionByKey, 1, &gx, &wx, NULL, perBlockAdditionEvent, 5,
			(size_t)0, (void *)keySumArray,
			(size_t)0, (void *)postSumArray,
			(size_t)0, (void *)flag,
			(size_t)0, (void *)data,
			sizeof(UINT), &n);
/*
    V_OPENCL( kernels[2].setArg( 0, *keySumArray ),         "Error setArg kernels[ 2 ]" ); // Input buffer
    V_OPENCL( kernels[2].setArg( 1, *postSumArray ),        "Error setArg kernels[ 2 ]" ); // Input buffer
    V_OPENCL( kernels[2].setArg( 2, firstKey.getBuffer()), "Error setArg kernels[ 2 ]" ); // Output buffer
    V_OPENCL( kernels[2].setArg( 3, result.getBuffer()),   "Error setArg kernels[ 2 ]" ); // Output buffer
    V_OPENCL( kernels[2].setArg( 4, numElements ),          "Error setArg kernels[ 2 ]" ); // Size of scratch buffer
    V_OPENCL( kernels[2].setArg( 5, *binaryPredicateBuffer ),"Error setArg kernels[ 2 ]" ); // User provided functor
    V_OPENCL( kernels[2].setArg( 6, *binaryFunctionBuffer ),"Error setArg kernels[ 2 ]" ); // User provided functor
*/


#if TIMING
	clFinish(dev->queue);
	icl_stop_timer(timer);
	perBlockScanTime += icl_profile_event(perBlockScanEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI);
	intraBlockTime += icl_profile_event(intraBlockEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI);
	perBlockAdditionTime += icl_profile_event(perBlockAdditionEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI);
#endif
}
Exemplo n.º 2
0
Arquivo: vec_mul.c Projeto: 8l/insieme
int main(int argc, char* argv[]) {
	int size = 1000;

	int* input1 = (int*)malloc(sizeof(int) * size);
	int* input2 = (int*) malloc(sizeof(int) * size);
	int* output = (int *)malloc(sizeof(int) * size);
	
	for(int i=0; i < size; ++i) {
		input1[i] = i;
		input2[i] = 1;
	}

#ifndef INSIEME
	icl_timer* time1 = icl_init_timer(ICL_SEC);
	icl_start_timer(time1);
#endif
	icl_init_devices(ICL_CPU);
#ifndef INSIEME
	printf("TIME for initialization: %f\n", icl_stop_timer(time1));
#endif
	
	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(0);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "vec_mul.cl", "vec_mul", "", ICL_SOURCE);
		
		icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size);
		icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size);
		icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size);

		icl_event* wb1 = icl_create_event();
		icl_event* wb2 = icl_create_event();
		icl_event* rb = icl_create_event();

		icl_write_buffer(buf_input1, CL_FALSE, sizeof(int) * size, &input1[0], NULL, wb1);
		icl_write_buffer(buf_input2, CL_FALSE, sizeof(int) * size, &input2[0], NULL, wb2);
		
		size_t szLocalWorkSize = 256;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;

		icl_event* rk = icl_create_event();
		icl_event* wb_all = icl_create_event_list(2, wb1, wb2);	
		icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, wb_all, rk, 4,
											(size_t)0, (void *)buf_input1,
											(size_t)0, (void *)buf_input2,
											(size_t)0, (void *)buf_output,
											sizeof(cl_int), (void *)&size);
		
		icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], rk, rb);
		
		printf("Time wb1 %f\n", icl_profile_event(wb1, ICL_STARTED, ICL_FINISHED, ICL_SEC));		
		printf("Time wb2 %f\n", icl_profile_event(wb2, ICL_STARTED, ICL_FINISHED, ICL_SEC));
		printf("Time rk %f\n", icl_profile_event(rk, ICL_STARTED, ICL_FINISHED, ICL_SEC));
		printf("Time rb %f\n", icl_profile_event(rb, ICL_STARTED, ICL_FINISHED, ICL_SEC));
	
		icl_release_events(5, wb1, wb2, wb_all, rk, rb);
		icl_release_buffers(3, buf_input1, buf_input2, buf_output);
		icl_release_kernel(kernel);
	}
#ifndef INSIEME
	icl_restart_timer(time1);
#endif
	icl_release_devices();
#ifndef INSIEME
	printf("TIME for releasing the devices: %f\n", icl_stop_timer(time1));
	icl_release_timer(time1);
#endif
	
	// CHECK for output
	printf("======================\n= Vector Mul Done\n");
	unsigned int check = 1;
	for(unsigned int i = 0; i < size; ++i) {
		if(output[i] != i*size) {
			check = 0;
			printf("= fail at %d, expected %d / actual %d", i, i*3/2, output[i]);
			break;
		}
	}
	printf("= result check: %s\n======================\n", check ? "OK" : "FAIL");
	free(input1);
	free(input2);
	free(output);
}