Exemplo n.º 1
0
void updateTree(icl_buffer *nodelist, icl_buffer *particlesD, icl_buffer *treeD, UINT nParticles, UINT treeHeight, icl_device* dev) {
	// compile OpenCL kernels
	icl_kernel* update = icl_create_kernel(dev, "kernel/updateTree.cl", "updateTree", KERNEL_BUILD_MACRO, ICL_SOURCE);
	UINT nNodes = nParticles * 2 - 1;

#if timing == 1
	icl_timer* timer_update = icl_init_timer(ICL_MILLI);
#endif
	
	size_t localSizeB = 256;
	size_t globalSizeB = ((nNodes + 255) / 256) * 256;

#if timing == 1
		icl_start_timer(timer_update);
#endif

	for(int l = (int)treeHeight; l >= 0; --l)
	{
		icl_run_kernel(update, 1, &globalSizeB, &localSizeB, NULL, NULL, 5,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)treeD,
						(size_t)0, (void *)particlesD,
						sizeof(int), &l,
						sizeof(UINT), &nNodes);
	}
#if timing == 1
	icl_finish(dev);
	icl_stop_timer(timer_update);

	printf("tree update %f\n\n", timer_update->current_time);
#endif

}
Exemplo n.º 2
0
UINT buildTree(icl_buffer *nodelist, icl_buffer *particlesD, icl_buffer *treeD, UINT nParticles, icl_device* dev)
{
	UINT level = 1;
	UINT nNodes = nParticles * 2 - 1;

	icl_timer* timer = icl_init_timer(ICL_MILLI);
//	void icl_start_timer(icl_timer* timer);
	double time = 0;

	// overapproximate size of temporal lists
/*	struct Node** activelist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT activeN = 0;
	struct Node** smalllist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT smallN = 0;
	struct Node** nextlist = (struct Node**)malloc(nParticles * sizeof(struct Node*)); UINT nextN = 0;*/
	icl_buffer* activelist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles);
	icl_buffer* smalllist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles);
	icl_buffer* nextlist = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(NodeId) * nParticles);
	icl_buffer* sizes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(UINT) * 5); // holds the current size of each of 3 buffers:
		// 0 activelist
		// 1 nodelist
		// 2 smalllist
		// 3 old max level
		// 4 new max level

	UINT maxNchunks = ((nParticles / fmin(T, chunk_size)) * 2) -1;
//	assert(maxNchunks <= 256 && "adapt implementation"); // TODO allow more than 256 chunks per node
	icl_buffer* chunks = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Chunk) * maxNchunks);
	icl_buffer* bboxes = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct BBox) * maxNchunks);

	size_t localSize = 1;
	size_t globalSize = 1;

/*
struct Particle* particles = (struct Particle*)malloc(3000 * sizeof(struct Particle));
icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * 3000, &particles[0], NULL, NULL);
printf("%f %f %f\n", particles[0].pos.x, particles[0].pos.y, particles[0].pos.z);
*/
	// compile OpenCL kernels
	icl_kernel* init = icl_create_kernel(dev, "kernel/init.cl", "init", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* resetChunks = icl_create_kernel(dev, "kernel/init.cl", "memset_chunks", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* gp2c = icl_create_kernel(dev, "kernel/groupToChunks.cl", "groupParticlesIntoChunks", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* cBBox = icl_create_kernel(dev, "kernel/chunkedBBox.cl", "chunkedBBox", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* bBox = icl_create_kernel(dev, "kernel/bBox.cl", "bBox", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* sln = icl_create_kernel(dev, "kernel/splitLargeNodes.cl", "splitLargeNodes", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* sortP = icl_create_kernel(dev, "kernel/sortParticlesToChilds.cl", "sortParticlesToChilds", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* snf = icl_create_kernel(dev, "kernel/smallNodeFiltering.cl", "smallNodeFiltering", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* pnl = icl_create_kernel(dev, "kernel/packNextlist.cl", "packNextlist", KERNEL_BUILD_MACRO, ICL_SOURCE);	

	//////////////////////////////////////////////////////////////////////////
	icl_kernel* preScan  = icl_create_kernel(dev, "kernel/sortP_prescan.cl", "sortP_prescan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* postScan = icl_create_kernel(dev, "kernel/sortP_postscan.cl", "sortP_postscan_chunked", KERNEL_BUILD_MACRO, ICL_SOURCE);
	segmented_scan_init(nParticles, dev, KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* memset_int_s  = icl_create_kernel(dev, "kernel/init.cl", "memset_int_s", KERNEL_BUILD_MACRO, ICL_SOURCE);

	// approach with segmented scan
	icl_buffer *scan_data = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles);
	icl_buffer *scan_flag = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(int) * nParticles);
	icl_buffer* buffered_particles = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Particle) * nParticles);


#if timing == 1
	icl_timer* timer_gp2c =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_cBBox =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_bBox =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_sln =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_sortP =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_snf =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_pnl =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_ran =  icl_init_timer(ICL_MILLI);


	icl_timer* timer_prescan =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_scan =  icl_init_timer(ICL_MILLI);
	icl_timer* timer_postscan =  icl_init_timer(ICL_MILLI);

#endif
	//add root node to the activelist and initialize size lists
	icl_run_kernel(init, 1, &globalSize, &localSize, NULL, NULL, 3,
			(size_t)0, (void *)activelist,
			(size_t)0, (void *)sizes,
			(size_t)0, (void *)particlesD);

	UINT activeN = 1;

	icl_finish(dev);

	// smallest power of 2 bigger or equal to maxxNchnunks
	UINT pow2maxNchunks = pow2roundup(maxNchunks);
	// processLargeNode
	while(activeN != 0)
	{

		icl_start_timer(timer);
		// group triangles into chunks
		size_t localSize1 = min(pow2maxNchunks, 256);

#if timing == 1
		icl_start_timer(timer_gp2c);
#endif
		size_t globalSize1 = ((maxNchunks + localSize1 -1) / localSize1) * localSize1;
		// reset chunks
		icl_run_kernel(resetChunks, 1, &globalSize1, &localSize1, NULL, NULL, 2,
				(size_t)0, (void *)chunks,
				sizeof(UINT), &maxNchunks);

		globalSize1 = localSize1 * activeN;
		// split every node in chunk of chunk_size
		icl_run_kernel(gp2c, 1, &globalSize1, &localSize1, NULL, NULL, 4,
				(size_t)0, (void *)nodelist,
				(size_t)0, (void *)activelist,
				(size_t)0, (void *)chunks,
				sizeof(UINT), &activeN);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_gp2c);
#endif

		// compute per chunk bounding box
		size_t localSize2 = chunk_size;
		size_t globalSize2 = maxNchunks * chunk_size;
#if timing == 1
		icl_start_timer(timer_cBBox);
#endif

		icl_run_kernel(cBBox, 1, &globalSize2, &localSize2, NULL, NULL, 5,
				(size_t)0, (void *)nodelist,
				(size_t)0, (void *)activelist,
				(size_t)0, (void *)particlesD,
				(size_t)0, (void *)chunks,
				(size_t)0, (void *)bboxes);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_cBBox);
#endif

		// compute per node bounding box
		size_t localSize3 = min(pow2maxNchunks, 256);
		size_t globalSize3 = localSize3 * activeN;
#if timing == 1
		icl_start_timer(timer_bBox);
#endif
		icl_run_kernel(bBox, 1, &globalSize3, &localSize3, NULL, NULL, 4,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)activelist,
						(size_t)0, (void *)bboxes,
						sizeof(UINT), &activeN);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_bBox);
#endif

		// split large nodes
		size_t localSize4 = 256;
		size_t globalSize4 = ((activeN + 255) / 256) * 256;
#if timing == 1
		icl_start_timer(timer_sln);
#endif
		icl_run_kernel(sln, 1, &globalSize4, &localSize4, NULL, NULL, 5,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)activelist,
						(size_t)0, (void *)nextlist,
						(size_t)0, (void *)sizes,
						sizeof(UINT), &activeN);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_sln);
#endif

		///////////////////////////////////////////////////////////////////////////////
		// XXx replaced with segmented scan

		//globalSize = (activeN+1) * 256;
#if timing == 1
		icl_start_timer(timer_sortP);
#endif


#if DEVICE == ICL_CPU 
		// sort particles to child nodes
		size_t localSize5 = 256;
		size_t globalSize5 = ((activeN + 255) / 256) * 256;
		icl_run_kernel(sortP, 1, &globalSize5, &localSize5, NULL, NULL, 5,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)particlesD,
			(size_t)0, (void *)activelist,
			(size_t)0, (void *)nextlist,
			sizeof(UINT), &activeN
		);
#else		
		// init scan_flag to 1
		cl_int initFlag = 1;
		size_t np = (size_t)((nParticles + localSize4 -1 ) / localSize4) * localSize4;
		icl_run_kernel(memset_int_s, 1, &np, &localSize4, NULL, NULL, 3,
			(size_t)0, (void *)scan_flag,
			sizeof(cl_int), &initFlag,
			sizeof(UINT), &nParticles
		);

#if timing == 1
		icl_start_timer(timer_prescan);
#endif
		// pre-scan fills data0 and data1 with 1 and 0 whenever value < pivot
		localSize = chunk_size;
//		globalSize = activeN * 256;
		globalSize = maxNchunks * chunk_size;
		icl_run_kernel(preScan, 1, &globalSize, &localSize, NULL, NULL, 7,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)chunks,
			(size_t)0, (void *)particlesD,
			(size_t)0, (void *)activelist,
			sizeof(UINT), &activeN,
			(size_t)0, (void *)scan_data,
			(size_t)0, (void *)scan_flag						
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_prescan);
#endif

#if timing == 1
		icl_start_timer(timer_scan);
#endif
		// scan for
		scan(scan_data, scan_flag, nParticles);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_scan);
#endif
		// copy partially sorted data to the final
		icl_copy_buffer(particlesD, buffered_particles, sizeof(struct Particle) * nParticles, NULL, NULL);
//		swap(particlesD, buffered_particles);
#if timing == 1
		icl_start_timer(timer_postscan);
#endif

		localSize = chunk_size;
		globalSize = maxNchunks * chunk_size;
		icl_run_kernel(postScan, 1, &globalSize, &localSize, NULL, NULL, 7,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)chunks,
			(size_t)0, (void *)particlesD,
			(size_t)0, (void *)activelist,
			sizeof(UINT), &activeN,

			(size_t)0, (void *)buffered_particles,
			(size_t)0, (void *)scan_data						
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_postscan);
#endif

icl_finish(dev);
#endif
		///////////////////////////////////////////////////////////////////////////////

#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_sortP);
#endif
		
		// small node filtering
		size_t localSize6 = 256;
		size_t globalSize6 = ((activeN*2 + 255) / 256) * 256;
#if timing == 1
		icl_start_timer(timer_snf);
#endif
		icl_run_kernel(snf, 1, &globalSize6, &localSize6, NULL, NULL, 4,
			(size_t)0, (void *)nodelist,
			(size_t)0, (void *)nextlist,
			(size_t)0, (void *)smalllist,
			(size_t)0, (void *)sizes
			//, sizeof(UINT), &nParticles
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_snf);
#endif

		// packing of nextlist
		size_t localSize7 = 1;
		size_t globalSize7 = 1;
#if timing == 1
		icl_start_timer(timer_pnl);
#endif
		icl_run_kernel(pnl, 1, &globalSize7, &localSize7, NULL, NULL, 2,
			(size_t)0, (void *)nextlist,
			(size_t)0, (void *)sizes
		);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_pnl);
#endif

		// swap nextlist and activelist
		swap(&nextlist, &activelist);

#if timing == 1
		icl_start_timer(timer_ran);
#endif
		// read size of next activelist set in kernel
		icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL);
		icl_finish(dev);
#if timing == 1
		icl_stop_timer(timer_ran);
#endif

		++level;
//printf("%d: ActiveN %d\n", level, activeN);
		time = icl_stop_timer(timer);

	}

	icl_release_kernel(init);
	icl_release_kernel(gp2c);
	icl_release_kernel(cBBox);
	icl_release_kernel(bBox);
	icl_release_kernel(sln);
	icl_release_kernel(sortP);
	icl_release_kernel(snf);
	icl_release_kernel(pnl);
//////////////////////////////////////////////////////////////////////////
	icl_release_kernel(preScan);
	icl_release_kernel(postScan);
	segmented_scan_release();

	icl_release_buffers(3, scan_data, scan_flag, buffered_particles);

#if timing == 1
	printf("gp2c %f\ncBBox %f\nbBox %f\nsln  %f\nsortP %f\nsnf %f\npnl %f\nran %f\n\n",
			timer_gp2c->current_time,
			timer_cBBox->current_time,
			timer_bBox->current_time,
			timer_sln->current_time,
			timer_sortP->current_time,
			timer_snf->current_time,
			timer_pnl->current_time,
			timer_ran->current_time);
	icl_release_timer(timer_gp2c);
	icl_release_timer(timer_cBBox);
	icl_release_timer(timer_bBox);
	icl_release_timer(timer_sln);
	icl_release_timer(timer_sortP);
	icl_release_timer(timer_snf);
	icl_release_timer(timer_pnl);

	printf("prescan %f\nscan %f\npostscan %f\n\n", timer_prescan->current_time, timer_scan->current_time, timer_postscan->current_time);

	icl_release_timer(timer_prescan);
	icl_release_timer(timer_scan);
	icl_release_timer(timer_postscan);
#endif

/*
icl_read_buffer(nodelist, CL_TRUE, sizeof(struct Node) * 6000, tree->nodelist, NULL, NULL);
printf("node: %d, left %d, right %d", tree->nodelist[0].particlesHigh - tree->nodelist[0].particlesLow,
		tree->nodelist[1].particlesHigh - tree->nodelist[1].particlesLow, tree->nodelist[2].particlesHigh - tree->nodelist[2].particlesLow);

printBox(tree->nodelist[49].bounding_box);
printBox(tree->nodelist[53].bounding_box);
printBox(tree->nodelist[54].bounding_box);


for(int i = 0; i < 6000; ++i)
	if(tree->nodelist[i].bounding_box.box[0].x != 0.0)
		printBox(tree->nodelist[i].bounding_box);
*/
	//small nodes stage
//	preprocessSmallNodes(smalllist);
	icl_release_buffers(3, activelist, chunks, bboxes);
	icl_kernel* sasl = icl_create_kernel(dev, "kernel/swapActiveAndSmalllist.cl", "swapActiveAndSmalllist", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* ssn = icl_create_kernel(dev, "kernel/splitSmallNodes.cl", "splitSmallNodes", KERNEL_BUILD_MACRO, ICL_SOURCE);

#if timing == 1
	icl_timer* timer_ssn = icl_init_timer(ICL_MILLI);
	icl_timer* timer_sasl = icl_init_timer(ICL_MILLI);
	icl_timer* timer_rsn = icl_init_timer(ICL_MILLI);
#endif

	size_t localSize8 = 1;
	size_t globalSize8 = 1;
	UINT setMaxLevel = 0;
	icl_run_kernel(sasl, 1, &globalSize8, &localSize8, NULL, NULL, 2,
					(size_t)0, (void *)sizes,
					sizeof(UINT), &level);
	// get number of small nodes
	icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL);

	while(activeN != 0)
	{
		icl_start_timer(timer);
		// compute SVH and determine the split plane
		size_t localSize9 = 256;
		size_t globalSize9 = ((activeN + 255) / 256) * 256;
#if timing == 1
		icl_start_timer(timer_ssn);
#endif
		icl_run_kernel(ssn, 1, &globalSize9, &localSize9, NULL, NULL, 5,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)smalllist,
						(size_t)0, (void *)nextlist,
						(size_t)0, (void *)particlesD,
						(size_t)0, (void *)sizes);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_ssn);
#endif

		size_t localSizeA = 1;
		size_t globalSizeA = 1;
#if timing == 1
		icl_start_timer(timer_sasl);
#endif
		icl_run_kernel(sasl, 1, &globalSizeA, &localSizeA, NULL, NULL, 2,
						(size_t)0, (void *)sizes,
						sizeof(UINT), &setMaxLevel);
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_sasl);
#endif

		swap(&nextlist, &smalllist);

		// read size of next activelist set in kernel
#if timing == 1
		icl_start_timer(timer_rsn);
#endif
		icl_read_buffer(sizes, CL_TRUE, sizeof(UINT), &activeN, NULL, NULL);
//printf("small size %d\n", activeN);
		icl_finish(dev);
#if timing == 1
		icl_stop_timer(timer_rsn);
#endif
		time = icl_stop_timer(timer);
	}

	icl_release_buffer(smalllist);
	icl_release_buffer(nextlist);

	icl_release_kernel(sasl);
	icl_release_kernel(ssn);

#if timing == 1
	printf("ssn %f\nsasl %f\nrsn %f\n\n", timer_ssn->current_time, timer_sasl->current_time, timer_rsn->current_time);
	icl_release_timer(timer_ssn);
	icl_release_timer(timer_sasl);
	icl_release_timer(timer_rsn);
#endif

	UINT s[5];
	icl_read_buffer(sizes, CL_TRUE, sizeof(UINT) * 5, &s, NULL, NULL);
	icl_release_buffer(sizes);

	icl_kernel* upPass = icl_create_kernel(dev, "kernel/upPass.cl", "upPass", KERNEL_BUILD_MACRO, ICL_SOURCE);
	icl_kernel* downPass = icl_create_kernel(dev, "kernel/kdDownPass.cl", "downPass", KERNEL_BUILD_MACRO, ICL_SOURCE);

#if timing == 1
	icl_timer* timer_upPass = icl_init_timer(ICL_MILLI);
	icl_timer* timer_downPass = icl_init_timer(ICL_MILLI);
	icl_timer* timer_rt = icl_init_timer(ICL_MILLI);
#endif

	UINT treeHeight = s[4];
	printf("Tree height: %d\n", treeHeight);

	size_t localSizeB = 256;
	size_t globalSizeB = ((nNodes + 255) / 256) * 256;
	icl_start_timer(timer);
#if timing == 1
		icl_start_timer(timer_upPass);
#endif

	for(int l = (int)treeHeight; l >= 0; --l)
	{
		icl_run_kernel(upPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)particlesD,
						sizeof(int), &l,
						sizeof(UINT), &nNodes);
	}
#if timing == 1
		icl_finish(dev);
		icl_stop_timer(timer_upPass);
		icl_start_timer(timer_downPass);
#endif
	for(UINT l = 0; l <= treeHeight; ++l)
	{
		icl_run_kernel(downPass, 1, &globalSizeB, &localSizeB, NULL, NULL, 4,
						(size_t)0, (void *)nodelist,
						(size_t)0, (void *)treeD,
						sizeof(UINT), &l,
						sizeof(UINT), &nNodes);
	}
	icl_finish(dev);
#if timing == 1
	icl_stop_timer(timer_downPass);
#endif
	time = icl_stop_timer(timer);

	icl_release_kernel(upPass);
	icl_release_kernel(downPass);

#if timing == 1
	icl_start_timer(timer_rt);
#endif

#if timing == 1
	icl_finish(dev);
	icl_stop_timer(timer_rt);

	printf("upPass %f\ndownPass %f\nread Tree %f\n\n", timer_upPass->current_time, timer_downPass->current_time, timer_rt->current_time);
	icl_release_timer(timer_upPass);
	icl_release_timer(timer_downPass);
	icl_release_timer(timer_rt);
#endif

	//	struct Node* kdTree = (struct Node*)malloc(sizeof(struct Node) * nNodes);
//	icl_read_buffer(treeD, CL_TRUE, sizeof(struct Node) * nNodes, kdTree, NULL, NULL);
//	printf("%d", tree->nodelist[0].left_child);

	printf("\nTime: %f\n", time);
	icl_release_timer(timer);

	return treeHeight;
}
Exemplo n.º 3
0
void scan(icl_buffer *data, icl_buffer *flag, UINT n) {
	// use size for actual n, not overapproximation as in allocation
	UINT numWorkGroups = ((n + wx - 1) / wx);
	UINT sizeScanBuff = ((numWorkGroups + wx -1) / wx) * wx;

	size_t gx = numWorkGroups * wx;
	INT init = -1;

#if TIMING
	clFinish(dev->queue);
	icl_start_timer(timer);
#endif

	icl_run_kernel(perBlockScanByKey, 1, &gx, &wx, NULL, perBlockScanEvent, 8,
			(size_t)0, (void *)flag,
			(size_t)0, (void *)data,
			sizeof(INT), &init,
			sizeof(UINT), &n,
			sizeof(UINT) * wx, NULL,
			sizeof(INT) * wx, NULL,
			(size_t)0, (void *)keySumArray,
			(size_t)0, (void *)preSumArray);
/*
    V_OPENCL( kernels[0].setArg( 0, firstKey.getBuffer()), "Error setArg kernels[ 0 ]" ); // Input keys
    V_OPENCL( kernels[0].setArg( 1, firstValue.getBuffer()),"Error setArg kernels[ 0 ]" ); // Input buffer
    V_OPENCL( kernels[0].setArg( 2, result.getBuffer( ) ), "Error setArg kernels[ 0 ]" ); // Output buffer
    V_OPENCL( kernels[0].setArg( 3, init ),                 "Error setArg kernels[ 0 ]" ); // Initial value exclusive
    V_OPENCL( kernels[0].setArg( 4, numElements ),          "Error setArg kernels[ 0 ]" ); // Size of scratch buffer
    V_OPENCL( kernels[0].setArg( 5, ldsKeySize, NULL ),     "Error setArg kernels[ 0 ]" ); // Scratch buffer
    V_OPENCL( kernels[0].setArg( 6, ldsValueSize, NULL ),   "Error setArg kernels[ 0 ]" ); // Scratch buffer
    V_OPENCL( kernels[0].setArg( 7, *binaryPredicateBuffer),"Error setArg kernels[ 0 ]" ); // User provided functor
    V_OPENCL( kernels[0].setArg( 8, *binaryFunctionBuffer ),"Error setArg kernels[ 0 ]" ); // User provided functor
    V_OPENCL( kernels[0].setArg( 9, *keySumArray ),         "Error setArg kernels[ 0 ]" ); // Output per block sum
    V_OPENCL( kernels[0].setArg(10, *preSumArray ),         "Error setArg kernels[ 0 ]" ); // Output per block sum
    V_OPENCL( kernels[0].setArg(11, doExclusiveScan ),      "Error setArg kernels[ 0 ]" ); // Exclusive scan?
*/

	UINT workPerThread = sizeScanBuff / wx;

	icl_run_kernel(intraBlockInclusiveScanByKey, 1, &wx, &wx, NULL, intraBlockEvent, 7,
			(size_t)0, (void *)keySumArray,
			(size_t)0, (void *)preSumArray,
			(size_t)0, (void *)postSumArray,
			sizeof(UINT), &numWorkGroups,
			sizeof(UINT) * wx, NULL,
			sizeof(INT) * wx, NULL,
			sizeof(UINT), &workPerThread);

/*
INT* output = (UINT*)malloc(n * sizeof(INT));
icl_read_buffer(keySumArray, CL_TRUE, n * sizeof(INT), output, NULL, NULL);
for(int i = 0; i < n; ++i)
	printf("%d ", output[i]);
printf("\n"); */
/*
    V_OPENCL( kernels[1].setArg( 0, *keySumArray ),         "Error setArg kernels[ 1 ]" ); // Input keys
    V_OPENCL( kernels[1].setArg( 1, *preSumArray ),         "Error setArg kernels[ 1 ]" ); // Input buffer
    V_OPENCL( kernels[1].setArg( 2, *postSumArray ),        "Error setArg kernels[ 1 ]" ); // Output buffer
    V_OPENCL( kernels[1].setArg( 3, numWorkGroupsK0 ),      "Error setArg kernels[ 1 ]" ); // Size of scratch buffer
    V_OPENCL( kernels[1].setArg( 4, ldsKeySize, NULL ),     "Error setArg kernels[ 1 ]" ); // Scratch buffer
    V_OPENCL( kernels[1].setArg( 5, ldsValueSize, NULL ),   "Error setArg kernels[ 1 ]" ); // Scratch buffer
    V_OPENCL( kernels[1].setArg( 6, workPerThread ),        "Error setArg kernels[ 1 ]" ); // User provided functor
    V_OPENCL( kernels[1].setArg( 7, *binaryPredicateBuffer ),"Error setArg kernels[ 1 ]" ); // User provided functor
    V_OPENCL( kernels[1].setArg( 8, *binaryFunctionBuffer ),"Error setArg kernels[ 1 ]" ); // User provided functor
*/

	icl_run_kernel(perBlockAdditionByKey, 1, &gx, &wx, NULL, perBlockAdditionEvent, 5,
			(size_t)0, (void *)keySumArray,
			(size_t)0, (void *)postSumArray,
			(size_t)0, (void *)flag,
			(size_t)0, (void *)data,
			sizeof(UINT), &n);
/*
    V_OPENCL( kernels[2].setArg( 0, *keySumArray ),         "Error setArg kernels[ 2 ]" ); // Input buffer
    V_OPENCL( kernels[2].setArg( 1, *postSumArray ),        "Error setArg kernels[ 2 ]" ); // Input buffer
    V_OPENCL( kernels[2].setArg( 2, firstKey.getBuffer()), "Error setArg kernels[ 2 ]" ); // Output buffer
    V_OPENCL( kernels[2].setArg( 3, result.getBuffer()),   "Error setArg kernels[ 2 ]" ); // Output buffer
    V_OPENCL( kernels[2].setArg( 4, numElements ),          "Error setArg kernels[ 2 ]" ); // Size of scratch buffer
    V_OPENCL( kernels[2].setArg( 5, *binaryPredicateBuffer ),"Error setArg kernels[ 2 ]" ); // User provided functor
    V_OPENCL( kernels[2].setArg( 6, *binaryFunctionBuffer ),"Error setArg kernels[ 2 ]" ); // User provided functor
*/


#if TIMING
	clFinish(dev->queue);
	icl_stop_timer(timer);
	perBlockScanTime += icl_profile_event(perBlockScanEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI);
	intraBlockTime += icl_profile_event(intraBlockEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI);
	perBlockAdditionTime += icl_profile_event(perBlockAdditionEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI);
#endif
}
Exemplo n.º 4
0
Arquivo: vec_mul.c Projeto: 8l/insieme
int main(int argc, char* argv[]) {
	int size = 1000;

	int* input1 = (int*)malloc(sizeof(int) * size);
	int* input2 = (int*) malloc(sizeof(int) * size);
	int* output = (int *)malloc(sizeof(int) * size);
	
	for(int i=0; i < size; ++i) {
		input1[i] = i;
		input2[i] = 1;
	}

#ifndef INSIEME
	icl_timer* time1 = icl_init_timer(ICL_SEC);
	icl_start_timer(time1);
#endif
	icl_init_devices(ICL_CPU);
#ifndef INSIEME
	printf("TIME for initialization: %f\n", icl_stop_timer(time1));
#endif
	
	if (icl_get_num_devices() != 0) {
		icl_device* dev = icl_get_device(0);

		icl_print_device_short_info(dev);
		icl_kernel* kernel = icl_create_kernel(dev, "vec_mul.cl", "vec_mul", "", ICL_SOURCE);
		
		icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size);
		icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size);
		icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size);

		icl_event* wb1 = icl_create_event();
		icl_event* wb2 = icl_create_event();
		icl_event* rb = icl_create_event();

		icl_write_buffer(buf_input1, CL_FALSE, sizeof(int) * size, &input1[0], NULL, wb1);
		icl_write_buffer(buf_input2, CL_FALSE, sizeof(int) * size, &input2[0], NULL, wb2);
		
		size_t szLocalWorkSize = 256;
		float multiplier = size/(float)szLocalWorkSize;
		if(multiplier > (int)multiplier)
			multiplier += 1;
		size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize;

		icl_event* rk = icl_create_event();
		icl_event* wb_all = icl_create_event_list(2, wb1, wb2);	
		icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, wb_all, rk, 4,
											(size_t)0, (void *)buf_input1,
											(size_t)0, (void *)buf_input2,
											(size_t)0, (void *)buf_output,
											sizeof(cl_int), (void *)&size);
		
		icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], rk, rb);
		
		printf("Time wb1 %f\n", icl_profile_event(wb1, ICL_STARTED, ICL_FINISHED, ICL_SEC));		
		printf("Time wb2 %f\n", icl_profile_event(wb2, ICL_STARTED, ICL_FINISHED, ICL_SEC));
		printf("Time rk %f\n", icl_profile_event(rk, ICL_STARTED, ICL_FINISHED, ICL_SEC));
		printf("Time rb %f\n", icl_profile_event(rb, ICL_STARTED, ICL_FINISHED, ICL_SEC));
	
		icl_release_events(5, wb1, wb2, wb_all, rk, rb);
		icl_release_buffers(3, buf_input1, buf_input2, buf_output);
		icl_release_kernel(kernel);
	}
#ifndef INSIEME
	icl_restart_timer(time1);
#endif
	icl_release_devices();
#ifndef INSIEME
	printf("TIME for releasing the devices: %f\n", icl_stop_timer(time1));
	icl_release_timer(time1);
#endif
	
	// CHECK for output
	printf("======================\n= Vector Mul Done\n");
	unsigned int check = 1;
	for(unsigned int i = 0; i < size; ++i) {
		if(output[i] != i*size) {
			check = 0;
			printf("= fail at %d, expected %d / actual %d", i, i*3/2, output[i]);
			break;
		}
	}
	printf("= result check: %s\n======================\n", check ? "OK" : "FAIL");
	free(input1);
	free(input2);
	free(output);
}