Ejemplo n.º 1
0
static void scanImpl(cl_mem d_input, int rLen, cl_mem d_output, struct clContext * context, struct statistic * pp)
{
	int len = 2;
	if(rLen < len){
		cl_mem input, output;
		size_t globalSize = 1;
		size_t localSize = 1;
		input = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int) * len, NULL, 0);
		output = clCreateBuffer(context->context,CL_MEM_READ_WRITE, sizeof(int) * len, NULL, 0);
		context->kernel = clCreateKernel(context->program,"cl_memset_int",0);

        	clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&input);
        	clSetKernelArg(context->kernel,1,sizeof(int), (void*)&len);
       		clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,0);

		clEnqueueWriteBuffer(context->queue, input, CL_TRUE, 0, rLen * sizeof(int), d_input,0,0,0);
                preallocBlockSums(len, context);
                prescanArray(output, input, len, context,pp);
                deallocBlockSums();
		clEnqueueWriteBuffer(context->queue, d_output, CL_TRUE, 0, rLen * sizeof(int), output,0,0,0);
		clReleaseMemObject(input);
		clReleaseMemObject(output);
                return;
	}else{
		preallocBlockSums(rLen, context);
		prescanArray(d_output, d_input, rLen, context,pp);
		deallocBlockSums();
	}
}
int reduceImpl( cl_mem d_Rin, int rLen, cl_mem d_Rout, int OPERATOR, int numThread, int numMaxBlock,int *index,cl_event *eventList,cl_kernel *kernel,int *Flag_CPU_GPU,double * burden,tempResult *tR, int _CPU_GPU)
{
	preallocBlockSums( rLen, numThread,tR );
	reduceFirstPass( d_Rin, rLen, numThread, numMaxBlock, OPERATOR,index,eventList,kernel,Flag_CPU_GPU,burden,tR,_CPU_GPU );
	int result = reduceBlockSums( d_Rout, numThread, OPERATOR, rLen,index,eventList,kernel,Flag_CPU_GPU,burden,tR,_CPU_GPU);
	clWaitForEvents(1,&eventList[(*index-1)%2]);
	return result;
}