Exemple #1
0
hardware::code::Buffer::Buffer(const hardware::code::OpenClKernelParametersInterface& kernelParameters,
                               const hardware::Device* device)
    : Opencl_Module(kernelParameters, device)
{
    _copy_16_bytes = createKernel("copy_16_bytes") << "buffer.cl";

    auto base_code = get_device()->getGaugefieldCode()->get_sources();
    _clear_bytes   = createKernel("clear_bytes") << base_code << "buffer.cl";
    _clear_float4  = createKernel("clear_float4") << base_code << "buffer.cl";
}
int oclBilinearPyramid::compile()
{
	clUpsample = 0;
    clDownsample = 0;

	if (!oclProgram::compile())
	{
		return 0;
	}

	clUpsample = createKernel("clUpsample");
	KERNEL_VALIDATE(clUpsample)
	clDownsample = createKernel("clDownsample");
	KERNEL_VALIDATE(clDownsample)
	return 1;
}
BOOL CRunDeconvFFT::BeforeCall()
{
    pKernel = createKernel(m_pDocSrc2->GetImage());
    IppStatus status = CallInit();
    IppErrorMessage(m_initName, status);
    if (status < 0) return FALSE;
    return TRUE;
}
int oclBvhTrimesh::compile()
{
	clAABB = 0;
	clMorton = 0;
	clCreateNodes = 0;
	clLinkNodes = 0;
	clCreateLeaves = 0;
	clComputeAABBs = 0;

	if (!mRadixSort.compile())
	{
		return 0;
	}

	if (!oclProgram::compile())
	{
		return 0;
	}

	clAABB = createKernel("clAABB");
	KERNEL_VALIDATE(clAABB)
	clMorton = createKernel("clMorton");
	KERNEL_VALIDATE(clMorton)
	clCreateNodes = createKernel("clCreateNodes");
	KERNEL_VALIDATE(clCreateNodes)
	clLinkNodes = createKernel("clLinkNodes");
	KERNEL_VALIDATE(clLinkNodes)
	clCreateLeaves = createKernel("clCreateLeaves");
	KERNEL_VALIDATE(clCreateLeaves)
	clComputeAABBs = createKernel("clComputeAABBs");
	KERNEL_VALIDATE(clComputeAABBs)
	return 1;
}
void AudioDSPKernelProcessor::initialize()
{
    if (isInitialized())
        return;

    ASSERT(!m_kernels.size());

    // Create processing kernels, one per channel.
    for (unsigned i = 0; i < numberOfChannels(); ++i)
        m_kernels.append(createKernel());
        
    m_initialized = true;
    m_hasJustReset = true;
}
int oclConvolute::compile()
{
	clIso2D = 0;
	clIso2Dsep = 0;
	clAniso2Dtang = 0;
	clAniso2Dorth = 0;

	if (!oclProgram::compile())
	{
		return 0;
	}

	clIso2D = createKernel("clIso2D");
	KERNEL_VALIDATE(clIso2D)
	clIso2Dsep = createKernel("clIso2Dsep");
	KERNEL_VALIDATE(clIso2Dsep)

	clAniso2Dtang = createKernel("clAniso2Dtang");
	KERNEL_VALIDATE(clAniso2Dtang)
	clAniso2Dorth = createKernel("clAniso2Dorth");
	KERNEL_VALIDATE(clAniso2Dorth)
	return 1;
}
Exemple #7
0
void hardware::code::Real::fill_kernels()
{
    basic_real_code = get_fundamental_sources() << "types.hpp"
                                                << "operations_real.cl";

    logger.debug() << "Creating Real kernels...";

    // Setting operations kernel
    get_elem_vec = createKernel("get_elem_vector") << basic_real_code << "real_access_vector_element.cl";
    set_elem_vec = createKernel("set_elem_vector") << basic_real_code << "real_access_vector_element.cl";
    // Single operations kernels
    ratio      = createKernel("real_ratio") << basic_real_code << "real_ratio.cl";
    product    = createKernel("real_product") << basic_real_code << "real_product.cl";
    sum        = createKernel("real_sum") << basic_real_code << "real_sum.cl";
    difference = createKernel("real_subtraction") << basic_real_code << "real_subtraction.cl";
    // Update cgm kernels
    update_alpha_cgm = createKernel("update_alpha_cgm") << basic_real_code << "update_alpha_cgm.cl";
    update_beta_cgm  = createKernel("update_beta_cgm") << basic_real_code << "update_beta_cgm.cl";
    update_zeta_cgm  = createKernel("update_zeta_cgm") << basic_real_code << "update_zeta_cgm.cl";
}
Exemple #8
0
int main(int argc, char **argv)
{
    setbuf(stdout, NULL);

	glutInit(&argc, argv);
	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
	glutInitWindowSize(DIM, DIM);
    glutCreateWindow("Simple OpenGL OpenCL");
	glutIdleFunc(display);
	glutDisplayFunc(display);
    glutKeyboardFunc(keyboard);

    initCL();
    createKernel("swirl.cl","swirlKernelSCB");

    size_t addressbits,localSize,computeUnits,globalSize;
    openCLErrorID = clGetDeviceInfo(deviceHandle, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &computeUnits, NULL);
    openCLErrorID = clGetDeviceInfo(deviceHandle, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &localSize, NULL);
    openCLErrorID = clGetDeviceInfo(deviceHandle, CL_DEVICE_ADDRESS_BITS, sizeof(size_t), &addressbits, NULL);
    printf("CL_DEVICE_MAX_COMPUTE_UNITS: %lu\nCL_DEVICE_MAX_WORK_GROUP_SIZE: %lu\nCL_DEVICE_ADDRESS_BITS: %lu\n",computeUnits,localSize,addressbits);

	// load bitmap	
	Bitmap bmp = Bitmap("who-is-that.bmp");
	if (bmp.isValid())
	{		
		for (int i = 0 ; i < DIM*DIM ; i++) {
			sourceColors[i] = bmp.getR(i/DIM, i%DIM) / 255.0f;
		}
    }else{
        printf("couldnt load who-is-that.bmp");
        exit(0);
    }

    // DONE: allocate memory at sourceDevPtr on the GPU and copy sourceColors into it.
    sourceDevPtr = clCreateBuffer(  contextHandle,  CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,  DIM*DIM*sizeof(float),    sourceColors,   &openCLErrorID);
    // DONE: allocate memory at swirlDevPtr for the unswirled image.
    swirlDevPtr = clCreateBuffer(   contextHandle,  CL_MEM_READ_WRITE,                      DIM*DIM*sizeof(float),    NULL,           &openCLErrorID);

    //DONE: Set Kernel Arguments
    openCLErrorID = clSetKernelArg(kernel,0,sizeof(cl_mem),&sourceDevPtr);
    openCLErrorID = clSetKernelArg(kernel,1,sizeof(cl_mem),&swirlDevPtr);
    openCLErrorID = clSetKernelArg(kernel,2,sizeof(cl_float),&a);
    openCLErrorID = clSetKernelArg(kernel,3,sizeof(cl_float),&b);

	glutMainLoop();

	cleanup();
}
Exemple #9
0
// -------------------------------------------------------------------------
void MorphOpenCL::recompile(Morphology::EOperationType opType, int coordsSize)
{
	static int prevCoordsSize[Morphology::OT_Gradient+1] = {0};
	SKernelParameters* kparams;
	cl::Kernel* kernel;

	if(opType == Morphology::OT_Erode)
	{
		kparams = &erodeParams;
		kernel = &kernelErode;
	}
	else if(opType == Morphology::OT_Dilate)
	{
		kparams = &dilateParams;
		kernel = &kernelDilate;
	}
	else if(opType == Morphology::OT_Gradient)
	{
		kparams = &gradientParams;
		kernel = &kernelGradient;
	}
	else
	{
		if(opType == Morphology::OT_TopHat ||
		   opType == Morphology::OT_BlackHat ||
		   opType == Morphology::OT_Open ||
		   opType == Morphology::OT_Close)
		{
			recompile(Morphology::OT_Erode, coordsSize);
			recompile(Morphology::OT_Dilate, coordsSize);
		}
		return;
	}

	if(!kparams->needRecompile || coordsSize == prevCoordsSize[opType])
		return;

	QString opts = kparams->options + " -DCOORDS_SIZE=" + QString::number(coordsSize);
	prevCoordsSize[opType] = coordsSize;

	cl::Program prog = createProgram(kparams->programName,opts);
	*kernel = createKernel(prog, kparams->kernelName);
}
Exemple #10
0
/** Updates the internal state of the filter. */
int AnalogFilter::updateInternal() {
  /*
  int oldKernelLength = nKernelLength;
  nKernelLength = (int) dKernelLength;
  if (nKernelLength < 1) {
    nKernelLength = 1;
    dKernelLength = 1;
  }
  */

  // Reallocate memory for kernel
  if (oldKernelLength != nKernelLength) {
    delete[] filter_kernel;
    filter_kernel = new double[nKernelLength];
    createKernel(nKernelLength);
    oldKernelLength = nKernelLength;
  }

  return 0;
}
Exemple #11
0
 Kernel Program::createKernel(const string& name) const
 {
     return createKernel(name.c_str());
 }
int main(int argc, char **argv)
{
	//single precision real number
	//row major m rows by n columns
	int performance_level = atoi(argv[1]);
	int m = atoi(argv[2]);//m and n should be mod 32
	int n = atoi(argv[3]);
	int batchSize = atoi(argv[4]);
	//n should be twice as m for now.
	if (n != 3 * m)
	{
		std::cout << "n should be three times as m for now." << std::endl;
		return 1;
	}
	//malloc input data
	std::complex<float> *CPU_A = (std::complex<float>*)malloc(m*n*batchSize*sizeof(std::complex<float>));
	//temperay buffer to hold the intermediate result after the first kernel
	std::complex<float> *CPU_A_TEMP = (std::complex<float>*)malloc(m*n*batchSize*sizeof(std::complex<float>));

	std::complex<float> *CPU_A_OUT = (std::complex<float>*)malloc(m*n*batchSize*sizeof(std::complex<float>));
	int miniBatchSize = n / m;//which is 3 for now
	for (int k = 0; k < batchSize; k++)
	{
		for (int q = 0; q < miniBatchSize; q++)
		{
			for (int i = 0; i < m; i++)
			{
				for (int j = 0; j < n/3; j++)
				{
					CPU_A[k*m*n + q*n/3 + i*n + j] = { (float)(i*n + j + k + q), (float)(i*n + j + k + 2*q) };
				}
			}
		}
	}

	//init OpenCL 
	cl_int err;
	cl_platform_id platform;
	cl_device_id device;
	cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
	cl_context context;
	cl_command_queue queue;
	cl_kernel kernel1, kernel2;
	cl_event event1, event2;
	char *source1, *source2;

	platform = getPlatform(PLATFORM_NAME);
	assert(platform != NULL);
	device = getDevice(platform, DEVICE_NAME);
	assert(device != NULL);
	props[1] = (cl_context_properties)platform;
	context = clCreateContext(props, 1, &device, NULL, NULL, &err);
	assert(context != NULL);
	queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
	assert(queue != NULL);

	cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_WRITE,
		(m * n * batchSize) * sizeof(*CPU_A), NULL, &err);
	assert(bufA != NULL);

	//move memory from host to device
	err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
		(m*n*batchSize) * sizeof(*CPU_A), CPU_A,
		0, NULL, NULL);

	//compile kernel
	source1 = loadFile(KERNEL_SOURCE1);
	assert(source1 != NULL);
	kernel1 = createKernel(source1, context, BUILD_OPTIONS, &err);
	assert(kernel1 != NULL);

	source2 = loadFile(KERNEL_SOURCE2);
	assert(source2 != NULL);
	kernel2 = createKernel(source2, context, BUILD_OPTIONS, &err);
	assert(kernel2 != NULL);


	//launch kernel
	size_t localWorkSize2[1] = { 256 };
	//calculate number of work groups
	//each work group works on a 32 by 32 block
	//the whole matrix has ((m-1)/32+1) * ((n-1)/32+1) = 23 x 23 blocks
	//the upper triangle of which (including the diagional) is
	//23*(23+1)/2 = 276
	//so the formula is ((m-1)/32+1) * ((n/3-1)/32+1+1) / 2
	int num_wg = ((m - 1) / 32 + 1) * ((m - 1) / 32 + 1 + 1) / 2;
	size_t globalWorkSize2[1] = { batchSize * num_wg * miniBatchSize * 256 };

	err = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &bufA);
	assert(err == CL_SUCCESS);
	/*
	err = clSetKernelArg(kernel, 1, sizeof(cl_uint), &m);
	assert(err == CL_SUCCESS);
	err = clSetKernelArg(kernel, 2, sizeof(cl_uint), &num_wg);
	assert(err == CL_SUCCESS);
	*/

	//second pass kernel sizes
	size_t localWorkSize1[1] = { 256 };
	size_t globalWorkSize1[1] = { batchSize*(313)*256 }; // 313 is calculated by the permutation algorithm given input 3 and 729
	err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &bufA);
	assert(err == CL_SUCCESS);

	if (performance_level == 0)
	{
		//check result
		//first launch kernel 1 that swaps lines
		err = clEnqueueNDRangeKernel(queue, kernel1, 1, NULL,
			globalWorkSize1, localWorkSize1, 0, NULL, &event1);
		assert(err == CL_SUCCESS);
		err = clFinish(queue);
		assert(err == CL_SUCCESS);
		err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0,
			(batchSize*m*n) * sizeof(*CPU_A_TEMP), CPU_A_TEMP,
			0, NULL, NULL);
		assert(err == CL_SUCCESS);

		//second pass that transpose each minibatch
		
		err = clEnqueueNDRangeKernel(queue, kernel2, 1, NULL,
			globalWorkSize2, localWorkSize2, 0, NULL, &event2);
		assert(err == CL_SUCCESS);
		err = clFinish(queue);
		

	}
	else if (performance_level == 1)
	{
		//check kernel performance
		//second pass
		err = clEnqueueNDRangeKernel(queue, kernel1, 1, NULL,
			globalWorkSize1, localWorkSize1, 0, NULL, &event1);
		assert(err == CL_SUCCESS);
		clWaitForEvents(1, &event1);
		assert(err == CL_SUCCESS);

		err = clEnqueueNDRangeKernel(queue, kernel2, 1, NULL,
			globalWorkSize2, localWorkSize2, 0, NULL, &event2);
		assert(err == CL_SUCCESS);
		clWaitForEvents(1, &event2);
		assert(err == CL_SUCCESS);

		cl_ulong start1, end1, start2, end2;
		cl_ulong KernelTime1 = 0;
		cl_ulong KernelTime2 = 0;
		int iteration = 20;
		for (int i = 0; i < iteration; i++)
		{
			event1 = NULL;
			event2 = NULL;

			err = clEnqueueNDRangeKernel(queue, kernel1, 1, NULL,
				globalWorkSize1, localWorkSize1, 0, NULL, &event1);

			assert(err == CL_SUCCESS);
			clWaitForEvents(1, &event1);
			assert(err == CL_SUCCESS);


			err = clEnqueueNDRangeKernel(queue, kernel2, 1, NULL,
				globalWorkSize2, localWorkSize2, 0, NULL, &event2);

			assert(err == CL_SUCCESS);
			clWaitForEvents(1, &event2);
			assert(err == CL_SUCCESS);

			err = clGetEventProfilingInfo(event1, CL_PROFILING_COMMAND_START,
				sizeof(start1), &start1, NULL);
			err = clGetEventProfilingInfo(event1, CL_PROFILING_COMMAND_END,
				sizeof(end1), &end1, NULL);

			err = clGetEventProfilingInfo(event2, CL_PROFILING_COMMAND_START,
				sizeof(start2), &start2, NULL);
			err = clGetEventProfilingInfo(event2, CL_PROFILING_COMMAND_END,
				sizeof(end2), &end2, NULL);

			KernelTime1 += (end1 - start1);
			KernelTime2 += (end2 - start2);

		}

		//KernelTime is in ns
		size_t peakGBs = 512;
		std::cout << "the first kernel takes " << KernelTime1/iteration << " ns in average." << std::endl;
		std::cout << "the second kernel takes " << KernelTime2/iteration << " ns in average." << std::endl;

		size_t KernelGBs = 2 * sizeof(std::complex<float>) * m * n * batchSize / ((KernelTime1 + KernelTime2) / iteration);
		std::cout << " GBs: " << KernelGBs << " GBs" << std::endl;
		float efficiency = ((float)KernelGBs) / (float)peakGBs;
		std::cout << " efficiency: " << efficiency * 100 << "%" << std::endl;
	}

	//move memory from device to host
	err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0,
		(batchSize*m*n) * sizeof(*CPU_A_OUT), CPU_A_OUT,
		0, NULL, NULL);
	assert(err == CL_SUCCESS);

	if (performance_level == 0)
	{
		//check result
		int error = 0;
		
		for (int k = 0; k < batchSize; k++)
		{
			for (int q = 0; q < miniBatchSize; q++)
			{
				for (int i = 0; i < m; i++)
				{
					for (int j = 0; j < n / 3; j++)
					{
						//std::complex<float> out = CPU_A_TEMP[k*m*n + q*n/3 + i*n + j];
						std::complex<float> out = CPU_A_TEMP[k*m*n + q*m*n/3 + i*n/3 +j];
						std::complex<float> in = CPU_A[k*m*n + q*n / 3 + i*n + j];

						if (in != out)
						{
							error = 1;
							break;
						}
					}
				}
			}
		}

		if (error == 0)
		{
			std::cout << "first kernel correstness passed." << std::endl;
		}
		else
		{
			std::cout << "first kernel correctness failed." << std::endl;
		}
		
		for (int k = 0; k < batchSize; k++)
		{
			for (int q = 0; q < miniBatchSize; q++)
			{
				for (int i = 0; i < m; i++)
				{
					for (int j = 0; j < n / 3; j++)
					{
						std::complex<float> out = CPU_A_OUT[k*m*n + q*m*n/3 + j*m + i];
						std::complex<float> in = CPU_A[k*m*n + q*n/3 + i*n + j];

						if (in != out)
						{
							error = 1;
							break;
						}
					}
				}
			}
		}
		
		if (error == 0)
		{
			std::cout << "correstness passed." << std::endl;
		}
		else
		{
			std::cout << "correctness failed." << std::endl;
		}
		
	}

	//releasing the objects
	err = clReleaseMemObject(bufA);
	err = clReleaseEvent(event1);
	err = clReleaseEvent(event2);
	err = clReleaseKernel(kernel1);
	err = clReleaseKernel(kernel2);
	err = clReleaseCommandQueue(queue);
	err = clReleaseContext(context);
	free(CPU_A_TEMP);
	free(CPU_A);
	free(CPU_A_OUT);
}
Exemple #13
0
/** Filter a response signal of the neural microcircuit.
    \param R Response of the neural microcircuit.
    \param X Target vector where to save the results.
    \param indices Indices where to store the results in X.
    \return -1 if an error occured, 1 for success. */
int AnalogFilter::filter(const double* R, double* X, int* indices) {

  if (R == 0) {
    TheCsimError.add("AnalogFilter::filter: Input is a NULL pointer!\n");
    return -1;
  }
  if (X == 0) {
    TheCsimError.add("AnalogFilter::filter: Target vector is a NULL pointer!\n");
    return -1;
  }

  deque<double>::iterator p;
  double f_value;
  int i, j;

  nInputAvailable++;

  if ((nInputAvailable) <= nKernelLength) {
    // Length of collected input data is shorter than
    // desired size of filter kernel
    // Calculate a new shorter kernel

    createKernel(nInputAvailable);
  }

  // Put the new data into the queues
  for (i=0; i<nChannels; i++) {
    // Delete the oldest element in the queue
    if (nInputAvailable > nKernelLength)
      dataQueues[i]->pop_front();
    // Add the new value at the end of the queue
    dataQueues[i]->push_back(R[i]);
  }

  int nToFilter = min(nInputAvailable, nKernelLength);


  // Filter all analog channels
  for (i=0; i<nChannels; i++) {
    p = dataQueues[i]->begin();
    f_value = 0.0;

    for (j=0; j<nToFilter; j++) {
      // Calculate filtered value
      f_value += filter_kernel[j] * *p;

      if (p != dataQueues[i]->end()) {
	if (j < (nToFilter - 1))
	  // Do not advance the iterator for the last object, since we want
	  // to change its content
	  p++;
      }
      else {
	TheCsimError.add("AnalogFilter::filter: Data was lost before filtering!\n");
	return -1;
      }
    }

    // Store the filtered value: Replace the value of the last input
    *p = f_value;
    if (indices)
      X[indices[i]] = f_value;
    else
      X[i] = f_value;
  }

  return 1;
}
Exemple #14
0
void
bluesteinsFFTGpu(const char* const argv[],const unsigned n, 
		 const unsigned orign,const unsigned size)
{
  const unsigned powM = (unsigned) log2(n);
  printf("Compiling Bluesteins Program..\n");

  compileProgram(argv, "fft.h", "kernels/bluesteins.cl");

    printf("Creating Kernel\n");
    for (unsigned i = 0; i < deviceCount; ++i) {
        createKernel(i, "bluesteins");
    }

    const unsigned sizePerGPU = size / deviceCount;
    for (unsigned i = 0; i < deviceCount; ++i) {
        workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU 
                                               : (size - workOffset[i]);       
        
        allocateDeviceMemoryBS(i , workSize[i], workOffset[i]);
        
        clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]);
        clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]);
	clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]);
        clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]);
	clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]);
        clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]);
	clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n);
	clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign);
	clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM);
	clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize);
       

        if ((i + 1) < deviceCount) {
            workOffset[i + 1] = workOffset[i] + workSize[i];
        } 

    }

    size_t localWorkSize[] = {blockSize};
    for (unsigned i = 0; i < deviceCount; ++i) {
        size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; 
        // kernel non blocking execution 
        runKernel(i, localWorkSize, globalWorkSize);
    }

    h_Rreal = h_Hreal;
    h_Rimag = h_Himag;
    
    for (unsigned i = 0; i < deviceCount; ++i) {
        copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i],
                                                workSize[i]); 
        copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i],
                                                 workSize[i]);
    }

    // wait for copy event
    const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone);
    checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents");
    printGpuTime();
}
Exemple #15
0
// -------------------------------------------------------------------------
cl::Kernel MorphOpenCL::createKernel(const cl::Program& prog, 
	const QString& kernelName)
{
	std::string b = kernelName.toStdString();
	return createKernel(prog, b.c_str());
}
Exemple #16
0
 theKernels(cl_context GPUContext, cl_device_id cdDevice) {
     GPUContext_K = GPUContext;
     cdDevice_K   = cdDevice;
     if(device_use)
     {
         createKernel("pairwiseDistanceKernel","../../../src/E_PairwiseDistance.cl",0);
     }
     else
         createKernel("pairwiseDistanceKernel","../../../src/CPU_PairwiseDistance.cl",0);
     createKernel("argminKernel","../../../src/argminKernel.cl",1);
     createKernel("argmaxKernel","../../../src/argmaxKernel.cl",2);
     createKernel("minKernel","../../../src/minKernel.cl",3);
     createKernel("maxKernel","../../../src/maxKernel.cl",4);
     if(device_use)
         createKernel("blockwise_distance_kernel","../../../src/E_blockwise_distance_kernel.cl",5);
     else
         createKernel("blockwise_distance_kernel","../../../src/CPU_blockwise_distance_kernel.cl",5);
     createKernel("blockwise_filter_kernel","../../../src/blockwise_filter_kernel.cl",6);
     createKernel("cell_histogram_kernel","../../../src/cell_histogram_kernel.cl",7);
     createKernel("cellHistogramKernel1","../../../src/cellHistogramKernel1.cl",8);
     createKernel("cellHistogramKernel2","../../../src/cellHistogramKernel2.cl",9);
     createKernel("cellHistogramKernel3","../../../src/cellHistogramKernel3.cl",10);
 }
int oclFluid3D::compile()
{
	clInitFluid = 0;
	clIntegrateForce = 0;
	clIntegrateVelocity = 0;
	clHash = 0;
	clReorder = 0;
	clInitBounds = 0;

	if (!mRadixSort.compile())
	{
		return 0;
	}

	if (!oclProgram::compile())
	{
		return 0;
	}

	clInitFluid = createKernel("clInitFluid");
	KERNEL_VALIDATE(clInitFluid)
	clIntegrateForce = createKernel("clIntegrateForce");
	KERNEL_VALIDATE(clIntegrateForce)
	clIntegrateVelocity = createKernel("clIntegrateVelocity");
	KERNEL_VALIDATE(clIntegrateVelocity)
	clHash = createKernel("clHash");
	KERNEL_VALIDATE(clHash)
	clReorder = createKernel("clReorder");
	KERNEL_VALIDATE(clReorder)
	clInitBounds = createKernel("clInitBounds");
	KERNEL_VALIDATE(clInitBounds)
	clFindBounds = createKernel("clFindBounds");
	KERNEL_VALIDATE(clFindBounds)
	clCalculateDensity = createKernel("clCalculateDensity");
	KERNEL_VALIDATE(clCalculateDensity)
	clCalculateForces = createKernel("clCalculateForces");
	KERNEL_VALIDATE(clCalculateForces)
	clGravity = createKernel("clGravity");
	KERNEL_VALIDATE(clGravity)
	clClipBox = createKernel("clClipBox");
	KERNEL_VALIDATE(clClipBox)

	// init fluid parameters
	clSetKernelArg(clInitFluid, 0, sizeof(cl_mem), bfParams);
	clEnqueueTask(mContext.getDevice(0), clInitFluid, 0, NULL, clInitFluid.getEvent());
	bfParams.map(CL_MAP_READ);

	return bindBuffers();
}
void cluster_t::init_opencl(){
  if(run_gpu){
  // initialize the GPU if necessary
#ifdef USE_GPU
    debug_opencl = false;
    proxmap_t::init_opencl();
    cerr<<"Initializing OpenCL for cluster sub class\n";
    cerr<<"P is "<<p<<", Workgroup width is "<<variable_blocks<<endl;
    // CREATE KERNELS
    createKernel("init_U",kernel_init_U);
    createKernel("update_U",kernel_update_U);
    createKernel("update_map_distance",kernel_update_map_distance);
    createKernel("init_v_project_coeff",kernel_init_v_project_coeff);
    createKernel("store_U_project",kernel_store_U_project);
    createKernel("store_U_project_prev",kernel_store_U_project_prev);
    createKernel("iterate_projection",kernel_iterate_projection);
    createKernel("evaluate_obj",kernel_evaluate_obj);
    createKernel("get_U_norm_diff",kernel_get_U_norm_diff);
    cerr<<"Kernels created\n";
    // CREATE BUFFERS
    createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U",buffer_U);
    createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_prev",buffer_U_prev);
    createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_project",buffer_U_project);
    createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_project_orig",buffer_U_project_orig);
    createBuffer<float>(CL_MEM_READ_WRITE,n*p,"buffer_U_project_prev",buffer_U_project_prev);
    createBuffer<float>(CL_MEM_READ_WRITE,triangle_dim,"buffer_V_project_coeff",buffer_V_project_coeff);
    createBuffer<float>(CL_MEM_READ_ONLY,n*p,"buffer_rawdata",buffer_rawdata);
    createBuffer<float>(CL_MEM_READ_ONLY,triangle_dim,"buffer_weights",buffer_weights);
    createBuffer<int>(CL_MEM_READ_ONLY,n,"buffer_offsets",buffer_offsets);
    createBuffer<float>(CL_MEM_READ_WRITE,variable_blocks,"buffer_variable_block_norms1",buffer_variable_block_norms1);
    createBuffer<float>(CL_MEM_READ_WRITE,variable_blocks,"buffer_variable_block_norms2",buffer_variable_block_norms2);
    createBuffer<float>(CL_MEM_READ_WRITE,n*variable_blocks,"buffer_subject_variable_block_norms",buffer_subject_variable_block_norms);
    createBuffer<float>(CL_MEM_READ_ONLY,1,"buffer_unweighted_lambda",buffer_unweighted_lambda);
    createBuffer<float>(CL_MEM_READ_ONLY,1,"buffer_dist_func",buffer_dist_func);
    createBuffer<float>(CL_MEM_READ_ONLY,1,"buffer_rho",buffer_rho);
    createBuffer<float>(CL_MEM_READ_WRITE,n,"buffer_n_norms",buffer_n_norms);
    createBuffer<float>(CL_MEM_READ_WRITE,triangle_dim,"buffer_n2_norms",buffer_n2_norms);
    ////createBuffer<>(CL_MEM_READ_ONLY,,"buffer_",buffer_);
    cerr<<"GPU Buffers created\n";
    // initialize anything here
    writeToBuffer(buffer_U,n*p,U,"buffer_U");
    writeToBuffer(buffer_U_prev,n*p,U_prev,"buffer_U_prev");
    writeToBuffer(buffer_U_project,n*p,U_project,"buffer_U_project");
    writeToBuffer(buffer_U_project_orig,n*p,U_project_orig,"buffer_U_project_orig");
    writeToBuffer(buffer_rawdata,n*p,rawdata,"buffer_rawdata");
    writeToBuffer(buffer_offsets,n,offsets,"buffer_offsets");
    cerr<<"GPU Buffers initialized\n";
    // SET KERNEL ARGUMENTS HERE
    int arg;
    //int kernelWorkGroupSize;
    arg = 0;
    setArg(kernel_update_U,arg,p,"kernel_update_U");
    setArg(kernel_update_U,arg,*buffer_dist_func,"kernel_update_U");
    setArg(kernel_update_U,arg,*buffer_rho,"kernel_update_U");
    setArg(kernel_update_U,arg,*buffer_U,"kernel_update_U");
    setArg(kernel_update_U,arg,*buffer_U_prev,"kernel_update_U");
    setArg(kernel_update_U,arg,*buffer_rawdata,"kernel_update_U");
    setArg(kernel_update_U,arg,*buffer_U_project,"kernel_update_U");
    arg = 0;
    setArg(kernel_init_U,arg,p,"kernel_init_U");
    setArg(kernel_init_U,arg,*buffer_rawdata,"kernel_init_U");
    setArg(kernel_init_U,arg,*buffer_U,"kernel_init_U");
    setArg(kernel_init_U,arg,*buffer_U_project,"kernel_init_U");
    setArg(kernel_init_U,arg,*buffer_U_project_orig,"kernel_init_U");
    arg = 0;
    setArg(kernel_update_map_distance,arg,n,"kernel_update_map_distance");
    setArg(kernel_update_map_distance,arg,p,"kernel_update_map_distance");
    setArg(kernel_update_map_distance,arg,*buffer_U,"kernel_update_map_distance");
    setArg(kernel_update_map_distance,arg,*buffer_U_project,"kernel_update_map_distance");
    setArg(kernel_update_map_distance,arg,*buffer_variable_block_norms1,"kernel_update_map_distance");
    setArg(kernel_update_map_distance,arg,*buffer_variable_block_norms2,"kernel_update_map_distance");
    setArg(kernel_update_map_distance,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_update_map_distance");
    setArg(kernel_update_map_distance,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_update_map_distance");
    arg = 0;
    setArg(kernel_init_v_project_coeff,arg,n,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,p,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,variable_blocks,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,*buffer_unweighted_lambda,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,*buffer_weights,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,*buffer_U_project_orig,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,*buffer_V_project_coeff,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,*buffer_offsets,"kernel_init_v_project_coeff");
    setArg(kernel_init_v_project_coeff,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_init_v_project_coeff");
    arg = 0; 
    setArg(kernel_store_U_project,arg,p,"kernel_store_U_project");
    setArg(kernel_store_U_project,arg,*buffer_U,"kernel_store_U_project");
    setArg(kernel_store_U_project,arg,*buffer_U_project,"kernel_store_U_project");
    setArg(kernel_store_U_project,arg,*buffer_U_project_orig,"kernel_store_U_project");
    arg = 0; 
    setArg(kernel_store_U_project_prev,arg,p,"kernel_store_U_project_prev");
    setArg(kernel_store_U_project_prev,arg,*buffer_U_project,"kernel_store_U_project_prev");
    setArg(kernel_store_U_project_prev,arg,*buffer_U_project_prev,"kernel_store_U_project_prev");
    arg = 0; 
    setArg(kernel_iterate_projection,arg,n,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,p,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,variable_blocks,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_U,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_U_project,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_U_project_orig,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_U_project_prev,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_offsets,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_weights,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_V_project_coeff,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,*buffer_subject_variable_block_norms,"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection");
    setArg(kernel_iterate_projection,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_iterate_projection");
    arg = 0; 
    setArg(kernel_evaluate_obj,arg,n,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,p,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,variable_blocks,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_offsets,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_rawdata,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_U,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_U_prev,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_U_project,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_weights,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_V_project_coeff,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_n_norms,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,*buffer_n2_norms,"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_evaluate_obj");
    setArg(kernel_evaluate_obj,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_evaluate_obj");
    arg = 0; 
    setArg(kernel_get_U_norm_diff,arg,n,"kernel_get_U_norm_diff");
    setArg(kernel_get_U_norm_diff,arg,p,"kernel_get_U_norm_diff");
    setArg(kernel_get_U_norm_diff,arg,variable_blocks,"kernel_get_U_norm_diff");
    setArg(kernel_get_U_norm_diff,arg,*buffer_U,"kernel_get_U_norm_diff");
    setArg(kernel_get_U_norm_diff,arg,*buffer_U_prev,"kernel_get_U_norm_diff");
    setArg(kernel_get_U_norm_diff,arg,*buffer_n_norms,"kernel_get_U_norm_diff");
    setArg(kernel_get_U_norm_diff,arg,cl::__local(sizeof(float)*BLOCK_WIDTH),"kernel_get_U_norm_diff");
    //setArg(kernel_reduce_weights2,arg,g_people,"kernel_reduce_weights2");
    //kernelWorkGroupSize = kernel_reduce_weights2->getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0], &err);
    //clSafe(err,"get workgroup size kernel reduce_weights2");
    //cerr<<"reduce_weights2 kernel work group size is "<<kernelWorkGroupSize<<endl;
    cerr<<"GPU kernel arguments assigned.\n";
#endif
  }
}