Пример #1
0
float *OpenClFindNearestNeighbors(
	cl_context context,
	int numRecords,
	std::vector<LatLong> &locations,float lat,float lng,
	int timing) {

    // 1. set up kernel
    cl_kernel NN_kernel;
        cl_int status;
        cl_program cl_NN_program;
        cl_NN_program = cl_compileProgram(
            (char *)"nearestNeighbor_kernel.cl",NULL);
       
        NN_kernel = clCreateKernel(
            cl_NN_program, "NearestNeighbor", &status);
        status = cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel",true);
        if(status)exit(1);
    // 2. set up memory on device and send ipts data to device
    // copy ipts(1,2) to device
    // also need to alloate memory for the distancePoints
    cl_mem d_locations;
    cl_mem d_distances;

    cl_int error=0;

    d_locations = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(LatLong) * numRecords, NULL, &error);

    d_distances = clCreateBuffer(context, CL_MEM_READ_WRITE,
        sizeof(float) * numRecords, NULL, &error);

    cl_command_queue command_queue = cl_getCommandQueue();
    cl_event writeEvent,kernelEvent,readEvent;
    error = clEnqueueWriteBuffer(command_queue,
               d_locations,
               1, // change to 0 for nonblocking write
               0, // offset
               sizeof(LatLong) * numRecords,
               &locations[0],
               0,
               NULL,
               &writeEvent);

    // 3. send arguments to device
    cl_int argchk;
    argchk  = clSetKernelArg(NN_kernel, 0, sizeof(cl_mem), (void *)&d_locations);
    argchk |= clSetKernelArg(NN_kernel, 1, sizeof(cl_mem), (void *)&d_distances);
    argchk |= clSetKernelArg(NN_kernel, 2, sizeof(int), (void *)&numRecords);
    argchk |= clSetKernelArg(NN_kernel, 3, sizeof(float), (void *)&lat);
    argchk |= clSetKernelArg(NN_kernel, 4, sizeof(float), (void *)&lng);

    cl_errChk(argchk,"ERROR in Setting Nearest Neighbor kernel args",true);

    // 4. enqueue kernel
    size_t globalWorkSize[1];
    globalWorkSize[0] = numRecords;
    if (numRecords % 64) globalWorkSize[0] += 64 - (numRecords % 64);
    //printf("Global Work Size: %zu\n",globalWorkSize[0]);      

    error = clEnqueueNDRangeKernel(
        command_queue,  NN_kernel, 1, 0,
        globalWorkSize,NULL,
        0, NULL, &kernelEvent);

    cl_errChk(error,"ERROR in Executing Kernel NearestNeighbor",true);

    // 5. transfer data off of device
    
    // create distances std::vector
    float *distances = (float *)malloc(sizeof(float) * numRecords);

    error = clEnqueueReadBuffer(command_queue,
        d_distances,
        1, // change to 0 for nonblocking write
        0, // offset
        sizeof(float) * numRecords,
        distances,
        0,
        NULL,
        &readEvent);

    cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
    if (timing) {
        clFinish(command_queue);
        cl_ulong eventStart,eventEnd,totalTime=0;
        printf("# Records\tWrite(s) [size]\t\tKernel(s)\tRead(s)  [size]\t\tTotal(s)\n");
        printf("%d        \t",numRecords);
        // Write Buffer
        error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_START,
                                        sizeof(cl_ulong),&eventStart,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Write Start)",true); 
        error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_END,
                                        sizeof(cl_ulong),&eventEnd,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Write End)",true);

        printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(LatLong) * numRecords)/1e6));
        totalTime += eventEnd-eventStart;
        // Kernel
        error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_START,
                                        sizeof(cl_ulong),&eventStart,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Kernel Start)",true); 
        error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_END,
                                        sizeof(cl_ulong),&eventEnd,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Kernel End)",true);

        printf("%f\t",(float)((eventEnd-eventStart)/1e9));
        totalTime += eventEnd-eventStart;
        // Read Buffer
        error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_START,
                                        sizeof(cl_ulong),&eventStart,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Read Start)",true); 
        error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_END,
                                        sizeof(cl_ulong),&eventEnd,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Read End)",true);

        printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(float) * numRecords)/1e6));
        totalTime += eventEnd-eventStart;
        
        printf("%f\n\n",(float)(totalTime/1e9));
    }
    // 6. return finalized data and release buffers
    clReleaseMemObject(d_locations);
    clReleaseMemObject(d_distances);
	return distances;
}
Пример #2
0
/*!
*/
cl_kernel* cl_precompileKernels(char* buildOptions)
{
    // Compile each program and create the kernel objects

    printf("Precompiling kernels...\n");

    cl_time totalstart, totalend;
    cl_time start, end;

    cl_getTime(&totalstart);

    // Creating descriptors kernel
    cl_getTime(&start);
    program_list[1]  = cl_compileProgram("CLSource/createDescriptors_kernel.cl",
        buildOptions, false);
    cl_getTime(&end);
    events->newCompileEvent(cl_computeTime(start, end), "createDescriptors");
    kernel_list[KERNEL_SURF_DESC] = cl_createKernel(program_list[1],
        "createDescriptors_kernel");

        // Get orientation kernels
    cl_getTime(&start);
    program_list[4]  = cl_compileProgram("CLSource/getOrientation_kernels.cl",
        buildOptions, false);
    cl_getTime(&end);
    events->newCompileEvent(cl_computeTime(start, end), "Orientation");
    kernel_list[KERNEL_GET_ORIENT1] = cl_createKernel(program_list[4],
        "getOrientationStep1");
    kernel_list[KERNEL_GET_ORIENT2] = cl_createKernel(program_list[4],
        "getOrientationStep2");

    // Hessian determinant kernel
    cl_getTime(&start);
    program_list[0]  = cl_compileProgram("CLSource/hessianDet_kernel.cl",
        buildOptions, false);
    cl_getTime(&end);
    events->newCompileEvent(cl_computeTime(start, end), "hessian_det");
    kernel_list[KERNEL_BUILD_DET] = cl_createKernel(program_list[0],
        "hessian_det");

    // Integral image kernels
    cl_getTime(&start);
    program_list[6] = cl_compileProgram("CLSource/integralImage_kernels.cl",
        buildOptions, false);
    cl_getTime(&end);
    events->newCompileEvent(cl_computeTime(start, end), "IntegralImage");
    kernel_list[KERNEL_SCAN] = cl_createKernel(program_list[6], "scan");
    kernel_list[KERNEL_SCAN4] = cl_createKernel(program_list[6], "scan4");
    kernel_list[KERNEL_SCANIMAGE] = cl_createKernel(program_list[6],
        "scanImage");
    kernel_list[KERNEL_TRANSPOSE] = cl_createKernel(program_list[6],
        "transpose");
    kernel_list[KERNEL_TRANSPOSEIMAGE] = cl_createKernel(program_list[6],
        "transposeImage");

    // Nearest neighbor kernels
    cl_getTime(&start);
    program_list[5]  = cl_compileProgram("CLSource/nearestNeighbor_kernel.cl",
        buildOptions, false);
    cl_getTime(&end);
    events->newCompileEvent(cl_computeTime(start, end), "NearestNeighbor");
    kernel_list[KERNEL_NN] = cl_createKernel(program_list[5],
        "NearestNeighbor");

    // Non-maximum suppression kernel
    cl_getTime(&start);
    program_list[3]  = cl_compileProgram("CLSource/nonMaxSuppression_kernel.cl",
        buildOptions, false);
    cl_getTime(&end);
    events->newCompileEvent(cl_computeTime(start, end), "NonMaxSuppression");
    kernel_list[KERNEL_NON_MAX_SUP] = cl_createKernel(program_list[3],
        "non_max_supression_kernel");

    // Normalization of descriptors kernel
    cl_getTime(&start);
    program_list[2]  = cl_compileProgram("CLSource/normalizeDescriptors_kernel.cl",
        buildOptions, false);
    cl_getTime(&end);
    events->newCompileEvent(cl_computeTime(start, end), "normalize");
    kernel_list[KERNEL_NORM_DESC] = cl_createKernel(program_list[2],
        "normalizeDescriptors");

    cl_getTime(&totalend);

    printf("\tTime for Off-Critical Path Compilation: %.3f milliseconds\n\n",
        cl_computeTime(totalstart, totalend));

    return kernel_list;
}
Пример #3
0
/*------------------------------------------------------
 ** ForwardSub() -- Forward substitution of Gaussian
 ** elimination.
 **------------------------------------------------------
 */
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing){    
	// 1. set up kernels
	cl_kernel fan1_kernel,fan2_kernel;
	cl_int status=0;
	cl_program gaussianElim_program;
	cl_event writeEvent,kernelEvent,readEvent;
	float writeTime=0,readTime=0,kernelTime=0;
	float writeMB=0,readMB=0;

	gaussianElim_program = cl_compileProgram(
			(char *)"gaussianElim_kernels.cl",NULL);

	fan1_kernel = clCreateKernel(
			gaussianElim_program, "Fan1", &status);
	status = cl_errChk(status, (char *)"Error Creating Fan1 kernel",true);
	if(status)exit(1);

	fan2_kernel = clCreateKernel(
			gaussianElim_program, "Fan2", &status);
	status = cl_errChk(status, (char *)"Error Creating Fan2 kernel",true);
	if(status)exit(1);

	// 2. set up memory on device and send ipts data to device

	cl_mem a_dev, b_dev, m_dev;

	cl_int error=0;

	a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float)*size*size, NULL, &error);

	b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float)*size, NULL, &error);

	m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float) * size * size, NULL, &error);

	command_queue = cl_getCommandQueue();

	error = clEnqueueWriteBuffer(command_queue,
			a_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size*size,
			a,
			0,
			NULL,
			&writeEvent);

	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);

	error = clEnqueueWriteBuffer(command_queue,
			b_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size,
			b,
			0,
			NULL,
			&writeEvent);
	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);

	error = clEnqueueWriteBuffer(command_queue,
			m_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size*size,
			m,
			0,
			NULL,
			&writeEvent);
	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);
	writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);

	// 3. Determine block sizes
	size_t globalWorksizeFan1[1];
	size_t globalWorksizeFan2[2];
	size_t localWorksizeFan1Buf[1]={BLOCK_SIZE_0};
	size_t localWorksizeFan2Buf[2]={BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y};
	size_t *localWorksizeFan1=NULL;
	size_t *localWorksizeFan2=NULL;

	globalWorksizeFan1[0] = size;
	globalWorksizeFan2[0] = size;
	globalWorksizeFan2[1] = size;

	if(localWorksizeFan1Buf[0]){
		localWorksizeFan1=localWorksizeFan1Buf;
		globalWorksizeFan1[0]=(int)ceil(globalWorksizeFan1[0]/(double)localWorksizeFan1Buf[0])*localWorksizeFan1Buf[0];
	}
	if(localWorksizeFan2Buf[0]){
		localWorksizeFan2=localWorksizeFan2Buf;
		globalWorksizeFan2[0]=(int)ceil(globalWorksizeFan2[0]/(double)localWorksizeFan2Buf[0])*localWorksizeFan2Buf[0];
		globalWorksizeFan2[1]=(int)ceil(globalWorksizeFan2[1]/(double)localWorksizeFan2Buf[1])*localWorksizeFan2Buf[1];
	}

	int t;
	// 4. Setup and Run kernels
	for (t=0; t<(size-1); t++) {
		// kernel args
		cl_int argchk;
		argchk  = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
		argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
		argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
		argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size);
		argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t);

		cl_errChk(argchk,"ERROR in Setting Fan1 kernel args",true);

		//printf("localWorksizeFan1:%u, globalWorksizeFan1:%u\n", localWorksizeFan1Buf[0], globalWorksizeFan1[0]);	
#pragma dividend local_work_group_size localWorksizeFan1 dim 1 dim1(2:64:2:64)
	//This lws will be used to profile the OpenCL kernel with id 1
			size_t _dividend_lws_localWorksizeFan1_k1[2];
		{
		_dividend_lws_localWorksizeFan1_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_localWorksizeFan1_k1[1] = 1;
		}
				// launch kernel
		error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(
				command_queue,  fan1_kernel, 1, 0,
				globalWorksizeFan1, _dividend_lws_localWorksizeFan1_k1,
				0, NULL, NULL);

		cl_errChk(error,"ERROR in Executing Fan1 Kernel",true);

		//fprintf(stderr, "AFTER THIS\n");

		argchk  = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
		argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
		argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
		argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size);
		argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t);

		cl_errChk(argchk,"ERROR in Setting Fan2 kernel args",true);

		size_t local_work_size[] = {128, 128};

		//printf("localWorksizeFan2:%u, globalWorksizeFan2[0]:%u, globalWorksizeFan2[1]:%u\n", localWorksizeFan2Buf[0], globalWorksizeFan2[0], globalWorksizeFan2[1]);	
#pragma dividend local_work_group_size local_work_size dim 2 dim1(8:64:2:64) dim2(8:64:2:64)
	//This lws will be used to profile the OpenCL kernel with id 2
			size_t _dividend_lws_local_work_size_k2[3];
		{
		_dividend_lws_local_work_size_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL);
		_dividend_lws_local_work_size_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_local_work_size_k2[2] = 2;
		}
				// launch kernel
		error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(
				command_queue,  fan2_kernel, 2, 0,
				globalWorksizeFan2, _dividend_lws_local_work_size_k2,
				0, NULL, NULL);

		cl_errChk(error,"ERROR in Executing Fan2 Kernel",true);
	
		if (timing) {
			//             printf("here2a\n");
			//             kernelTime+=eventTime(kernelEvent,command_queue);
			//             printf("here2b\n");
		}
		clReleaseEvent(kernelEvent);
		//Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
		//cudaThreadSynchronize();
	}
	// 5. transfer data off of device
	error = clEnqueueReadBuffer(command_queue,
			a_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size * size,
			a,
			0,
			NULL,
			&readEvent);

	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);

	error = clEnqueueReadBuffer(command_queue,
			b_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size,
			b,
			0,
			NULL,
			&readEvent);
	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);

	error = clEnqueueReadBuffer(command_queue,
			m_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size * size,
			m,
			0,
			NULL,
			&readEvent);

	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);
	readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);

	if (timing) {
		printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s)  [size]\t\tTotal(s)\n");
		printf("%dx%d      \t",size,size);

		printf("%f [%.2fMB]\t",writeTime,writeMB);


		printf("%f\t",kernelTime);


		printf("%f [%.2fMB]\t",readTime,readMB);

		printf("%f\n\n",writeTime+kernelTime+readTime);
	}

}