/*!
	Prints out the time taken between the start and end of an event.\n
	Adds synchronization in order to be sure that events have
	occured otherwise profiling calls will fail \n

	Shouldnt be used on critical path due to the necessary flushing of the queue
	\param event_time
*/
void cl_KernelTimeSync(cl_event event_time)
{
	cl_int kerneltimer;
	clFlush(cl_getCommandQueue());
	clFinish(cl_getCommandQueue());

	cl_ulong starttime;
	cl_ulong endtime;

	kerneltimer = clGetEventProfilingInfo(event_time,
		CL_PROFILING_COMMAND_START,
		sizeof(cl_ulong), &starttime, NULL);

	if(cl_errChk(kerneltimer, "Error in Start Time \n"))exit(1);

	kerneltimer = clGetEventProfilingInfo(event_time,
		CL_PROFILING_COMMAND_END  ,
		sizeof(cl_ulong), &endtime, NULL);

	if(cl_errChk(kerneltimer, "Error in Start Time \n"))exit(1);
	unsigned long elapsed =  (unsigned long)(endtime - starttime);
	printf("\tTime Elapsed in Kernel is %ld ns\n",elapsed);
}
Example #2
0
float *OpenClFindNearestNeighbors(
	cl_context context,
	int numRecords,
	std::vector<LatLong> &locations,float lat,float lng,
	int timing) {

    // 1. set up kernel
    cl_kernel NN_kernel;
        cl_int status;
        cl_program cl_NN_program;
        cl_NN_program = cl_compileProgram(
            (char *)"nearestNeighbor_kernel.cl",NULL);
       
        NN_kernel = clCreateKernel(
            cl_NN_program, "NearestNeighbor", &status);
        status = cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel",true);
        if(status)exit(1);
    // 2. set up memory on device and send ipts data to device
    // copy ipts(1,2) to device
    // also need to alloate memory for the distancePoints
    cl_mem d_locations;
    cl_mem d_distances;

    cl_int error=0;

    d_locations = clCreateBuffer(context, CL_MEM_READ_ONLY,
        sizeof(LatLong) * numRecords, NULL, &error);

    d_distances = clCreateBuffer(context, CL_MEM_READ_WRITE,
        sizeof(float) * numRecords, NULL, &error);

    cl_command_queue command_queue = cl_getCommandQueue();
    cl_event writeEvent,kernelEvent,readEvent;
    error = clEnqueueWriteBuffer(command_queue,
               d_locations,
               1, // change to 0 for nonblocking write
               0, // offset
               sizeof(LatLong) * numRecords,
               &locations[0],
               0,
               NULL,
               &writeEvent);

    // 3. send arguments to device
    cl_int argchk;
    argchk  = clSetKernelArg(NN_kernel, 0, sizeof(cl_mem), (void *)&d_locations);
    argchk |= clSetKernelArg(NN_kernel, 1, sizeof(cl_mem), (void *)&d_distances);
    argchk |= clSetKernelArg(NN_kernel, 2, sizeof(int), (void *)&numRecords);
    argchk |= clSetKernelArg(NN_kernel, 3, sizeof(float), (void *)&lat);
    argchk |= clSetKernelArg(NN_kernel, 4, sizeof(float), (void *)&lng);

    cl_errChk(argchk,"ERROR in Setting Nearest Neighbor kernel args",true);

    // 4. enqueue kernel
    size_t globalWorkSize[1];
    globalWorkSize[0] = numRecords;
    if (numRecords % 64) globalWorkSize[0] += 64 - (numRecords % 64);
    //printf("Global Work Size: %zu\n",globalWorkSize[0]);      

    error = clEnqueueNDRangeKernel(
        command_queue,  NN_kernel, 1, 0,
        globalWorkSize,NULL,
        0, NULL, &kernelEvent);

    cl_errChk(error,"ERROR in Executing Kernel NearestNeighbor",true);

    // 5. transfer data off of device
    
    // create distances std::vector
    float *distances = (float *)malloc(sizeof(float) * numRecords);

    error = clEnqueueReadBuffer(command_queue,
        d_distances,
        1, // change to 0 for nonblocking write
        0, // offset
        sizeof(float) * numRecords,
        distances,
        0,
        NULL,
        &readEvent);

    cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
    if (timing) {
        clFinish(command_queue);
        cl_ulong eventStart,eventEnd,totalTime=0;
        printf("# Records\tWrite(s) [size]\t\tKernel(s)\tRead(s)  [size]\t\tTotal(s)\n");
        printf("%d        \t",numRecords);
        // Write Buffer
        error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_START,
                                        sizeof(cl_ulong),&eventStart,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Write Start)",true); 
        error = clGetEventProfilingInfo(writeEvent,CL_PROFILING_COMMAND_END,
                                        sizeof(cl_ulong),&eventEnd,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Write End)",true);

        printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(LatLong) * numRecords)/1e6));
        totalTime += eventEnd-eventStart;
        // Kernel
        error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_START,
                                        sizeof(cl_ulong),&eventStart,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Kernel Start)",true); 
        error = clGetEventProfilingInfo(kernelEvent,CL_PROFILING_COMMAND_END,
                                        sizeof(cl_ulong),&eventEnd,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Kernel End)",true);

        printf("%f\t",(float)((eventEnd-eventStart)/1e9));
        totalTime += eventEnd-eventStart;
        // Read Buffer
        error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_START,
                                        sizeof(cl_ulong),&eventStart,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Read Start)",true); 
        error = clGetEventProfilingInfo(readEvent,CL_PROFILING_COMMAND_END,
                                        sizeof(cl_ulong),&eventEnd,NULL);
        cl_errChk(error,"ERROR in Event Profiling (Read End)",true);

        printf("%f [%.2fMB]\t",(float)((eventEnd-eventStart)/1e9),(float)((sizeof(float) * numRecords)/1e6));
        totalTime += eventEnd-eventStart;
        
        printf("%f\n\n",(float)(totalTime/1e9));
    }
    // 6. return finalized data and release buffers
    clReleaseMemObject(d_locations);
    clReleaseMemObject(d_distances);
	return distances;
}
Example #3
0
/*------------------------------------------------------
 ** ForwardSub() -- Forward substitution of Gaussian
 ** elimination.
 **------------------------------------------------------
 */
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing){    
	// 1. set up kernels
	cl_kernel fan1_kernel,fan2_kernel;
	cl_int status=0;
	cl_program gaussianElim_program;
	cl_event writeEvent,kernelEvent,readEvent;
	float writeTime=0,readTime=0,kernelTime=0;
	float writeMB=0,readMB=0;

	gaussianElim_program = cl_compileProgram(
			(char *)"gaussianElim_kernels.cl",NULL);

	fan1_kernel = clCreateKernel(
			gaussianElim_program, "Fan1", &status);
	status = cl_errChk(status, (char *)"Error Creating Fan1 kernel",true);
	if(status)exit(1);

	fan2_kernel = clCreateKernel(
			gaussianElim_program, "Fan2", &status);
	status = cl_errChk(status, (char *)"Error Creating Fan2 kernel",true);
	if(status)exit(1);

	// 2. set up memory on device and send ipts data to device

	cl_mem a_dev, b_dev, m_dev;

	cl_int error=0;

	a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float)*size*size, NULL, &error);

	b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float)*size, NULL, &error);

	m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
			sizeof(float) * size * size, NULL, &error);

	command_queue = cl_getCommandQueue();

	error = clEnqueueWriteBuffer(command_queue,
			a_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size*size,
			a,
			0,
			NULL,
			&writeEvent);

	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);

	error = clEnqueueWriteBuffer(command_queue,
			b_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size,
			b,
			0,
			NULL,
			&writeEvent);
	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);

	error = clEnqueueWriteBuffer(command_queue,
			m_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float)*size*size,
			m,
			0,
			NULL,
			&writeEvent);
	if (timing) writeTime+=eventTime(writeEvent,command_queue);
	clReleaseEvent(writeEvent);
	writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);

	// 3. Determine block sizes
	size_t globalWorksizeFan1[1];
	size_t globalWorksizeFan2[2];
	size_t localWorksizeFan1Buf[1]={BLOCK_SIZE_0};
	size_t localWorksizeFan2Buf[2]={BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y};
	size_t *localWorksizeFan1=NULL;
	size_t *localWorksizeFan2=NULL;

	globalWorksizeFan1[0] = size;
	globalWorksizeFan2[0] = size;
	globalWorksizeFan2[1] = size;

	if(localWorksizeFan1Buf[0]){
		localWorksizeFan1=localWorksizeFan1Buf;
		globalWorksizeFan1[0]=(int)ceil(globalWorksizeFan1[0]/(double)localWorksizeFan1Buf[0])*localWorksizeFan1Buf[0];
	}
	if(localWorksizeFan2Buf[0]){
		localWorksizeFan2=localWorksizeFan2Buf;
		globalWorksizeFan2[0]=(int)ceil(globalWorksizeFan2[0]/(double)localWorksizeFan2Buf[0])*localWorksizeFan2Buf[0];
		globalWorksizeFan2[1]=(int)ceil(globalWorksizeFan2[1]/(double)localWorksizeFan2Buf[1])*localWorksizeFan2Buf[1];
	}

	int t;
	// 4. Setup and Run kernels
	for (t=0; t<(size-1); t++) {
		// kernel args
		cl_int argchk;
		argchk  = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
		argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
		argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
		argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size);
		argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t);

		cl_errChk(argchk,"ERROR in Setting Fan1 kernel args",true);

		//printf("localWorksizeFan1:%u, globalWorksizeFan1:%u\n", localWorksizeFan1Buf[0], globalWorksizeFan1[0]);	
#pragma dividend local_work_group_size localWorksizeFan1 dim 1 dim1(2:64:2:64)
	//This lws will be used to profile the OpenCL kernel with id 1
			size_t _dividend_lws_localWorksizeFan1_k1[2];
		{
		_dividend_lws_localWorksizeFan1_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_localWorksizeFan1_k1[1] = 1;
		}
				// launch kernel
		error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(
				command_queue,  fan1_kernel, 1, 0,
				globalWorksizeFan1, _dividend_lws_localWorksizeFan1_k1,
				0, NULL, NULL);

		cl_errChk(error,"ERROR in Executing Fan1 Kernel",true);

		//fprintf(stderr, "AFTER THIS\n");

		argchk  = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
		argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
		argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
		argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size);
		argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t);

		cl_errChk(argchk,"ERROR in Setting Fan2 kernel args",true);

		size_t local_work_size[] = {128, 128};

		//printf("localWorksizeFan2:%u, globalWorksizeFan2[0]:%u, globalWorksizeFan2[1]:%u\n", localWorksizeFan2Buf[0], globalWorksizeFan2[0], globalWorksizeFan2[1]);	
#pragma dividend local_work_group_size local_work_size dim 2 dim1(8:64:2:64) dim2(8:64:2:64)
	//This lws will be used to profile the OpenCL kernel with id 2
			size_t _dividend_lws_local_work_size_k2[3];
		{
		_dividend_lws_local_work_size_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL);
		_dividend_lws_local_work_size_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_local_work_size_k2[2] = 2;
		}
				// launch kernel
		error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(
				command_queue,  fan2_kernel, 2, 0,
				globalWorksizeFan2, _dividend_lws_local_work_size_k2,
				0, NULL, NULL);

		cl_errChk(error,"ERROR in Executing Fan2 Kernel",true);
	
		if (timing) {
			//             printf("here2a\n");
			//             kernelTime+=eventTime(kernelEvent,command_queue);
			//             printf("here2b\n");
		}
		clReleaseEvent(kernelEvent);
		//Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
		//cudaThreadSynchronize();
	}
	// 5. transfer data off of device
	error = clEnqueueReadBuffer(command_queue,
			a_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size * size,
			a,
			0,
			NULL,
			&readEvent);

	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);

	error = clEnqueueReadBuffer(command_queue,
			b_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size,
			b,
			0,
			NULL,
			&readEvent);
	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);

	error = clEnqueueReadBuffer(command_queue,
			m_dev,
			1, // change to 0 for nonblocking write
			0, // offset
			sizeof(float) * size * size,
			m,
			0,
			NULL,
			&readEvent);

	cl_errChk(error,"ERROR with clEnqueueReadBuffer",true);
	if (timing) readTime+=eventTime(readEvent,command_queue);
	clReleaseEvent(readEvent);
	readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);

	if (timing) {
		printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s)  [size]\t\tTotal(s)\n");
		printf("%dx%d      \t",size,size);

		printf("%f [%.2fMB]\t",writeTime,writeMB);


		printf("%f\t",kernelTime);


		printf("%f [%.2fMB]\t",readTime,readMB);

		printf("%f\n\n",writeTime+kernelTime+readTime);
	}

}