void run_opencl_backward(HMM *word)
{

	puts("\n=>GPU");

	int i;
	int N = word->nstates;
	int T = word->len;
	float *B = word->b;
	float *A = word->a;

	// gpu timer
	cl_ulong gstart, gend;
	double gpuTime;

	// cpu timer
	//struct timeval cstart;
	//struct timeval cend;
	double cpuTime;

	float *betaB;
	betaB= (float*)malloc(sizeof(float)*N);
	init_1d_f(betaB,N,0.f);

	float *beta; // NxT
	beta = (float*)malloc(sizeof(float)*N*T);
	init_2d_f(beta,N,T,0.f);
	for(i = 0 ; i < N ; ++i){
		beta[i*T + T-1] = 1.f;
	}



	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------

	int chunks;
	chunks = (N+63)/64;

	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*3);
	
	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*2);    

	// read kernel file
	char *fileName = "backward_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';

	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);

	kernel[0] = clCreateKernel(program, "genbetaB", &err);
	OCL_CHECK(err);
	kernel[1] = clCreateKernel(program, "beta_dev", &err);
	OCL_CHECK(err);
	kernel[2] = clCreateKernel(program, "scale_beta", &err);
	OCL_CHECK(err);

	// allocate memory on device 
	cl_mem A_d      	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem B_d      	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*T,  NULL, NULL);
	cl_mem beta_d		= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*T,  NULL, NULL);
	cl_mem betaB_d		= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,    NULL, NULL);
	cl_mem betasum_int_d= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*chunks,  NULL, NULL);
	cl_mem betasum_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float),  	  NULL, NULL);

	// warm up() device
	float *dummy = (float*)malloc(sizeof(float));
	cl_mem dummy_d= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float),  NULL, NULL);
	for(i=0;i<50;++i){
		err = clEnqueueWriteBuffer(queue, dummy_d, 		CL_TRUE, 	0, sizeof(float), 	dummy, 		0, NULL, NULL);
	}


	// Initialize device memory
	err = clEnqueueWriteBuffer(queue, A_d, 		CL_TRUE, 	0, sizeof(float)*N*N, 	A, 		0, NULL, &event[0]);
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, B_d, 		CL_TRUE, 	0, sizeof(float)*N*T, 	B, 		0, NULL, NULL); 
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, beta_d, 	CL_TRUE, 	0, sizeof(float)*N*T, 	beta, 	0, NULL, NULL); 
	OCL_CHECK(err);

	//  1st kernel: beta * B 
	size_t local_1 = 64;
	size_t global_1 = chunks*64;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &beta_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &betaB_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 4, sizeof(int), &T);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	// 2nd kernel: A * betaB
	size_t local_2 = 64;
	size_t global_2 = chunks*64;

	err  = clSetKernelArg(kernel[1], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[1], 1, sizeof(cl_mem), &betaB_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[1], 2, sizeof(cl_mem), &beta_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[1], 3, sizeof(cl_mem), &betasum_int_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[1], 4, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[1], 5, sizeof(int), &T);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	//err |= clSetKernelArg(kernel[1], 6, sizeof(int), &frame);
	//if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[1], 7, sizeof(float)*64, NULL);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}



	// 3nd kernel: beta/sum 
	size_t local_3 = 64;
	size_t global_3 = chunks*64;

	err  = clSetKernelArg(kernel[2], 0, sizeof(cl_mem), &beta_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[2], 1, sizeof(cl_mem), &betasum_int_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[2], 2, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[2], 3, sizeof(int), &T);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[2], 5, sizeof(float), &chunks);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	// time capsule
	int frame;

	for(frame = (T-2) ; frame >= 0; frame--)
	{
		// 1st kernel : beta * B
		err |= clSetKernelArg(kernel[0], 5, sizeof(int), &frame);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &global_1, &local_1, 0, NULL, NULL);
		OCL_CHECK(err);

		if(frame ==  (T-2) && 0)
		{
			clFinish(queue);
			clEnqueueReadBuffer(queue, betaB_d, CL_TRUE, 0, sizeof(float)*N, betaB, 0, NULL , NULL);
			check_1d_f(betaB, N);	
			exit(1);	
		}


		// 2nd kernel; betaB * A
		err |= clSetKernelArg(kernel[1], 6, sizeof(int), &frame);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, &global_2, &local_2, 0, NULL, NULL);
		OCL_CHECK(err);

		if(frame ==  (T-2) && 0)
		{
			clFinish(queue);
			clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , NULL);
			check_2d_f(beta, N, T);	
			exit(1);	
		}



		// 3rd kernle; scale beta
		err |= clSetKernelArg(kernel[2], 4, sizeof(int), &frame);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		err = clEnqueueNDRangeKernel(queue, kernel[2], 1, NULL, &global_3, &local_3, 0, NULL, NULL);
		OCL_CHECK(err);

		if(frame ==  (T-2) && 0)
		{
			clFinish(queue);
			clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , NULL);
			check_2d_f(beta, N, T);	
			exit(1);	
		}



	}

	clFinish(queue);

	clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , &event[1]);

	err = clWaitForEvents(1,&event[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;

	cpuTime = 0.0; 

	printf("oclTime = %lf (s)\n", gpuTime + cpuTime);

	// check
	//check_2d_f(beta,N,T);



	clReleaseMemObject(A_d);
	clReleaseMemObject(B_d);
	clReleaseMemObject(beta_d);
	clReleaseMemObject(betaB_d);
	clReleaseMemObject(dummy_d);

	clReleaseMemObject(betasum_d);
	clReleaseMemObject(betasum_int_d);



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<3;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<2;++i){
		clReleaseEvent(event[i]);
	}


	free(beta);
	free(betaB);
	free(kernelSource);

	free(dummy);

	return;
}
Example #2
0
void run1(int N, char *fileName)
{
	puts("Matrix Vector Multiplication Naive\n");

	int i,j;

	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = 1.f;	
		}
	}

	float *B;
	B = (float*)malloc(sizeof(float)*N);
	for( i = 0; i < N ; ++i )
	{
		B[i] = 1.f;	
	}
	
	float *C;
	C = (float*)malloc(sizeof(float)*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);

	puts("B");
	check_1d_f(B,N);
#endif

	int NumK = 1;
	int NumE = 1;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);

#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bin, NULL);
	OCL_CHECK(err);

	// Print the binary out to the output file
	fh = fopen("kernel_mv_1.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

#endif

	kernel[0] = clCreateKernel(program, "mv_1", &err);
	OCL_CHECK(err);


	// memory on device
	cl_mem A_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem B_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);
	cl_mem C_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);

	// Initialize device memory
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , NULL); 
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, B_d, 	CL_TRUE, 0, sizeof(float)*N, 	B, 0, NULL , NULL); 
	OCL_CHECK(err);

	size_t localsize =  64;
	size_t globalsize = N;


	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &C_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &globalsize, &localsize, 0, NULL, &event[0]);
	OCL_CHECK(err);

	clFinish(queue);

	clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, sizeof(float)*N, C , 0, NULL , NULL );

	err = clWaitForEvents(1,&event[0]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("C = A * B");
	check_1d_f(C,N);

#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(B_d);	
	clReleaseMemObject(C_d);	



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif

	free(A);
	free(B);
	free(C);

	return;
}