Exemple #1
0
TEST_P(ocl_engine_test, BasicInteropC) {
    auto p = GetParam();
    cl_device_id ocl_dev = (p.adev_kind == dev_kind::gpu) ?
            gpu_ocl_dev :
            (p.adev_kind == dev_kind::cpu) ? cpu_ocl_dev : nullptr;

    cl_context ocl_ctx = (p.actx_kind == ctx_kind::gpu) ?
            gpu_ocl_ctx :
            (p.actx_kind == ctx_kind::cpu) ? cpu_ocl_ctx : nullptr;

    SKIP_IF(p.adev_kind != dev_kind::null && !ocl_dev,
            "Required OpenCL device not found.");
    SKIP_IF(p.actx_kind != ctx_kind::null && !ocl_ctx,
            "Required OpenCL context not found.");
    SKIP_IF(cpu_ocl_dev == gpu_ocl_dev
                    && (p.adev_kind == dev_kind::cpu
                               || p.actx_kind == ctx_kind::cpu),
            "OpenCL CPU-only device not found.");

    mkldnn_engine_t eng;
    mkldnn_status_t s
            = mkldnn_engine_create_ocl(&eng, mkldnn_gpu, ocl_dev, ocl_ctx);

    EXPECT_EQ(s, p.expected_status);

    if (s == mkldnn_success) {

        cl_device_id dev;
        cl_context ctx;

        MKLDNN_CHECK(mkldnn_engine_get_ocl_device(eng, &dev));
        MKLDNN_CHECK(mkldnn_engine_get_ocl_context(eng, &ctx));

        EXPECT_EQ(dev, ocl_dev);
        EXPECT_EQ(ctx, ocl_ctx);

        cl_uint ref_count;
        OCL_CHECK(clGetContextInfo(ocl_ctx, CL_CONTEXT_REFERENCE_COUNT,
                sizeof(ref_count), &ref_count, nullptr));
        int i_ref_count = int(ref_count);
        EXPECT_EQ(i_ref_count, 2);

        MKLDNN_CHECK(mkldnn_engine_destroy(eng));

        OCL_CHECK(clGetContextInfo(ocl_ctx, CL_CONTEXT_REFERENCE_COUNT,
                sizeof(ref_count), &ref_count, nullptr));
        i_ref_count = int(ref_count);
        EXPECT_EQ(i_ref_count, 1);
    }
}
Exemple #2
0
TEST_P(ocl_engine_test, BasicInteropCpp) {
    auto p = GetParam();
    cl_device_id ocl_dev = (p.adev_kind == dev_kind::gpu) ?
            gpu_ocl_dev :
            (p.adev_kind == dev_kind::cpu) ? cpu_ocl_dev : nullptr;

    cl_context ocl_ctx = (p.actx_kind == ctx_kind::gpu) ?
            gpu_ocl_ctx :
            (p.actx_kind == ctx_kind::cpu) ? cpu_ocl_ctx : nullptr;

    SKIP_IF(p.adev_kind != dev_kind::null && !ocl_dev,
            "Required OpenCL device not found.");
    SKIP_IF(p.actx_kind != ctx_kind::null && !ocl_ctx,
            "Required OpenCL context not found.");
    SKIP_IF(cpu_ocl_dev == gpu_ocl_dev
                    && (p.adev_kind == dev_kind::cpu
                               || p.actx_kind == ctx_kind::cpu),
            "OpenCL CPU-only device not found.");

    catch_expected_failures(
            [&]() {
                {
                    engine eng(engine::kind::gpu, ocl_dev, ocl_ctx);
                    if (p.expected_status != mkldnn_success) {
                        FAIL() << "Success not expected";
                    }

                    cl_device_id dev = eng.get_ocl_device();
                    cl_context ctx = eng.get_ocl_context();
                    EXPECT_EQ(dev, ocl_dev);
                    EXPECT_EQ(ctx, ocl_ctx);

                    cl_uint ref_count;
                    OCL_CHECK(clGetContextInfo(ocl_ctx,
                            CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_count),
                            &ref_count, nullptr));
                    int i_ref_count = int(ref_count);
                    EXPECT_EQ(i_ref_count, 2);
                }

                cl_uint ref_count;
                OCL_CHECK(clGetContextInfo(ocl_ctx, CL_CONTEXT_REFERENCE_COUNT,
                        sizeof(ref_count), &ref_count, nullptr));
                int i_ref_count = int(ref_count);
                EXPECT_EQ(i_ref_count, 1);
            },
            p.expected_status != mkldnn_success, p.expected_status);
}
Exemple #3
0
status_t ocl_engine_t::init() {
    CHECK(cl_engine_t::init());

    cl_int err = CL_SUCCESS;
    if (is_user_context_) {
        err = clRetainContext(context_);
        if (err != CL_SUCCESS)
            context_ = nullptr;
    } else {
        context_
                = clCreateContext(nullptr, 1, &device_, nullptr, nullptr, &err);
    }

    OCL_CHECK(err);

    status_t status
            = ocl_utils::check_device(engine_kind::gpu, device_, context_);
    if (status != status::success)
        return status;

    stream_t *service_stream_ptr;
    status = create_stream(&service_stream_ptr, stream_flags::default_flags);
    if (status != status::success)
        return status;
    service_stream_.reset(service_stream_ptr);
    return status::success;
}
Exemple #4
0
void ConvolutionLayerSpatial<Dtype>::swizzleWeights(
    const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top,
    int_tp swizzled_factor) {

  viennacl::ocl::context &ctx = viennacl::ocl::get_context(
      this->device_->id());
  viennacl::ocl::program &program = this->device_->program();
  viennacl::ocl::kernel &oclk_copy_weight = program.get_kernel(
      CL_KERNEL_SELECT("copyWeightsSwizzled"));
  cl_uint argIdx = 0;

  int_tp channels = this->channels_ / this->group_;
  oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx));
  oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx));
  oclk_copy_weight.arg(argIdx++, kernel_w_);
  oclk_copy_weight.arg(argIdx++, kernel_h_);
  oclk_copy_weight.arg(argIdx++, channels);
  oclk_copy_weight.arg(argIdx++, this->num_output_);
  oclk_copy_weight.arg(argIdx++, swizzled_factor);
  const size_t global_work_size_Copy[3] = { (size_t) (this->num_output_
      * channels * kernel_w_ * kernel_h_), 1, 1 };

  OCL_CHECK(clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                                       oclk_copy_weight.handle().get(), 3, NULL,
                                       global_work_size_Copy, NULL, 0, NULL,
                                       NULL));
}
Exemple #5
0
    virtual void SetUp() {
        gpu_ocl_dev = find_ocl_device(CL_DEVICE_TYPE_GPU);
        cpu_ocl_dev = find_ocl_device(CL_DEVICE_TYPE_CPU);

        cl_int err;
        if (gpu_ocl_dev) {
            gpu_ocl_ctx = clCreateContext(
                    nullptr, 1, &gpu_ocl_dev, nullptr, nullptr, &err);
            OCL_CHECK(err);
        }

        if (cpu_ocl_dev) {
            cpu_ocl_ctx = clCreateContext(
                    nullptr, 1, &cpu_ocl_dev, nullptr, nullptr, &err);
            OCL_CHECK(err);
        }
    }
	void PatternMatcher::findMatchesInText(cl_mem textBuffer, cl_mem patternBuffer, cl_mem matchBuffer, cl_uint textSize, cl_uint patternSize, cl_uint maxMismatch) {
		OCL_STATUS_INITIALIZE;
		cl_kernel kernel = OCL_CHECK(clCreateKernel(program, "find_matches", &OCL_STATUS));
		OCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &textBuffer));
		OCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &patternBuffer));
		OCL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &matchBuffer));
		OCL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uint), &patternSize));
		OCL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uint), &maxMismatch));
		size_t globalWorkSize[1] = { textSize - patternSize + 1 };
		OCL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr));
		OCL_CHECK(clReleaseKernel(kernel));
	}
	void PatternMatcher::computePrefixSum(cl_mem & inputBuffer, cl_uint bufferElementCount) {

		OCL_STATUS_INITIALIZE;

		cl_mem localBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_WRITE, bufferElementCount * sizeof(cl_uint), nullptr, &OCL_STATUS));

		cl_kernel kernel = OCL_CHECK(clCreateKernel(program, "prefix_sum_step", &OCL_STATUS));
		size_t globalWorkSize[1] = { bufferElementCount };
		OCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_uint), &bufferElementCount));

		for (cl_uint offset = 1; offset < bufferElementCount; offset *= 2) {
			OCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_uint), &offset));
			OCL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputBuffer));
			OCL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &localBuffer));
			OCL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr));
			std::swap(inputBuffer, localBuffer);
		}

		OCL_CHECK(clReleaseMemObject(localBuffer));
		OCL_CHECK(clReleaseKernel(kernel));
	}
	void PatternMatcher::packIndicesOfValueSteps(cl_mem packedIndicesBuffer, cl_mem valuesBuffer, cl_uint valuesCount) {
		OCL_STATUS_INITIALIZE;
		cl_kernel kernel = OCL_CHECK(clCreateKernel(program, "pack_indices_of_value_steps", &OCL_STATUS));
		OCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &packedIndicesBuffer));
		OCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &valuesBuffer));
		OCL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_uint), &valuesCount));
		size_t globalWorkSize[1] = { valuesCount };
		OCL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr));
		OCL_CHECK(clReleaseKernel(kernel));
	}
	PatternMatcher::~PatternMatcher() {
		OCL_STATUS_INITIALIZE;
		OCL_CHECK(clReleaseProgram(program));
		OCL_CHECK(clReleaseCommandQueue(queue));
		OCL_CHECK(clReleaseContext(context));
	}
	std::vector<cl_uint> PatternMatcher::findPattern(char const * pattern, char const * text, cl_uint maxMismatch) {

		OCL_STATUS_INITIALIZE;

		size_t textSize = strlen(text);
		size_t patternSize = strlen(pattern);
		size_t possibleMatchSites = textSize - patternSize + 1;

		// create a buffer for storing text on device and copy text to device
		cl_mem textBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_ONLY, textSize, nullptr, &OCL_STATUS));
		OCL_CHECK(clEnqueueWriteBuffer(queue, textBuffer, CL_FALSE, 0, textSize, text, 0, nullptr, nullptr));

		// create a buffer for storing pattern on device and copy pattern to device
		cl_mem patternBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_ONLY, patternSize, nullptr, &OCL_STATUS));
		OCL_CHECK(clEnqueueWriteBuffer(queue, patternBuffer, CL_FALSE, 0, patternSize, pattern, 0, nullptr, nullptr));

		// create a buffer for storing flags indicating starting positions of found matches
		size_t matchBufferSize = possibleMatchSites * sizeof(cl_uint);
		cl_mem matchBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_READ_WRITE, matchBufferSize, nullptr, &OCL_STATUS));

		// launch kernel to find matches in text buffer 
		if (maxMismatch == 0) {
			this->findExactMatchesInText(textBuffer, patternBuffer, matchBuffer, textSize, patternSize);
		} else {
			this->findMatchesInText(textBuffer, patternBuffer, matchBuffer, textSize, patternSize, maxMismatch);
		}

		// release the text and pattern buffers
		OCL_CHECK(clReleaseMemObject(textBuffer));
		OCL_CHECK(clReleaseMemObject(patternBuffer));

		// compute prefix sum of match-starts buffer 
		this->computePrefixSum(matchBuffer, textSize);

		// read total of matches found from last element of match-starts buffer
		cl_uint matchCount = 0;
		cl_uint offsetOfFinalMatchBufferElement = (possibleMatchSites - 1) * sizeof(cl_uint);
		OCL_CHECK(clEnqueueReadBuffer(queue, matchBuffer, CL_TRUE, offsetOfFinalMatchBufferElement, sizeof(cl_uint), &matchCount, 0, nullptr, nullptr));

		// create a buffer to store packed locations of match starts
		size_t locationsBufferSize = matchCount * sizeof(cl_uint);
		cl_mem locationsBuffer = OCL_CHECK(clCreateBuffer(context, CL_MEM_WRITE_ONLY, locationsBufferSize, nullptr, &OCL_STATUS));

		// pack the found location coordinates into the locations buffer
		this->packIndicesOfValueSteps(locationsBuffer, matchBuffer, possibleMatchSites);

		// release the match buffer
		OCL_CHECK(clReleaseMemObject(matchBuffer));

		// read the match-start positions from the device into a vector to be returned to caller
		vector<cl_uint> matchLocations(matchCount);
		OCL_CHECK(clEnqueueReadBuffer(queue, locationsBuffer, CL_TRUE, 0, locationsBufferSize, &matchLocations[0], 0, nullptr, nullptr));

		// release OpenCL resources
		OCL_CHECK(clReleaseMemObject(locationsBuffer));

		// return vector match starts
		return matchLocations;
	}
void run_opencl_fo(HMM *word)
{
	puts("\n=>GPU");

	int N = word->nstates;
	int T = word->len;
	float *B = word->b; // T x N
	float *A = word->a; // N x N
	float *prior = word->pri;

	cl_ulong gstart, gend;
	double gpuTime;

	int i;

	float *At; // NxN
	At = (float*)malloc(sizeof(float)*N*N);

	// initialize for checking
	float *alpha;
	alpha = (float*)malloc(sizeof(float)*T*N); // T x B
	//init_2d_f(alpha,T,N,0.f);


	int blks = (N+255)/256;

	//float *alphasum; // T x 1
	//alphasum = (float*)malloc(sizeof(float)*T);
	//init_1d_f(alphasum,T,0.f);


	float *lld; 
	lld = (float*)malloc(sizeof(float));
	lld[0] = 0.f;

	float *at_alpha; 
	at_alpha = (float*)malloc(sizeof(float)*N);

	uint startPos_pre;
	uint startPos;

	int numK = 4;
	int numE = 2;

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*numK);
	cl_event *events = (cl_event*)malloc(sizeof(cl_event)*numE);    


	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program


/*
	cl_event **eventwait = (cl_event**)malloc(sizeof(cl_event*)*2);
	for(i=0;i<2;++i){
		eventwait[i] = (cl_event*)malloc(sizeof(cl_event)*2);
	}
	*/



	cl_int err;

	// read kernel file
	char *fileName = "ocl_fo_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';

	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);


	// 
	kernel[0] = clCreateKernel(program, "transpose", &err);
	OCL_CHECK(err);

	kernel[1] = clCreateKernel(program, "init_alpha", &err);
	OCL_CHECK(err);

	kernel[2] = clCreateKernel(program, "mat_vec", &err);
	OCL_CHECK(err);

	kernel[3] = clCreateKernel(program, "alpha_dev", &err);
	OCL_CHECK(err);

	// allocate memory on device 
	cl_mem A_d      	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem At_d     	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem lld_d		= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float),  NULL, NULL);
	cl_mem B_d      	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*T*N,  NULL, NULL);
	cl_mem prior_d  	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,    NULL, NULL);
	cl_mem alpha_d  	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*T*N,  NULL, NULL);
	cl_mem at_alpha_d  	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,    NULL, NULL);

//	cl_mem alphasum_d   = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*T,  NULL, NULL);
//
//	cl_mem alphasum_tmp_d   = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*blks,  NULL, NULL);
//

	// warm up() device
	float *dummy = (float*)malloc(sizeof(float));
	cl_mem dummy_d= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float),  NULL, NULL);
	for(i=0;i<50;++i){
		err = clEnqueueWriteBuffer(queue, dummy_d, 		CL_TRUE, 	0, sizeof(float), 	dummy, 		0, NULL, NULL);
	}





	// copy from host to device
	err = clEnqueueWriteBuffer(queue, A_d, 		CL_TRUE, 	0, sizeof(float)*N*N, 	A, 		0, NULL, &events[0]);
	OCL_CHECK(err);

	err = clEnqueueWriteBuffer(queue, prior_d, 	CL_TRUE, 	0, sizeof(float)*N, 	prior, 	0, NULL, NULL);      
	OCL_CHECK(err);

	err = clEnqueueWriteBuffer(queue, B_d, 		CL_TRUE, 	0, sizeof(float)*T*N, 	B, 		0, NULL, NULL); 
	OCL_CHECK(err);
//
//	err = clEnqueueWriteBuffer(queue, alpha_d, 	CL_TRUE, 	0, sizeof(float)*T*N, 	alpha, 	0, NULL, NULL); 
//	OCL_CHECK(err);
//
//	err = clEnqueueWriteBuffer(queue, alphasum_d, CL_TRUE, 	0, sizeof(float)*T, 	alphasum, 0, NULL, NULL);  
//	OCL_CHECK(err);

	err = clEnqueueWriteBuffer(queue, lld_d, CL_TRUE, 	0, sizeof(float), 	lld, 0, NULL, NULL);  
	OCL_CHECK(err);


	//---------------------------------------- transpose kernel -------------------------------------------//
	//  1st kernel: 
	size_t local_0[2];
	size_t global_0[2];
	local_0[0]= 16;
	local_0[1]= 16;
	global_0[0] =  N;
	global_0[1] =  N;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &At_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[0], 2, sizeof(float)*256, NULL);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[0], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, global_0, local_0, 0, NULL, NULL );
	OCL_CHECK(err);


	//---------------------------------------- init_alpha kernel -------------------------------------------//
	// 2nd kernel: initialize alpha
	size_t local_1;
	size_t global_1;
	local_1  =  256;
	global_1=  256;

	err  = clSetKernelArg(kernel[1], 0, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[1], 1, sizeof(cl_mem), &prior_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[1], 2, sizeof(cl_mem), &alpha_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[1], 3, sizeof(cl_mem), &lld_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[1], 4, sizeof(float)*256, NULL);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err = clSetKernelArg(kernel[1], 5, sizeof(int), &blks);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, &global_1, &local_1, 0, NULL, NULL );
	OCL_CHECK(err);


//	clFinish(queue);
//	clEnqueueReadBuffer(queue, alpha_d, CL_TRUE, 0, sizeof(float)*T*N, alpha, 0, NULL , NULL);
//	for(i=0;i<N;++i){
//		printf("%.4e\n", alpha[i]);
//	}
//	printf("done!\n");




	//---------------------------------------- matrix vector multiplication kernel -------------------------------------------//
	// 3rd  kernel
	size_t local_2[2];
	size_t global_2[2];
	local_2[0]= 16;
	local_2[1]= 16;
	global_2[0] =  16;
	global_2[1] =  N;

	err = clSetKernelArg(kernel[2], 0, sizeof(cl_mem), &At_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[2], 1, sizeof(cl_mem), &alpha_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[2], 2, sizeof(cl_mem), &at_alpha_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[2], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	//---------------------------------------- alpha dev kernel -------------------------------------------//
	// 4th kernel
	size_t local_3;
	size_t global_3;
	local_3	 = 256;
	global_3 = 256;

	err = clSetKernelArg(kernel[3], 0, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[3], 1, sizeof(cl_mem), &at_alpha_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[3], 2, sizeof(cl_mem), &alpha_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[3], 3, sizeof(cl_mem), &lld_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[3], 4, sizeof(float)*256, NULL);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[3], 5, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err |= clSetKernelArg(kernel[3], 6, sizeof(int), &blks);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}




	int frame;
	//for(frame = 1 ; frame < 2; ++frame)
	for(frame = 1 ; frame < T; ++frame)
	{
		startPos     = frame * N;
		startPos_pre = startPos - N;

		err = clSetKernelArg(kernel[2], 4, sizeof(uint), &startPos_pre);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		// At x alpha = at_alpha
		err = clEnqueueNDRangeKernel(queue, kernel[2], 2, NULL, global_2, local_2, 0, NULL, NULL );
		OCL_CHECK(err);
		
		// alpha dev
		err = clSetKernelArg(kernel[3], 7, sizeof(uint), &startPos);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		err = clEnqueueNDRangeKernel(queue, kernel[3], 1, NULL, &global_3, &local_3, 0, NULL, NULL );
		OCL_CHECK(err);

		//clFinish(queue);
		//clEnqueueReadBuffer(queue, lld_d, CL_TRUE, 0, sizeof(float), lld, 0, NULL , NULL);
		//printf("\n\n  (%d)  lld = %.4e\n", frame, lld[0]);
		
//		clFinish(queue);
//		clEnqueueReadBuffer(queue, alpha_d, CL_TRUE, 0, sizeof(float)*T*N, alpha, 0, NULL , NULL);
//		for(i=0; i<N; ++i){
//			printf("%.4e\n", alpha[startPos + i]);
//		}	

	}

	clFinish(queue);
	clEnqueueReadBuffer(queue, lld_d, CL_TRUE, 0, sizeof(float), lld , 0, NULL , &events[1]);
	printf("lld = %.4f\n", lld[0]);


	err = clWaitForEvents(1,&events[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (events[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (events[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;

	printf("oclTime = %lf (s)\n", gpuTime);

	clReleaseMemObject(A_d);
	clReleaseMemObject(At_d);
	clReleaseMemObject(B_d);
	clReleaseMemObject(prior_d);
	clReleaseMemObject(alpha_d);
	clReleaseMemObject(lld_d);
	clReleaseMemObject(at_alpha_d);
	clReleaseMemObject(dummy_d);

	//clReleaseMemObject(alphasum_d);
	//clReleaseMemObject(alphasum_tmp_d);
	//clReleaseMemObject(alphamid_d);


	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<numK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<numE;++i){
		clReleaseEvent(events[i]);
	}

	free(kernelSource);

	free(At);
	free(alpha);
	free(lld);
	free(at_alpha);
	free(dummy);
	//free(alphasum);

	return;
}
void run_opencl_backward(HMM *word)
{

	puts("\n=>GPU");

	int i;
	int N = word->nstates;
	int T = word->len;
	float *B = word->b;
	float *A = word->a;

	// gpu timer
	cl_ulong gstart, gend;
	double gpuTime;

	// cpu timer
	//struct timeval cstart;
	//struct timeval cend;
	double cpuTime;

	float *betaB;
	betaB= (float*)malloc(sizeof(float)*N);
	init_1d_f(betaB,N,0.f);

	float *beta; // NxT
	beta = (float*)malloc(sizeof(float)*N*T);
	init_2d_f(beta,N,T,0.f);
	for(i = 0 ; i < N ; ++i){
		beta[i*T + T-1] = 1.f;
	}



	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------

	int chunks;
	chunks = (N+63)/64;

	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*3);
	
	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*2);    

	// read kernel file
	char *fileName = "backward_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';

	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);

	kernel[0] = clCreateKernel(program, "genbetaB", &err);
	OCL_CHECK(err);
	kernel[1] = clCreateKernel(program, "beta_dev", &err);
	OCL_CHECK(err);
	kernel[2] = clCreateKernel(program, "scale_beta", &err);
	OCL_CHECK(err);

	// allocate memory on device 
	cl_mem A_d      	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem B_d      	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*T,  NULL, NULL);
	cl_mem beta_d		= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*T,  NULL, NULL);
	cl_mem betaB_d		= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,    NULL, NULL);
	cl_mem betasum_int_d= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*chunks,  NULL, NULL);
	cl_mem betasum_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float),  	  NULL, NULL);

	// warm up() device
	float *dummy = (float*)malloc(sizeof(float));
	cl_mem dummy_d= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float),  NULL, NULL);
	for(i=0;i<50;++i){
		err = clEnqueueWriteBuffer(queue, dummy_d, 		CL_TRUE, 	0, sizeof(float), 	dummy, 		0, NULL, NULL);
	}


	// Initialize device memory
	err = clEnqueueWriteBuffer(queue, A_d, 		CL_TRUE, 	0, sizeof(float)*N*N, 	A, 		0, NULL, &event[0]);
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, B_d, 		CL_TRUE, 	0, sizeof(float)*N*T, 	B, 		0, NULL, NULL); 
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, beta_d, 	CL_TRUE, 	0, sizeof(float)*N*T, 	beta, 	0, NULL, NULL); 
	OCL_CHECK(err);

	//  1st kernel: beta * B 
	size_t local_1 = 64;
	size_t global_1 = chunks*64;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &beta_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &betaB_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[0], 4, sizeof(int), &T);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	// 2nd kernel: A * betaB
	size_t local_2 = 64;
	size_t global_2 = chunks*64;

	err  = clSetKernelArg(kernel[1], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[1], 1, sizeof(cl_mem), &betaB_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[1], 2, sizeof(cl_mem), &beta_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[1], 3, sizeof(cl_mem), &betasum_int_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[1], 4, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[1], 5, sizeof(int), &T);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	//err |= clSetKernelArg(kernel[1], 6, sizeof(int), &frame);
	//if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[1], 7, sizeof(float)*64, NULL);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}



	// 3nd kernel: beta/sum 
	size_t local_3 = 64;
	size_t global_3 = chunks*64;

	err  = clSetKernelArg(kernel[2], 0, sizeof(cl_mem), &beta_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err  = clSetKernelArg(kernel[2], 1, sizeof(cl_mem), &betasum_int_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[2], 2, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[2], 3, sizeof(int), &T);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
	err |= clSetKernelArg(kernel[2], 5, sizeof(float), &chunks);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	// time capsule
	int frame;

	for(frame = (T-2) ; frame >= 0; frame--)
	{
		// 1st kernel : beta * B
		err |= clSetKernelArg(kernel[0], 5, sizeof(int), &frame);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &global_1, &local_1, 0, NULL, NULL);
		OCL_CHECK(err);

		if(frame ==  (T-2) && 0)
		{
			clFinish(queue);
			clEnqueueReadBuffer(queue, betaB_d, CL_TRUE, 0, sizeof(float)*N, betaB, 0, NULL , NULL);
			check_1d_f(betaB, N);	
			exit(1);	
		}


		// 2nd kernel; betaB * A
		err |= clSetKernelArg(kernel[1], 6, sizeof(int), &frame);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, &global_2, &local_2, 0, NULL, NULL);
		OCL_CHECK(err);

		if(frame ==  (T-2) && 0)
		{
			clFinish(queue);
			clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , NULL);
			check_2d_f(beta, N, T);	
			exit(1);	
		}



		// 3rd kernle; scale beta
		err |= clSetKernelArg(kernel[2], 4, sizeof(int), &frame);
		if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

		err = clEnqueueNDRangeKernel(queue, kernel[2], 1, NULL, &global_3, &local_3, 0, NULL, NULL);
		OCL_CHECK(err);

		if(frame ==  (T-2) && 0)
		{
			clFinish(queue);
			clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , NULL);
			check_2d_f(beta, N, T);	
			exit(1);	
		}



	}

	clFinish(queue);

	clEnqueueReadBuffer(queue, beta_d, CL_TRUE, 0, sizeof(float)*N*T, beta, 0, NULL , &event[1]);

	err = clWaitForEvents(1,&event[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;

	cpuTime = 0.0; 

	printf("oclTime = %lf (s)\n", gpuTime + cpuTime);

	// check
	//check_2d_f(beta,N,T);



	clReleaseMemObject(A_d);
	clReleaseMemObject(B_d);
	clReleaseMemObject(beta_d);
	clReleaseMemObject(betaB_d);
	clReleaseMemObject(dummy_d);

	clReleaseMemObject(betasum_d);
	clReleaseMemObject(betasum_int_d);



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<3;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<2;++i){
		clReleaseEvent(event[i]);
	}


	free(beta);
	free(betaB);
	free(kernelSource);

	free(dummy);

	return;
}
Exemple #13
0
 status_t init() {
     OCL_CHECK(device_info_.init());
     return status::success;
 }
Exemple #14
0
void run1(int N, char *fileName)
{
	puts("Matrix Vector Multiplication Naive\n");

	int i,j;

	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = 1.f;	
		}
	}

	float *B;
	B = (float*)malloc(sizeof(float)*N);
	for( i = 0; i < N ; ++i )
	{
		B[i] = 1.f;	
	}
	
	float *C;
	C = (float*)malloc(sizeof(float)*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);

	puts("B");
	check_1d_f(B,N);
#endif

	int NumK = 1;
	int NumE = 1;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);

#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bin, NULL);
	OCL_CHECK(err);

	// Print the binary out to the output file
	fh = fopen("kernel_mv_1.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

#endif

	kernel[0] = clCreateKernel(program, "mv_1", &err);
	OCL_CHECK(err);


	// memory on device
	cl_mem A_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem B_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);
	cl_mem C_d    = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N,  NULL, NULL);

	// Initialize device memory
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , NULL); 
	OCL_CHECK(err);
	err = clEnqueueWriteBuffer(queue, B_d, 	CL_TRUE, 0, sizeof(float)*N, 	B, 0, NULL , NULL); 
	OCL_CHECK(err);

	size_t localsize =  64;
	size_t globalsize = N;


	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &C_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 3, sizeof(int), &N);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &globalsize, &localsize, 0, NULL, &event[0]);
	OCL_CHECK(err);

	clFinish(queue);

	clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, sizeof(float)*N, C , 0, NULL , NULL );

	err = clWaitForEvents(1,&event[0]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("C = A * B");
	check_1d_f(C,N);

#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(B_d);	
	clReleaseMemObject(C_d);	



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif

	free(A);
	free(B);
	free(C);

	return;
}
Exemple #15
0
void runProgram(int N, char *fileName)
{
	printf("GPU Symmetrize()..."
		"\nSquareMatrix[%d][%d]\n", N, N);

	int i,j;

	// initialize input array
	float *A;
	A = (float*)malloc(sizeof(float)*N*N);

	for( i = 0; i < N ; ++i )
	{
		for( j = 0; j < N ; ++j )
		{
			A[i*N + j] = j;	
		}
	}

	//  result
	float *Aout;
	Aout = (float*)malloc(sizeof(float)*N*N);


#ifdef DEBUG
	puts("A");
	check_2d_f(A,N,N);
#endif

	int NumK = 1;
	int NumE = 2;

	double gpuTime;
	cl_ulong gstart, gend;

	//------------------------------------------------
	//  OpenCL 
	//------------------------------------------------
	cl_int err;

	cl_platform_id platform;          // OpenCL platform
	cl_device_id device_id;           // device ID
	cl_context context;               // context
	cl_command_queue queue;           // command queue
	cl_program program;               // program

	cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK);

	cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE);    

	// read kernel file
	//char *fileName = "transpose_kernel.cl";
	char *kernelSource;
	size_t size;
	FILE *fh = fopen(fileName, "rb");
	if(!fh) {
		printf("Error: Failed to open kernel file!\n");
		exit(1);
	}
	fseek(fh,0,SEEK_END);
	size=ftell(fh);
	fseek(fh,0,SEEK_SET);
	kernelSource = malloc(size+1);
	size_t result;
	result = fread(kernelSource,1,size,fh);
	if(result != size){ fputs("Reading error", stderr);exit(1);}
	kernelSource[size] = '\0';
	
	// Bind to platform
	err = clGetPlatformIDs(1, &platform, NULL);
	OCL_CHECK(err);

	// Get ID for the device
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
	OCL_CHECK(err);

	// Create a context  
	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	OCL_CHECK(err);

	// Create a command queue 
	queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
	OCL_CHECK(err);

	// Create the compute program from the source buffer
	program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err);
	OCL_CHECK(err);

	// turn on optimization for kernel
	char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only";

	err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
	if(err != CL_SUCCESS)
		printCompilerOutput(program, device_id);
	OCL_CHECK(err);



#ifdef SAVEBIN
	// Calculate size of binaries 
	size_t binary_size;
	err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
	OCL_CHECK(err);

	//printf("binary size = %ld\n", binary_size);

	unsigned char* bin;
	bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size);

	err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL);
	OCL_CHECK(err);

	//puts("save binaries");

	// Print the binary out to the output file
	fh = fopen("kernel.bin", "wb");
	fwrite(bin, 1, binary_size, fh);
	fclose(fh);

	puts("done save binaries");

#endif


	kernel[0] = clCreateKernel(program, "kernel_a", &err);
	OCL_CHECK(err);

	// memory on device
	cl_mem A_d    	= clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);
	cl_mem Aout_d   = clCreateBuffer(context, CL_MEM_READ_WRITE,  sizeof(float)*N*N,  NULL, NULL);


	// copy data to device
	err = clEnqueueWriteBuffer(queue, A_d, 	CL_TRUE, 0, sizeof(float)*N*N, 	A, 0, NULL , &event[0]); 
	OCL_CHECK(err);

	size_t localsize[2];
	size_t globalsize[2];

	localsize[0] = 16; 
	localsize[1] = 16;

	globalsize[0] = N;
	globalsize[1] = N;

	err  = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}

	err  = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d);
	if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}


	err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL);
	OCL_CHECK(err);

	clFinish(queue);

	// read device data back to host
	clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]);

	err = clWaitForEvents(1,&event[1]);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
	OCL_CHECK(err);

	err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
	OCL_CHECK(err);

	gpuTime = (double)(gend -gstart)/1000000000.0;



	//check_1d_f(sum, blks+1);

#ifdef DEBUG
	puts("Output");
	check_2d_f(Aout,N,N);
#endif

	printf("oclTime = %lf (s)\n", gpuTime );

	// free
	clReleaseMemObject(A_d);	
	clReleaseMemObject(Aout_d);	


	// // check
	// int flag = 1;
	// for(i=0;i<N;++i){
	// 	for(j=0;j<N;++j){
	// 		if(A[i*N+j] != At[j*N+i])		
	// 		{
	// 			flag  = 0;
	// 			break;
	// 		}
	// 	}
	// }
	// if( flag == 0 )
	// {
	// 	puts("Bugs! Check program.");
	// }else{
	// 	puts("Succeed!");	
	// }



	clReleaseProgram(program);
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	for(i=0;i<NumK;++i){
		clReleaseKernel(kernel[i]);
	}
	for(i=0;i<NumE;++i){
		clReleaseEvent(event[i]);
	}
	free(kernelSource);


#ifdef SAVEBIN
	free(bin);
#endif



	free(A);
	free(Aout);

	return;
}