Example #1
0
int main(int argc, char **argv) {
    if(argc < 2) {
        usage();
        return -1;
    }
    //init the filter array
    float filter[49] =
      {-1,      -1,      -1,      -1,      -1,      -1,      -1,
       -1,      -1,      -1,      -1,      -1,      -1,      -1,
       -1,      -1,      -1,      -1,      -1,      -1,      -1,
       -1,      -1,      -1,      49,      -1,      -1,      -1,
       -1,      -1,      -1,      -1,      -1,      -1,      -1,
       -1,      -1,      -1,      -1,      -1,      -1,      -1,
       -1,      -1,      -1,      -1,      -1,      -1,      -1};
    //operate the params of cmd
    const char* inputFileName; 
    const char* outputFileName; 
    inputFileName = (argv[1]);
    outputFileName = (argv[2]);

    //the image height and width
    int imageHeight, imageWidth;

    int filterWidth = 7;
    
    //read the bmp image to the memory
    float* inputImage = readBmpImage(inputFileName, &imageWidth, &imageHeight);

    //to check the read is succ
    printf("the width of the image is %d, the height of the image is %d\n", imageWidth, imageHeight);

    //calculate the datasize
    int dataSize = imageHeight * imageWidth * sizeof(float);
    int filterSize = sizeof(float) * filterWidth * filterWidth;

    //output image
    float *outputImage = NULL;
    outputImage = (float*)malloc(dataSize);

    //set up the OpenCL environment
    cl_int status;

    //Discovery platform
    cl_platform_id platforms[2];
    cl_platform_id platform;
    status = clGetPlatformIDs(2, platforms, NULL);
    check(status, "clGetPlatformIDs");
    platform = platforms[PLATFORM_TO_USE];

    //Discovery device
    cl_device_id device;
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
    check(status, "clGetDeviceIDs");

    //create context
    cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0};
    cl_context context;
    context = clCreateContext(props, 1, &device, NULL, NULL, &status);
    check(status, "clCreateContext");

    //create command queue
    cl_command_queue queue;
    queue = clCreateCommandQueue(context, device, 0, &status);
    check(status, "clCreateCommandQueue");

    //create the input and output buffers
    cl_mem d_input, d_output, d_filter;
    d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL,
       &status);
    check(status, "clCreateBuffer");

    d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, filterSize, NULL,
       &status);
    check(status, "clCreateBuffer");

    // Copy the input image to the device
    d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL,
       &status);
    check(status, "clCreateBuffer");

    status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, 
         inputImage, 0, NULL, NULL);
    check(status, "clEnqueueWriteBuffer");

    status = clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, filterSize,
        filter, 0, NULL, NULL);
    check(status, "clEnqueueWriteBuffer");

    const char* source = readSource(kernelPath);
    //create a program object with source and build it
    cl_program program;
    program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
    check(status, "clCreateProgramWithSource");
    status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    size_t log_size;
    char *program_log;
    if(status < 0) {
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        program_log = (char*)malloc(log_size + 1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        exit(1);
    }
    check(status, "clBuildProgram");



    //create the kernel object
    cl_kernel kernel;
    kernel = clCreateKernel(program, "sharpen", &status);
    check(status, "clCreateKernel");

    //set the kernel arguments
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output);
    status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input);
    status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth);
    status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight);
    status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
    status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
    check(status, "clSetKernelArg");

    // Set the work item dimensions
   size_t globalSize[2] = {imageWidth, imageHeight};
   status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
      NULL, NULL);
   check(status, "clEnqueueNDRange");
   // Read the image back to the host
   status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, 
         outputImage, 0, NULL, NULL); 

   check(status, "clEnqueueReadBuffer");

   // Write the output image to file
   storeBmpImage(outputImage, outputFileName, imageHeight, imageWidth, inputFileName);

       //free opencl resources
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseMemObject(d_input);
    clReleaseMemObject(d_output);
    clReleaseMemObject(d_filter);
    clReleaseContext(context);

    //free host resources
    free(inputImage);
    free(outputImage);
    

}
Example #2
0
File: nw.c Project: zwang4/dividend
int main(int argc, char **argv){

  printf("WG size of kernel = %d \n", BLOCK_SIZE);

    int max_rows, max_cols, penalty;
	char * tempchar;
	// the lengths of the two sequences should be able to divided by 16.
	// And at current stage  max_rows needs to equal max_cols
	if (argc == 4)
	{
		max_rows = atoi(argv[1]);
		max_cols = atoi(argv[1]);
		penalty = atoi(argv[2]);
		tempchar = argv[3];
	}
    else{
	     usage(argc, argv);
    }
	
	if(atoi(argv[1])%16!=0){
	fprintf(stderr,"The dimension values must be a multiple of 16\n");
	exit(1);
	}

	max_rows = max_rows + 1;
	max_cols = max_cols + 1;

	int *reference;
	int *input_itemsets;
	int *output_itemsets;
	
	reference = (int *)malloc( max_rows * max_cols * sizeof(int) );
    input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
	output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
	
	srand(7);
	
	//initialization 
	for (int i = 0 ; i < max_cols; i++){
		for (int j = 0 ; j < max_rows; j++){
			input_itemsets[i*max_cols+j] = 0;
		}
	}

	for( int i=1; i< max_rows ; i++){    //initialize the cols
			input_itemsets[i*max_cols] = rand() % 10 + 1;
	}
	
    for( int j=1; j< max_cols ; j++){    //initialize the rows
			input_itemsets[j] = rand() % 10 + 1;
	}
	
	for (int i = 1 ; i < max_cols; i++){
		for (int j = 1 ; j < max_rows; j++){
		reference[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
		}
	}

    for( int i = 1; i< max_rows ; i++)
       input_itemsets[i*max_cols] = -i * penalty;
	for( int j = 1; j< max_cols ; j++)
       input_itemsets[j] = -j * penalty;
	
	int sourcesize = 1024*1024;
		
	char * source = (char *)calloc(sourcesize, sizeof(char)); 
	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_nw1  = "nw_kernel1";
	char * kernel_nw2  = "nw_kernel2";
	FILE * fp = fopen(tempchar, "rb"); 
	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
	fread(source + strlen(source), sourcesize, 1, fp);
	fclose(fp);

	int nworkitems, workgroupsize = 0;
	nworkitems = BLOCK_SIZE;

	if(nworkitems < 1 || workgroupsize < 0){
		printf("ERROR: invalid or missing <num_work_items>[/<work_group_size>]\n"); 
		return -1;
	}
		// set global and local workitems
	size_t local_work[3] = { (workgroupsize>0)?workgroupsize:1, 1, 1 };
	size_t global_work[3] = { nworkitems, 1, 1 }; //nworkitems = no. of GPU threads
	
	int use_gpu = 1;
	// OpenCL initialization
	if(initialize(use_gpu)) return -1;

	// compile kernel
	cl_int err = 0;
	const char * slist[2] = { source, 0 };
	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }

	char clOptions[110];
	//  sprintf(clOptions,"-I../../src");                                                                                 
	sprintf(clOptions," ");

#ifdef BLOCK_SIZE
	sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE);
#endif

	err = DIVIDEND_CL_WRAP(clBuildProgram)(prog, 0, NULL, clOptions, NULL, NULL);
	/*{ // show warnings/errors
		static char log[65536]; memset(log, 0, sizeof(log));
		cl_device_id device_id = 0;
		err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
		clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
		if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
	}*/
	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
    	
	cl_kernel kernel1;
	cl_kernel kernel2;
	kernel1 = clCreateKernel(prog, kernel_nw1, &err);  
	kernel2 = clCreateKernel(prog, kernel_nw2, &err);  
	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
	clReleaseProgram(prog);
	
		
	
	// creat buffers
	cl_mem input_itemsets_d;
	cl_mem output_itemsets_d;
	cl_mem reference_d;
	
	input_itemsets_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_item_set (size:%d) => %d\n", max_cols * max_rows, err); return -1;}
	reference_d		 = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer reference (size:%d) => %d\n", max_cols * max_rows, err); return -1;}
	output_itemsets_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_item_set (size:%d) => %d\n", max_cols * max_rows, err); return -1;}
	
	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, input_itemsets_d, 1, 0, max_cols * max_rows * sizeof(int), input_itemsets, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer bufIn1 (size:%d) => %d\n", max_cols * max_rows, err); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, reference_d, 1, 0, max_cols * max_rows * sizeof(int), reference, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer bufIn2 (size:%d) => %d\n", max_cols * max_rows, err); return -1; }
		
	int worksize = max_cols - 1;
	printf("worksize = %d\n", worksize);
	//these two parameters are for extension use, don't worry about it.
	int offset_r = 0, offset_c = 0;
	int block_width = worksize/BLOCK_SIZE ;
	
	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &reference_d);
	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &input_itemsets_d);
	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &output_itemsets_d);
	clSetKernelArg(kernel1, 3, sizeof(cl_int) * (BLOCK_SIZE + 1) *(BLOCK_SIZE+1), (void*)NULL );
	clSetKernelArg(kernel1, 4, sizeof(cl_int) *  BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	clSetKernelArg(kernel1, 5, sizeof(cl_int), (void*) &max_cols);
	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &penalty);
	clSetKernelArg(kernel1, 8, sizeof(cl_int), (void*) &block_width);
	clSetKernelArg(kernel1, 9, sizeof(cl_int), (void*) &worksize);
	clSetKernelArg(kernel1, 10, sizeof(cl_int), (void*) &offset_r);
	clSetKernelArg(kernel1, 11, sizeof(cl_int), (void*) &offset_c);

	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &reference_d);
	clSetKernelArg(kernel2, 1, sizeof(void *), (void*) &input_itemsets_d);
	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &output_itemsets_d);
	clSetKernelArg(kernel2, 3, sizeof(cl_int) * (BLOCK_SIZE + 1) *(BLOCK_SIZE+1), (void*)NULL );
	clSetKernelArg(kernel2, 4, sizeof(cl_int) * BLOCK_SIZE *BLOCK_SIZE, (void*)NULL );
	clSetKernelArg(kernel2, 5, sizeof(cl_int), (void*) &max_cols);
	clSetKernelArg(kernel2, 6, sizeof(cl_int), (void*) &penalty);
	clSetKernelArg(kernel2, 8, sizeof(cl_int), (void*) &block_width);
	clSetKernelArg(kernel2, 9, sizeof(cl_int), (void*) &worksize);
	clSetKernelArg(kernel2, 10, sizeof(cl_int), (void*) &offset_r);
	clSetKernelArg(kernel2, 11, sizeof(cl_int), (void*) &offset_c);
	
	printf("Processing upper-left matrix\n");
	for( int blk = 1 ; blk <= worksize/BLOCK_SIZE ; blk++){
	
		global_work[0] = BLOCK_SIZE * blk;
		local_work[0]  = BLOCK_SIZE;
		clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &blk);
#pragma dividend local_work_group_size local_work dim 2 dim1(2:1024:2:32) dim2(1:1:2:1)
	//This lws will be used to profile the OpenCL kernel with id 1
			size_t _dividend_lws_local_work_k1[3];
		{
		_dividend_lws_local_work_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL);
		_dividend_lws_local_work_k1[1] = getLWSValue("DIVIDEND_LWS1_D1",DIVIDEND_LWS1_D1_DEFAULT_VAL);
		//Dividend extension: store the kernel id as the last element
		_dividend_lws_local_work_k1[2] = 1;
		}
				err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel1, 2, NULL, global_work, _dividend_lws_local_work_k1, 0, 0, 0);
		if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }			
	}
	
	printf("BLOCK_SIZE:%d\n", BLOCK_SIZE);
	printf("Processing lower-right matrix\n");
	for( int blk =  worksize/BLOCK_SIZE - 1  ; blk >= 1 ; blk--){	   
		global_work[0] = BLOCK_SIZE * blk;
		local_work[0] =  BLOCK_SIZE;
		clSetKernelArg(kernel2, 7, sizeof(cl_int), (void*) &blk);
#pragma dividend local_work_group_size local_work dim 2 dim1(2:1024:2:32) dim2(1:1:2:1)
	//This lws will be used to profile the OpenCL kernel with id 2
        	size_t _dividend_lws_local_work_k2[3];
        {
        _dividend_lws_local_work_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL);
        _dividend_lws_local_work_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL);
        //Dividend extension: store the kernel id as the last element
        _dividend_lws_local_work_k2[2] = 2;
        }
                err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel2, 2, NULL, global_work, _dividend_lws_local_work_k2, 0, 0, 0);
		if(err != CL_SUCCESS) { printf("ERROR: 2 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
	}
    
    // Lingjie Zhang modified at Nov 1, 2015
    //	clFinish(cmd_queue);
    //	fflush(stdout);
	//end Lingjie Zhang modification

    //DIVIDEND_CL_WRAP(clFinish)(cmd_queue);
    err = clEnqueueReadBuffer(cmd_queue, input_itemsets_d, 1, 0, max_cols * max_rows * sizeof(int), output_itemsets, 0, 0, 0);
    DIVIDEND_CL_WRAP(clFinish)(cmd_queue);

//#define TRACEBACK	
#ifdef TRACEBACK
	
	FILE *fpo = fopen("result.txt","w");
	fprintf(fpo, "print traceback value GPU:\n");
    
	for (int i = max_rows - 2,  j = max_rows - 2; i>=0, j>=0;){
		int nw, n, w, traceback;
		if ( i == max_rows - 2 && j == max_rows - 2 )
			fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element
		if ( i == 0 && j == 0 )
           break;
		if ( i > 0 && j > 0 ){
			nw = output_itemsets[(i - 1) * max_cols + j - 1];
		    w  = output_itemsets[ i * max_cols + j - 1 ];
            n  = output_itemsets[(i - 1) * max_cols + j];
		}
		else if ( i == 0 ){
		    nw = n = LIMIT;
		    w  = output_itemsets[ i * max_cols + j - 1 ];
		}
		else if ( j == 0 ){
		    nw = w = LIMIT;
            n  = output_itemsets[(i - 1) * max_cols + j];
		}
		else{
		}

		//traceback = maximum(nw, w, n);
		int new_nw, new_w, new_n;
		new_nw = nw + reference[i * max_cols + j];
		new_w = w - penalty;
		new_n = n - penalty;
		
		traceback = maximum(new_nw, new_w, new_n);
		if(traceback == new_nw)
			traceback = nw;
		if(traceback == new_w)
			traceback = w;
		if(traceback == new_n)
            traceback = n;
			
		fprintf(fpo, "%d ", traceback);

		if(traceback == nw )
		{i--; j--; continue;}

        else if(traceback == w )
		{j--; continue;}

        else if(traceback == n )
		{i--; continue;}

		else
		;
	}
	
	fclose(fpo);

#endif

	printf("Computation Done\n");
    // OpenCL shutdown
	if(shutdown()) return -1;

	clReleaseMemObject(input_itemsets_d);
	clReleaseMemObject(output_itemsets_d);
	clReleaseMemObject(reference_d);

	free(reference);
	free(input_itemsets);
	free(output_itemsets);
	
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'abs_ulong4.cl' */
        source_code = read_buffer("abs_ulong4.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "abs_ulong4", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_ulong4 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong4));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_ulong4){{2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong4), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_ulong4 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_ulong4));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_ulong4));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_ulong4), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_ulong4), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_ulong4));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Example #4
0
static bool opencl_thread_init(struct thr_info *thr)
{
	const int thr_id = thr->id;
	struct cgpu_info *gpu = thr->cgpu;
	struct opencl_thread_data *thrdata;
	_clState *clState = clStates[thr_id];
	cl_int status = 0;
	thrdata = calloc(1, sizeof(*thrdata));
	thr->cgpu_data = thrdata;
	int buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
	if (opt_neoscrypt) {
		buffersize = opt_neoscrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
	}

	if (!thrdata) {
		applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
		return false;
	}

	switch (clState->chosen_kernel) {
		case KL_POCLBM:
			thrdata->queue_kernel_parameters = &queue_poclbm_kernel;
			break;
		case KL_PHATK:
			thrdata->queue_kernel_parameters = &queue_phatk_kernel;
			break;
		case KL_DIAKGCN:
			thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
			break;
#ifdef USE_SCRYPT
		case KL_SCRYPT:
			thrdata->queue_kernel_parameters = &queue_scrypt_kernel;
			break;
#endif
#ifdef USE_NEOSCRYPT
		case KL_NEOSCRYPT:
			thrdata->queue_kernel_parameters = &queue_neoscrypt_kernel;
			break;
#endif
#ifdef USE_KECCAK
		case KL_KECCAK:
			thrdata->queue_kernel_parameters = &queue_keccak_kernel;
			break;
#endif
		default:
		case KL_DIABLO:
			thrdata->queue_kernel_parameters = &queue_diablo_kernel;
			break;
	}

	thrdata->res = calloc(buffersize, 1);

	if (!thrdata->res) {
		free(thrdata);
		applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
		return false;
	}

	status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
				       buffersize, blank_res, 0, NULL, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
		return false;
	}

	gpu->status = LIFE_WELL;

	gpu->device_last_well = time(NULL);

	return true;
}
Example #5
0
int main(int argc, char *argv[])
{
  std::string vvadd_kernel_str;

  /* Provide names of the OpenCL kernels
   * and cl file that they're kept in */
  std::string vvadd_name_str = 
    std::string("vvadd");
  std::string vvadd_kernel_file = 
    std::string("vvadd.cl");

  cl_vars_t cv; 
  cl_kernel vvadd;

  /* Read OpenCL file into STL string */
  readFile(vvadd_kernel_file,
	   vvadd_kernel_str);
  
  /* Initialize the OpenCL runtime 
   * Source in clhelp.cpp */
  initialize_ocl(cv);
  
  /* Compile all OpenCL kernels */
  compile_ocl_program(vvadd, cv, vvadd_kernel_str.c_str(),
		      vvadd_name_str.c_str());
  
  /* Arrays on the host (CPU) */
  float *h_A, *h_B, *h_Y;
  /* Arrays on the device (GPU) */
  cl_mem g_A, g_B, g_Y;

  /* Allocate arrays on the host
   * and fill with random data */
  int n = (1<<20);
  h_A = new float[n];
  h_B = new float[n];
  h_Y = new float[n];
  bzero(h_Y, sizeof(float)*n);
  
  for(int i = 0; i < n; i++)
    {
      h_A[i] = (float)drand48();
      h_B[i] = (float)drand48();
    }

  /* CS194: Allocate memory for arrays on 
   * the GPU */
  cl_int err = CL_SUCCESS;
  
  /* CS194: Here's something to get you started  */
  // creates memory on the device to hold the A and B source arrays, plus the results array Y.
  g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
  CHK_ERR(err);
  

  /* CS194: Copy data from host CPU to GPU */
  // copies the host array A and B to the device.
  err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n,
			     h_A, 0, NULL, NULL);
  CHK_ERR(err);
  err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n,
			     h_B, 0, NULL, NULL);
  CHK_ERR(err);
  
  /* CS194: Define the global and local workgroup sizes */
  size_t global_work_size[1] = {n};
  size_t local_work_size[1] = {128};
  
  /* CS194: Set Kernel Arguments */
  err = clSetKernelArg(vvadd, 0, sizeof(cl_mem), &g_Y);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 1, sizeof(cl_mem), &g_A);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 2, sizeof(cl_mem), &g_B);
  CHK_ERR(err);
  err = clSetKernelArg(vvadd, 3, sizeof(int), &n);
  CHK_ERR(err);
  
  /* CS194: Call kernel on the GPU */
  err = clEnqueueNDRangeKernel(cv.commands,
			       vvadd,
			       1,//work_dim,
			       NULL, //global_work_offset
			       global_work_size, //global_work_size
			       local_work_size, //local_work_size
			       0, //num_events_in_wait_list
			       NULL, //event_wait_list
			       NULL //
			       );
  CHK_ERR(err);
  
  /* Read result of GPU on host CPU */
  // copies the result array Y from the device back to the host Y.
  err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
			    h_Y, 0, NULL, NULL);
  CHK_ERR(err);

  /* Check answer */
  for(int i = 0; i < n; i++)
    {
      float d = h_A[i] + h_B[i];
      if(h_Y[i] != d)
	{
	  printf("error at %d :(\n", i);
	  break;
	}
    }

  /* Shut down the OpenCL runtime */
  uninitialize_ocl(cv);
  
  delete [] h_A; 
  delete [] h_B; 
  delete [] h_Y;
  
  // frees memory allocated on device
  clReleaseMemObject(g_A); 
  clReleaseMemObject(g_B); 
  clReleaseMemObject(g_Y);
  
  return 0;
}
Example #6
0
int main(int argc, char* argv[])
{
	int ciErrNum = 0;
	
	printf("press a key to start\n");
	getchar();

	const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);

	cl_device_type  deviceType = CL_DEVICE_TYPE_GPU;//CL_DEVICE_TYPE_ALL
	
	void* glCtx=0;
	void* glDC = 0;
	printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
	g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);

	if (numDev>0)
	{
		int deviceIndex=0;

		cl_device_id		device;
		device = btOpenCLUtils::getDevice(g_cxMainContext,deviceIndex);
		btOpenCLDeviceInfo clInfo;
		btOpenCLUtils::getDeviceInfo(device,clInfo);
		btOpenCLUtils::printDeviceInfo(device);


		const char* globalAtomicsKernelStringPatched = globalAtomicsKernelString;
		if (!strstr(clInfo.m_deviceExtensions,"cl_ext_atomic_counters_32"))
		{
			globalAtomicsKernelStringPatched = findAndReplace(globalAtomicsKernelString,"counter32_t", "volatile __global int*");
		}

		

		// create a command-queue
		g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
		
		cl_mem counterBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, sizeof(int), NULL, &ciErrNum);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);

		char* kernelMethods[] = 
		{
			"globalAtomicKernelOpenCL1_1",
			"counterAtomicKernelExt",
			"globalAtomicKernelExt",
			"globalAtomicKernelCounters32Broken"
		};
		int numKernelMethods = sizeof(kernelMethods)/sizeof(char*);

		for (int i=0;i<numKernelMethods;i++)
		{
			int myCounter = 0;

			//write to counterBuffer
			int deviceOffset=0;
			int hostOffset=0;

			ciErrNum = clEnqueueWriteBuffer(g_cqCommandQue, counterBuffer,CL_FALSE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);

			g_atomicsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext,device,globalAtomicsKernelStringPatched,kernelMethods[i], &ciErrNum);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);

		


			ciErrNum = clSetKernelArg(g_atomicsKernel, 0, sizeof(cl_mem),(void*)&counterBuffer);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);

			size_t	numWorkItems = workGroupSize*((NUM_OBJECTS + (workGroupSize-1)) / workGroupSize);
			ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_atomicsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);
			
			clFinish(g_cqCommandQue);
			oclCHECKERROR(ciErrNum, CL_SUCCESS);

			//read from counterBuffer
			ciErrNum = clEnqueueReadBuffer(g_cqCommandQue, counterBuffer, CL_TRUE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
			 oclCHECKERROR(ciErrNum, CL_SUCCESS);

			 if (myCounter != NUM_OBJECTS)
			 {
				 printf("%s is broken, expected %d got %d\n",kernelMethods[i],NUM_OBJECTS,myCounter);
			 } else
			 {
				 printf("%s success, got %d\n",kernelMethods[i],myCounter);
			 }
		}

		clReleaseCommandQueue(g_cqCommandQue);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}

	clReleaseContext(g_cxMainContext);
	
	printf("press a key to end\n");
	getchar();

	return 0;
}
void cl_launch_kernel()
{
	double t_start, t_end;

	int m = M;
	int n = N;

	DATA_TYPE float_n = FLOAT_N;
	DATA_TYPE eps = EPS;

	DATA_TYPE val = 1.0;

	size_t localWorkSize_Kernel1[2], globalWorkSize_Kernel1[2];
	size_t localWorkSize_Kernel2[2], globalWorkSize_Kernel2[2];
	size_t localWorkSize_Kernel3[2], globalWorkSize_Kernel3[2];
	size_t localWorkSize_Kernel4[2], globalWorkSize_Kernel4[2];

	localWorkSize_Kernel1[0] = LWS_KERNEL_1_X;
	localWorkSize_Kernel1[1] = LWS_KERNEL_1_Y;
	globalWorkSize_Kernel1[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_1_X)) * LWS_KERNEL_1_X;
	globalWorkSize_Kernel1[1] = 1;

	localWorkSize_Kernel2[0] = LWS_KERNEL_2_X;
	localWorkSize_Kernel2[1] = LWS_KERNEL_2_Y;
	globalWorkSize_Kernel2[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_2_X)) * LWS_KERNEL_2_X;
	globalWorkSize_Kernel2[1] = 1;

	localWorkSize_Kernel3[0] = LWS_KERNEL_3_X;
	localWorkSize_Kernel3[1] = LWS_KERNEL_3_Y;
	globalWorkSize_Kernel3[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_3_X)) * LWS_KERNEL_3_X;
	globalWorkSize_Kernel3[1] = (size_t)ceil(((float)N) / ((float)LWS_KERNEL_3_Y)) * LWS_KERNEL_3_Y;

	localWorkSize_Kernel4[0] = LWS_KERNEL_4_X;
	localWorkSize_Kernel4[1] = LWS_KERNEL_4_Y;
	globalWorkSize_Kernel4[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_4_X)) * LWS_KERNEL_4_X;
	globalWorkSize_Kernel4[1] = 1;


//	t_start = rtclock();	
	
	// Set the arguments of the kernel
	err_code =  clSetKernelArg(clKernel_mean, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
	err_code |= clSetKernelArg(clKernel_mean, 1, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_mean, 2, sizeof(DATA_TYPE), (void *)&float_n);
	err_code |= clSetKernelArg(clKernel_mean, 3, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_mean, 4, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in seting arguments1\n");
          exit(1);
        }

	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_mean, 1, NULL, globalWorkSize_Kernel1, localWorkSize_Kernel1, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel1\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	// Set the arguments of the kernel
	err_code =  clSetKernelArg(clKernel_std, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
	err_code =  clSetKernelArg(clKernel_std, 1, sizeof(cl_mem), (void *)&stddev_mem_obj);
	err_code |= clSetKernelArg(clKernel_std, 2, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_std, 3, sizeof(DATA_TYPE), (void *)&float_n);
	err_code |= clSetKernelArg(clKernel_std, 4, sizeof(DATA_TYPE), (void *)&eps);
	err_code |= clSetKernelArg(clKernel_std, 5, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_std, 6, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in seting arguments2\n");
          exit(1);
        }
 
	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_std, 1, NULL, globalWorkSize_Kernel2, localWorkSize_Kernel2, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel2\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	// Set the arguments of the kernel
	err_code =  clSetKernelArg(clKernel_reduce, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
	err_code =  clSetKernelArg(clKernel_reduce, 1, sizeof(cl_mem), (void *)&stddev_mem_obj);
	err_code |= clSetKernelArg(clKernel_reduce, 2, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_reduce, 3, sizeof(DATA_TYPE), (void *)&float_n);
	err_code |= clSetKernelArg(clKernel_reduce, 4, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_reduce, 5, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS) 
        {
          printf("Error in seting arguments3\n");
          exit(1);
        }
 
	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_reduce, 2, NULL, globalWorkSize_Kernel3, localWorkSize_Kernel3, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel3\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	// Set the arguments of the kernel	
	err_code =  clSetKernelArg(clKernel_corr, 0, sizeof(cl_mem), (void *)&symmat_mem_obj);
	err_code |= clSetKernelArg(clKernel_corr, 1, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_corr, 2, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_corr, 3, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in seting arguments4\n");
          exit(1);
        }

	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_corr, 1, NULL, globalWorkSize_Kernel4, localWorkSize_Kernel4, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel4\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	clEnqueueWriteBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, ((M)*(M+1) + (M))*sizeof(DATA_TYPE), sizeof(DATA_TYPE), &val, 0, NULL, NULL);

	clFinish(clCommandQue);

//	t_end = rtclock();
//	fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
}
Example #8
0
compute::buffer cape::fighter_to_fixed_vec(vec3f p1, vec3f p2, vec3f p3, vec3f rot)
{
    vec3f rotation = rot;

    vec3f diff = p3 - p1;

    float shrink = 0.12f;

    diff = diff * shrink;

    p3 = p3 - diff;
    p1 = p1 + diff;

    vec3f lpos = p1;
    vec3f rpos = p3;

    ///approximation
    ///could also use body scaling
    float ldepth = (p3 - p1).length() / 3.f;
    float rdepth = ldepth;
    ///we should move perpendicularly away, not zdistance away

    vec2f ldir = {p3.v[0], p3.v[2]};

    ldir = ldir - (vec2f){p1.v[0], p1.v[2]};

    vec2f perp = perpendicular(ldir.norm());

    vec3f perp3 = {perp.v[0], 0.f, perp.v[1]};

    lpos = lpos + perp3 * ldepth;
    rpos = rpos + perp3 * ldepth;

    lpos.v[1] += bodypart::scale / 4;
    rpos.v[1] += bodypart::scale / 4;

    ///dir could also just be (p3 - p1).rot ???
    vec3f dir = rpos - lpos;

    int len = width;

    vec3f step = dir / (float)(len - 1);

    vec3f current = lpos;

    compute::buffer buf = compute::buffer(cl::context, sizeof(float)*width*3, CL_MEM_READ_WRITE, nullptr);

    if(!cape_init)
    {
        gpu_cape.resize(width * 3);
        cape_init = true;
    }

    //cl_float* mem_map = (cl_float*) clEnqueueMapBuffer(cl::cqueue.get(), buf.get(), CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, sizeof(cl_float)*width*3, 0, NULL, NULL, NULL);

    float sag = bodypart::scale/32.f;

    //sag = 0;

    for(int i=0; i<len; i++)
    {
        float xf = (float)i / len;

        float yval = 4 * xf * (xf - 1) * sag + sin(xf * 30);

        /*mem_map[i*3 + 0] = current.v[0];
        mem_map[i*3 + 1] = current.v[1] + yval;
        mem_map[i*3 + 2] = current.v[2];*/

        gpu_cape[i*3 + 0] = current.v[0];
        gpu_cape[i*3 + 1] = current.v[1] + yval;
        gpu_cape[i*3 + 2] = current.v[2];

        current = current + step;
    }

    clEnqueueWriteBuffer(cl::cqueue.get(), buf.get(), CL_FALSE, 0, sizeof(cl_float) * width * 3, gpu_cape.data(), 0, NULL, NULL);

    //clEnqueueUnmapMemObject(cl::cqueue.get(), buf.get(), mem_map, 0, NULL, NULL);

    return buf;
}
Example #9
0
void init_cl_radix_sort(
		int nkeys){



	cl_int err;


	cl_int status;

	/**/



	nkeys_rounded=nkeys;
	// check some conditions
	assert(_TOTALBITS % _BITS == 0);
	assert(nkeys % (_GROUPS * _ITEMS) == 0);
	assert( (_GROUPS * _ITEMS * _RADIX) % _HISTOSPLIT == 0);
	assert(pow(2,(int) log2(_GROUPS)) == _GROUPS);
	assert(pow(2,(int) log2(_ITEMS)) == _ITEMS);

	// init the timers
	histo_time=0;
	scan_time=0;
	reorder_time=0;
	transpose_time=0;










	//printf("Construct the random list\n");
	// construction of a random list
	uint maxint=_MAXINT;
	assert(_MAXINT != 0);


	h_checkKeys = (uint*)malloc(sizeof(uint)*nkeys);
	h_Permut = (uint*)malloc(sizeof(uint)*nkeys);
	// construction of the initial permutation
	for(uint i = 0; i < nkeys; i++){

		//printf("%d, ",i);
		h_Permut[i] = i;
		h_checkKeys[i]=h_keys[i];

	}


	printf("Send to the GPU\n");
	// copy on the GPU
	d_inKeys  = clCreateBuffer(context,
			CL_MEM_READ_WRITE,
			sizeof(uint)* nkeys ,
			NULL,
			&err);
	assert(err == CL_SUCCESS);

	d_outKeys  = clCreateBuffer(context,
			CL_MEM_READ_WRITE,
			sizeof(uint)* nkeys ,
			NULL,
			&err);
	assert(err == CL_SUCCESS);

	d_inPermut  = clCreateBuffer(context,
			CL_MEM_READ_WRITE,
			sizeof(uint)* nkeys ,
			NULL,
			&err);
	assert(err == CL_SUCCESS);

	d_outPermut  = clCreateBuffer(context,
			CL_MEM_READ_WRITE,
			sizeof(uint)* nkeys ,
			NULL,
			&err);
	assert(err == CL_SUCCESS);



	////////////////////////////////////////////////////////////////////////////////
	//copy the two previous vectors to the device
	//cl_radix_host2gpu();
	////////////////////////////////////////////////////////////////////////////////
	status = clEnqueueWriteBuffer( command_que,
			d_inKeys,
			CL_TRUE, 0,
			sizeof(uint)  * nkeys,
			h_keys,
			0, NULL, NULL );



	if(status == CL_INVALID_COMMAND_QUEUE){
		printf("if command_queue is not a valid command-queue.1 \n");

	}else if(status == CL_INVALID_CONTEXT){
		printf("if command_queue is not a valid command-queue.2 \n");
	}else if(status == CL_INVALID_MEM_OBJECT){
		printf("if command_queue is not a valid command-queue.3 \n");
	}else if(status == CL_INVALID_VALUE){
		printf("if command_queue is not a valid command-queue.4 \n");
	}else if(status == CL_INVALID_EVENT_WAIT_LIST){
		printf("if command_queue is not a valid command-queue.5 \n");
	}else if(status == CL_MISALIGNED_SUB_BUFFER_OFFSET){
		printf("if command_queue is not a valid command-queue. 6\n");
	}else if(status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST){
		printf("if command_queue is not a valid command-queue.7 \n");
	}else if(status == CL_MEM_OBJECT_ALLOCATION_FAILURE){
		printf("if command_queue is not a valid command-queue.8 \n");
	}else if(status == CL_OUT_OF_RESOURCES){
		printf("if command_queue is not a valid command-queue. 9\n");
	}else if(status == CL_OUT_OF_HOST_MEMORY){
		printf("if command_queue is not a valid command-queue.10 \n");
	}

	assert (status == CL_SUCCESS);
	clFinish(command_que);  // wait end of read

	status = clEnqueueWriteBuffer( command_que,
			d_inPermut,
			CL_TRUE, 0,
			sizeof(uint)  * nkeys,
			h_Permut,
			0, NULL, NULL );

	assert (status == CL_SUCCESS);
	clFinish(command_que);  // wait end of read
	////////////////////////////////////////////////////////////////////////////////
	////////////////////////////////////////////////////////////////////////////////







	// allocate the histogram on the GPU
	d_Histograms  = clCreateBuffer(context,
			CL_MEM_READ_WRITE,
			sizeof(uint)* _RADIX * _GROUPS * _ITEMS,
			NULL,
			&err);
	assert(err == CL_SUCCESS);


	// allocate the auxiliary histogram on GPU
	d_globsum  = clCreateBuffer(context,
			CL_MEM_READ_WRITE,
			sizeof(uint)* _HISTOSPLIT,
			NULL,
			&err);
	assert(err == CL_SUCCESS);

	// temporary vector when the sum is not needed
	d_temp  = clCreateBuffer(context,
			CL_MEM_READ_WRITE,
			sizeof(uint)* _HISTOSPLIT,
			NULL,
			&err);
	assert(err == CL_SUCCESS);

	cl_radix_resize(nkeys);


	// we set here the fixed arguments of the OpenCL kernels
	// the changing arguments are modified elsewhere in the class

	//void histogram(const __global int* d_Keys,__global int* d_Histograms,
	//	const int pass,	__local int* loc_histo,	const int n)
	err = clSetKernelArg(ckHistogram, 1, sizeof(cl_mem), &d_Histograms);
	assert(err == CL_SUCCESS);
	err = clSetKernelArg(ckHistogram, 3, sizeof(uint)*_RADIX*_ITEMS, NULL);
	assert(err == CL_SUCCESS);



	// err = clSetKernelArg(ckHistogram, 3, sizeof(uint)*_ITEMS, NULL);
	// assert(err == CL_SUCCESS);

	err = clSetKernelArg(ckPasteHistogram, 0, sizeof(cl_mem), &d_Histograms);
	assert(err == CL_SUCCESS);

	err = clSetKernelArg(ckPasteHistogram, 1, sizeof(cl_mem), &d_globsum);
	assert(err == CL_SUCCESS);

	err = clSetKernelArg(ckReorder, 2, sizeof(cl_mem), &d_Histograms);
	assert(err == CL_SUCCESS);

	err  = clSetKernelArg(ckReorder, 6,
			sizeof(uint)* _RADIX * _ITEMS ,
			NULL); // mem cache
	assert(err == CL_SUCCESS);


}
Example #10
0
void * materializeCol(struct materializeNode * mn, struct clContext * context, struct statistic * pp){

	struct timespec start,end;
        clock_gettime(CLOCK_REALTIME,&start);

	cl_event ndrEvt;
	cl_ulong startTime, endTime;

	struct tableNode *tn = mn->table;
	char * res;
	cl_mem gpuResult;
	cl_mem gpuAttrSize;

	long totalSize = tn->tupleNum * tn->tupleSize;

	cl_int error = 0;

	cl_mem gpuContent = clCreateBuffer(context->context, CL_MEM_READ_ONLY, totalSize, NULL, &error);
	gpuResult = clCreateBuffer(context->context, CL_MEM_READ_WRITE, totalSize, NULL, &error);
	gpuAttrSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*tn->totalAttr,NULL,&error);
	clEnqueueWriteBuffer(context->queue,gpuAttrSize,CL_TRUE,0,sizeof(int)*tn->totalAttr,tn->attrSize,0,0,&ndrEvt);

	clWaitForEvents(1, &ndrEvt);
	clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
	clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
	pp->pcie += 1e-6 * (endTime - startTime);

	res = (char *) malloc(totalSize);

	long offset = 0;
	long *colOffset = (long*)malloc(sizeof(long)*tn->totalAttr);

	for(int i=0;i<tn->totalAttr;i++){
		colOffset[i] = offset;
		int size = tn->tupleNum * tn->attrSize[i]; 

		if(tn->dataPos[i] == MEM){
			clEnqueueWriteBuffer(context->queue,gpuContent,CL_TRUE,offset,size,tn->content[i],0,0,&ndrEvt);

			clWaitForEvents(1, &ndrEvt);
			clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
			clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
			pp->pcie += 1e-6 * (endTime - startTime);
		}else
			clEnqueueCopyBuffer(context->queue,(cl_mem)tn->content[i],gpuContent,0,offset,size,0,0,0);
			
		offset += size;
	}

	cl_mem gpuColOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(long)*tn->totalAttr,NULL,&error);
	clEnqueueWriteBuffer(context->queue,gpuColOffset,CL_TRUE,0,sizeof(long)*tn->totalAttr,colOffset,0,0,&ndrEvt);

	clWaitForEvents(1, &ndrEvt);
	clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
	clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
	pp->pcie += 1e-6 * (endTime - startTime);

	size_t globalSize = 512;
	size_t localSize = 128;

	context->kernel = clCreateKernel(context->program,"materialize",0);
	clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
	clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuColOffset);
	clSetKernelArg(context->kernel,2,sizeof(int), (void*)&tn->totalAttr);
	clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuAttrSize);
	clSetKernelArg(context->kernel,4,sizeof(long), (void*)&tn->tupleNum);
	clSetKernelArg(context->kernel,5,sizeof(int), (void*)&tn->tupleSize);
	clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuResult);

	clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,0);

	clEnqueueReadBuffer(context->queue,gpuResult,CL_TRUE,0,totalSize,res,0,0,&ndrEvt);
	clWaitForEvents(1, &ndrEvt);
	clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
	clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
	pp->pcie += 1e-6 * (endTime - startTime);

	free(colOffset);

	clFinish(context->queue);

	clReleaseMemObject(gpuColOffset);
	clReleaseMemObject(gpuContent);
	clReleaseMemObject(gpuAttrSize);
	clReleaseMemObject(gpuResult);


	clock_gettime(CLOCK_REALTIME,&end);
        double timeE = (end.tv_sec -  start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec;
        printf("Materialization Time: %lf\n", timeE/(1000*1000));
	return res;
}
Example #11
0
int main(int argc, char **argv)
{
    cl_int           err     = 0;
    cl_context       context = 0;
    cl_device_id *   devices = NULL;
    cl_command_queue queue   = 0;
    cl_program       program = 0;
    cl_mem           cl_a = 0, cl_b = 0, cl_res = 0;
    cl_kernel        adder = 0;
    cl_event         event;
    // The iteration variable
    int i;
    // Define our data set
    cl_float a[DATA_SIZE], b[DATA_SIZE], res[DATA_SIZE];

    // Initialize array
    srand(time(0));
    for (i = 0; i < DATA_SIZE; i++) {
        a[i]   = (rand() % 100) / 100.0;
        b[i]   = (rand() % 100) / 100.0;
        res[i] = 0;
    }

    check_release(get_cl_context(&context, &devices, 0) == false,
                  "Fail to create context");

    // Specify the queue to be profile-able
    queue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, 0);
    check_release(queue == NULL, "Can't create command queue");

    program = load_program(context, devices[0], "shader.cl");
    check_release(program == NULL, "Fail to build program");

    cl_a =
      clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
    cl_b =
      clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
    cl_res = clCreateBuffer(
      context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
    if (cl_a == 0 || cl_b == 0 || cl_res == 0) {
        printf("Can't create OpenCL buffer\n");
        goto release;
    }

    check_release(clEnqueueWriteBuffer(
                    queue, cl_a, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, a, 0, 0, 0),
                  "Write Buffer 1");
    check_release(clEnqueueWriteBuffer(
                    queue, cl_b, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, b, 0, 0, 0),
                  "Write Buffer 2");

    adder = clCreateKernel(program, "adder", &err);
    if (err == CL_INVALID_KERNEL_NAME) printf("CL_INVALID_KERNEL_NAME\n");
    check_release(adder == NULL, "Can't load kernel");

    clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
    clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
    clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);

    size_t work_size = DATA_SIZE;

    check_release(clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, &event),
                  "Can't enqueue kernel");
    check_release(
      clEnqueueReadBuffer(
        queue, cl_res, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, res, 0, 0, 0),
      "Can't enqueue read buffer");

    clWaitForEvents(1, &event);
    printf("Execution Time: %.04lf ms\n\n", get_event_exec_time(event));

    // Make sure everything is done before we do anything
    clFinish(queue);
    err = 0;
    for (i = 0; i < DATA_SIZE; i++) {
        if (res[i] != a[i] + b[i]) {
            printf("%f + %f = %f(answer %f)\n", a[i], b[i], res[i], a[i] + b[i]);
            err++;
        }
    }
    if (err == 0)
        printf("Validation passed\n");
    else
        printf("Validation failed\n");
    printf("------\n");

    //--------------------------------
    // Second test
    for (i = 0; i < DATA_SIZE; i++) {
        a[i]   = i;
        b[i]   = i;
        res[i] = 0;
    }

    check_err(clEnqueueWriteBuffer(
                queue, cl_a, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, a, 0, 0, 0),
              "Write Buffer 1");
    check_err(clEnqueueWriteBuffer(
                queue, cl_b, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, b, 0, 0, 0),
              "Write Buffer 2");

    check_err(clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, &event),
              "Can't enqueue kernel");
    check_err(clEnqueueReadBuffer(
                queue, cl_res, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, res, 0, 0, 0),
              "Can't enqueue read buffer");

    clWaitForEvents(1, &event);
    printf("Execution Time: %.04lf ms\n\n", get_event_exec_time(event));

    // Make sure everything is done before we do anything
    clFinish(queue);
    err = 0;
    for (i = 0; i < DATA_SIZE; i++) {
        if (res[i] != a[i] + b[i]) {
            printf("%f + %f = %f(answer %f)\n", a[i], b[i], res[i], a[i] + b[i]);
            err++;
        }
    }
    if (err == 0)
        printf("Validation passed\n");
    else
        printf("Validation failed\n");

release:
    clReleaseKernel(adder);
    clReleaseProgram(program);
    clReleaseMemObject(cl_a);
    clReleaseMemObject(cl_b);
    clReleaseMemObject(cl_res);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}
// Main function
// *********************************************************************
int main(const int argc, const char** argv) 
{
    // start logs
    shrSetLogFileName ("oclDXTCompression.txt");
    shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); 

    cl_context cxGPUContext;
    cl_command_queue cqCommandQueue;
    cl_program cpProgram;
    cl_kernel ckKernel;
    cl_mem cmMemObjs[3];
    size_t szGlobalWorkSize[1];
    size_t szLocalWorkSize[1];
    cl_int ciErrNum;

    // Get the path of the filename
    char *filename;
    if (shrGetCmdLineArgumentstr(argc, argv, "image", &filename)) {
        image_filename = filename;
    }
    // load image
    const char* image_path = shrFindFilePath(image_filename, argv[0]);
    shrCheckError(image_path != NULL, shrTRUE);
    shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);
    shrCheckError(h_img != NULL, shrTRUE);
    shrLog(LOGBOTH, 0, "Loaded '%s', %d x %d pixels\n", image_path, width, height);

    // Convert linear image to block linear. 
    uint * block_image = (uint *) malloc(width * height * 4);

    // Convert linear image to block linear. 
    for(uint by = 0; by < height/4; by++) {
        for(uint bx = 0; bx < width/4; bx++) {
            for (int i = 0; i < 16; i++) {
                const int x = i & 3;
                const int y = i / 4;
                block_image[(by * width/4 + bx) * 16 + i] = 
                    ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x];
            }
        }
    }

    // create the OpenCL context on a GPU device
    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // get and log device
    cl_device_id device;
    if( shrCheckCmdLineFlag(argc, argv, "device") ) {
      int device_nr = 0;
      shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
      device = oclGetDev(cxGPUContext, device_nr);
    } else {
      device = oclGetMaxFlopsDev(cxGPUContext);
    }
    oclPrintDevInfo(LOGBOTH, device);

    // create a command-queue
    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Memory Setup

    // Compute permutations.
    cl_uint permutations[1024];
    computePermutations(permutations);

    // Upload permutations.
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                  sizeof(cl_uint) * 1024, permutations, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Image
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,
                                  sizeof(cl_uint) * width * height, NULL, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);
    
    // Result
    const uint compressedSize = (width / 4) * (height / 4) * 8;

    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
                                  compressedSize, NULL , &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);
    
    unsigned int * h_result = (uint *)malloc(compressedSize);

    // Program Setup
    size_t program_length;
    const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]);
    shrCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    shrCheckError(source != NULL, shrTRUE);

    // create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
        (const char **) &source, &program_length, &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx");
        shrCheckError(ciErrNum, CL_SUCCESS); 
    }

    // create the kernel
    ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // set the args values
    ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]);
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]);
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL);
    ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);

    shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height);

    // Upload the image
    clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0);

    // set work-item dimensions
    szGlobalWorkSize[0] = width * height * (NUM_THREADS/16);
    szLocalWorkSize[0]= NUM_THREADS;
    
#ifdef GPU_PROFILING
    int numIterations = 100;
    for (int i = -1; i < numIterations; ++i) {
        if (i == 0) { // start timing only after the first warmup iteration
            clFinish(cqCommandQueue); // flush command queue
            shrDeltaT(0); // start timer
        }
#endif
        // execute kernel
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL,
                                          szGlobalWorkSize, szLocalWorkSize, 
                                          0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);
#ifdef GPU_PROFILING
    }
    clFinish(cqCommandQueue);
    double dAvgTime = shrDeltaT(0) / (double)numIterations;
    shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n", 
        (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1); 

#endif

    // blocking read output
    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0,
                                   compressedSize, h_result, 0, NULL, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);

    // Write DDS file.
    FILE* fp = NULL;
    char output_filename[1024];
    #ifdef WIN32
        strcpy_s(output_filename, 1024, image_path);
        strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds");
        fopen_s(&fp, output_filename, "wb");
    #else
        strcpy(output_filename, image_path);
        strcpy(output_filename + strlen(image_path) - 3, "dds");
        fp = fopen(output_filename, "wb");
    #endif
    shrCheckError(fp != NULL, shrTRUE);

    DDSHeader header;
    header.fourcc = FOURCC_DDS;
    header.size = 124;
    header.flags  = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE);
    header.height = height;
    header.width = width;
    header.pitch = compressedSize;
    header.depth = 0;
    header.mipmapcount = 0;
    memset(header.reserved, 0, sizeof(header.reserved));
    header.pf.size = 32;
    header.pf.flags = DDPF_FOURCC;
    header.pf.fourcc = FOURCC_DXT1;
    header.pf.bitcount = 0;
    header.pf.rmask = 0;
    header.pf.gmask = 0;
    header.pf.bmask = 0;
    header.pf.amask = 0;
    header.caps.caps1 = DDSCAPS_TEXTURE;
    header.caps.caps2 = 0;
    header.caps.caps3 = 0;
    header.caps.caps4 = 0;
    header.notused = 0;

    fwrite(&header, sizeof(DDSHeader), 1, fp);
    fwrite(h_result, compressedSize, 1, fp);

    fclose(fp);

    // Make sure the generated image matches the reference image (regression check)
    shrLog(LOGBOTH, 0, "\nComparing against Host/C++ computation...\n");     
    const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]);
    shrCheckError(reference_image_path != NULL, shrTRUE);

    // read in the reference image from file
    #ifdef WIN32
        fopen_s(&fp, reference_image_path, "rb");
    #else
        fp = fopen(reference_image_path, "rb");
    #endif
    shrCheckError(fp != NULL, shrTRUE);
    fseek(fp, sizeof(DDSHeader), SEEK_SET);
    uint referenceSize = (width / 4) * (height / 4) * 8;
    uint * reference = (uint *)malloc(referenceSize);
    fread(reference, referenceSize, 1, fp);
    fclose(fp);

    // compare the reference image data to the sample/generated image
    float rms = 0;
    for (uint y = 0; y < height; y += 4)
    {
        for (uint x = 0; x < width; x += 4)
        {
            // binary comparison of data
            uint referenceBlockIdx = ((y/4) * (width/4) + (x/4));
            uint resultBlockIdx = ((y/4) * (width/4) + (x/4));
            int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);

            // log deviations, if any
            if (cmp != 0.0f) 
            {
                compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx);
                shrLog(LOGBOTH, 0, "Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3);
            }
            rms += cmp;
        }
    }
    rms /= width * height * 3;
    shrLog(LOGBOTH, 0, "RMS(reference, result) = %f\n\n", rms);
    shrLog(LOGBOTH, 0, "TEST %s\n\n", (rms <= ERROR_THRESHOLD) ? "PASSED" : "FAILED !!!");

    // Free OpenCL resources
    oclDeleteMemObjs(cmMemObjs, 3);
    clReleaseKernel(ckKernel);
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQueue);
    clReleaseContext(cxGPUContext);

    // Free host memory
    free(source);
    free(h_img);

    // finish
    shrEXIT(argc, argv);
}
Example #13
0
int main(int argc, char** argv)
{
	double serial_time, openCL_time, start_time;
	cl_int err;
	cl_platform_id* platforms = NULL;
	char platform_name[1024];
	cl_device_id device_id = NULL;
	cl_uint	num_of_platforms = 0;
	cl_uint num_of_devices = 0;
	cl_context context;
	cl_kernel kernel;
	cl_command_queue command_queue;
	cl_program program;
	cl_mem input1, input2, input3, output;
	float **A, **B, **C, **serialC;	// matrices
	int d1, d2, d3;         // dimensions of matrices

							/* print user instruction */
	if (argc != 4)
	{
		printf("Matrix multiplication: C = A x B\n");
		printf("Usage: %s <NumRowA> <NumColA> <NumColB>\n", argv[0]);
		return 0;
	}

	/* read user input */
	d1 = 1000;		// rows of A and C
	d2 = 1000;     // cols of A and rows of B
	d3 = 1000;     // cols of B and C
	int d[4] = { 0, d1, d2, d3 };
	size_t global[1] = { (size_t)d1*d3 };

	printf("Matrix sizes C[%d][%d] = A[%d][%d] x B[%d][%d]\n", d1, d3, d1, d2, d2, d3);

	/* prepare matrices */
	A = alloc_mat(d1, d2);
	init_mat(A, d1, d2);
	B = alloc_mat(d2, d3);
	init_mat(B, d2, d3);
	C = alloc_mat(d1, d3);
	serialC = alloc_mat(d1, d3);

	err = clGetPlatformIDs(0, NULL, &num_of_platforms);
	if (err != CL_SUCCESS) {
		printf("No platforms found. Error: %d\n", err);
		return 0;
	}

	platforms = (cl_platform_id *)malloc(num_of_platforms);
	err = clGetPlatformIDs(num_of_platforms, platforms, NULL);
	if (err != CL_SUCCESS) {
		printf("No platforms found. Error: %d\n", err);
		return 0;
	}
	else {
		int nvidia_platform = 0;
		for (unsigned int i = 0; i<num_of_platforms; i++) {
			clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
			if (err != CL_SUCCESS) {
				printf("Could not get information about platform. Error: %d\n", err);
				return 0;
			}
			if (strstr(platform_name, "NVIDIA") != NULL) {
				nvidia_platform = i;
				break;
			}
		}
		err = clGetDeviceIDs(platforms[nvidia_platform], CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices);
		if (err != CL_SUCCESS) {
			printf("Could not get device in platform. Error: %d\n", err);
			return 0;
		}
	}

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (err != CL_SUCCESS) {
		printf("Unable to create context. Error: %d\n", err);
		return 0;
	}

	command_queue = clCreateCommandQueue(context, device_id, 0, &err);
	if (err != CL_SUCCESS) {
		printf("Unable to create command queue. Error: %d\n", err);
		return 0;
	}

	program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);
	if (err != CL_SUCCESS) {
		printf("Unable to create program. Error: %d\n", err);
		return 0;
	}

	if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
		char *log;
		size_t size;
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); // 1. Länge des Logbuches?
		log = (char *)malloc(size + 1);
		if (log) {
			clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); // 2. Hole das Logbuch ab
			log[size] = '\0';
			printf("%s", log);
			free(log);
		}
		return 1;
	}


	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS) {
		printf("Error building program. Error: %d\n", err);
		return 0;
	}


	kernel = clCreateKernel(program, "matmult_ocl", &err);
	if (err != CL_SUCCESS) {
		printf("Error setting kernel. Error: %d\n", err);
		return 0;
	}

	input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, d1*d2*sizeof(float), NULL, &err);
	input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, d2*d3*sizeof(float), NULL, &err);
	input3 = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(int), NULL, &err);

	output = clCreateBuffer(context, CL_MEM_READ_WRITE, d1*d3*sizeof(float), NULL, &err);

	start_time = omp_get_wtime();

	clEnqueueWriteBuffer(command_queue, input1, CL_TRUE, 0, d1*d2*sizeof(float), *A, 0, NULL, NULL);
	clEnqueueWriteBuffer(command_queue, input2, CL_TRUE, 0, d2*d3*sizeof(float), *B, 0, NULL, NULL);
	clEnqueueWriteBuffer(command_queue, input3, CL_TRUE, 0, 4 * sizeof(int), d, 0, NULL, NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &input3);
	clSetKernelArg(kernel, 3, sizeof(cl_mem), &output);

	clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);

	clFinish(command_queue);

	clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, d1*d3*sizeof(float), *C, 0, NULL, NULL);
	// for (unsigned int i = 0; i < (unsigned int) d1*d3; i++)
	//	printf("%f\n", C[0][i]);

	openCL_time = omp_get_wtime() - start_time;

	clReleaseMemObject(input1);
	clReleaseMemObject(input2);
	clReleaseMemObject(input3);
	clReleaseMemObject(output);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(command_queue);
	clReleaseContext(context);

	printf("Running serial algorithm...\n");
	start_time = omp_get_wtime();
	serialC = mult_mat(A, B, d1, d2, d3);
	serial_time = omp_get_wtime() - start_time;

	printf("Checking results... ");
	is_correct(C, serialC, d1, d3);

	printf("Showing stats...\n");
	printf("   serial runtime = %f\n", serial_time);
	printf("   OpenCL runtime = %f\n", openCL_time);
	printf("   Speedup = %f\n", serial_time / openCL_time);
	return 0;
}
Example #14
0
int main(int argc, char *argv[])
{
  cl_int err;
  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue;
  cl_program program;
  cl_kernel kernel;
  cl_mem d_a, d_b, d_c;
  float *h_a, *h_b, *h_c;

  size_t N = 1024;
  if (argc > 1)
  {
    N = atoi(argv[1]);
  }

  size_t global = N;
  if (argc > 2)
  {
    global = atoi(argv[2]);
  }

  if (!N || !global)
  {
    printf("Usage: ./vecadd N [GLOBAL_SIZE]\n");
    exit(1);
  }

  // Get list of platforms
  cl_uint numPlatforms = 0;
  cl_platform_id platforms[MAX_PLATFORMS];
  err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms);
  checkError(err, "getting platforms");

  // Find Oclgrind
  platform = NULL;
  for (int i = 0; i < numPlatforms; i++)
  {
    char name[256];
    err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 256, name, NULL);
    checkError(err, "getting platform name");
    if (!strcmp(name, "Oclgrind"))
    {
      platform = platforms[i];
      break;
    }
  }
  if (!platform)
  {
    fprintf(stderr, "Unable to find Oclgrind platform\n");
    exit(1);
  }

  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
  checkError(err, "getting device");

  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
  checkError(err, "creating context");

  queue = clCreateCommandQueue(context, device, 0, &err);
  checkError(err, "creating command queue");

  program = clCreateProgramWithSource(context, 1, &KERNEL_SOURCE, NULL, &err);
  checkError(err, "creating program");

  err = clBuildProgram(program, 1, &device, "", NULL, NULL);
  if (err == CL_BUILD_PROGRAM_FAILURE)
  {
    size_t sz;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
                          sizeof(size_t), NULL, &sz);
    char *buildLog = malloc(++sz);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
                          sz, buildLog, NULL);
    fprintf(stderr, "%s\n", buildLog);
  }
  checkError(err, "building program");

  kernel = clCreateKernel(program, "vecadd", &err);
  checkError(err, "creating kernel");

  size_t dataSize = N*sizeof(cl_float);

  // Initialise host data
  srand(0);
  h_a = malloc(dataSize);
  h_b = malloc(dataSize);
  h_c = malloc(dataSize);
  for (int i = 0; i < N; i++)
  {
    h_a[i] = rand()/(float)RAND_MAX;
    h_b[i] = rand()/(float)RAND_MAX;
    h_c[i] = 0;
  }

  d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
  checkError(err, "creating d_a buffer");
  d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
  checkError(err, "creating d_b buffer");
  d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
  checkError(err, "creating d_c buffer");

  err = clEnqueueWriteBuffer(queue, d_a, CL_FALSE,
                             0, dataSize, h_a, 0, NULL, NULL);
  checkError(err, "writing d_a data");
  err = clEnqueueWriteBuffer(queue, d_b, CL_FALSE,
                             0, dataSize, h_b, 0, NULL, NULL);
  checkError(err, "writing d_b data");
  err = clEnqueueWriteBuffer(queue, d_c, CL_FALSE,
                             0, dataSize, h_c, 0, NULL, NULL);
  checkError(err, "writing d_c data");

  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
  checkError(err, "setting kernel args");

  err = clEnqueueNDRangeKernel(queue, kernel,
                               1, NULL, &global, NULL, 0, NULL, NULL);
  checkError(err, "enqueuing kernel");

  err = clFinish(queue);
  checkError(err, "running kernel");

  err = clEnqueueReadBuffer(queue, d_c, CL_TRUE,
                            0, dataSize, h_c, 0, NULL, NULL);
  checkError(err, "reading d_c data");

  // Check results
  int errors = 0;
  for (int i = 0; i < N; i++)
  {
    float ref = h_a[i] + h_b[i];
    if (fabs(ref - h_c[i]) > TOL)
    {
      if (errors < MAX_ERRORS)
      {
        fprintf(stderr, "%4d: %.4f != %.4f\n", i, h_c[i], ref);
      }
      errors++;
    }
  }
  printf("%d errors detected\n", errors);

  free(h_a);
  free(h_b);
  free(h_c);
  clReleaseMemObject(d_a);
  clReleaseMemObject(d_b);
  clReleaseMemObject(d_c);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);

  return (errors != 0);
}
Example #15
0
int main()
{
  //This code executes on the OpenCL host

  //Host data
  int * A = NULL; //Input array
  int * B = NULL; //Input array
  int * C = NULL; //Output array

  //Elements in each array
  const int elements = 2048;
  
  //Compute the size of data
  size_t datasize = sizeof(int) * elements;

  //Allocate space for input/output data
  A = (int *)malloc(datasize);
  B = (int *)malloc(datasize);
  C = (int *)malloc(datasize);

  puts ("After allocation");

  //Initialize the input data
  int i;
  for (i = 0; i < elements; i++)
  {
    A[i] = i;
    B[i] = i;
  }

  puts ("After for");
  //Use this check the output of each API call
  cl_int status;

  /******************************************************************/
  /* PLATFORM */
  /******************************************************************/
  //Retrieve the number of platforms
  cl_uint numPlatforms = 0;
  puts ("Before get platform.");
  status = clGetPlatformIDs(0, NULL, &numPlatforms);

  //Allocate enough space for each platform
  cl_platform_id * platforms = NULL;
  printf ("Total platform: %d\n", numPlatforms);
  platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));

  //Fill in the platforms
  puts ("Before fill platform");
  status = clGetPlatformIDs(numPlatforms, platforms, NULL);

  /******************************************************************/
  /* DEVICE ID */
  /******************************************************************/
  cl_uint numDevices = 0;
  puts ("Before get devices");
  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0,
                          NULL, &numDevices);
  //Alocate enough space for each device
  cl_device_id * devices;
  devices = (cl_device_id *) malloc(numDevices * sizeof(cl_device_id));

  //Fill in the devices
  puts ("Before alloc get devices");
  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,
                          numDevices, devices, NULL);
  printf ("total devices: %d\n", numDevices);
  printf ("devices: %p\n", devices);
  /******************************************************************/
  /* CONTEXT */
  /******************************************************************/
  cl_context context;
  puts ("Before context");
  context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

  /******************************************************************/
  /* COMMAND QUEUE */
  /******************************************************************/
  cl_command_queue cmdQueue;
  puts ("Before clCreateCommandQueue");
  cmdQueue = clCreateCommandQueue (context, devices[0], 0, &status);

  /******************************************************************/
  /* BUFFER OBJECT */
  /******************************************************************/
  cl_mem bufA;
  puts ("Before clCreateBuffer A.");
  bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status);

  cl_mem bufB;
  puts ("Before clCreateBuffer A.");
  bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status);

  // Create a buffer object that will hold the output
  cl_mem bufC;
  puts ("Before clCreateBuffer A.");
  bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status);

  //Write input array A to the device bufferA
  puts ("Before clEnqueueWriteBuffer A.");
  status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 0, 
                                datasize, A, 0, NULL, NULL);

  //Write input array B to the device bufferB
  puts ("Before clEnqueueWriteBuffer B.");
  status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 0,
                               datasize, B, 0, NULL, NULL);

  /******************************************************************/
  /*Create a program with source code*/
  /******************************************************************/
  puts ("Before clCreateProgramWithSource.");
  cl_program program = clCreateProgramWithSource(context, 1,
                        (const char **)&programSource, NULL, &status);

  status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);

  //Create the vector addition kernel
  cl_kernel kernel;
  kernel = clCreateKernel(program, "vecadd", &status);

  //Associate the input and output buffer with the kernel
  status = clSetKernelArg(kernel, 0, sizeof (cl_mem), &bufA);
  status = clSetKernelArg(kernel, 1, sizeof (cl_mem), &bufB);
  status = clSetKernelArg(kernel, 2, sizeof (cl_mem), &bufC);

  //Define an index space
  size_t globalWorkSize[1];

  // Execute the kernel for execution
  status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize,
                                NULL, 0, NULL, NULL);

  // Read the device output buffer to the host output array
  clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0, datasize, C, 0, NULL, NULL);

  //Verify the output
  int result = 1;
  for (i = 0; i < elements; i++)
  {
    if (C[i] != i + i)
    {
      result = 0;
      break;
    }
  }

  if (result)
  {
    printf("Output is correct\n");
  }
  else
  {
    printf("Output is wrong\n");
  }

  //Free OpenCL resoureces
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(cmdQueue);
  clReleaseMemObject(bufA);
  clReleaseMemObject(bufB);
  clReleaseMemObject(bufC);
  clReleaseContext(context);
  //Free host resources
  free(A);
  free(B);
  free(C);
  free(platforms);
  free(devices);
  
  return 0;
}
Example #16
0
int main(int argc, char** argv) {

   // Set up the data on the host	
   clock_t start, start0;
   start0 = clock();
   start = clock();
   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";



   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth, 
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Pad the number of columns 
#ifdef NON_OPTIMIZED
   int deviceWidth = imageWidth;
#else  // READ_ALIGNED || READ4
   int deviceWidth = roundUp(imageWidth, WGX);
#endif
   int deviceHeight = imageHeight;
   // Size of the input and output images on the device
   int deviceDataSize = imageHeight*deviceWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);
   int i, j;
   for(i = 0; i < imageHeight; i++) {
       for(j = 0; j < imageWidth; j++) {
           outputImage[i*imageWidth+j] = 0;
       }
   }

   // 45 degree motion blur
   float filter[49] = 
      {0,      0,      0,      0,      0, 0.0145,      0,
       0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
       0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
       0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
       0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
  0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
       0, 0.0145,      0,      0,      0,      0,      0};
 
   int filterWidth = 7;
   int paddingPixels = (int)(filterWidth/2) * 2; 
   stoptime(start, "set up input, output.");
   start = clock();
   // Set up the OpenCL environment

   // Discovery platform
   cl_platform_id platform;
   clGetPlatformIDs(1, &platform, NULL);

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device,
      NULL);

    size_t time_res;
    clGetDeviceInfo(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
            sizeof(time_res), &time_res, NULL);
    printf("Device profiling timer resolution: %zu ns.\n", time_res);

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 
       (cl_context_properties)(platform), 0};
   cl_context context; 
   context = clCreateContext(props, 1, &device, NULL, NULL, 
      NULL);

   // Create command queue
   cl_ulong time_start, time_end, exec_time;
   cl_event timing_event;
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

   // Create memory buffers
   cl_mem d_inputImage;
   cl_mem d_outputImage;
   cl_mem d_filter;
   d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       deviceDataSize, NULL, NULL);
   d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
       deviceDataSize, NULL, NULL);
   d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, 
       49*sizeof(float),NULL, NULL);
   
   // Write input data to the device
#ifdef NON_OPTIMIZED
   clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize,
       inputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   size_t buffer_origin[3] = {0,0,0};
   size_t host_origin[3] = {0,0,0};
   size_t region[3] = {deviceWidth*sizeof(float), 
      imageHeight, 1};
   clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0,
      inputImage, 0, NULL, NULL);
#endif
	
   // Write the filter to the device
   clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, 
      49*sizeof(float), filter, 0, NULL, NULL);
	
   // Read in the program from file
   char* source = readSource("convolution.cl");

   // Create the program
   cl_program program;
	
   // Create and compile the program
   program = clCreateProgramWithSource(context, 1, 
       (const char**)&source, NULL, NULL);
   cl_int build_status;
   build_status = clBuildProgram(program, 1, &device, NULL, NULL,
      NULL);
      
   // Create the kernel
   cl_kernel kernel;
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   // Only the host-side code differs for the aligned reads
   kernel = clCreateKernel(program, "convolution", NULL);
#else // READ4
   kernel = clCreateKernel(program, "convolution_read4", NULL);
#endif
	
   // Selected work group size is 16x16
   int wgWidth = WGX;
   int wgHeight = WGY;

   // When computing the total number of work items, the 
   // padding work items do not need to be considered
   int totalWorkItemsX = roundUp(imageWidth-paddingPixels, 
      wgWidth);
   int totalWorkItemsY = roundUp(imageHeight-paddingPixels, 
      wgHeight);

   // Size of a work group
   size_t localSize[2] = {wgWidth, wgHeight};
   // Size of the NDRange
   size_t globalSize[2] = {totalWorkItemsX, totalWorkItemsY};

   // The amount of local data that is cached is the size of the
   // work groups plus the padding pixels
#if defined NON_OPTIMIZED || defined READ_ALIGNED
   int localWidth = localSize[0] + paddingPixels;
#else // READ4
   // Round the local width up to 4 for the read4 kernel
   int localWidth = roundUp(localSize[0]+paddingPixels, 4);
#endif
   int localHeight = localSize[1] + paddingPixels;

   // Compute the size of local memory (needed for dynamic 
   // allocation)
   size_t localMemSize = (localWidth * localHeight * 
      sizeof(float));

   // Set the kernel arguments
   clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
   clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
   clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter);
   clSetKernelArg(kernel, 3, sizeof(int), &deviceHeight);
   clSetKernelArg(kernel, 4, sizeof(int), &deviceWidth); 
   clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
   clSetKernelArg(kernel, 6, localMemSize, NULL);
   clSetKernelArg(kernel, 7, sizeof(int), &localHeight); 
   clSetKernelArg(kernel, 8, sizeof(int), &localWidth);

   stoptime(start, "set up kernel");
   start = clock();
   // Execute the kernel
   clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, 
      localSize, 0, NULL, &timing_event);

   // Wait for kernel to complete
   clFinish(queue);
   stoptime(start, "run kernel");
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
   clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
   exec_time = time_end-time_start;
   printf("Profile execution time = %.3lf sec.\n", (double) exec_time/1000000000);

   // Read back the output image
#ifdef NON_OPTIMIZED
   clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, 
      deviceDataSize, outputImage, 0, NULL, NULL);
#else // READ_ALIGNED || READ4
   // Begin reading output from (3,3) on the device 
   // (for 7x7 filter with radius 3)
   buffer_origin[0] = 3*sizeof(float);
   buffer_origin[1] = 3;
   buffer_origin[2] = 0;

   // Read data into (3,3) on the host
   host_origin[0] = 3*sizeof(float);
   host_origin[1] = 3;
   host_origin[2] = 0;
	
   // Region is image size minus padding pixels
   region[0] = (imageWidth-paddingPixels)*sizeof(float);
   region[1] = (imageHeight-paddingPixels);
   region[2] = 1;
	
	// Perform the read
   clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, 
      buffer_origin, host_origin, region, 
      deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, 
      outputImage, 0, NULL, NULL);
#endif
  
   // Homegrown function to write the image to file
   storeImage(outputImage, outputFile, imageHeight, 
      imageWidth, inputFile);
   
   // Free OpenCL objects
   clReleaseMemObject(d_inputImage);
   clReleaseMemObject(d_outputImage);
   clReleaseMemObject(d_filter);
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(queue);
   clReleaseContext(context);

   return 0;
}
Example #17
0
int
NBody::setupCL()
{
    cl_int status = CL_SUCCESS;

    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clGetPlatformIDs failed."))
    {
        return SDK_FAILURE;
    }
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetPlatformIDs failed."))
        {
            return SDK_FAILURE;
        }
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);

            if(!sampleCommon->checkVal(status,
                                       CL_SUCCESS,
                                       "clGetPlatformInfo failed."))
            {
                return SDK_FAILURE;
            }

            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }

    if(NULL == platform)
    {
        sampleCommon->error("NULL platform found so Exiting Application.");
        return SDK_FAILURE;
    }

    // Display available devices.
    if(!sampleCommon->displayDevices(platform, dType))
    {
        sampleCommon->error("sampleCommon::displayDevices() failed");
        return SDK_FAILURE;
    }

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(
        cps,
        dType,
        NULL,
        NULL,
        &status);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateContextFromType failed."))
    {
        return SDK_FAILURE;
    }

    size_t deviceListSize;

    /* First, get the size of device list data */
    status = clGetContextInfo(
        context, 
        CL_CONTEXT_DEVICES, 
        0, 
        NULL, 
        &deviceListSize);
    if(!sampleCommon->checkVal(
        status, 
        CL_SUCCESS,
        "clGetContextInfo failed."))
        return SDK_FAILURE;

    int deviceCount = (int)(deviceListSize / sizeof(cl_device_id));
    if(!sampleCommon->validateDeviceId(deviceId, deviceCount))
    {
        sampleCommon->error("sampleCommon::validateDeviceId() failed");
        return SDK_FAILURE;
    }

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id*)malloc(deviceListSize);
    if(devices == NULL)
    {
        sampleCommon->error("Failed to allocate memory (devices).");
        return SDK_FAILURE;
    }

    /* Now, get the device list data */
    status = clGetContextInfo(
        context, 
        CL_CONTEXT_DEVICES, 
        deviceListSize, 
        devices, 
        NULL);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetContextInfo failed."))
        return SDK_FAILURE;


    /* Create command queue */

    commandQueue = clCreateCommandQueue(
        context,
        devices[deviceId],
        0,
        &status);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateCommandQueue failed."))
    {
        return SDK_FAILURE;
    }

    /* Get Device specific Information */
    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(size_t),
        (void*)&maxWorkGroupSize,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed."))
        return SDK_FAILURE;


    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
        sizeof(cl_uint),
        (void*)&maxDimensions,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
        return SDK_FAILURE;


    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));

    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_MAX_WORK_ITEM_SIZES,
        sizeof(size_t) * maxDimensions,
        (void*)maxWorkItemSizes,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
        return SDK_FAILURE;


    status = clGetDeviceInfo(
        devices[deviceId],
        CL_DEVICE_LOCAL_MEM_SIZE,
        sizeof(cl_ulong),
        (void *)&totalLocalMemory,
        NULL);

    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS, 
        "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed."))
        return SDK_FAILURE;


    /*
    * Create and initialize memory objects
    */

    /* Create memory objects for position */
    currPos = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (oldPos)"))
    {
        return SDK_FAILURE;
    }

    /* Initialize position buffer */
    status = clEnqueueWriteBuffer(commandQueue,
                                  currPos,
                                  1,
                                  0,
                                  numBodies * sizeof(cl_float4),
                                  pos,
                                  0,
                                  0,
                                  0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clEnqueueWriteBuffer failed. (oldPos)"))
    {
        return SDK_FAILURE;
    }


    /* Create memory objects for position */
    newPos = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (newPos)"))
    {
        return SDK_FAILURE;
    }

    /* Create memory objects for velocity */
    currVel = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (oldVel)"))
    {
        return SDK_FAILURE;
    }

    /* Initialize velocity buffer */
    status = clEnqueueWriteBuffer(commandQueue,
                                  currVel,
                                  1,
                                  0,
                                  numBodies * sizeof(cl_float4),
                                  vel,
                                  0,
                                  0,
                                  0);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clEnqueueWriteBuffer failed. (oldVel)"))
    {
        return SDK_FAILURE;
    }

    /* Create memory objects for velocity */
    newVel = clCreateBuffer(
        context,
        CL_MEM_READ_ONLY,
        numBodies * sizeof(cl_float4),
        0,
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateBuffer failed. (newVel)"))
    {
        return SDK_FAILURE;
    }

    /* create a CL program using the kernel source */
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
        if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        const char * binary = kernelFile.source().c_str();
        size_t binarySize = kernelFile.source().size();
        program = clCreateProgramWithBinary(context,
                                            1,
                                            &devices[deviceId], 
                                            (const size_t *)&binarySize,
                                            (const unsigned char**)&binary,
                                            NULL,
                                            &status);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clCreateProgramWithBinary failed."))
        {
            return SDK_FAILURE;
        }

    }
    else
    {
        kernelPath.append("NBody_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char * source = kernelFile.source().c_str();
        size_t sourceSize[] = { strlen(source) };
        program = clCreateProgramWithSource(context,
                                            1,
                                            &source,
                                            sourceSize,
                                            &status);
        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clCreateProgramWithSource failed."))
            return SDK_FAILURE;
        }

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(
        program,
        1,
        &devices[deviceId],
        NULL,
        NULL,
        NULL);
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char * buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                &buildLogSize);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
                return SDK_FAILURE;

            buildLog = (char*)malloc(buildLogSize);
            if(buildLog == NULL)
            {
                sampleCommon->error("Failed to allocate host memory. (buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo (program, 
                devices[deviceId], 
                CL_PROGRAM_BUILD_LOG, 
                buildLogSize, 
                buildLog, 
                NULL);
            if(!sampleCommon->checkVal(
                logStatus,
                CL_SUCCESS,
                "clGetProgramBuildInfo failed."))
            {
                free(buildLog);
                return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!sampleCommon->checkVal(
            status,
            CL_SUCCESS,
            "clBuildProgram failed."))
            return SDK_FAILURE;
    }

    /* get a kernel object handle for a kernel with the given name */
    kernel = clCreateKernel(
        program,
        "nbody_sim",
        &status);
    if(!sampleCommon->checkVal(
        status,
        CL_SUCCESS,
        "clCreateKernel failed."))
    {
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}
Example #18
0
int main( int argc, char **argv )
{

  int             i, iteration;

  double          timecounter;

  FILE            *fp;

  cl_int ecode;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  /*  Initialize timers  */
  timer_on = 0;            
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    fclose(fp);
    timer_on = 1;
  }
  timer_clear( 0 );
  if (timer_on) {
    timer_clear( 1 );
    timer_clear( 2 );
    timer_clear( 3 );
  }

  if (timer_on) timer_start( 3 );

  /*  Initialize the verification arrays if a valid class */
  for( i=0; i<TEST_ARRAY_SIZE; i++ )
    switch( CLASS )
    {
      case 'S':
        test_index_array[i] = S_test_index_array[i];
        test_rank_array[i]  = S_test_rank_array[i];
        break;
      case 'A':
        test_index_array[i] = A_test_index_array[i];
        test_rank_array[i]  = A_test_rank_array[i];
        break;
      case 'W':
        test_index_array[i] = W_test_index_array[i];
        test_rank_array[i]  = W_test_rank_array[i];
        break;
      case 'B':
        test_index_array[i] = B_test_index_array[i];
        test_rank_array[i]  = B_test_rank_array[i];
        break;
      case 'C':
        test_index_array[i] = C_test_index_array[i];
        test_rank_array[i]  = C_test_rank_array[i];
        break;
      case 'D':
        test_index_array[i] = D_test_index_array[i];
        test_rank_array[i]  = D_test_rank_array[i];
        break;
    };

  /* set up the OpenCL environment. */
  setup_opencl(argc, argv);

  /*  Printout initial NPB info */
  printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" );
  printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
  printf( " Iterations:   %d\n", MAX_ITERATIONS );

  if (timer_on) timer_start( 1 );

  /*  Generate random number sequence and subsequent keys on all procs */
  create_seq( 314159265.00,                    /* Random number gen seed */
              1220703125.00 );                 /* Random number gen mult */
  if (timer_on) timer_stop( 1 );

  /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
      all data and code pages and respective tables */
  rank( 1 );  

  /*  Start verification counter */
  passed_verification = 0;

  DTIMER_START(T_BUFFER_WRITE);
  ecode = clEnqueueWriteBuffer(cmd_queue,
                               m_passed_verification,
                               CL_TRUE,
                               0,
                               sizeof(cl_int),
                               &passed_verification,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_WRITE);

  if( CLASS != 'S' ) printf( "\n   iteration\n" );

  /*  Start timer  */             
  timer_start( 0 );


  /*  This is the main iteration */
  for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
  {
    if( CLASS != 'S' ) printf( "        %d\n", iteration );
    rank( iteration );
  }

  DTIMER_START(T_BUFFER_READ);
  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_passed_verification,
                              CL_TRUE,
                              0,
                              sizeof(cl_int),
                              &passed_verification,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_READ);

  /*  End of timing, obtain maximum time of all processors */
  timer_stop( 0 );
  timecounter = timer_read( 0 );


  /*  This tests that keys are in sequence: sorting of last ranked key seq
      occurs here, but is an untimed operation                             */
  if (timer_on) timer_start( 2 );
  full_verify();
  if (timer_on) timer_stop( 2 );

  if (timer_on) timer_stop( 3 );


  /*  The final printout  */
  if( passed_verification != 5*MAX_ITERATIONS + 1 )
    passed_verification = 0;
  c_print_results( "IS",
                   CLASS,
                   (int)(TOTAL_KEYS/64),
                   64,
                   0,
                   MAX_ITERATIONS,
                   timecounter,
                   ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                              /timecounter/1000000.,
                   "keys ranked", 
                   passed_verification,
                   NPBVERSION,
                   COMPILETIME,
                   CC,
                   CLINK,
                   C_LIB,
                   C_INC,
                   CFLAGS,
                   CLINKFLAGS,
                   "",
                   clu_GetDeviceTypeName(device_type),
                   device_name);

  /*  Print additional timers  */
  if (timer_on) {
    double t_total, t_percent;

    t_total = timer_read( 3 );
    printf("\nAdditional timers -\n");
    printf(" Total execution: %8.3f\n", t_total);
    if (t_total == 0.0) t_total = 1.0;
    timecounter = timer_read(1);
    t_percent = timecounter/t_total * 100.;
    printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(0);
    t_percent = timecounter/t_total * 100.;
    printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(2);
    t_percent = timecounter/t_total * 100.;
    printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
  }

  release_opencl();
  
  fflush(stdout);

  return 0;
  /**************************/
} /*  E N D  P R O G R A M  */
Example #19
0
template <typename ElemType> nano_time_t
Syr2PerformanceTest<ElemType>::clblasPerfSingle(void)
{
    nano_time_t time;
    cl_event event;
    cl_int status;
    cl_command_queue queue = base_->commandQueues()[0];

    status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
                                  ((params_.N * params_.lda) + params_.offa) *
                                  sizeof(ElemType), backA_, 0, NULL, &event);
    if (status != CL_SUCCESS) {
        cerr << "Matrix A buffer object enqueuing error, status = " <<
                 status << endl;

        return NANOTIME_ERR;
    }

    status = clWaitForEvents(1, &event);
    if (status != CL_SUCCESS) {
        cout << "Wait on event failed, status = " <<
                status << endl;

        return NANOTIME_ERR;
    }

    event = NULL;

#define TIMING
#ifdef TIMING
    clFinish( queue);
    time = getCurrentTime();

    int iter = 100;
    for ( int i = 1; i <= iter; i++)
    {
#endif
    status = (cl_int)clMath::clblas::syr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
				mobjY_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event);

    if (status != CL_SUCCESS) {
        cerr << "The CLBLAS SYR2 function failed, status = " <<
                status << endl;

        return NANOTIME_ERR;
    }

#ifdef TIMING
    } // iter loop
    clFinish( queue);
    time = getCurrentTime() - time;
    time /= iter;
#else
    status = flushAll(1, &queue);
    if (status != CL_SUCCESS) {
        cerr << "clFlush() failed, status = " << status << endl;
        return NANOTIME_ERR;
    }

    time = getCurrentTime();
    status = waitForSuccessfulFinish(1, &queue, &event);
    if (status == CL_SUCCESS) {
        time = getCurrentTime() - time;
    }
    else {
        cerr << "Waiting for completion of commands to the queue failed, "
                "status = " << status << endl;
        time = NANOTIME_ERR;
    }
#endif

    return time;
}
int main()
{
	int i,j,k;
	// nb of operations:
	const int dsize = 512;
	int nthreads = 1;
	int nbOfAverages = 1e2;
	int opsMAC = 2; // operations per MAC
	cl_short4 *in, *out;
	cl_half *ck;
	double tops; //total ops

#define NQUEUES 1
	cl_int err;
	cl_platform_id platform = 0;
	cl_device_id device = 0;
	cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
	cl_context ctx = 0;
	cl_command_queue queues[NQUEUES];
	cl_mem bufin, bufck, bufout;
	cl_event event = NULL;
	cl_program program;
	cl_kernel kernel;
	size_t global[2], local[2];
	size_t param[5];
	char version[300];
  
	// allocate matrices
	
	in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in));
	out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out));
	ck = (cl_half *) calloc(9*9, sizeof(*ck));
	in[0].x = 0x3c00;
	in[1].x = 0x4000;
	in[dsize].x = 0x4100;
	ck[0] = 0x3c00;
	ck[1] = 0x4000;
	ck[9] = 0x3000;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    for(i = 0; i < NQUEUES; i++)
    	queues[i] = clCreateCommandQueue( ctx, device, 0, &err );

	// Print some info about the system
	clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
	printf("CL_DEVICE_VERSION=%s\n", version);
	clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL);
	printf("CL_DRIVER_VERSION=%s\n", version);
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]);
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]);
	j = param[0];
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL);
	printf("CL_DEVICE_MAX_WORK_ITEM_SIZES=");
	for(i = 0; i < j; i++)
		printf("%d ", (int)param[i]);
	printf("\n");
        clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL);
        printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]);
		
		
	program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err);
	if(!program)
	{
		printf("Error creating program\n");
		return -1;
	}
	err = clBuildProgram(program, 0, 0, 0, 0, 0);
	if(err != CL_SUCCESS)
	{
		char buffer[20000];
		size_t len;
		
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		puts(buffer);
		return -1;
	}
	kernel = clCreateKernel(program, "conv9x9", &err);
	if(!kernel || err != CL_SUCCESS)
	{
		printf("Error creating kernel\n");
		return -1;
	}

    /* Prepare OpenCL memory objects and place matrices inside them. */
	cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT};
	cl_int rc;
	bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
	bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc);
    bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck),
                          NULL, &err );

	size_t origin[3] = {0,0,0};
	size_t region[3] = {dsize, dsize, 1};
    err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL );
	clSetKernelArg(kernel, 0, sizeof(int), &dsize);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck);
	clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout);
	local[0] = 8;
	local[1] = 8;
	global[0] = global[1] = dsize-32;
    usleep(100000);

	struct timeval start,end;
	gettimeofday(&start, NULL);

	for (k=0; k<nthreads; k++) {
		//printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
		for(i=0;i<nbOfAverages;i++) {
		// do the 2D convolution
			err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL);
			if(err != CL_SUCCESS)
			{
				printf("clEnqueueNDRangeKernel error %d\n", err);
				return -1;
			}
		}
	}

	clFinish(queues[0]);
	gettimeofday(&end, NULL);
	double t = ((double) (end.tv_sec - start.tv_sec))
	+ ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified!

    /* Wait for calculations to be finished. */

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL );
	clFinish(queues[0]);
	
	printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x);

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufin );
    clReleaseMemObject( bufck );
    clReleaseMemObject( bufout );

    /* Release OpenCL working objects. */
    for(i = 0; i < NQUEUES; i++)
    	clReleaseCommandQueue( queues[i] );
    clReleaseContext( ctx );
	
	// report performance:
	tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops
	printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads);
	printf("\nTime in s: %lf:", t);
	printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9);
	printf("\n");
	return(0);
}
Example #21
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufAsum, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasSasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSasum() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float),
                                    &asum, 0, NULL, NULL);
        printf("Result : %f\n", asum);
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAsum);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Example #22
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufAP, bufX, bufY;
    cl_event event = NULL;
    int ret = 0, numElementsAP;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_double2)),
                            NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2),
                            NULL, &err);
	bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2),
						    NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
					                numElementsAP * sizeof(cl_double2), AP, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
					                N * sizeof(cl_double2), X, 0, NULL, NULL);
	err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
					                N * sizeof(cl_double2), Y, 0, NULL, NULL);

    err = clblasZhpr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy,
						            bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event);

   	if (err != CL_SUCCESS) {
        printf("clblasZhpr2() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_double2)),
                                  AP, 0, NULL, NULL);
        /* At this point you will get the result of ZHPR2 placed in A array. */
        printResult();
    }


    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAP);
	clReleaseMemObject(bufY);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Example #23
0
static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
				int64_t __maybe_unused max_nonce)
{
	const int thr_id = thr->id;
	struct opencl_thread_data *thrdata = thr->cgpu_data;
	struct cgpu_info *gpu = thr->cgpu;
	_clState *clState = clStates[thr_id];
	const cl_kernel *kernel = &clState->kernel;
	const int dynamic_us = opt_dynamic_interval * 1000;

	cl_int status;
	size_t globalThreads[1];
	size_t localThreads[1] = { clState->wsize };
	int64_t hashes;
	int found = opt_scrypt ? SCRYPT_FOUND : FOUND;
	int buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
	if (opt_neoscrypt) {
			found = opt_neoscrypt ? SCRYPT_FOUND : FOUND;
			buffersize = opt_neoscrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;		
	}

	/* Windows' timer resolution is only 15ms so oversample 5x */
	if (gpu->dynamic && (++gpu->intervals * dynamic_us) > 70000) {
		struct timeval tv_gpuend;
		double gpu_us;

		cgtime(&tv_gpuend);
		gpu_us = us_tdiff(&tv_gpuend, &gpu->tv_gpustart) / gpu->intervals;
		if (gpu_us > dynamic_us) {
			if (gpu->intensity > MIN_INTENSITY)
				--gpu->intensity;
		} else if (gpu_us < dynamic_us / 2) {
			if (gpu->intensity < MAX_INTENSITY)
				++gpu->intensity;
		}
		memcpy(&(gpu->tv_gpustart), &tv_gpuend, sizeof(struct timeval));
		gpu->intervals = 0;
	}

	set_threads_hashes(clState->vwidth, &hashes, globalThreads, localThreads[0], &gpu->intensity);
	if (hashes > gpu->max_hashes)
		gpu->max_hashes = hashes;

	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
		return -1;
	}

	if (clState->goffset) {
		size_t global_work_offset[1];

		global_work_offset[0] = work->blk.nonce;
		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset,
						globalThreads, localThreads, 0,  NULL, NULL);
	} else
		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
						globalThreads, localThreads, 0,  NULL, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
		return -1;
	}

	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
				     buffersize, thrdata->res, 0, NULL, NULL);
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status);
		return -1;
	}

	/* The amount of work scanned can fluctuate when intensity changes
	 * and since we do this one cycle behind, we increment the work more
	 * than enough to prevent repeating work */
	work->blk.nonce += gpu->max_hashes;

	/* This finish flushes the readbuffer set with CL_FALSE in clEnqueueReadBuffer */
	clFinish(clState->commandQueue);

	/* FOUND entry is used as a counter to say how many nonces exist */
	if (thrdata->res[found]) {
		/* Clear the buffer again */
		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
					      buffersize, blank_res, 0, NULL, NULL);
		if (unlikely(status != CL_SUCCESS)) {
			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
			return -1;
		}
		applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
		postcalc_hash_async(thr, work, thrdata->res);
		memset(thrdata->res, 0, buffersize);
		/* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */
		clFinish(clState->commandQueue);
	}

	return hashes;
}
Example #24
0
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) {
  struct bench_args_t *args = (struct bench_args_t *)vargs;

  int size = 1 << 26;
  uint8_t *data = (uint8_t *)malloc(size);
  for (int i=0; i<size; i+=sizeof(args->buf))
    memcpy(data + i, args->buf, sizeof(args->buf));

  // 0th: initialize the timer at the beginning of the program
  timespec timer = tic();

  // Create device buffers
  //
  cl_mem key_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->k), NULL, NULL);
  cl_mem value_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL);
  //cl_mem value_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->buf), NULL, NULL);
  if (!key_buffer || !value_buffer)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    exit(1);
  }    

  // 1st: time of buffer allocation
  toc(&timer, "buffer allocation");

  // Write our data set into device buffers  
  //
  int err;
  err = clEnqueueWriteBuffer(commands, key_buffer, CL_TRUE, 0, sizeof(args->k), args->k, 0, NULL, NULL);
  err |= clEnqueueWriteBuffer(commands, value_buffer, CL_TRUE, 0, size, data, 0, NULL, NULL);
  //err |= clEnqueueWriteBuffer(commands, value_buffer, CL_TRUE, 0, sizeof(args->buf), args->buf, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
      printf("Error: Failed to write to device memory!\n");
      printf("Test failed\n");
      exit(1);
  }

  // 2nd: time of pageable-pinned memory copy
  toc(&timer, "memory copy");
    
  // Set the arguments to our compute kernel
  //
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &key_buffer);
  err  |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &value_buffer);
  err  |= clSetKernelArg(kernel, 2, sizeof(int), &size);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }

  // 3rd: time of setting arguments
  toc(&timer, "set arguments");

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

#ifdef C_KERNEL
  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
#else
  printf("Error: OpenCL kernel is not currently supported!\n");
  exit(1);
#endif
  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }

  // 4th: time of kernel execution
  clFinish(commands);
  toc(&timer, "kernel execution");

  // Read back the results from the device to verify the output
  //
  err = clEnqueueReadBuffer( commands, value_buffer, CL_TRUE, 0, size, data, 0, NULL, NULL );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }

  // 5th: time of data retrieving (PCIe + memcpy)
  toc(&timer, "data retrieving");

  memcpy(args->buf, data, sizeof(args->buf));
  free(data);
}
Example #25
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
      
    float data[DATA_SIZE];              // original data set given to device
    float results[DATA_SIZE];           // results returned from device
    unsigned int correct;               // number of correct results returned

    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation

    cl_device_id device_id;             // compute device id 
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel
    
    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    
    int i;
    int use_gpu = 1;
    for(i = 0; i < argc && argv; i++)
    {
        if(!argv[i])
            continue;
            
        if(strstr(argv[i], "cpu"))
            use_gpu = 0;        

        else if(strstr(argv[i], "gpu"))
            use_gpu = 1;
    }

    printf("Parameter detect %s device\n",use_gpu==1?"GPU":"CPU");

    // Fill our data set with random float values
    //
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (float)RAND_MAX;
    
    // Connect to a compute device
    //
    err = clGetDeviceIDs(NULL, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }
  
    // Create a compute context 
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }

    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

    // Create the input and output arrays in device memory for our calculation
    //
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    
    
    // Write our data set into the input array in device memory 
    //
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }

    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }

    // Get the maximum work group size for executing the kernel on the device
    //
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }

    // Execute the kernel over the entire range of our 1d input data set
    // using the maximum number of work group items for this device
    //
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }

    // Wait for the command commands to get serviced before reading back results
    //
    clFinish(commands);

    // Read back the results from the device to verify the output
    //
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
    
    // Validate our results
    //
    correct = 0;
    for(i = 0; i < count; i++)
    {
        #ifdef __EMSCRIPTEN__
            if ((results[i] - (data[i] * data[i])) < MIN_ERROR)
                correct++;
        #else    
            if(results[i] == data[i] * data[i])
                correct++;
        #endif           
    }
    
    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);
    
    // Shutdown and cleanup
    //
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return 0;
}
void call_kernel(float *data1,float *data2,int count,char * cl_name,float *results) {

    FILE* programHandle;
    size_t programSize, KernelSourceSize;
    char *programBuffer, *KernelSource;

    size_t global;                      // global domain size for our calculation
    size_t local;                       // local domain size for our calculation

    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

    cl_mem input1;                       // device memory used for the input array
    cl_mem input2;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array

	int err;
    int gpu = 1;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    commands = clCreateCommandQueue(context, device_id, 0, &err);

	//----------------------------------------------------------------------------
    // get size of kernel source
    programHandle = fopen(cl_name, "r");
    fseek(programHandle, 0, SEEK_END);
    programSize = ftell(programHandle);
    rewind(programHandle);

    programBuffer = (char*) malloc(programSize + 1);
    programBuffer[programSize] = '\0';
    fread(programBuffer, sizeof(char), programSize, programHandle);
    fclose(programHandle);

    // create program from buffer
    program = clCreateProgramWithSource(context,1,(const char**) &programBuffer,&programSize, NULL);
    free(programBuffer);

    // read kernel source back in from program to check
    clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &KernelSourceSize);
    KernelSource = (char*) malloc(KernelSourceSize);
    clGetProgramInfo(program, CL_PROGRAM_SOURCE, KernelSourceSize, KernelSource, NULL);

    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    kernel = clCreateKernel(program, "square", &err);
	//----------------------------------------------------------------------------

    input1 = clCreateBuffer(context, CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    input2 = clCreateBuffer(context, CL_MEM_READ_ONLY,  sizeof(float) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);

    err = clEnqueueWriteBuffer(commands, input1, CL_TRUE, 0, sizeof(float) * count, data1, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(commands, input2, CL_TRUE, 0, sizeof(float) * count, data2, 0, NULL, NULL);
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1);
    err  = clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 3, sizeof(int), &count);

	printf("*********************%d\n", local);
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
	printf("*********************%d\n", local);
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    clFinish(commands);
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );

    clReleaseMemObject(input1);
    clReleaseMemObject(input2);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    //printf("nKernel source:\n\n %s \n", KernelSource);
    free(KernelSource);
}
Example #27
0
void set_constants()
{
  ce[0][0]  = 2.0;
  ce[0][1]  = 0.0;
  ce[0][2]  = 0.0;
  ce[0][3]  = 4.0;
  ce[0][4]  = 5.0;
  ce[0][5]  = 3.0;
  ce[0][6]  = 0.5;
  ce[0][7]  = 0.02;
  ce[0][8]  = 0.01;
  ce[0][9]  = 0.03;
  ce[0][10] = 0.5;
  ce[0][11] = 0.4;
  ce[0][12] = 0.3;

  ce[1][0]  = 1.0;
  ce[1][1]  = 0.0;
  ce[1][2]  = 0.0;
  ce[1][3]  = 0.0;
  ce[1][4]  = 1.0;
  ce[1][5]  = 2.0;
  ce[1][6]  = 3.0;
  ce[1][7]  = 0.01;
  ce[1][8]  = 0.03;
  ce[1][9]  = 0.02;
  ce[1][10] = 0.4;
  ce[1][11] = 0.3;
  ce[1][12] = 0.5;

  ce[2][0]  = 2.0;
  ce[2][1]  = 2.0;
  ce[2][2]  = 0.0;
  ce[2][3]  = 0.0;
  ce[2][4]  = 0.0;
  ce[2][5]  = 2.0;
  ce[2][6]  = 3.0;
  ce[2][7]  = 0.04;
  ce[2][8]  = 0.03;
  ce[2][9]  = 0.05;
  ce[2][10] = 0.3;
  ce[2][11] = 0.5;
  ce[2][12] = 0.4;

  ce[3][0]  = 2.0;
  ce[3][1]  = 2.0;
  ce[3][2]  = 0.0;
  ce[3][3]  = 0.0;
  ce[3][4]  = 0.0;
  ce[3][5]  = 2.0;
  ce[3][6]  = 3.0;
  ce[3][7]  = 0.03;
  ce[3][8]  = 0.05;
  ce[3][9] = 0.04;
  ce[3][10] = 0.2;
  ce[3][11] = 0.1;
  ce[3][12] = 0.3;

  ce[4][0]  = 5.0;
  ce[4][1]  = 4.0;
  ce[4][2]  = 3.0;
  ce[4][3]  = 2.0;
  ce[4][4]  = 0.1;
  ce[4][5]  = 0.4;
  ce[4][6]  = 0.3;
  ce[4][7]  = 0.05;
  ce[4][8]  = 0.04;
  ce[4][9] = 0.03;
  ce[4][10] = 0.1;
  ce[4][11] = 0.3;
  ce[4][12] = 0.2;

  c1 = 1.4;
  c2 = 0.4;
  c3 = 0.1;
  c4 = 1.0;
  c5 = 1.4;

  dnxm1 = 1.0 / (double)(grid_points[0]-1);
  dnym1 = 1.0 / (double)(grid_points[1]-1);
  dnzm1 = 1.0 / (double)(grid_points[2]-1);

  c1c2 = c1 * c2;
  c1c5 = c1 * c5;
  c3c4 = c3 * c4;
  c1345 = c1c5 * c3c4;

  conz1 = (1.0-c1c5);

  tx1 = 1.0 / (dnxm1 * dnxm1);
  tx2 = 1.0 / (2.0 * dnxm1);
  tx3 = 1.0 / dnxm1;

  ty1 = 1.0 / (dnym1 * dnym1);
  ty2 = 1.0 / (2.0 * dnym1);
  ty3 = 1.0 / dnym1;

  tz1 = 1.0 / (dnzm1 * dnzm1);
  tz2 = 1.0 / (2.0 * dnzm1);
  tz3 = 1.0 / dnzm1;

  dx1 = 0.75;
  dx2 = 0.75;
  dx3 = 0.75;
  dx4 = 0.75;
  dx5 = 0.75;

  dy1 = 0.75;
  dy2 = 0.75;
  dy3 = 0.75;
  dy4 = 0.75;
  dy5 = 0.75;

  dz1 = 1.0;
  dz2 = 1.0;
  dz3 = 1.0;
  dz4 = 1.0;
  dz5 = 1.0;

  dxmax = max(dx3, dx4);
  dymax = max(dy2, dy4);
  dzmax = max(dz2, dz3);

  dssp = 0.25 * max(dx1, max(dy1, dz1) );

  c4dssp = 4.0 * dssp;
  c5dssp = 5.0 * dssp;

  dttx1 = dt*tx1;
  dttx2 = dt*tx2;
  dtty1 = dt*ty1;
  dtty2 = dt*ty2;
  dttz1 = dt*tz1;
  dttz2 = dt*tz2;

  c2dttx1 = 2.0*dttx1;
  c2dtty1 = 2.0*dtty1;
  c2dttz1 = 2.0*dttz1;

  dtdssp = dt*dssp;

  comz1  = dtdssp;
  comz4  = 4.0*dtdssp;
  comz5  = 5.0*dtdssp;
  comz6  = 6.0*dtdssp;

  c3c4tx3 = c3c4*tx3;
  c3c4ty3 = c3c4*ty3;
  c3c4tz3 = c3c4*tz3;

  dx1tx1 = dx1*tx1;
  dx2tx1 = dx2*tx1;
  dx3tx1 = dx3*tx1;
  dx4tx1 = dx4*tx1;
  dx5tx1 = dx5*tx1;

  dy1ty1 = dy1*ty1;
  dy2ty1 = dy2*ty1;
  dy3ty1 = dy3*ty1;
  dy4ty1 = dy4*ty1;
  dy5ty1 = dy5*ty1;

  dz1tz1 = dz1*tz1;
  dz2tz1 = dz2*tz1;
  dz3tz1 = dz3*tz1;
  dz4tz1 = dz4*tz1;
  dz5tz1 = dz5*tz1;

  c2iv  = 2.5;
  con43 = 4.0/3.0;
  con16 = 1.0/6.0;

  xxcon1 = c3c4tx3*con43*tx3;
  xxcon2 = c3c4tx3*tx3;
  xxcon3 = c3c4tx3*conz1*tx3;
  xxcon4 = c3c4tx3*con16*tx3;
  xxcon5 = c3c4tx3*c1c5*tx3;

  yycon1 = c3c4ty3*con43*ty3;
  yycon2 = c3c4ty3*ty3;
  yycon3 = c3c4ty3*conz1*ty3;
  yycon4 = c3c4ty3*con16*ty3;
  yycon5 = c3c4ty3*c1c5*ty3;

  zzcon1 = c3c4tz3*con43*tz3;
  zzcon2 = c3c4tz3*tz3;
  zzcon3 = c3c4tz3*conz1*tz3;
  zzcon4 = c3c4tz3*con16*tz3;
  zzcon5 = c3c4tz3*c1c5*tz3;

  //------------------------------------------------------------------------
  cl_int ecode;
  int i;
  for (i = 0; i < num_devices; i++) {
  ecode = clEnqueueWriteBuffer(cmd_queue[i],
                               m_ce[i],
                               CL_TRUE,
                               0, sizeof(double)*5*13,
                               ce,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_ce");
  }
  //------------------------------------------------------------------------
}
Example #28
0
File: mvect.c Project: b8875/gemtc
int main( int argc, char* argv[] )
{
//unsigned int n;
    // Length of vectors
  int m = atoi(argv[4]);
//struct timespec start, finish;
unsigned int n=(256*m);
    // Host input vectors
    int *h_a;
    int *h_b;
    // Host output vector
    int *h_c;
	double elapsed;
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
	cl_kernel kernel; 
    cl_platform_id* cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    //cl_command_queue* queue;           // command queue
    //cl_command_queue queue;           // command queue
    cl_program program;               // program
cl_platform_id* platforms;		// platform id,
// differnt for all the device we have in the system
cl_uint platformCount; //keeps the divice count

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(int);
 
    // Allocate memory for each vector on host
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = i;
        h_b[i] = i;
//	printf("%d ",h_a[i]);
    }
 
    size_t globalSize, localSize; //similar to cuda
    cl_int err;//for errors
    int workgrp;
    int wrkitm;
    int num_ker;
    num_ker=atoi(argv[2]);
    wrkitm=atoi(argv[3]);// i have tried automating lots of data,
//u can check my bash script
    // Number of work items in each local work group
    localSize = wrkitm ;
//n=atoi(argv[1]);
    // Number of total work items - localSize must be devisor
    globalSize = n;
//mallocing for array of queues (break through)
cl_command_queue * queue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue));
//defining platform
 clGetPlatformIDs(0, NULL, &platformCount);
    cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here

int choice = atoi(argv[1]);
if(choice ==1)
{
// we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here
//depending what device are we working on
// we can these multiple times depending on requirements
    err = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    
        printf("Error: Failed to create a device group!\n");
}

else
{
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

    if (err != CL_SUCCESS)

    {

        printf("Error: Failed to create a device group!\n");
}
}
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

	for(i=0;i<num_ker;++i)
	{
    queue[i] = clCreateCommandQueue(context, device_id, 0, &err);
	}
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & KernelSource, NULL, &err);
    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
    //clock_gettime(CLOCK_MONOTONIC, &start);
//struct timeval tim;
 // double t1,t2;

//    gettimeofday(&tim, NULL);
  //  t1=tim.tv_sec+(tim.tv_usec/1000000.0);
/*    gettimeofday(&tim, NULL);
    t1=tim.tv_sec+(tim.tv_usec/1000000.0);
*/
	// Write our data set into the input array in device memory
	for(i=0;i<num_ker;++i)
{
    err = clEnqueueWriteBuffer(queue[i], d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue[i], d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL);
}
//clFinish(queue);
	// i know.. way to many APIs to be called in OpenCL
    // Set the arguments to our compute kernel
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
  // Get the maximum work group size for executing the kernel on the device
//localSize=256;
//  err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(localSize), &localSize, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
// timer for my evalutation
//clock_t start=clock();

// clock_gettime(CLOCK_MONOTONIC, &start);
// kernel part
  
  // Execute the kernel over the entire range of the data set 
// timing function
struct timeval tim;
  double t1,t2;
  
//    gettimeofday(&tim, NULL);
  //  t1=tim.tv_sec+(tim.tv_usec/1000000.0);
    gettimeofday(&tim, NULL);
    t1=tim.tv_sec+(tim.tv_usec/1000000.0);    
//printf("err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,0, NULL, NULL\n");
for(i=0;i<num_ker;i++)
{
err = clEnqueueNDRangeKernel(queue[i], kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);


}

// Wait for the command queue to get serviced before reading back results

    //clock_gettime(CLOCK_MONOTONIC, &finish);
    //elapsed = (finish.tv_sec - start.tv_sec);
    //elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000.0;
 //clock_t finish =clock();

    // Read the results from the device
for(i=0;i<num_ker;++i)
{
clFinish(queue[i]);
}
 gettimeofday(&tim, NULL);
    t2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf\t",(t2-t1));

for(i=0;i<num_ker;++i)
{
clEnqueueReadBuffer(queue[i], d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );    
}  
//clock_gettime(CLOCK_MONOTONIC, &finish);
  //elapsed = (finish.tv_nsec - start.tv_nsec);
  //  elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000.0;
for(i=0;i<num_ker;++i)
{
clFinish(queue[i]);
}/*
gettimeofday(&tim, NULL);
    t2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf(" %.4lf\t",(t2-t1)); */   
//Sum up vector c and print result divided by n, this should equal 1 within error
//int threads=globalSize/localSize;    
//double sum = 0;
  //  for(i=0; i<n; i++)
//       printf("%d ", h_c[i]);
//elapsed=(start-finish)/CLOCKS_PER_SEC;
//printf("%d",globalSize);
//printf("/%d ",localSize);
//printf("threads = %d \n",threads);
//    printf("Time taken by GPU in MicroSec = %.6le\n ",elapsed);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
for(i=0;i<num_ker;++i)
    clReleaseCommandQueue(queue[i]);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}
Example #29
0
int mri(
		float* img, 
		float complex* f, 
		float* mask, 
		float lambda,
		int N1,
		int N2)
{
	int i, j;

    // Use this to check the output of each API call
    cl_int status;

    // Retrieve the number of platforms
    cl_uint numPlatforms = 0;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    // Allocate enough space for each platform
    cl_platform_id *platforms = NULL;
    platforms = (cl_platform_id*)malloc(
        numPlatforms*sizeof(cl_platform_id));

    // Fill in the platforms
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    // Retrieve the number of devices
    cl_uint numDevices = 0;
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0,
        NULL, &numDevices);

    // Allocate enough space for each device
    cl_device_id *devices;
    devices = (cl_device_id*)malloc(
        numDevices*sizeof(cl_device_id));

    // Fill in the devices
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,
        numDevices, devices, NULL);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, devices, NULL,
        NULL, &status);

    // Create a command queue and associate it with the device
    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context, devices[0], 0,
        &status);

    // Create a buffer object that will contain the data
    // from the host array A
        
	float complex* f0	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dx	    = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy	    = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dx_new   = (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dy_new   = (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* dtildex	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* dtildey	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u_fft2	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* u		= (float complex*) calloc(N1*N2,sizeof(float complex));

	float complex* fftmul	= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* Lap		= (float complex*) calloc(N1*N2,sizeof(float complex));
	float complex* diff		= (float complex*) calloc(N1*N2,sizeof(float complex));
    float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex));
	float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex));
	float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex));
       
    Lap(N1-1, N2-1)	= 0.f;
	Lap(N1-1, 0)	= 1.f; 
	Lap(N1-1, 1)	= 0.f;
	Lap(0, N2-1)	= 1.f;
	Lap(0, 0)		= -4.f; 
	Lap(0, 1)		= 1.f;
	Lap(1, N2-1)	= 0.f;
	Lap(1, 0)		= 1.f; 
	Lap(1, 1)		= 0.f;

    cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status);
    cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

	cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
	cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);
    
    cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status);
    cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status);

    status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL);
        
	cl_program program = clCreateProgramWithSource(context, 1, 
        (const char**)&kernel, NULL, &status);
        
    status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
	cl_kernel ker;
	size_t globalWorkSize[2]={N1,N2};
	
	float sum = 0;

	for(i=0; i<N1; i++)
		for(j=0; j<N2; j++)
			sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1));
            
	float normFactor = 1.f/sqrtf(sum);
	float scale		 = sqrtf(N1*N2);

    ker = clCreateKernel(program, "loop1", &status);

    
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
    status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor);
    status = clSetKernelArg(ker, 3, sizeof(int), &N1);
    status = clSetKernelArg(ker, 4, sizeof(int), &N2);
    
    
    
    
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    w1[0] = 1;
	w2[0] = 1;
	dft_init(&w1, &w2, &buff, N1, N2);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL);

    ker = clCreateKernel(program, "dft1", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
                   if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "dft2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
    status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
    status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
    status = clSetKernelArg(ker, 5, sizeof(int), &N1);
    status = clSetKernelArg(ker, 6, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
               if (status != CL_SUCCESS)
            	printf("error: %d\n", status); 
    ker = clCreateKernel(program, "loop2", &status);
    status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
    status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap);
    status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
    status = clSetKernelArg(ker, 3, sizeof(float), &lambda);
    status = clSetKernelArg(ker, 4, sizeof(int), &N1);
    status = clSetKernelArg(ker, 5, sizeof(int), &N2);
    status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL);    
    
    float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex));
    float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex));
    
    
    
	int OuterIter,iter;
	for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) {
		for(iter = 0; iter<MaxIter; iter++) {
            ker = clCreateKernel(program, "loop3", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 3, sizeof(int), &N1);
            status = clSetKernelArg(ker, 4, sizeof(int), &N2);

            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 

            ker = clCreateKernel(program, "dft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
             if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
            	
            ker = clCreateKernel(program, "dft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
			//dft(diff, diff, w1, w2, buff, N1, N2);

            
            ker = clCreateKernel(program, "loop4", &status);
            int more = (iter == MaxIter - 1);

            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(int), &N1);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clSetKernelArg(ker, 6, sizeof(float), &scale);
            status = clSetKernelArg(ker, 7, sizeof(float), &lambda);
            status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 
            
            ker = clCreateKernel(program, "idft1", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
            ker = clCreateKernel(program, "idft2", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff);
            status = clSetKernelArg(ker, 5, sizeof(int), &N1);
            status = clSetKernelArg(ker, 6, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
            
         
            ker = clCreateKernel(program, "loop5", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex);
            status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey);
            status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new);
            status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new);
            status = clSetKernelArg(ker, 7, sizeof(int), &N1);
            status = clSetKernelArg(ker, 8, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);

            
		}
        /*
          ker = clCreateKernel(program, "last_loop", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0);
            status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask);
            status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2);
            status = clSetKernelArg(ker, 4, sizeof(float), &scale);
            status = clSetKernelArg(ker, 5, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
          if (status != CL_SUCCESS)
            	printf("error: %d\n", status);
          
          */
        clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);    
        clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL);
        clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL);
        
        for(i=0;i<N1;i++) {
			for(j=0;j<N2;j++) {
				f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale;  
			}
		}
        
        clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL);
        
       
	}
    
    
            ker = clCreateKernel(program, "loop7", &status);
            status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img);
            status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u);
            status = clSetKernelArg(ker, 2, sizeof(int), &N1);
            status = clSetKernelArg(ker, 3, sizeof(int), &N2);
            status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); 


    clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL);
    clReleaseKernel(ker);
    clReleaseProgram(program);
    clReleaseCommandQueue(cmdQueue);
    clReleaseMemObject(cl_img);
    clReleaseMemObject(cl_mask);
    clReleaseMemObject(cl_f);
    clReleaseMemObject(cl_f0);
    clReleaseMemObject(cl_dx);
    clReleaseMemObject(cl_dy);
    clReleaseMemObject(cl_dx_new);
    clReleaseMemObject(cl_dy_new);
    clReleaseMemObject(cl_dtildex);
    clReleaseMemObject(cl_dtildey);
    clReleaseMemObject(cl_u_fft2);
    clReleaseMemObject(cl_u);
    clReleaseMemObject(cl_fftmul);
    clReleaseMemObject(cl_Lap);
    clReleaseMemObject(cl_diff);
    clReleaseMemObject(cl_w1);
    clReleaseMemObject(cl_w2);
    clReleaseMemObject(cl_buff);
    
    clReleaseContext(context);
    free(platforms);
    free(devices);
	free(w1);
	free(w2);
	free(buff);
	return 0;
}
Example #30
0
int main( void )
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX;
    float *X;
    cl_event event = NULL;
    int ret = 0;
    size_t N = 16;
    char platform_name[128];
    char device_name[128];

    /* FFT library realted declarations */
    clfftPlanHandle planHandle;
    clfftDim dim = CLFFT_1D;
    size_t clLengths[1] = {N};

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );

    size_t ret_param_size = 0;
    err = clGetPlatformInfo(platform, CL_PLATFORM_NAME,
            sizeof(platform_name), platform_name,
            &ret_param_size);
    printf("Platform found: %s\n", platform_name);

    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL );

    err = clGetDeviceInfo(device, CL_DEVICE_NAME,
            sizeof(device_name), device_name,
            &ret_param_size);
    printf("Device found on the above platform: %s\n", device_name);

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    queue = clCreateCommandQueue( ctx, device, 0, &err );

    /* Setup clFFT. */
    clfftSetupData fftSetup;
    err = clfftInitSetupData(&fftSetup);
    err = clfftSetup(&fftSetup);

    /* Allocate host & initialize data. */
    /* Only allocation shown for simplicity. */
    X = (float *)malloc(N * 2 * sizeof(*X));

    /* print input array */
    printf("\nPerforming fft on an one dimensional array of size N = %ld\n", N);
    int print_iter = 0;
    while(print_iter<N) {
        float x = (float)print_iter;
        float y = (float)print_iter*3;
        X[2*print_iter  ] = x;
        X[2*print_iter+1] = y;
        printf("(%f, %f) ", x, y);
        print_iter++;
    }
    printf("\n\nfft result: \n");

    /* Prepare OpenCL memory objects and place data inside them. */
    bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, N * 2 * sizeof(*X), NULL, &err );

    err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0,
            N * 2 * sizeof( *X ), X, 0, NULL, NULL );

    /* Create a default plan for a complex FFT. */
    err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths);

    /* Set plan parameters. */
    err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE);
    err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED);
    err = clfftSetResultLocation(planHandle, CLFFT_INPLACE);

    /* Bake the plan. */
    err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL);

    /* Execute the plan. */
    err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL);

    /* Wait for calculations to be finished. */
    err = clFinish(queue);

    /* Fetch results of calculations. */
    err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL );

    /* print output array */
    print_iter = 0;
    while(print_iter<N) {
        printf("(%f, %f) ", X[2*print_iter], X[2*print_iter+1]);
        print_iter++;
    }
    printf("\n");

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufX );

    free(X);

    /* Release the plan. */
    err = clfftDestroyPlan( &planHandle );

    /* Release clFFT library. */
    clfftTeardown( );

    /* Release OpenCL working objects. */
    clReleaseCommandQueue( queue );
    clReleaseContext( ctx );

    return ret;
}