int main(int argc, char** argv) {
    /* OpenCL 1.1 data structures */
    cl_platform_id* platforms;
    cl_program program;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_uint numOfPlatforms;
    cl_int  error;

    cl_mem matrixAMemObj; // input matrix A mem buffer
    cl_mem matrixBMemObj; // input matrix B mem buffer
    cl_mem matrixCMemObj; // input matrix C mem buffer
    cl_int* matrixA;      // input matrix A
    cl_int* matrixB;      // input matrix B
    cl_int* matrixC;      // input matrix C
    cl_uint widthA = WIDTH_G;
    cl_uint heightA = HEIGHT_G;
    cl_uint widthB = WIDTH_G;
    cl_uint heightB = HEIGHT_G;

	{
	    // allocate memory for input and output matrices 
        // based on whatever matrix theory i know.
	    matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int));
	    matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int));
	    matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int));

	    memset(matrixA, 0, widthA * heightA * sizeof(cl_int));
	    memset(matrixB, 0, widthB * heightB * sizeof(cl_int));
	    memset(matrixC, 0, widthB * heightA * sizeof(cl_int));
        
        fillRandom(matrixA, widthA, heightA, 643);
        fillRandom(matrixB, widthB, heightB, 991);
    }

    /*
     Get the number of platforms
     Remember that for each vendor's SDK installed on the computer,
     the number of available platform also increased.
     */
    error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
    if(error != CL_SUCCESS) {
        perror("Unable to find any OpenCL platforms");
        exit(1);
    }
    
    platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
    printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);
    
    error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
    if(error != CL_SUCCESS) {
        perror("Unable to find any OpenCL platforms");
        exit(1);
    }
    // Search for a GPU device through the installed platforms
    // Build a OpenCL program and do not run it.
    for(cl_int i = 0; i < numOfPlatforms; i++ ) {
        // Get the GPU device
        error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL);

        if(error != CL_SUCCESS) {
            perror("Can't locate a OpenCL compliant device i.e. GPU");
            exit(1);
        }
        /* Create a context */
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
        if(error != CL_SUCCESS) {
            perror("Can't create a valid OpenCL context");
            exit(1);
        }
        
        /* Load the two source files into temporary datastores */
        const char *file_names[] = {"mmult.cl"};
        const int NUMBER_OF_FILES = 1;
        char* buffer[NUMBER_OF_FILES];
        size_t sizes[NUMBER_OF_FILES];
        loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
        
        /* Create the OpenCL program object */
        program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);
	    if(error != CL_SUCCESS) {
            perror("Can't create the OpenCL program object");
            exit(1);
	    }
        /* Build OpenCL program object and dump the error message, if any */
        char *program_log;
        const char options[] = "";
        size_t log_size;

        error = clBuildProgram(program, 1, &device, options, NULL, NULL);
	    if(error != CL_SUCCESS) {
            // If there's an error whilst building the program, dump the log
            clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
            program_log = (char*) malloc(log_size+1);
            program_log[log_size] = '\0';
            clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
                                  log_size+1, program_log, NULL);
            printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
            free(program_log);
            exit(1);
	    }
       
        // Queue is created with profiling enabled 
        cl_command_queue_properties props;
        props |= CL_QUEUE_PROFILING_ENABLE;

        queue = clCreateCommandQueue(context, device, props, &error);

        cl_kernel kernel = clCreateKernel(program, "mmmult", &error);

        matrixAMemObj = clCreateBuffer(context,
                                       CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                       widthA * heightA * sizeof(cl_int),
                                       matrixA,
                                       &error);

        matrixBMemObj = clCreateBuffer(context,
                                       CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
                                       widthB * heightB * sizeof(cl_int),
                                       matrixB,
                                       &error);

        matrixCMemObj = clCreateBuffer(context,
                                       CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR,
                                       widthB * heightA * sizeof(cl_int),
                                       0,
                                       &error);

        clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB);
        clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA);
        clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj);
        clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj);
        clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj);
         
        size_t globalThreads[] = {heightA};
        size_t localThreads[] = {256};
		cl_event exeEvt; 
        cl_ulong executionStart, executionEnd;
		error = clEnqueueNDRangeKernel(queue,
		                               kernel,
		                               1,
                                       NULL,
		                               globalThreads,
                                       localThreads,
                                       0,
                                       NULL,
                                       &exeEvt);
		clWaitForEvents(1, &exeEvt);
		if(error != CL_SUCCESS) {
			printf("Kernel execution failure!\n");
			exit(-22);
		}	

        // let's understand how long it took?
        clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL);
        clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL);
        clReleaseEvent(exeEvt);

        printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000);
        printf("Execution the matrix-matrix multiplication took %lu s\n", (executionEnd - executionStart));
 
        clEnqueueReadBuffer(queue,
                            matrixCMemObj,
                            CL_TRUE,
                            0,
                            widthB * heightA * sizeof(cl_int),
                            matrixC,
                            0,
                            NULL,
                            NULL);
       
        if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB))
            printf("Passed!\n");
        else 
            printf("Failed!\n");
 
        /* Clean up */
        for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
        clReleaseProgram(program);
        clReleaseContext(context);
        clReleaseMemObject(matrixAMemObj);
        clReleaseMemObject(matrixBMemObj);
        clReleaseMemObject(matrixCMemObj);
    }
    
    free(matrixA);
    free(matrixB);
    free(matrixC);
}
int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
	int in, hid, out;
	float out_err, hid_err;
  
	in = net->input_n;
	hid = net->hidden_n;
	out = net->output_n;   

        //int use_device = 0;  // use CPU as device
	int use_device = 2;  // use GPU as device
        //int use_device = 2;  // use FPGA as device
	if(initialize(use_device)) return -1;
         
	int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char)); 
	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_bp1  = "bpnn_layerforward_ocl";
	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
	char * tempchar = "./backprop_kernel.cl";
        char * krnl_file = "./binary/backprop_kernel_default.xclbin";
 
        cl_int err = 0;
        cl_program prog;
        // create program from source
        if (use_device < 2 ) {
	    FILE * fp = fopen(tempchar, "rb"); 
	    if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
	    fread(source + strlen(source), sourcesize, 1, fp);
	    fclose(fp);
		
	    // compile kernel
	    err = 0;
	    const char * slist[2] = { source, 0 };
	    prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	    if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
        } 
        // create program from binary
        else {
            char *krnl_bin;
	    const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin);

            err = 0;
            prog = clCreateProgramWithBinary(context, 1,
	                                    &device_list[0], &krnl_size,
	                                    (const unsigned char**) &krnl_bin,
	                                    NULL, &err);
            if ((!prog) || (err!=CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n",
		       err);
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	    }
        }
        
	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
	{ // show warnings/errors
		//static char log[65536]; memset(log, 0, sizeof(log));
		//cl_device_id device_id = 0;
		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
	}
	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
    	
	cl_kernel kernel1;
	cl_kernel kernel2;
	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
        if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; }
	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; }
	/* clReleaseProgram(prog); */
	
	float *input_weights_one_dim;
    float *input_weights_prev_one_dim;
	float * partial_sum;
	float sum;
	float num_blocks = in / BLOCK_SIZE;
	
	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
	
	// set global and local workitems
	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
	
	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
	// todo: fix mem allocation
	int m = 0;
	for (int k = 0; k <= in; k++) {	
		for (int j = 0; j <= hid; j++) {
		input_weights_one_dim[m] = net->input_weights[k][j];
		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
	    m++;
		}
	}
	
	cl_mem input_hidden_ocl;
	cl_mem input_ocl;
	cl_mem output_hidden_ocl;
	cl_mem hidden_partial_sum;
	cl_mem hidden_delta_ocl;
	cl_mem input_prev_weights_ocl;
  
	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
		
	printf("Performing GPU computation\n");
	
	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
 
	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0);
        if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");}
	if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
  
	for (int j = 1; j <= hid; j++) {
		sum = 0.0;
		for (int k = 0; k < num_blocks; k++) {	
		sum += partial_sum[k * hid + j-1] ;
    }
		sum += net->input_weights[0][j];
		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
	}

	
	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);

	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
  
	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
  
	clReleaseMemObject(input_ocl);
	clReleaseMemObject(output_hidden_ocl);
	clReleaseMemObject(input_hidden_ocl);
	clReleaseMemObject(hidden_partial_sum);
	clReleaseMemObject(input_prev_weights_ocl);
  
	free(input_weights_prev_one_dim);
	free(partial_sum);
	free(input_weights_one_dim);

}
Exemple #3
0
void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) {
	cl_context context;
	cl_command_queue commandQueue;
	cl_program program;
	cl_kernel kernel;
	
	size_t dataBytes, kernelLength;
	cl_int errorCode;
	
	cl_mem gridBuffer;
	
	cl_device_id* devices;
	cl_device_id gpu;
	
	cl_uint numPlatforms;

	errorCode = clGetPlatformIDs(0, NULL, &numPlatforms);
	cl_platform_id platforms[numPlatforms];
	errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL);
	
	checkError(errorCode);
	
	cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0};

	context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode);
	checkError(errorCode);
	
	errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes);
	devices = malloc(dataBytes);
	errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL);
	
	gpu = devices[0];
	
	commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode);
	checkError(errorCode);
	
	gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode);
	checkError(errorCode);
	
	const char* programBuffer = readFile("kernel.cl");
	kernelLength = strlen(programBuffer);
	program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode);
	checkError(errorCode);
	
	errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (errorCode == CL_BUILD_PROGRAM_FAILURE) {
		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		
		// Allocate memory for the log
		char *log = (char *) malloc(log_size);
		
		// Get the log
		clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
		
		// Print the log
		free(log);
		printf("%s\n", log);
	}
	checkError(errorCode);
	
	kernel = clCreateKernel(program, "diffuse", &errorCode);
	checkError(errorCode);

	size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width};

	errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer);
	errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL);
	errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width);
	errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize);
	errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations);
	checkError(errorCode);
	
	errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	checkError(errorCode);
	
	errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL);
	checkError(errorCode);



	free(devices);
	free((void *)programBuffer);
	clReleaseContext(context);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(commandQueue);

	
}
Exemple #4
0
int main(int argc, char* argv[]) {
  struct pb_Parameters *parameters;

  parameters = pb_ReadParameters(&argc, argv);
  if (!parameters)
    return -1;

  if(!parameters->inpFiles[0]){
    fputs("Input file expected\n", stderr);
    return -1;
  }

  
  struct pb_TimerSet timers;
  
  char oclOverhead[] = "OCL Overhead";
  char intermediates[] = "IntermediatesKernel";
  char finals[] = "FinalKernel";

  pb_InitializeTimerSet(&timers);
  
  pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL);
  pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL);
    
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  
  int numIterations;
  if (argc >= 2){
    numIterations = atoi(argv[1]);
  } else {
    fputs("Expected at least one command line argument\n", stderr);
    return -1;
  }

  unsigned int img_width, img_height;
  unsigned int histo_width, histo_height;

  FILE* f = fopen(parameters->inpFiles[0],"rb");
  int result = 0;

  result += fread(&img_width,    sizeof(unsigned int), 1, f);
  result += fread(&img_height,   sizeof(unsigned int), 1, f);
  result += fread(&histo_width,  sizeof(unsigned int), 1, f);
  result += fread(&histo_height, sizeof(unsigned int), 1, f);

  if (result != 4){
    fputs("Error reading input and output dimensions from file\n", stderr);
    return -1;
  }

  unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int));
  unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char));

  result = fread(img, sizeof(unsigned int), img_width*img_height, f);

  fclose(f);

  if (result != img_width*img_height){
    fputs("Error reading input array from file\n", stderr);
    return -1;
  }

  cl_int ciErrNum;
  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(parameters);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
    return -1;
  }

  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  cl_context clContext = (cl_context) pb_context->clContext;
  cl_command_queue clCommandQueue;
  
  cl_program clProgram[2];
  
  cl_kernel histo_intermediates_kernel;
  cl_kernel histo_final_kernel;
  
  cl_mem input;
  cl_mem ranges;
  cl_mem sm_mappings;
  cl_mem global_subhisto;
  cl_mem global_overflow;
  cl_mem final_histo;
  
  clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SetOpenCL(&clContext, &clCommandQueue);
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  cl_uint workItemDimensions;
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) );
  
  size_t workItemSizes[workItemDimensions];
  OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) );
  
  size_t program_length[2];
  const char *source_path[2] = { 
    "src/opencl_mxpa/histo_intermediates.cl", 
   "src/opencl_mxpa/histo_final.cl"};
  char *source[4];

  for (int i = 0; i < 2; ++i) {
    // Dynamically allocate buffer for source
    source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]);
    if(!source[i]) {
      fprintf(stderr, "Could not load program source\n"); exit(1);
    }
  	
  	clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum);
  	OCL_ERRCK_VAR(ciErrNum);
  	  	
  	free(source[i]);
  }
  	
  	  	  	  	  	  	  	
  for (int i = 0; i < 2; ++i) {
    //fprintf(stderr, "Building Program #%d...\n", i);
    OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) );
       
    /*
       char *build_log;
       size_t ret_val_size;
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);	OCL_ERRCK_VAR(ciErrNum);
       build_log = (char *)malloc(ret_val_size+1);
       ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
       	OCL_ERRCK_VAR(ciErrNum);
       	

       // to be carefully, terminate with \0
       // there's no information in the reference whether the string is 0 terminated or not
       build_log[ret_val_size] = '\0';

       fprintf(stderr, "%s\n", build_log );
     */
  }
  	
  histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum);
  OCL_ERRCK_VAR(ciErrNum);
  
  pb_SwitchToTimer(&timers, pb_TimerID_COPY);  

  input =           clCreateBuffer(clContext, CL_MEM_READ_WRITE, 
      img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  ranges =          clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);  
  sm_mappings =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);
  final_histo =     clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum);

  // Must dynamically allocate. Too large for stack
  unsigned int *zeroData;
  zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int));
  if (zeroData == NULL) {
    fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height);
    exit(1);
  }
   
  for (int y=0; y < img_height; y++){
    OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, 
                          y*img_width*sizeof(unsigned int), // Offset in bytes
                          img_width*sizeof(unsigned int), // Size of data to write
                          &img[y*img_width], // Host Source
                          0, NULL, NULL) );
  }
 
  pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);

  unsigned int img_dim = img_height*img_width;
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) );
  OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) );

  size_t inter_localWS[1] = { workItemSizes[0] };
  size_t inter_globalWS[1] = { img_height * inter_localWS[0] };
  
  size_t final_localWS[1] = { workItemSizes[0] };
  size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) /
                                          (int)final_localWS[0])*(int)final_localWS[0] };
  
  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

  for (int iter = 0; iter < numIterations; iter++) {
    unsigned int ranges_h[2] = {UINT32_MAX, 0};
    
    // how about something like
    // __global__ unsigned int ranges[2];
    // ...kernel
    // __shared__ unsigned int s_ranges[2];
    // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];}
    // __syncthreads();
    
    // Although then removing the blocking cudaMemcpy's might cause something about
    // concurrent kernel execution.
    // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads?


  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 
                          0, // Offset in bytes
                          2*sizeof(unsigned int), // Size of data to write
                          ranges_h, // Host Source
                          0, NULL, NULL) );
                          
  OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 
                          0, // Offset in bytes
                          histo_width*histo_height*sizeof(unsigned int), // Size of data to write
                          zeroData, // Host Source
                          0, NULL, NULL) );
                          
  pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL);

  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0,
                            inter_globalWS, inter_localWS, 0, 0, 0) );              
  pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL);                            
  OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0,
                            final_globalWS, final_localWS, 0, 0, 0) );                           
  }

  pb_SwitchToTimer(&timers, pb_TimerID_IO);

  OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 
                          0, // Offset in bytes
                          histo_height*histo_width*sizeof(unsigned char), // Size of data to read
                          histo, // Host Source
                          0, NULL, NULL) );                         

  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) );
  OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) );
  
  OCL_ERRCK_RETVAL ( clReleaseMemObject(input) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) );
  OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) );

  if (parameters->outFile) {
    dump_histo_img(histo, histo_height, histo_width, parameters->outFile);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  free(zeroData);
  free(img);
  free(histo);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);

  printf("\n");
  pb_PrintTimerSet(&timers);
  pb_FreeParameters(parameters);
  
  pb_DestroyTimerSet(&timers);

  OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) );
  OCL_ERRCK_RETVAL ( clReleaseContext(clContext) );

  return 0;
}
Exemple #5
0
int main(int argc, char **argv)
{

	cl_int ret;


	/*
	 * Command line
	 */
	char *binary_path;
	if (argc != 2)
	{
		printf("syntax: %s <binary>\n", argv[0]);
		exit(1);
	}
	binary_path = argv[1];


	/*
	 * Platform
	 */

	/* Get platform */
	cl_platform_id platform;
	cl_uint num_platforms;
	ret = clGetPlatformIDs(1, &platform, &num_platforms);
	if (ret != CL_SUCCESS)
	{
		printf("error: second call to 'clGetPlatformIDs' failed\n");
		exit(1);
	}
	printf("Number of platforms: %d\n", num_platforms);

	/* Get platform name */
	char platform_name[100];
	ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetPlatformInfo' failed\n");
		exit(1);
	}
	printf("platform.name='%s'\n", platform_name);
	printf("\n");



	/*
	 * Device
	 */

	/* Get device */
	cl_device_id device;
	cl_uint num_devices;
	ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetDeviceIDs' failed\n");
		exit(1);
	}
	printf("Number of devices: %d\n", num_devices);

	/* Get device name */
	char device_name[100];
	ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clGetDeviceInfo' failed\n");
		exit(1);
	}
	printf("device.name='%s'\n", device_name);
	printf("\n");



	/*
	 * Context
	 */
	
	/* Create context */
	cl_context context;
	context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateContext' failed\n");
		exit(1);
	}

	

	/*
	 * Command Queue
	 */
	
	/* Create command queue */
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(context, device, 0, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateCommandQueue' failed\n");
		exit(1);
	}
	printf("\n");



	/*
	 * Program
	 */
	
	/* Program binary */
	const unsigned char *binary;
	size_t binary_length;

	/* Read binary */
	binary = read_buffer(binary_path, &binary_length);
	if (!binary)
	{
		printf("error: %s: cannot open binary\n", binary_path);
		exit(1);
	}
	
	/* Create a program */
	cl_program program;
	program = clCreateProgramWithBinary(context, 1, &device, &binary_length,
			&binary, NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateProgramWithSource' failed\n");
		exit(1);
	}

	/* Build program */
	ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	if (ret != CL_SUCCESS )
	{
		size_t size;
		char *log;

		/* Get log size */
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);

		/* Allocate log and print */
		log = malloc(size);
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, log, NULL);
		printf("error: call to 'clBuildProgram' failed:\n%s\n", log);

		/* Free log and exit */
		free(log);
		exit(1);
	}
	printf("program built\n");
	printf("\n");



	/*
	 * Kernel
	 */
	
	/* Create kernel */
	cl_kernel kernel;
	kernel = clCreateKernel(program, "vector_add", &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clCreateKernel' failed\n");
		exit(1);
	}
	printf("\n");


	/*
	 * Buffers
	 */
	
	/* Create and allocate host buffers */
	size_t num_elem = 10;

	cl_int *src1_host_buffer;
	cl_int *src2_host_buffer;
	cl_int *dst_host_buffer;
	src1_host_buffer = malloc(num_elem * sizeof(cl_int));
	src2_host_buffer = malloc(num_elem * sizeof(cl_int));
	dst_host_buffer = malloc(num_elem * sizeof(cl_int));

	/* Initialize host source buffer */
	int i;
	for (i = 0; i < num_elem; i++)
	{
		src1_host_buffer[i] = i;
		src2_host_buffer[i] = 100;
	}
	
	/* Create device source buffers */
	cl_mem src1_device_buffer;
	cl_mem src2_device_buffer;
	src1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL);
	src2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL);
	if (!src1_device_buffer || !src2_device_buffer)
	{
		printf("error: could not create destination buffer\n");
		exit(1);
	}

	/* Create device destination buffer */
	cl_mem dst_device_buffer;
	dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem * sizeof(cl_int), NULL, &ret);
	if (ret != CL_SUCCESS)
	{
		printf("error: could not create destination buffer\n");
		exit(1);
	}

	/* Copy buffer */
	ret = clEnqueueWriteBuffer(command_queue, src1_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), src1_host_buffer, 0, NULL, NULL);
	ret |= clEnqueueWriteBuffer(command_queue, src2_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), src2_host_buffer, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueWriteBuffer' failed\n");
		exit(1);
	}


	/*
	 * Kernel arguments
	 */
	
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src1_device_buffer);
	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src2_device_buffer);
	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clSetKernelArg' failed\n");
		exit(1);
	}
	
	
	/*
	 * Launch Kernel
	 */
	
	size_t global_work_size = num_elem;
	size_t local_work_size = num_elem;

	/* Launch the kernel */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
		&global_work_size, &local_work_size, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
		exit(1);
	}

	/* Wait for it to finish */
	clFinish(command_queue);


	/*
	 * Result
	 */
	
	/* Receive buffer */
	ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,
		0, num_elem * sizeof(cl_int), dst_host_buffer, 0, NULL, NULL);
	if (ret != CL_SUCCESS)
	{
		printf("error: call to 'clEnqueueReadBuffer' failed\n");
		exit(1);
	}

	/* Print result */
	for (i = 0; i < num_elem; i++)
		printf("dst_host_buffer[%d] = %d\n", i, dst_host_buffer[i]);
	printf("\n");

	return 0;
}
Exemple #6
0
	int SieveBoth::Sieve(size_t n)
	{
		cl_context context = 0;
		cl_command_queue commandQueue = 0;
		cl_program program = 0;
		cl_device_id device = 0;
		cl_kernel kernel = 0;
		cl_mem memObjects[2] = { 0, 0 };
		cl_int errNum;
		int array_size = 10;

		// Create an OpenCL context on first available platform
		context = OpenCLFuncs::CreateContext();
		if (context == NULL)
		{
			std::cerr << "Failed to create OpenCL context." << std::endl;
			system("pause");
			return 1;
		}

		// Create a command-queue on the first device available
		// on the created context
		commandQueue = OpenCLFuncs::CreateCommandQueue(context, &device);
		if (commandQueue == NULL)
		{
			OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects);
			system("pause");
			return 2;
		}

		// Create OpenCL program from HelloWorld.cl kernel source
		program = OpenCLFuncs::CreateProgram(context, device, "Sieve.cl");
		if (program == NULL)
		{
			OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects);
			system("pause");
			return 3;
		}

		// Create OpenCL kernel
		kernel = clCreateKernel(program, "main_kernel", NULL);
		if (kernel == NULL)
		{
			std::cerr << "Failed to create kernel" << std::endl;
			OpenCLFuncs::Cleanup(context, commandQueue, program, kernel, memObjects);
			system("pause");
			return 4;
		}

		// Create memory objects that will be used as arguments to
		// kernel.  First create host memory arrays that will be
		// used to store the arguments to the kernel
		//int result = 0;
		int limit = n;
		
		if (!OpenCLFuncs::CreateMemObjectsForSieve(context, memObjects, limit))
		{
			OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects);
			system("pause");
			return 5;
		}

		// Set the kernel arguments (result, a, b)
		errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
		errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
		
		if (errNum != CL_SUCCESS)
		{
			std::cerr << "Error setting kernel arguments." << std::endl;
			OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects);
			system("pause");
			return 6;
		}

		size_t globalWorkSize[1] = { 1 };
		size_t localWorkSize[1] = { 1 };

		//timer.Start();

		// Queue the kernel up for execution across the array
		errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
			globalWorkSize, localWorkSize,
			0, NULL, NULL);

		if (errNum != CL_SUCCESS)
		{
			std::cerr << "Error queuing kernel for execution." << std::endl;
			OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects);
			system("pause");
			return 7;
		}

		int result = 0;
		//float *a = new float[array_size];
		//float *b = new float[array_size];
		
		// Read the output buffer back to the Host
		errNum = clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE,
			0, sizeof(int), &result,
			0, NULL, NULL);
		if (errNum != CL_SUCCESS)
		{
			std::cerr << "Error reading result buffer." << std::endl;
			OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects);
			
			system("pause");
			return 1;
		}

		//timer.End();
		//if (timer.Diff(seconds, useconds))
		//	std::cerr << "Warning: timer returned negative difference!" << std::endl;
		//std::cout << "OpenCL ran in " << seconds << "." << useconds << " seconds" << std::endl << std::endl;

		OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects);
		
		return result;
	}
Exemple #7
0
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) {
  struct bench_args_t *args = (struct bench_args_t *)vargs;
  // Create device buffers
  //
  cl_mem obs_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->obs), NULL, NULL);
  cl_mem init_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->init), NULL, NULL);
  cl_mem transition_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->transition), NULL, NULL);
  cl_mem emission_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->emission), NULL, NULL);
  cl_mem path_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->path), NULL, NULL);
  if (!obs_buffer || !init_buffer || !transition_buffer || !emission_buffer || !path_buffer)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    exit(1);
  }    

  // Write our data set into device buffers  
  //
  int err;
  err = clEnqueueWriteBuffer(commands, obs_buffer, CL_TRUE, 0, sizeof(args->obs), args->obs, 0, NULL, NULL);
  err |= clEnqueueWriteBuffer(commands, init_buffer, CL_TRUE, 0, sizeof(args->init), args->init, 0, NULL, NULL);
  err |= clEnqueueWriteBuffer(commands, transition_buffer, CL_TRUE, 0, sizeof(args->transition), args->transition, 0, NULL, NULL);
  err |= clEnqueueWriteBuffer(commands, emission_buffer, CL_TRUE, 0, sizeof(args->emission), args->emission, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
      printf("Error: Failed to write to device memory!\n");
      printf("Test failed\n");
      exit(1);
  }
    
  // Set the arguments to our compute kernel
  //
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &obs_buffer);
  err  |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &init_buffer);
  err  |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &transition_buffer);
  err  |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &emission_buffer);
  err  |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &path_buffer);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

#ifdef C_KERNEL
  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
#else
  printf("Error: OpenCL kernel is not currently supported!\n");
  exit(1);
#endif
  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }

  // Read back the results from the device to verify the output
  //
  err = clEnqueueReadBuffer( commands, path_buffer, CL_TRUE, 0, sizeof(args->path), args->path, 0, NULL, NULL );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    exit(1);
  }
}
int 
ScanLargeArrays::runCLKernels(void)
{
    cl_int status;
    cl_int eventStatus = CL_QUEUED;

    cl_event writeEvt;
    // Enqueue write to seedsBuf
    status = clEnqueueWriteBuffer(commandQueue,
                                  inputBuffer,
                                  CL_FALSE,
                                  0,
                                  length * sizeof(cl_float),
                                  input,
                                  0,
                                  NULL,
                                  &writeEvt);
    CHECK_OPENCL_ERROR(status,"clEnqueueWriteBuffer failed.(inputBuffer)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status,"clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed");

    // Do block-wise sum
    if(bScan(length, &inputBuffer, &outputBuffer[0], &blockSumBuffer[0]))
        return SDK_FAILURE;

    for(int i = 1; i < (int)pass; i++)
    {
        if(bScan((cl_uint)(length / pow((float)blockSize, (float)i)), 
            &blockSumBuffer[i - 1],
            &outputBuffer[i],
            &blockSumBuffer[i]))
        {
            return SDK_FAILURE;
        }
    }

    int tempLength = (int)(length / pow((float)blockSize, (float)pass));

    // Do scan to tempBuffer
    if(pScan(tempLength, &blockSumBuffer[pass - 1], &tempBuffer))
        return SDK_FAILURE;

    // Do block-addition on outputBuffers
    if(bAddition((cl_uint)(length / pow((float)blockSize, (float)(pass - 1))),
        &tempBuffer, &outputBuffer[pass - 1]))
    {
        return SDK_FAILURE;
    }

    for(int i = pass - 1; i > 0; i--)
    {
        if(bAddition((cl_uint)(length / pow((float)blockSize, (float)(i - 1))),
            &outputBuffer[i], &outputBuffer[i - 1]))
        {
            return SDK_FAILURE;
        }
    }

    cl_event readEvt;
    // Enqueue the results to application pointe
    status = clEnqueueReadBuffer(commandQueue,
                                 outputBuffer[0],
                                 CL_FALSE,
                                 0,
                                 length * sizeof(cl_float),
                                 output,
                                 0,
                                 NULL,
                                 &readEvt);
    CHECK_OPENCL_ERROR(status,"clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status,"clFlush failed.(commandQueue)");

    status = sampleCommon->waitForEventAndRelease(&readEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed");

    return SDK_SUCCESS;
}
Exemple #9
0
/**
 * @brief Main principal
 * @param argc El número de argumentos del programa
 * @param argv Cadenas de argumentos del programa
 * @return Nada si es correcto o algún número negativo si es incorrecto
 */
int main( int argc, char** argv ) {

	if(argc != 2)
		return -1;

	// Medimos tiempo para el programa
	const double start_time = getCurrentTimestamp();

	FILE *kernels;
	char *source_str;
	size_t source_size, work_items;

	// OpenCL runtime configuration
	unsigned num_devices;
	cl_platform_id platform_ids[3];
	cl_uint ret_num_platforms;
	cl_device_id device_id;
	cl_context context = NULL;
	cl_command_queue command_queue;
	cl_program program = NULL;
	cl_int ret;
	cl_kernel kernelINIT;
	cl_event kernel_event, finish_event;
	cl_mem objPARTICULAS;

	// Abrimos el fichero que contiene el kernel
	fopen_s(&kernels, "initparticulasCPU.cl", "r");
	if (!kernels) {
		fprintf(stderr, "Fallo al cargar el kernel\n");
		exit(-1);
	}	
	source_str = (char *) malloc(0x100000);
	source_size = fread(source_str, 1, 0x100000, kernels);
	fclose(kernels);

	// Obtenemos los IDs de las plataformas disponibles
	if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) {
		printf("No se puede obtener id de la plataforma");
		return -1;
	}

	// Intentamos obtener un dispositivo CPU soportado
	if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) {
		printf("No se puede obtener id del dispositivo");
		return -1;
	}
	clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL);
 
	// Creación de un contexto OpenCL
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
 
	// Creación de una cola de comandos
	command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);

	// Creación de un programa kernel desde un fichero de código
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
	if (ret != CL_SUCCESS) {
		size_t len;
		char buffer[2048];
		printf("Error: ¡Fallo al construir el programa ejecutable!\n");
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s", buffer);
		exit(-1);
	}

	// Creación del kernel OpenCL
	kernelINIT = clCreateKernel(program, "calc_particles_init", &ret);

	// Creamos el buffer para las partículas y reservamos espacio ALINEADO para los datos
	size_t N = atoi(argv[1]);
	particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64);
	objPARTICULAS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(particle), NULL, &ret);
	const size_t global = 4;
	const size_t local_work_size = 1;

	// Transferimos el frame al dispositivo
	cl_event write_event;
	ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event);

	// Establecemos los argumentos del kernel
	ret = clSetKernelArg(kernelINIT, 0, sizeof(cl_mem), &objPARTICULAS);
	ret = clSetKernelArg(kernelINIT, 1, sizeof(int), &N);

	// Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo
	ret = clEnqueueNDRangeKernel(command_queue, kernelINIT, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event);

	// Leemos los resultados
	ret = clEnqueueReadBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 1, &kernel_event, &finish_event);
	
	// Esperamos a que termine de leer los resultados
	clWaitForEvents(1, &finish_event);

	// Obtenemos el tiempo del kernel y de las transferencias CPU-RAM
	cl_ulong totalKernel = getStartEndTime(kernel_event);
	cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event);

	const double end_time = getCurrentTimestamp();

	// Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria
	printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3);
	printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6);
	printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6);

	// Liberamos todos los recursos usados (kernels y objetos OpenCL)
	clReleaseEvent(kernel_event);
	clReleaseEvent(finish_event);
	clReleaseEvent(write_event);
	clReleaseMemObject(objPARTICULAS);
	clReleaseKernel(kernelINIT);
	clReleaseCommandQueue(command_queue);
	clReleaseProgram(program);
	clReleaseContext(context);
}
int 
SimpleConvolution::runCLKernels(void)
{
    cl_int   status;
    cl_event events[2];

    status = this->setWorkGroupSize();
    CHECK_ERROR(status, SDK_SUCCESS, "setWorkGroupSize() failed");

    // Set appropriate arguments to the kernel
    status = clSetKernelArg(
                    kernel, 
                    0, 
                    sizeof(cl_mem), 
                    (void *)&outputBuffer);
    CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (outputBuffer)");

    status = clSetKernelArg(
                    kernel, 
                    1, 
                    sizeof(cl_mem), 
                    (void *)&inputBuffer);
    CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (inputBuffer)");

    status = clSetKernelArg(
                    kernel, 
                    2, 
                    sizeof(cl_mem), 
                    (void *)&maskBuffer);
    CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (maskBuffer)");

    cl_uint2 inputDimensions = {width, height};
    cl_uint2 maskDimensions  = {maskWidth, maskHeight};

    status = clSetKernelArg(
                    kernel, 
                    3, 
                    sizeof(cl_uint2), 
                    (void *)&inputDimensions);
    CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (inputDimensions)");

    status = clSetKernelArg(
                    kernel, 
                    4, 
                    sizeof(cl_uint2), 
                    (void *)&maskDimensions);
    CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (maskDimensions)");

    // Enqueue a kernel run call.
    status = clEnqueueNDRangeKernel(
            commandQueue,
            kernel,
            1,
            NULL,
            globalThreads,
            localThreads,
            0,
            NULL,
            &events[0]);
    CHECK_OPENCL_ERROR( status, "clEnqueueNDRangeKernel failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status,"clFlush() failed"); 

    status = sampleCommon->waitForEventAndRelease(&events[0]);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(events[0]) Failed");
    
    // Enqueue readBuffer
    status = clEnqueueReadBuffer(
                commandQueue,
                outputBuffer,
                CL_TRUE,
                0,
                width * height * sizeof(cl_uint),
                output,
                0,
                NULL,
                &events[1]);
    CHECK_OPENCL_ERROR( status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status,"clFlush() failed"); 

    status = sampleCommon->waitForEventAndRelease(&events[1]);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(events[1]) Failed");
    return SDK_SUCCESS;
}
int main(int argc, char *argv[])
{
    //FILE *fp;

    cl_platform_id      platform_id[2];
    cl_uint             ret_num_devices;
    cl_uint             ret_num_platforms;
    cl_int              ret_code;

    cl_mem              image_in_mem = NULL;
    cl_mem              image_out_mem = NULL;
    cl_mem              twiddle_factors_mem = NULL;
    cl_float2           *image_in_host;
    cl_float2           *twiddle_factors_host;

    cl_kernel           kernel_twiddle_factors;
    cl_kernel           kernel_matriz_transpose;
    cl_kernel           kernel_lowpass_filter;

    pgm_t ipgm;
    pgm_t opgm;

    image_file_t        *image_filename;
    char                *output_filename;
    FILE                *fp;
    const char          *kernel_filename = C_NOME_ARQ_KERNEL;
    size_t              source_size;
    char                *source_str;
    cl_int              i, j,n ,m;
    cl_int              raio = 0;
    size_t              global_wg[2];
    size_t              local_wg[2];
    float               *image_amplitudes;
    size_t              log_size;
    char                *log_file;

    cl_event            kernels_events_out_fft[4];

    cl_ulong            kernel_runtime              = (cl_ulong) 0;
    cl_ulong            kernel_start_time           = (cl_ulong) 0;
    cl_ulong            kernel_end_time             = (cl_ulong) 0;

    cl_event            write_host_dev_event;
    cl_ulong            write_host_dev_start_time   = (cl_ulong) 0;
    cl_ulong            write_host_dev_end_time     = (cl_ulong) 0;
    cl_ulong            write_host_dev_run_time     = (cl_ulong) 0;

    cl_event            read_dev_host_event;
    cl_ulong            read_dev_host_start_time    = (cl_ulong) 0;
    cl_ulong            read_dev_host_end_time      = (cl_ulong) 0;
    cl_ulong            read_dev_host_run_time      = (cl_ulong) 0;

    unsigned __int64    image_tam;
    unsigned __int64    MEGA_BYTES   =  1048576; // 1024*1024
    double              image_tam_MB;
    double              tempo_total;

    struct event_in_fft_t *fft_events;


   //=== Timer count start ==============================================================================
    timer_reset();
    timer_start();
    //===================================================================================================

    if (argc < 2) {
        printf("**Erro: O arquivo de entrada eh necessario.\n");
        exit(EXIT_FAILURE);
    }

    image_filename = (image_file_t *) malloc(sizeof(image_file_t));
    split_image_filename(image_filename, argv[1]);
    output_filename = (char *) malloc(40*sizeof(char));
    sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO);

    fp = fopen(kernel_filename, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(EXIT_FAILURE);
    }

    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );

    //===================================================================================================
     /* Abrindo imagem do arquivo para objeto de memoria local*/
    if( ler_pgm(&ipgm, argv[1]) == -1)
        exit(EXIT_FAILURE);

    n = ipgm.width;
    raio = n/8;
    m = (cl_int)(log((double)n)/log(2.0));

    image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2));
    twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));

    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j];
            image_in_host[n*i + j].s[1] = (float)0;
        }
    }

    fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t));

    kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event));

    //===================================================================================================
    CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms));

    if (ret_num_platforms == 0 ) {
        fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n");
        exit(2);
    }

    //===================================================================================================

    CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices));
    //print_platform_info(&platform_id[1]);

    //===================================================================================================
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code);
    //===================================================================================================

    cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code);
    //===================================================================================================

    image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code);
    image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code);
    twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code);
    //===================================================================================================

    /* Transfer data to memory buffer */
    CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event));

    image_tam = n*n*sizeof(cl_float2);

    //===================================================================================================
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code);
    //===================================================================================================
    ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    //===================================================================================================
    if (ret_code != CL_SUCCESS) {
    // Determine the size of the log
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    //===================================================================================================

    // Allocate memory for the log
    log_file = (char *) malloc(log_size);

    // Get the log
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL);
    printf("%s\n", log_file);
    system("pause");
    exit(0);
}
    kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code);
    kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code);
    kernel_lowpass_filter  = clCreateKernel(program, "lowpass_filter", &ret_code);

    /* Processa os fatores Wn*/
    //===================================================================================================
    CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem));
    CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n/2, 1);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0]));

    //===================================================================================================
    /* Executa a FFT em N/2 */
    fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]);

    //===================================================================================================
    /* Realiza a transposta da Matriz (imagem) */
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1]));

    //===================================================================================================
    /* Executa a FFT N/2 */
    fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]);

    //===================================================================================================
    /* Processa o filtro passa baixa */
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n));
    CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2]));

    //===================================================================================================
    /* Obtem a FFT inversa*/
    fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]);
    //===================================================================================================

    /* Realiza a transposta da Matriz (imagem) */
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem));
    CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n));
    config_workgroup_size(global_wg, local_wg, n, n);
    CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3]));

    //===================================================================================================
    fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]);
    //===================================================================================================

    CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event));
    //===================================================================================================

    //== Total time elapsed ============================================================================
    timer_stop();
    tempo_total = get_elapsed_time();
    //==================================================================================================

    //====== Get time of Profile Info ==================================================================
    // Write data time
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL));
    // Read data time
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL));
    CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL));

    for (i = 0; i < MAX_CALL_FFT; i++) {

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;
        CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;
        CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);

        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;

        if (fft_events[i].kernel_normalize != NULL) {
            CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
            CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
            kernel_runtime += (kernel_end_time - kernel_start_time);
        }
   }

    for (j=0; j < MAX_CALL_FFT*m; j++){
        kernel_start_time = (cl_long) 0;
        kernel_end_time = (cl_long) 0;

        CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL));
        CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL));
        kernel_runtime += (kernel_end_time - kernel_start_time);
    }

    write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time;
    read_dev_host_run_time =  read_dev_host_end_time -  read_dev_host_start_time;

/* save_log_debug(write_host_dev_run_time,fp);
    save_log_debug(read_dev_host_run_time,fp);
    close_log_debug(fp); */

    image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES);

    //==================================================================================================
    save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)),
    (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME);

    //===================================================================================================
    image_amplitudes = (float*)malloc(n*n*sizeof(float));
    for (i=0; i < n; i++) {
        for (j=0; j < n; j++) {
            image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1]));
        }
    }

    //clFlush(cmd_queue);
    //clFinish(cmd_queue);
    opgm.width = n;
    opgm.height = n;

    normalizar_pgm(&opgm, image_amplitudes);
    escrever_pgm(&opgm, output_filename);

    //===================================================================================================
	clFinish(cmd_queue);
    clReleaseKernel(kernel_twiddle_factors);
    clReleaseKernel(kernel_matriz_transpose);
    clReleaseKernel(kernel_lowpass_filter);
    clReleaseProgram(program);
    clReleaseMemObject(image_in_mem);
    clReleaseMemObject(image_out_mem);
    clReleaseMemObject(twiddle_factors_mem);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
	clReleaseEvent(read_dev_host_event);
	clReleaseEvent(write_host_dev_event);
	clReleaseEvent(kernels_events_out_fft[0]);
	clReleaseEvent(kernels_events_out_fft[1]);
	clReleaseEvent(kernels_events_out_fft[2]);
	clReleaseEvent(kernels_events_out_fft[3]);
    destruir_pgm(&ipgm);
    destruir_pgm(&opgm);
    free(image_amplitudes);
    free(source_str);
    free(image_in_host);
    free(image_filename);
    free(twiddle_factors_host);
    free(output_filename);
    free(fft_events);
    free(kernel_butter_events);

    //_CrtDumpMemoryLeaks();

    return 0;
}
///
//	main() for Convoloution example
//
int main(int argc, char** argv)
{
    cl_int errNum;
    cl_uint numPlatforms;
	cl_uint numDevices;
    cl_platform_id * platformIDs;
	cl_device_id * deviceIDs;
    cl_context context = NULL;
	cl_command_queue queue;
	cl_program program;
	cl_kernel kernel;
	cl_mem inputSignalBuffer;
	cl_mem outputSignalBuffer;
	cl_mem maskBuffer;

    // First, select an OpenCL platform to run on.  
	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkErr( 
		(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
		"clGetPlatformIDs"); 
 
	platformIDs = (cl_platform_id *)alloca(
       		sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr( 
	   (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
	   "clGetPlatformIDs");

	// Iterate through the list of platforms until we find one that supports
	// a CPU device, otherwise fail with an error.
	deviceIDs = NULL;
	cl_uint i;
	for (i = 0; i < numPlatforms; i++)
	{
		errNum = clGetDeviceIDs(
            platformIDs[i], 
            CL_DEVICE_TYPE_CPU, 
            0,
            NULL,
            &numDevices);
		if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
	    {
			checkErr(errNum, "clGetDeviceIDs");
        }
	    else if (numDevices > 0) 
		{
		   	deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
			errNum = clGetDeviceIDs(
				platformIDs[i],
				CL_DEVICE_TYPE_CPU,
				numDevices, 
				&deviceIDs[0], 
				NULL);
			checkErr(errNum, "clGetDeviceIDs");
			break;
	   }
	}

	// Check to see if we found at least one CPU device, otherwise return
	if (deviceIDs == NULL) {
		std::cout << "No CPU device found" << std::endl;
		exit(-1);
	}

    // Next, create an OpenCL context on the selected platform.  
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[i],
        0
    };
    context = clCreateContext(
		contextProperties, 
		numDevices,
        deviceIDs, 
		&contextCallback,
		NULL, 
		&errNum);
	checkErr(errNum, "clCreateContext");

	std::ifstream srcFile("../convolution/Convolution.cl");
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");

	std::string srcProg(
        std::istreambuf_iterator<char>(srcFile),
        (std::istreambuf_iterator<char>()));

	const char * src = srcProg.c_str();
	size_t length = srcProg.length();

	// Create program from source
	program = clCreateProgramWithSource(
		context, 
		1, 
		&src, 
		&length, 
		&errNum);
	checkErr(errNum, "clCreateProgramWithSource");

	// Build program
	errNum = clBuildProgram(
		program,
		numDevices,
		deviceIDs,
		NULL,
		NULL,
		NULL);
    if (errNum != CL_SUCCESS)
    {
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
			program, 
			deviceIDs[0], 
			CL_PROGRAM_BUILD_LOG,
            sizeof(buildLog), 
			buildLog, 
			NULL);

        std::cerr << "Error in kernel: " << std::endl;
        std::cerr << buildLog;
		checkErr(errNum, "clBuildProgram");
    }

	// Create kernel object
	kernel = clCreateKernel(
		program,
		"convolve",
		&errNum);
	checkErr(errNum, "clCreateKernel");

	// Now allocate buffers
	inputSignalBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
		static_cast<void *>(inputSignal),
		&errNum);
	checkErr(errNum, "clCreateBuffer(inputSignal)");

	maskBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(cl_uint) * maskHeight * maskWidth,
		static_cast<void *>(mask),
		&errNum);
	checkErr(errNum, "clCreateBuffer(mask)");

	outputSignalBuffer = clCreateBuffer(
		context,
		CL_MEM_WRITE_ONLY,
		sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
		NULL,
		&errNum);
	checkErr(errNum, "clCreateBuffer(outputSignal)");

	// Pick the first device and create command queue.
	queue = clCreateCommandQueue(
		context,
		deviceIDs[0],
		0,
		&errNum);
	checkErr(errNum, "clCreateCommandQueue");

    errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
	errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
	errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
	checkErr(errNum, "clSetKernelArg");

	const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight };
    const size_t localWorkSize[1]  = { 1 };

    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel, 
		1, 
		NULL,
        globalWorkSize, 
		localWorkSize,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueNDRangeKernel");
    
	errNum = clEnqueueReadBuffer(
		queue, 
		outputSignalBuffer, 
		CL_TRUE,
        0, 
		sizeof(cl_uint) * outputSignalHeight * outputSignalHeight, 
		outputSignal,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueReadBuffer");

    // Output the result buffer
    for (int y = 0; y < outputSignalHeight; y++)
	{
		for (int x = 0; x < outputSignalWidth; x++)
		{
			std::cout << outputSignal[x][y] << " ";
		}
		std::cout << std::endl;
	}

    std::cout << std::endl << "Executed program succesfully." << std::endl;

	return 0;
}
Exemple #13
0
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength,
		int bLength, int keyLength, uint32_t** matches, char* clFile, int x,
		int y) {
	int gap = 0, myoffset = 0;
	cl_platform_id *platforms;
	cl_uint num_platforms = 0;
	cl_device_id *devices;
	cl_uint num_devices = 0;
	cl_context context;
	cl_command_queue command_queue;
	cl_image_format imgFormat;
	cl_mem aImg;
	cl_mem bImg;
	cl_mem res_buf;
	cl_program program;
	cl_kernel kernel;
	cl_uint *results;
	FILE *prgm_fptr;
	struct stat prgm_sbuf;
	char *prgm_data;
	size_t prgm_size;
	size_t offset;
	size_t count;
	const size_t global_work_size[] = { x, y };
	const size_t origin[] = { 0, 0, 0 };
	const size_t region[] = { aLength, 1, 1 };

	cl_int ret;
	cl_uint i;

	cl_bool imageSupport;

	struct timeval t1, t2;
	double elapsedTime;

	results = malloc(sizeof(cl_uint) * aLength);

	imgFormat.image_channel_order = CL_RGBA;
	imgFormat.image_channel_data_type = CL_UNSIGNED_INT32;

	/* figure out how many CL platforms are available */
	ret = clGetPlatformIDs(0, NULL, &num_platforms);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting the number of platform IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (0 == num_platforms) {
		print_error ("No CL platforms were found.");
		exit(EXIT_FAILURE);
	}

	/* allocate space for each available platform ID */
	if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* get all of the platform IDs */
	ret = clGetPlatformIDs(num_platforms, platforms, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting platform IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* find a platform that supports given device type */
	//	print_error ("Number of platforms found: %d", num_platforms);
	for (i = 0; i < num_platforms; i++) {
		ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL,
				&num_devices);
		if (CL_SUCCESS != ret)
			continue;

		if (0 < num_devices)
			break;
	}

	/* make sure at least one device was found */
	if (num_devices == 0) {
		print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU"));
		exit(EXIT_FAILURE);
	}

	/* only one device is necessary... */
	num_devices = 1;
	if (NULL == (devices = malloc((sizeof *devices) * num_devices))) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* get one device id */
	ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices,
			devices, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting device IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL);
	if (CL_SUCCESS != ret) {
			print_error ("Failed to get Device Info: %d", ret);
			exit(EXIT_FAILURE);
		}

	if(imageSupport == CL_FALSE)
	{
		print_error ("Failure: Images are not supported!");
				exit(EXIT_FAILURE);
	}

	/* create a context for the CPU device that was found earlier */
	context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret);
	if (NULL == context || CL_SUCCESS != ret) {
		print_error ("Failed to create context: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* create a command queue for the CPU device */
	command_queue = clCreateCommandQueue(context, devices[0], 0, &ret);
	if (NULL == command_queue || CL_SUCCESS != ret) {
		print_error ("Failed to create a command queue: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* create buffers on the CL device */
	aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret);
	if (NULL == aImg || CL_SUCCESS != ret) {
		print_error ("Failed to create a image: %d", ret);
		exit(EXIT_FAILURE);
	}

	bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret);
	if (NULL == bImg || CL_SUCCESS != ret) {
		print_error ("Failed to create b image: %d", ret);
		exit(EXIT_FAILURE);
	}

	int res_bufSize = aLength;

	res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint)
			* res_bufSize, NULL, &ret);
	if (NULL == res_buf || CL_SUCCESS != ret) {
		print_error ("Failed to create b buffer: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* read the opencl program code into a string */
	prgm_fptr = fopen(clFile, "r");
	if (NULL == prgm_fptr) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}

	if (0 != stat(clFile, &prgm_sbuf)) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}
	prgm_size = prgm_sbuf.st_size;

	prgm_data = malloc(prgm_size);
	if (NULL == prgm_data) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* make sure all data is read from the file (just in case fread returns
	 * short) */
	offset = 0;
	while (prgm_size - offset != (count = fread(prgm_data + offset, 1,
			prgm_size - offset, prgm_fptr)))
		offset += count;

	if (0 != fclose(prgm_fptr)) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}

	/* create a 'program' from the source */
	program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data,
			&prgm_size, &ret);
	if (NULL == program || CL_SUCCESS != ret) {
		print_error ("Failed to create program with source: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* compile the program.. (it uses llvm or something) */
	ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL);
	if (CL_SUCCESS != ret) {
		size_t size;
		char *log = calloc(1, 4000);
		if (NULL == log) {
			print_error ("Out of memory");
			exit(EXIT_FAILURE);
		}

		print_error ("Failed to build program: %d", ret);
		ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG,
				4096, log, &size);
		if (CL_SUCCESS != ret) {
			print_error ("Failed to get program build info: %d", ret);
			exit(EXIT_FAILURE);
		}

		fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log);
		exit(EXIT_FAILURE);
	}

	/* pull out a reference to your kernel */
	kernel = clCreateKernel(program, "cgm_kernel", &ret);
	if (NULL == kernel || CL_SUCCESS != ret) {
		print_error ("Failed to create kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	gettimeofday(&t1, NULL);

	/* write data to these buffers */
	clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0,
			(void*) aImg, 0, NULL, NULL);
	clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0,
			(void*) bImg, 0, NULL, NULL);

	/* set your kernel's arguments */
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clSetKernelArg(kernel, 4, sizeof(int), &gap);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* make sure buffers have been written before executing */
	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* enque this kernel for execution... */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
			global_work_size, NULL, 0, NULL, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* wait for the kernel to finish executing */
	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* copy the contents of dev_buf from the CL device to the host (CPU) */
	ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint)
			* aLength, results, 0, NULL, NULL);

	gettimeofday(&t2, NULL);
	elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms
	elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms

	if (CL_SUCCESS != ret) {
		print_error ("Failed to copy data from device to host: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* make sure the content of the buffer are what we expect */
	//for (i = 0; i < aLength; i++)
	//	printf("%d\n", results[i]);

	/* free up resources */
	ret = clReleaseKernel(kernel);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseProgram(program);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release program: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseMemObject(aImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clReleaseMemObject(bImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseMemObject(res_buf);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) {
		print_error ("Failed to release command queue: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (CL_SUCCESS != (ret = clReleaseContext(context))) {
		print_error ("Failed to release context: %d", ret);
		exit(EXIT_FAILURE);
	}

	matches = &results;
	return elapsedTime;
}
static void
clrpc_client_test2(void)
{
	int err;

	int size = 1024;

	cl_uint nplatforms = 0;
	cl_platform_id* platforms = 0;
	cl_uint nplatforms_ret;

	clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret);	

	printf(  "after call one i get nplatforms_ret = %d",
		nplatforms_ret);

	if (nplatforms_ret == 0) exit(1);

	nplatforms = nplatforms_ret;
	platforms = (cl_platform_id*)calloc(nplatforms,sizeof(cl_platform_id));

	clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret);

	int i;
	for(i=0;i<nplatforms;i++) {
		clrpc_dptr* tmp = ((_xobj_t*)platforms[i])->obj;
		int is_rpc;
		if ( clGetPlatformInfo(platforms[i],999,sizeof(cl_int),&is_rpc,0)==CL_SUCCESS) {
			printf(  "platforms[%d] local=%p remote=%p\n",
				i,(void*)tmp->local,
				(void*)tmp->remote);
		} else {
			printf( "platforms[%d] not RPC\n",i);
		}
	}

	char buffer[1024];
	size_t sz;
	cl_platform_id rpc_platform = 0;
	for(i=0;i<nplatforms;i++) {
		clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1023,buffer,&sz);
		printf(  "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer);
	}

int iplat;
for(iplat=0;iplat<nplatforms;iplat++) {

printf("\n******************\nTEST PLATFORM %d\n*************\n\n",iplat);

	cl_uint ndevices = 0;
	cl_device_id* devices = 0;
	cl_uint ndevices_ret;

	clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL,
		ndevices,devices,&ndevices_ret);

	printf(  "after call one i get ndevices_ret = %d\n", ndevices_ret);

	if (ndevices_ret > 10) exit(-1);

	ndevices = ndevices_ret;
	devices = (cl_device_id*)calloc(ndevices,sizeof(cl_device_id));

	clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL,
		ndevices,devices,&ndevices_ret);

	if (!ndevices_ret) {
		//printf("no devices, stopping.\n");
		//exit(1);
		printf("no devices, skipping.\n");
		continue;
	}

	for(i=0;i<ndevices;i++) {
		clrpc_dptr* tmp = ((_xobj_t*)devices[i])->obj;
		clGetDeviceInfo(devices[i],CL_DEVICE_NAME,1023,buffer,&sz);
		printf(  "CL_DEVICE_NAME |%s|\n",buffer);
		cl_platform_id tmpid;
		clGetDeviceInfo(devices[i],CL_DEVICE_PLATFORM,sizeof(tmpid),&tmpid,&sz);
		printf("%p\n",platforms[iplat]); fflush(stdout);
		printf("%p\n",tmpid); fflush(stdout);
		clGetPlatformInfo(tmpid,CL_PLATFORM_NAME,1023,buffer,&sz);
		printf(  "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer);
	}

	cl_context_properties ctxprop[] = { 
		CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[iplat], 0 };

	printf("i am setting this: prop[%d] %p\n",iplat,platforms[iplat]);

	cl_context ctx = clCreateContext(ctxprop,ndevices,devices, 0,0,&err);

	cl_command_queue* cmdq 
		= (cl_command_queue*) calloc(ndevices,sizeof(cl_command_queue));

	for(i=0;i<ndevices;i++) {
		cmdq[i] = clCreateCommandQueue(ctx,devices[i],0,&err);
		printf( 	 "cmdq %d %p",i,cmdq[i]);
	}

	cl_mem a_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);
	cl_mem b_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);
	cl_mem c_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);
	cl_mem d_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int),
		0,&err);

	int* a = (int*)malloc(1024*sizeof(int));
	int* b = (int*)malloc(1024*sizeof(int));
	int* c = (int*)malloc(1024*sizeof(int));
	int* d = (int*)malloc(1024*sizeof(int));

	char* prgsrc[] = { 
		"__kernel void my_kern( int n, __global int* a, __global int* b )\n"
		" { int i = get_global_id(0); int tmp = 0; int j; for(j=0;j<n;j++) tmp += a[i] * a[j]; b[i] = tmp; }\n" 
	};
	size_t prgsrc_sz = strlen(prgsrc[0]) + 1;

	cl_program prg = clCreateProgramWithSource(ctx,1,
		(const char**)prgsrc,&prgsrc_sz,&err);

	clBuildProgram(prg,ndevices,devices,0,0,0);

	cl_kernel krn = clCreateKernel(prg,"my_kern",&err);

int idev;
for(idev=0;idev<ndevices;idev++) {
printf("\n******************\nTEST DEVICE %d(%d)\n*************\n\n",idev,iplat);

	for(i=0;i<size;i++) a[i] = i*10;
	for(i=0;i<size;i++) b[i] = i*10+1;
	for(i=0;i<size;i++) c[i] = 0;
	for(i=0;i<size;i++) d[i] = 0;

	cl_event ev[8];

	for(i=0;i<32;i++) printf("%d/",a[i]); printf("\n");
	for(i=0;i<32;i++) printf("%d/",b[i]); printf("\n");

	clEnqueueWriteBuffer(cmdq[idev],a_buf,CL_FALSE,0,size*sizeof(int),a,
		0,0,&ev[0]);
	clEnqueueWriteBuffer(cmdq[idev],b_buf,CL_FALSE,0,size*sizeof(int),b,
		1,ev,&ev[1]);
	clEnqueueWriteBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c,
		2,ev,&ev[2]);
	clEnqueueWriteBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d,
		3,ev,&ev[3]);

	size_t offset = 0; 
	size_t gwsz = 128;
	size_t lwsz = 16;

	clSetKernelArg(krn,0,sizeof(int),&size);
	clSetKernelArg(krn,1,sizeof(cl_mem),&a_buf);
	clSetKernelArg(krn,2,sizeof(cl_mem),&c_buf);
	clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,4,ev,&ev[4]);

	clSetKernelArg(krn,1,sizeof(cl_mem),&b_buf);
	clSetKernelArg(krn,2,sizeof(cl_mem),&d_buf);
	clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,5,ev,&ev[5]);

	clEnqueueReadBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c,
		6,ev,&ev[6]);
	clEnqueueReadBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d,
		7,ev,&ev[7]);

	clFlush(cmdq[idev]);

	clWaitForEvents(8,ev);

	for(i=0;i<32;i++) printf("%d/",c[i]); printf("\n");
	for(i=0;i<32;i++) printf("%d/",d[i]); printf("\n");

	for(i=0;i<8;i++) clReleaseEvent(ev[i]);

}

	clReleaseKernel(krn);

	clReleaseProgram(prg);

	clReleaseMemObject(a_buf);
	clReleaseMemObject(b_buf);
	clReleaseMemObject(c_buf);
	clReleaseMemObject(d_buf);

	clReleaseCommandQueue(cmdq[0]);
	clReleaseContext(ctx);

//	printf("sleeping ...\n");
//	sleep(1);

}

//	clrpc_final();

}
Exemple #15
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY, bufParam;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);
	int lenParam = 5;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
    bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasSrotm(N, bufX, 0, incx, bufY, 0, incy, bufParam, 0, 1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSrotm() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
                                    Y, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
                                    X, 0, NULL, NULL);

        /* At this point you will get the result of SROTM placed in vector Y. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufParam);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
int CommandGenerate::execute(const std::vector<std::string>& p_args) {
	if(p_args.size() < 10) {
		help();
		return -1;
	}

	unsigned int platformId = atol(p_args[1].c_str());
	unsigned int deviceId = atol(p_args[2].c_str());
	unsigned int staggerSize = atol(p_args[3].c_str());
	unsigned int threadsNumber = atol(p_args[4].c_str());
	unsigned int hashesNumber = atol(p_args[5].c_str());
	unsigned int nonceSize = PLOT_SIZE * staggerSize;

	std::cerr << "Threads number: " << threadsNumber << std::endl;
	std::cerr << "Hashes number: " << hashesNumber << std::endl;

	unsigned int numjobs = (p_args.size() - 5)/4;
	std::cerr << numjobs << " plot(s) to do." << std::endl;
	unsigned int staggerMbSize = staggerSize / 4;
	std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl;
	
	std::vector<std::string> paths(numjobs);
	std::vector<std::ofstream *> out_files(numjobs);
	std::vector<unsigned long long> addresses(numjobs);
	std::vector<unsigned long long> startNonces(numjobs);
	std::vector<unsigned long long> endNonces(numjobs);
	std::vector<unsigned int> noncesNumbers(numjobs);
	std::vector<unsigned char*> buffersCpu(numjobs);
	std::vector<bool> saving_thread_flags(numjobs);
	std::vector<std::future<void>> save_threads(numjobs);
	unsigned long long maxNonceNumber = 0;
	unsigned long long totalNonces = 0;

	int returnCode = 0;

	try {
		for (unsigned int i = 0; i < numjobs; i++) {
			std::cerr << "----" << std::endl;
			std::cerr << "Job number " << i << std::endl;
			unsigned int argstart = 6 + i*4;
			paths[i] = std::string(p_args[argstart]);
			addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10);
			startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10);
			noncesNumbers[i] = atol(p_args[argstart+3].c_str());
			maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]);
			totalNonces += noncesNumbers[i];

			std::ostringstream outFile;
			outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \
				noncesNumbers[i] << "_" << staggerSize;
			std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc;
			out_files[i] = new std::ofstream(outFile.str(), file_mode);
			assert(out_files[i]);

			if(noncesNumbers[i] % staggerSize != 0) {
				noncesNumbers[i] -= noncesNumbers[i] % staggerSize;
				noncesNumbers[i] += staggerSize;
			}

			endNonces[i] = startNonces[i] + noncesNumbers[i];
			unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024;
			std::cerr << "Path: " << outFile.str() << std::endl;
			std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl;
			std::cerr << "Creating CPU buffer" << std::endl;
			buffersCpu[i] = new unsigned char[nonceSize];
			if(!buffersCpu[i]) {
				throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)");
			}
			saving_thread_flags[i] = false;
			std::cerr << "----" << std::endl;
		}

		cl_platform_id platforms[4];
		cl_uint platformsNumber;
		cl_device_id devices[32];
		cl_uint devicesNumber;
		cl_context context = 0;
		cl_command_queue commandQueue = 0;
		cl_mem bufferGpuGen = 0;
		cl_mem bufferGpuScoops = 0;
		cl_program program = 0;
		cl_kernel kernelStep1 = 0;
		cl_kernel kernelStep2 = 0;
		cl_kernel kernelStep3 = 0;

		int error;

		std::cerr << "Retrieving OpenCL platforms" << std::endl;
		error = clGetPlatformIDs(4, platforms, &platformsNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL platforms");
		}

		if(platformId >= platformsNumber) {
			throw std::runtime_error("No platform found with the provided id");
		}

		std::cerr << "Retrieving OpenCL GPU devices" << std::endl;
		error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to retrieve the OpenCL devices");
		}

		if(deviceId >= devicesNumber) {
			throw std::runtime_error("No device found with the provided id");
		}

		std::cerr << "Creating OpenCL context" << std::endl;
		context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL context");
		}

		std::cerr << "Creating OpenCL command queue" << std::endl;
		commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL command queue");
		}

		std::cerr << "Creating OpenCL GPU generation buffer" << std::endl;
		bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer");
		}

		std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl;
		bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer");
		}

		std::cerr << "Creating OpenCL program" << std::endl;
		std::string source = loadSource("kernel/nonce.cl");
		const char* sources[] = {source.c_str()};
		size_t sourcesLength[] = {source.length()};
		program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL program");
		}

		std::cerr << "Building OpenCL program" << std::endl;
		error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0);
		if(error != CL_SUCCESS) {
			size_t logSize;
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);

			char* log = new char[logSize];
			clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0);
			std::cerr << log << std::endl;
			delete[] log;

			throw OpenclError(error, "Unable to build the OpenCL program");
		}

		std::cerr << "Creating OpenCL step1 kernel" << std::endl;
		kernelStep1 = clCreateKernel(program, "nonce_step1", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		std::cerr << "Creating OpenCL step2 kernel" << std::endl;
		kernelStep2 = clCreateKernel(program, "nonce_step2", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		std::cerr << "Creating OpenCL step3 kernel" << std::endl;
		kernelStep3 = clCreateKernel(program, "nonce_step3", &error);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to create the OpenCL kernel");
		}

		std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl;
		error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize);
		error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen);
		error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops);
		if(error != CL_SUCCESS) {
			throw OpenclError(error, "Unable to set the OpenCL kernel arguments");
		}

		size_t globalWorkSize = staggerSize;
		size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber;
		time_t startTime = time(0);
		unsigned int totalNoncesCompleted = 0;
		for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) {
			for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) {
				unsigned long long nonce = startNonces[jobnum] + nonce_ordinal;
				if (nonce > endNonces[jobnum]) {
				  break;
				}

				std::cout << "Running with start nonce " << nonce << std::endl;
				// Is a cl_ulong always an unsigned long long?
				unsigned int error = 0;
				error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments");
				}
				error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments");
				}

				error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in step1 kernel launch");
				}

				unsigned int hashesSize = hashesNumber * HASH_SIZE;
				for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) {
					error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce);
					error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset);
					error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments");
					}

					error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Error in step2 kernel launch");
					}

					error = clFinish(commandQueue);
					if(error != CL_SUCCESS) {
						throw OpenclError(error, "Error in step2 kernel finish");
					}
				}

				totalNoncesCompleted += staggerSize;
				double percent = 100.0 * (double)totalNoncesCompleted / totalNonces;
				time_t currentTime = time(0);
				double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0;
				double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed;
				std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)";
				std::cerr << ", " << speed << " nonces/minutes";
				std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s";
				std::cerr << "...                    ";

				error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in step3 kernel launch");
				}

				if (saving_thread_flags[jobnum]) {
					save_threads[jobnum].wait(); // Wait for last job to finish
					saving_thread_flags[jobnum] = false;
				}

				error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0);
				if(error != CL_SUCCESS) {
					throw OpenclError(error, "Error in synchronous read");
				}
				saving_thread_flags[jobnum] = true;
				save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]);
			}
		}

		//Clean up
		for (unsigned int i = 0; i < paths.size(); i += 1) {
		  if (saving_thread_flags[i]) {
		    std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl;
		    save_threads[i].wait();
		    saving_thread_flags[i] = false;
		    std::cerr << "done waiting for final save" << std::endl;
		    if (buffersCpu[i]) {
		      delete[] buffersCpu[i];
		    }
		  }
		}
		
		if(kernelStep3) { clReleaseKernel(kernelStep3); }
		if(kernelStep2) { clReleaseKernel(kernelStep2); }
		if(kernelStep1) { clReleaseKernel(kernelStep1); }
		if(program) { clReleaseProgram(program); }
		if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); }
		if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); }
		if(commandQueue) { clReleaseCommandQueue(commandQueue); }
		if(context) { clReleaseContext(context); }


		time_t currentTime = time(0);
		double elapsedTime = difftime(currentTime, startTime) / 60.0;
		double speed = (double)totalNonces / elapsedTime;
		std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)";
		std::cerr << ", " << speed << " nonces/minutes";
		std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s";
		std::cerr << "                    " << std::endl;
	} catch(const OpenclError& ex) {
		std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl;
		returnCode = -1;
	} catch(const std::exception& ex) {
		std::cerr << "[ERROR] " << ex.what() << std::endl;
		returnCode = -1;
	}
	return returnCode;
}
void test_variable_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	int id, devid, ret;
	int factor = *(int *) args;

        cl_int             err;
	cl_kernel          kernel;
	cl_command_queue   queue;
	cl_event           event;

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]);

	cl_context context;
	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);
	starpu_opencl_get_context(devid, &context);

	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
		sizeof(int), &variable_config.copy_failed, &err);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"variable_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(fail), &fail);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(factor), &factor);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global = 1;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	err = clEnqueueReadBuffer(queue,
				  fail,
				  CL_TRUE,
				  0, 
				  sizeof(int),
				  &variable_config.copy_failed,
				  0,
				  NULL,
				  NULL);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
	return;
}
Exemple #18
0
int main(int argc, char **argv)
{
  printf("enter demo main\n");
  fflush(stdout);
  putenv("POCL_VERBOSE=1");
  putenv("POCL_DEVICES=basic");
  putenv("POCL_LEAVE_TEMP_DIRS=1");
  putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1");
  putenv("POCL_TEMP_DIR=pocl");
  putenv("POCL_CACHE_DIR=pocl");
  putenv("POCL_WORK_GROUP_METHOD=spmd");
  if(argc >= 2){
    printf("argv[1]:%s:\n",argv[1]);
    if(!strcmp(argv[1], "h"))
      putenv("POCL_WORK_GROUP_METHOD=spmd");
    if(!strcmp(argv[1], "c"))
      putenv("POCL_CROSS_COMPILE=1");
  }
  if(argc >= 3){
    printf("argv[2]:%s:\n",argv[2]);
    if(!strcmp(argv[2], "h"))
      putenv("POCL_WORK_GROUP_METHOD=spmd");
    if(!strcmp(argv[2], "c"))
      putenv("POCL_CROSS_COMPILE=1");
  }

  //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
  //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib");
  //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib");
  //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath());
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	if (platforms_n == 0)
		return 1;

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", devices_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err));

	cl_command_queue queue;
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err));

	cl_kernel kernel = 0;
  cl_mem memObjects[2] = {0,0};


  // Create OpenCL program - first attempt to load cached binary.
  //  If that is not available, then create the program from source
  //  and store the binary for future use.
  std::cout << "Attempting to create program from binary..." << std::endl;
  cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin");
  if (program == NULL)
  {
      std::cout << "Binary not loaded, create from source..." << std::endl;
      program = CreateProgram(context, devices[1], "kernel.cl");
      if (program == NULL)
      {
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }

      std::cout << "Save program binary for future run..." << std::endl;
      if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false)
      {
          std::cerr << "Failed to write program binary" << std::endl;
          Cleanup(context, queue, program, kernel, memObjects);
          return 1;
      }
  }
  else
  {
      std::cout << "Read program from binary." << std::endl;
  }

  printf("attempting to create input buffer\n");
  fflush(stdout);
	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));

  printf("attempting to create output buffer\n");
  fflush(stdout);
	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));

  memObjects[0] = input_buffer;
  memObjects[1] = output_buffer;

  double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;;

  printf("attempting to create kernel\n");
  fflush(stdout);
	kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err));
  printf("setting up kernel args cl_mem:%lx \n",input_buffer);
  fflush(stdout);
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

  printf("attempting to enqueue write buffer\n");
  fflush(stdout);
	for (int i=0; i<NUM_DATA; i++) {
    double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;;
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
  printf("attempting to enqueue kernel\n");
  fflush(stdout);
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
  printf("Enqueue'd kerenel\n");
  fflush(stdout);
    cl_ulong time_start, time_end;
  CL_CHECK(clWaitForEvents(1, &kernel_completion));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL));
  CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL));
  double elapsed = time_end - time_start;
  printf("time(ns):%lg\n",elapsed);
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		double data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL));
		//printf(" %lg", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(memObjects[0]));
	CL_CHECK(clReleaseMemObject(memObjects[1]));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Exemple #19
0
/* uint32_t run (in uint32_t rank, [array, size_is (rank)] in uint32_t shape, [array, size_is (rank), optional] in uint32_t tile); */
NS_IMETHODIMP dpoCKernel::Run(uint32_t rank, uint32_t *shape, uint32_t *tile, uint32_t *_retval)
{
	cl_int err_code;
	cl_event runEvent, readEvent, writeEvent;
	size_t *global_work_size;
	size_t *local_work_size;
	const int zero = 0;

	DEBUG_LOG_STATUS("Run", "preparing execution of kernel");

    if (sizeof(size_t) == sizeof(uint32_t)) {
		global_work_size = (size_t *) shape;
	} else {
		global_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t));
		if (global_work_size == NULL) {
			DEBUG_LOG_STATUS("Run", "allocation of global_work_size failed");
			return NS_ERROR_OUT_OF_MEMORY;
		}
		for (uint32_t cnt = 0; cnt < rank; cnt++) {
			global_work_size[cnt] = shape[cnt];
		}
	}

#ifdef USE_LOCAL_WORKSIZE
	if (tile == NULL) {
		local_work_size = NULL;
	} else {
		if ((sizeof(size_t) == sizeof(uint32_t))) {
			local_work_size = (size_t *) tile;
		} else {
			local_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t));
			if (local_work_size == NULL) {
				DEBUG_LOG_STATUS("Run", "allocation of local_work_size failed");
				return NS_ERROR_OUT_OF_MEMORY;
			}
			for (int cnt = 0; cnt < rank; cnt++) {
				local_work_size[cnt] = (size_t) tile[cnt];
			}
		}
	}
#else /* USE_LOCAL_WORKSIZE */
	local_work_size = NULL;
#endif /* USE_LOCAL_WORKSIZE */

	DEBUG_LOG_STATUS("Run", "setting failure code to 0");

	err_code = clEnqueueWriteBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), &zero, 0, NULL, &writeEvent);
	if (err_code != CL_SUCCESS) {
		DEBUG_LOG_ERROR("Run", err_code);
		return NS_ERROR_ABORT;
	}

	DEBUG_LOG_STATUS("Run", "enqueing execution of kernel");

#ifdef WINDOWS_ROUNDTRIP
	dpoCContext::RecordBeginOfRoundTrip(parent);
#endif /* WINDOWS_ROUNDTRIP */

	err_code = clEnqueueNDRangeKernel(cmdQueue, kernel, rank, NULL, global_work_size, NULL, 1, &writeEvent, &runEvent);
	if (err_code != CL_SUCCESS) {
		DEBUG_LOG_ERROR("Run", err_code);
		return NS_ERROR_ABORT;
	}

	DEBUG_LOG_STATUS("Run", "reading failure code");

	err_code = clEnqueueReadBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), _retval, 1, &runEvent, &readEvent);
	if (err_code != CL_SUCCESS) {
		DEBUG_LOG_ERROR("Run", err_code);
		return NS_ERROR_ABORT;
	}

	DEBUG_LOG_STATUS("Run", "waiting for execution to finish");
	
	// For now we always wait for the run to complete.
	// In the long run, we may want to interleave this with JS execution and only sync on result read.
	err_code = clWaitForEvents( 1, &readEvent);
	
	DEBUG_LOG_STATUS("Run", "first event fired");

	if (err_code != CL_SUCCESS) {
		DEBUG_LOG_ERROR("Run", err_code);
		return NS_ERROR_ABORT;
	}
#ifdef WINDOWS_ROUNDTRIP
	dpoCContext::RecordEndOfRoundTrip(parent);
#endif /* WINDOWS_ROUNDTRIP */
	
#ifdef CLPROFILE
#ifdef CLPROFILE_ASYNC
	err_code = clSetEventCallback( readEvent, CL_COMPLETE, &dpoCContext::CollectTimings, parent);
	
	DEBUG_LOG_STATUS("Run", "second event fired");
	if (err_code != CL_SUCCESS) {
		DEBUG_LOG_ERROR("Run", err_code);
		return NS_ERROR_ABORT;
	}
#else /* CLPROFILE_ASYNC */
	dpoCContext::CollectTimings(runEvent,CL_COMPLETE,parent);
#endif /* CLPROFILE_ASYNC */
#endif /* CLPROFILE */
		
	DEBUG_LOG_STATUS("Run", "execution completed successfully, start cleanup");
	
	if (global_work_size != (size_t *) shape) {
		nsMemory::Free(global_work_size);
	}
#ifdef USE_LOCAL_WORKSIZE
	if (local_work_size != (size_t *) tile) {
		nsMemory::Free(local_work_size);
	}
#endif /* USE_LOCAL_WORKSIZE */
	
	err_code = clReleaseEvent(readEvent);
	err_code = clReleaseEvent(runEvent);
	err_code = clReleaseEvent(writeEvent);

	if (err_code != CL_SUCCESS) {
		DEBUG_LOG_ERROR("Run", err_code);
		return NS_ERROR_ABORT;
	}

	DEBUG_LOG_STATUS("Run", "cleanup complete");

    return NS_OK;
}
Exemple #20
0
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_platform_id cpPlatform;       //OpenCL platform
    cl_device_id cdDevice;           //OpenCL device
    cl_context       cxGPUContext;   //OpenCL context
    cl_command_queue cqCommandQueue; //OpenCL command que
    cl_mem      d_Input, d_Output;   //OpenCL memory buffer objects

    cl_int ciErrNum;

    float *h_Input, *h_OutputCPU, *h_OutputGPU;

    const uint
        imageW = 2048,
        imageH = 2048,
        stride = 2048;

    const int dir = DCT_FORWARD;

    shrQAStart(argc, argv);

    // set logfile name and start logs
    shrSetLogFileName ("oclDCT8x8.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    shrLog("Allocating and initializing host memory...\n");
        h_Input     = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float));
        h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float));
        srand(2009);
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++)
                h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX;

    shrLog("Initializing OpenCL...\n");
        //Get the NVIDIA platform
        ciErrNum = oclGetPlatformID(&cpPlatform);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Get a GPU device
        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create the context
        cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Create a command-queue
        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Initializing OpenCL DCT 8x8...\n");
        initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv);

    shrLog("Creating OpenCL memory objects...\n");
        d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride *  sizeof(cl_float), h_Input, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW);
        //Just a single iteration or a warmup iteration
        DCT8x8(
            cqCommandQueue,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

#define GPU_PROFILING 1
#ifdef GPU_PROFILING
    const int numIterations = 16;
    cl_event startMark, endMark;
    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrDeltaT(0);

    for(int iter = 0; iter < numIterations; iter++)
        DCT8x8(
            NULL,
            d_Output,
            d_Input,
            stride,
            imageH,
            imageW,
            dir
        );

    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
    ciErrNum |= clFinish(cqCommandQueue);
    shrCheckError(ciErrNum, CL_SUCCESS);

    //Calculate performance metrics by wallclock time
    double gpuTime = shrDeltaT(0) / (double)numIterations;
    shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", 
            (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); 

    //Get profiler time
    cl_ulong startTime = 0, endTime = 0;
    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
    shrCheckError(ciErrNum, CL_SUCCESS);
    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif

    shrLog("Reading back OpenCL results...\n");
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL);
        oclCheckError(ciErrNum, CL_SUCCESS);

    shrLog("Comparing against Host/C++ computation...\n"); 
        DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir);
        double sum = 0, delta = 0;
        double L2norm;
        for(uint i = 0; i < imageH; i++)
            for(uint j = 0; j < imageW; j++){
                sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j];
                delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]);
            }
        L2norm = sqrt(delta / sum);
        shrLog("Relative L2 norm: %.3e\n\n", L2norm);

    shrLog("Shutting down...\n");
        //Release kernels and program
        closeDCT8x8();

        //Release other OpenCL objects
        ciErrNum  = clReleaseMemObject(d_Output);
        ciErrNum |= clReleaseMemObject(d_Input);
        ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
        ciErrNum |= clReleaseContext(cxGPUContext);
        oclCheckError(ciErrNum, CL_SUCCESS);

        //Release host buffers
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Input);

        //Finish
        shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-3) ? QA_PASSED : QA_FAILED);
}
int
MemoryOptimizations::copy(cl_kernel& kernel, int vectorSize)
{
    cl_int status;
    cl_event events[2];

    /* Check group size against kernelWorkGroupSize */
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceId],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS, 
                               "clGetKernelWorkGroupInfo failed."))
    {
        return SDK_FAILURE;
    }

    if(localThreads[0] * localThreads[1] > kernelWorkGroupSize)
    {
        std::cout << "\nDevice doesn't support required work-group size!\n";
        return SDK_SUCCESS;
    }

    /*** Set appropriate arguments to the kernel ***/
    
    status = clSetKernelArg(kernel,
                            0,
                            sizeof(cl_mem), 
                            (void *)&inputBuffer);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clSetKernelArg failed.(inputBuffer)"))
        return SDK_FAILURE;

    status = clSetKernelArg(kernel,
                            1,
                            sizeof(cl_mem), 
                            (void *)&outputBuffer);

    if(!sampleCommon->checkVal(status,
                               CL_SUCCESS,
                               "clSetKernelArg failed.(outputBuffer)"))
        return SDK_FAILURE;

    double nsec = 0;

    // Reduce the iterations if verification is enabled.
    if(verify)
        Iterations = 1;

    /* Run the kernel for a number of iterations */
    for(int i = 0; i < Iterations; i++)
    {

        /*Enqueue a kernel run call */
        status = clEnqueueNDRangeKernel(commandQueue,
                                        kernel,
                                        2,
                                        NULL,
                                        globalThreads,
                                        localThreads,
                                        0,
                                        NULL,
                                        &events[0]);
        
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clEnqueueNDRangeKernel failed."))
            return SDK_FAILURE;


        /* wait for the kernel call to finish execution */
        status = clWaitForEvents(1, &events[0]);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clWaitForEvents failed."))
            return SDK_FAILURE;

        /* Calculate performance */
        cl_ulong startTime;
        cl_ulong endTime;
        
        /* Get kernel profiling info */
        status = clGetEventProfilingInfo(events[0],
                                         CL_PROFILING_COMMAND_START,
                                         sizeof(cl_ulong),
                                         &startTime,
                                         0);
        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetEventProfilingInfo failed.(startTime)"))
            return SDK_FAILURE;


        status = clGetEventProfilingInfo(events[0],
                                         CL_PROFILING_COMMAND_END,
                                         sizeof(cl_ulong),
                                         &endTime,
                                         0);

        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clGetEventProfilingInfo failed.(endTime)"))
            return SDK_FAILURE;

        /* Cumulate time for each iteration */
        nsec += endTime - startTime;
    }

    /* Copy bytes */
    int numThreads = (int)(globalThreads[0] * globalThreads[1]);
    double bytes = (double)(Iterations * 2 * vectorSize * sizeof(cl_float));
    double perf = (bytes / nsec) * numThreads;
    
    std::cout << ": " << perf << " GB/s" << std::endl;

    if(verify)
    {
        /* Enqueue readBuffer*/
        status = clEnqueueReadBuffer(commandQueue,
                                     outputBuffer,
                                     CL_TRUE,
                                     0,
                                     length * sizeof(cl_float4),
                                     output,
                                     0,
                                     NULL,
                                     0);

        if(!sampleCommon->checkVal(status,
                                   CL_SUCCESS,
                                   "clEnqueueReadBuffer failed."))
            return SDK_FAILURE;
        
        /* Verify data */
        if(!memcmp(input, output, vectorSize * sizeof(cl_float) * length))
        {
            std::cout << "Passed!\n";
            return SDK_SUCCESS;
        }
        else
        {
            std::cout << "Failed!\n";
            return SDK_FAILURE;
        }
    }

    return SDK_SUCCESS;
}
int main() {
// START:context
  cl_platform_id platform;
  clGetPlatformIDs(1, &platform, NULL);

  cl_device_id device;
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
// END:context

// START:queue
  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
// END:queue

// START:kernel
  char* source = read_source("multiply_arrays.cl");
  cl_program program = clCreateProgramWithSource(context, 1,
    (const char**)&source, NULL, NULL);
  free(source);
  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL);
// END:kernel

// START:buffers
  cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
  random_fill(a, NUM_ELEMENTS);
  random_fill(b, NUM_ELEMENTS);
  cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
  cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
    sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
  cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
    sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
// END:buffers

// START:execute
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
  clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);

  size_t work_units = NUM_ELEMENTS;
  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL);
// END:execute

// START:results
  cl_float results[NUM_ELEMENTS];
  clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
    results, 0, NULL, NULL);
// END:results

// START:cleanup
  clReleaseMemObject(inputA);
  clReleaseMemObject(inputB);
  clReleaseMemObject(output);
  clReleaseKernel(kernel);
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseContext(context);
// END:cleanup

  for (int i = 0; i < NUM_ELEMENTS; ++i) {
    printf("%f * %f = %f\n", a[i], b[i], results[i]);
  }

  return 0;
}
int main(int argc, char *argv[])
{
    int iGlobalSize = 1;
    int iCheck1, iCheck2, iCheck3, iCheck4;
    size_t iGlobalWorkSize = -1;
    size_t iLocalWorkSize = -1;
    if (argc > 1) // Size of input vector
    {
        iCheck1 = atoi(argv[1]);
        if (iCheck1 != 0)
        {
            iGlobalSize = iCheck1;
        }
    }
    int iNoReps = 100; // Number of repetitions.
	if (argc > 2)
	{
		iCheck2 = atoi(argv[2]);
        if (iCheck2 != 0)
        {
            iNoReps = iCheck2;
        }        
	}
    
/*    
    if (argc > 3) // Global work size
    {
        iCheck3 = atoi(argv[3]);
        if (iCheck3 != 0)
        {
            iGlobalWorkSize = iCheck3;
        }
    }
    if (argc > 4) // Local work size
    {
        iCheck4 = atoi(argv[4]);
        if (iCheck4 != 0)
        {
            iLocalWorkSize = iCheck4;
        }
    } */
    int bPrint = 0;
	if (argc > 3) // Originally 5.
	{
		bPrint = 1;
	}

 //   printf("The global size is %d, the global work size is %ld, and the local work size is %ld. \n", iGlobalSize, iGlobalWorkSize, iLocalWorkSize);
/*    size_t * ipGlobalWorkParam = NULL;
    if (iGlobalWorkSize != -1)
    {
        ipGlobalWorkParam = &iGlobalWorkSize;
    }
    
    size_t * ipLocalWorkParam = NULL;
    if (iLocalWorkSize != -1)
    {
        ipLocalWorkParam = &iLocalWorkSize;
    } */
    
    GCAQ * TheGCAQ = GCAQSetup();
    if (TheGCAQ == NULL)
    {
        return 1;
    }

#if BIGFLOAT
	const char *szFloatOpt = "-DBIGFLOAT";
#else
	const char *szFloatOpt = NULL;
#endif
    const int iNoKernels = 1;
	char *ourKernelStrings[6] =
		{ szDotProduct, szReduce, szDotProduct2, szReduce2, szDotProduct4, szReduce4};


  	GPAK *TheGPAK = GPAKSetup(TheGCAQ, iNoKernels, ourKernelStrings, szFloatOpt);
    if (TheGPAK == NULL)
    {
        GCAQShutdown(TheGCAQ);
        return 2;
    }

    INTG iTypicalWorkgroupNo = TheGPAK->TheMaxWorkGroupSizes[0];
    
    INTG iExpOutputSize = ioutsize(iGlobalSize, iTypicalWorkgroupNo);
    FLPT * fExpDotProdResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
    FLPT * fExpReduceResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
    fdotprodexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpDotProdResult);
    freduceexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpReduceResult);
 //   printvector("dot prod", iExpOutputSize, fExpDotProdResult);
 //   printvector("reduce", iExpOutputSize, fExpReduceResult);

    FLPT* inputDataF = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFIncrease(iGlobalSize, inputDataF);
    
    // For the dot product.
    
    FLPT* outputDataD = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFNull(iGlobalSize, outputDataD);

    // For the reduction.
    
    FLPT* outputDataR = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
    SetFNull(iGlobalSize, outputDataR);
    
	struct timespec start[iNoKernels];
	struct timespec end[iNoKernels];
    
// create buffers for the input and ouput

    int err; 
    cl_mem inputF, outputF, outputAll;
    inputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_READ_ONLY, iGlobalSize * sizeof(FLPT), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for F");
        return 3;
    }
    
    
    outputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for output 7");
        return 9;
    }
    outputAll = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error allocating for output 8");
        return 9;
    }

    clEnqueueWriteBuffer(TheGCAQ->TheQueue, inputF, CL_TRUE, 0, iGlobalSize * sizeof(FLPT), inputDataF, 0, NULL, NULL);

    int iRep;
    int iKernel;
    int i;
    int iLengthTotal = iGlobalSize;
    size_t iGlobalWorkThing = iGlobalSize;
    int iSomething = 1;
    for (iKernel = 0; iKernel < iNoKernels; iKernel++)
    {    
		for (i = 0; i < iLengthTotal; i++)
		{
			outputDataD[i] = 0.0;
            outputDataR[i] = 0.0;
		}
        
        clock_gettime(CLOCK_MONOTONIC, &(start[iKernel]));
		for (iRep = 0; iRep < iNoReps; iRep++)
		{
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 0, sizeof(int), &iLengthTotal);
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 1, sizeof(cl_mem), &inputF);
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 2, iSomething * iLocalWorkSize * sizeof(float), NULL); // Was 3
            clSetKernelArg(TheGPAK->TheKernels[iKernel], 3, sizeof(cl_mem), &outputAll); // Was 4         
            clEnqueueNDRangeKernel(TheGCAQ->TheQueue, TheGPAK->TheKernels[iKernel], 1,
                NULL, &iGlobalWorkThing, &(TheGPAK->TheMaxWorkGroupSizes[iKernel]), 0, NULL, NULL);
            clFinish(TheGCAQ->TheQueue);
 
// copy the results from out of the output buffer

            if (iKernel % 2 ==  0)
            {
                clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataD, 0, NULL, NULL);
            }
            else
            {
                clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataR, 0, NULL, NULL);
            }
        
        }
        clock_gettime(CLOCK_MONOTONIC, &(end[iKernel]));    
        if (bPrint)
        {
            for (i = 0; i < iExpOutputSize; i++)
			{
				if (iKernel % 2 == 0)
                {
                    if (outputDataD[i] != fExpDotProdResult[i])
                    {
                        printf
                            ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n",
                            iKernel, i, outputDataD[i], fExpDotProdResult[i]);
                        break;
                    }
                }
                else
                {
                    if (outputDataR[i] != fExpReduceResult[i])
                    {
                        printf
                            ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n",
                            iKernel, i, outputDataR[i], fExpReduceResult[i]);
                        break;
                    }
                }
			}    
        }
   //     if ((iKernel % 2) == 1)
   //     {
   //         iLengthTotal = iLengthTotal / 2;
    //        iSomething = iSomething * 2;
    //        iGlobalWorkThing = iGlobalWorkThing / 2;
  //      }
    }


    clReleaseMemObject(inputF);
    clReleaseMemObject(outputF);
    clReleaseMemObject(outputAll);
    
// print the results
//    if (bPrint)
//    {
//        printf("output %d: \n", iGlobalSize);
//        for(i=0;i<iExpOutputSize; i++)
//        {
//            printf("%d - %f - %f\n", i, outputDataD[i], outputDataR[i]);
//        }   
//    }
    
// cleanup - release OpenCL resources
    
    free(inputDataF);
    free(outputDataD);
    free(outputDataR);
   
    GPAKShutdown(TheGPAK);
    GCAQShutdown (TheGCAQ);
    printf("%d - ", iGlobalSize);
	for (iKernel = 0; iKernel < iNoKernels; iKernel++)
	{
		printf("%f - ", (1.0 * TLPERS * iGlobalSize * iNoReps) /
			   (MEGAHERTZ * timespecDiff(&(end[iKernel]), &(start[iKernel]))));
	}
    printf("\n");
    return 0;
}
Exemple #24
0
OPENCL_EXPERIMENTS_EXPORT
cl_int opencl_plugin_voxelize_meshes(opencl_plugin plugin,
                                     float inv_element_size,
                                     float corner_x,
                                     float corner_y,
                                     float corner_z,
                                     cl_int x_cell_length,
                                     cl_int y_cell_length,
                                     cl_int z_cell_length,
                                     cl_int mesh_data_count,
                                     mesh_data *mesh_data_list,
                                     cl_uchar *voxel_grid_out)
{
    cl_int err = CL_SUCCESS;
    cl_int i;
    cl_int next_row_offset, next_slice_offset;
    size_t local_work_size;
    cl_int num_voxels;

    clock_t t1;
    clock_t t2;
    clock_t t3;

    assert(plugin != NULL);
    assert(inv_element_size >= 0);
    assert(x_cell_length >= 0);
    assert(y_cell_length >= 0);
    assert(z_cell_length >= 0);
    assert(mesh_data_count >= 0);
    assert(mesh_data_list != NULL);

    t1 = clock();

    /* (Re-)allocate buffer for voxel grid */
    num_voxels = x_cell_length * y_cell_length * z_cell_length;
    if (opencl_plugin_init_voxel_buffer(plugin, num_voxels))
        goto error;

    /* (Re-)allocate buffers for mesh data */
    if (opencl_plugin_init_mesh_buffers(plugin, mesh_data_count, mesh_data_list))
        goto error;

    err = clGetKernelWorkGroupInfo(
        plugin->voxelize_kernel, plugin->selected_device,
        CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size,
        NULL);
    CHECK_CL_ERROR(err);

    if (enqueue_zero_buffer(plugin->queue, plugin->voxel_grid_buffer,
                            plugin->voxel_grid_buffer_capacity, 0, NULL, NULL,
                            &err))
        goto error;

    err = clFinish(plugin->queue);
    CHECK_CL_ERROR(err);

    t1 = clock() - t1;
    t2 = clock();

    next_row_offset = x_cell_length;
    next_slice_offset = x_cell_length * y_cell_length;

    err |= clSetKernelArg(plugin->voxelize_kernel, 0, sizeof(cl_mem), &plugin->voxel_grid_buffer);
    err |= clSetKernelArg(plugin->voxelize_kernel, 1, sizeof(float),  &inv_element_size);
    err |= clSetKernelArg(plugin->voxelize_kernel, 2, sizeof(float),  &corner_x);
    err |= clSetKernelArg(plugin->voxelize_kernel, 3, sizeof(float),  &corner_y);
    err |= clSetKernelArg(plugin->voxelize_kernel, 4, sizeof(float),  &corner_z);
    err |= clSetKernelArg(plugin->voxelize_kernel, 5, sizeof(cl_int), &next_row_offset);
    err |= clSetKernelArg(plugin->voxelize_kernel, 6, sizeof(cl_int), &next_slice_offset);
    err |= clSetKernelArg(plugin->voxelize_kernel, 7, sizeof(cl_int), &x_cell_length);
    err |= clSetKernelArg(plugin->voxelize_kernel, 8, sizeof(cl_int), &y_cell_length);
    err |= clSetKernelArg(plugin->voxelize_kernel, 9, sizeof(cl_int), &z_cell_length);
    CHECK_CL_ERROR(err);

    for (i = 0; i < mesh_data_count; i++) {
        size_t global_work_size;
        cl_uint vertex_buffer_base_idx = mesh_data_list[i].vertex_buffer_base_idx;
        cl_uint triangle_buffer_base_idx = mesh_data_list[i].triangle_buffer_base_idx;
        err |= clSetKernelArg(plugin->voxelize_kernel, 10, sizeof(cl_mem), &plugin->vertex_buffer);
        err |= clSetKernelArg(plugin->voxelize_kernel, 11, sizeof(cl_mem), &plugin->triangle_buffer);
        err |= clSetKernelArg(plugin->voxelize_kernel, 12, sizeof(cl_int), &mesh_data_list[i].num_triangles);
        err |= clSetKernelArg(plugin->voxelize_kernel, 13, sizeof(cl_uint), &vertex_buffer_base_idx);
        err |= clSetKernelArg(plugin->voxelize_kernel, 14, sizeof(cl_uint), &triangle_buffer_base_idx);
        CHECK_CL_ERROR(err);

        /* As per the OpenCL spec, global_work_size must divide evenly by
         * local_work_size */
        global_work_size = mesh_data_list[i].num_triangles / local_work_size;
        global_work_size *= local_work_size;
        if (global_work_size < (size_t)mesh_data_list[i].num_triangles)
            global_work_size += local_work_size;

        err = clEnqueueNDRangeKernel(
            plugin->queues[i % plugin->num_queues], plugin->voxelize_kernel, 1, NULL, &global_work_size,
            &local_work_size, 0, NULL, NULL);
        CHECK_CL_ERROR_MSG(err, "clEnqueueNDRangeKernel failed on mesh %d/%d",
                           i + 1, mesh_data_count);

        err = clFinish(plugin->queue);
        CHECK_CL_ERROR_MSG(err, "clFinish failed on mesh %d/%d",
                           i + 1, mesh_data_count);
    }

    err = clFinish(plugin->queue);
    CHECK_CL_ERROR(err);

    for (i = 0; i < plugin->num_queues; i++) {
        err = clFinish(plugin->queues[i]);
        CHECK_CL_ERROR(err);
    }

    t2 = clock() - t2;
    t3 = clock();

    err = clEnqueueReadBuffer(
        plugin->queue, plugin->voxel_grid_buffer, CL_TRUE, 0,
        num_voxels, voxel_grid_out, 0, NULL, NULL);
    CHECK_CL_ERROR(err);

    t3 = clock() - t3;

    TRACE("Clock T1: %f", ((float)t1 * 1000.0f) / CLOCKS_PER_SEC);
    TRACE("Clock T2: %f", ((float)t2 * 1000.0f) / CLOCKS_PER_SEC);
    TRACE("Clock T3: %f", ((float)t3 * 1000.0f) / CLOCKS_PER_SEC);
    return 0;
error:
    return -1;
}
int
FastWalshTransform::runCLKernels(void)
{
    cl_int   status;
    size_t globalThreads[1];
    size_t localThreads[1];

    // Enqueue write input to inputBuffer
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                 commandQueue,
                 inputBuffer,
                 CL_FALSE,
                 0,
                 length * sizeof(cl_float),
                 input,
                 0,
                 NULL,
                 &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");

    status = waitForEventAndRelease(&writeEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed");

    /*
     * The kernel performs a butterfly operation and it runs for half the
     * total number of input elements in the array.
     * In each pass of the kernel two corresponding elements are found using
     * the butterfly operation on an array of numbers and their sum and difference
     * is stored in the same locations as the numbers
     */
    globalThreads[0] = length / 2;
    localThreads[0]  = 256;

    // Check group size against kernelWorkGroupSize
    status = kernelInfo.setKernelWorkGroupInfo(kernel,
             devices[sampleArgs->deviceId]);
    CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed.");

    if((cl_uint)(localThreads[0]) > kernelInfo.kernelWorkGroupSize)
    {
        if(!sampleArgs->quiet)
        {
            std::cout << "Out of Resources!" << std::endl;
            std::cout << "Group Size specified : " << localThreads[0] << std::endl;
            std::cout << "Max Group Size supported on the kernel : "
                      << kernelInfo.kernelWorkGroupSize << std::endl;
            std::cout<<"Changing the group size to " << kernelInfo.kernelWorkGroupSize
                     << std::endl;
        }
        localThreads[0] = kernelInfo.kernelWorkGroupSize;
    }

    // Set appropriate arguments to the kernel

    // the input array - also acts as output
    status = clSetKernelArg(
                 kernel,
                 0,
                 sizeof(cl_mem),
                 (void *)&inputBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputBuffer)");

    for(cl_int step = 1; step < length; step <<= 1)
    {
        // stage of the algorithm
        status = clSetKernelArg(
                     kernel,
                     1,
                     sizeof(cl_int),
                     (void *)&step);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (step)");

        // Enqueue a kernel run call
        cl_event ndrEvt;
        status = clEnqueueNDRangeKernel(
                     commandQueue,
                     kernel,
                     1,
                     NULL,
                     globalThreads,
                     localThreads,
                     0,
                     NULL,
                     &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        status = clFlush(commandQueue);
        CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");

        status = waitForEventAndRelease(&ndrEvt);
        CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed");
    }


    // Enqueue readBuffer
    cl_event readEvt;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 inputBuffer,
                 CL_FALSE,
                 0,
                 length *  sizeof(cl_float),
                 output,
                 0,
                 NULL,
                 &readEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)");

    status = waitForEventAndRelease(&readEvt);
    CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed");

    return SDK_SUCCESS;
}
Exemple #26
0
void
nrm2CorrectnessTest(TestParams *params)
{
    cl_int err;
    T1 *blasX;
    T2 *clblasNRM2, *blasNRM2;
    cl_mem bufX, bufNRM2, scratchBuff;
    clMath::BlasBase *base;
    cl_event *events;
    cl_double deltaForType = 0.0;

    base = clMath::BlasBase::getInstance();

    if ((typeid(T1) == typeid(cl_double) ||
         typeid(T1) == typeid(DoubleComplex)) &&
        !base->isDevSupportDoublePrecision()) {

        std::cerr << ">> WARNING: The target device doesn't support native "
                     "double precision floating point arithmetic" <<
                     std::endl << ">> Test skipped" << std::endl;
        SUCCEED();
        return;
    }

	printf("number of command queues : %d\n\n", params->numCommandQueues);

    events = new cl_event[params->numCommandQueues];
    memset(events, 0, params->numCommandQueues * sizeof(cl_event));

    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));

    blasX 	= new T1[lengthX + params->offBX ];
	blasNRM2 = new T2[1];
    clblasNRM2 = new T2[1 + params->offa];

	if((blasX == NULL) || (clblasNRM2 == NULL) || (blasNRM2 == NULL))
	{
		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
        deleteBuffers<T1>(blasX);
        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
		delete[] events;
		SUCCEED();
        return;
	}

    srand(params->seed);

	randomVectors<T1>(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true);

	// Allocate buffers
    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
    bufNRM2 = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE);
	scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * 2 * sizeof(T1)), 0, CL_MEM_READ_WRITE);

	*blasNRM2  = ::clMath::blas::nrm2( params->N, blasX, params->offBX, params->incx);

    if ((bufX == NULL) || (bufNRM2 == NULL) || (scratchBuff == NULL)) {
        releaseMemObjects(bufX, bufNRM2, scratchBuff);
        deleteBuffers<T1>(blasX);
        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
        delete[] events;
        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
            << ::std::endl
            << ">> Can't execute the test, because data is not transfered to GPU."
            << ::std::endl
            << ">> Test skipped." << ::std::endl;
        SUCCEED();
        return;
    }

    DataType type;
    type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;

    err = (cl_int)::clMath::clblas::nrm2( type, params->N,  bufNRM2, params->offa, bufX,
    					params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(),
    					0, NULL, events);

    if (err != CL_SUCCESS) {
        releaseMemObjects(bufX, bufNRM2, scratchBuff);
        deleteBuffers<T1>(blasX);
        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::NRM2() failed";
    }

    err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events);
    if (err != CL_SUCCESS) {
        releaseMemObjects(bufX, bufNRM2, scratchBuff);
        deleteBuffers<T1>(blasX);
        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
    }

    err = clEnqueueReadBuffer(base->commandQueues()[0], bufNRM2, CL_TRUE, 0,
            (1 + params->offa) * sizeof(*clblasNRM2), clblasNRM2, 0, NULL, NULL);
	if (err != CL_SUCCESS) {
		::std::cerr << "NRM2: Reading results failed...." << std::endl;
	}
    releaseMemObjects(bufX, bufNRM2, scratchBuff);

    deltaForType = DELTA_0<T1>();

    // Since every element of X encounters a division, delta would be sum of deltas for every element in X
    cl_double delta = 0;
    for(unsigned int i=0; i<(params->N); i++) {
        delta += deltaForType * returnMax<T1>(blasX[params->offBX + i]);
    }
    compareValues<T2>( (blasNRM2), (clblasNRM2+params->offa), delta);

    if (::testing::Test::HasFailure())
    {
        printTestParams(params->N, params->offBX, params->incx);
        ::std::cerr << "offNRM2 = " << params->offa << ::std::endl;
        ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl;
    }

    deleteBuffers<T1>(blasX);
    deleteBuffers<T2>(blasNRM2,  clblasNRM2);
    delete[] events;
}
Exemple #27
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufX;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float),
                          NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float),
                          NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
                               N * lda * sizeof(cl_float), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
                               N * sizeof(cl_float), X, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasStbsv(order, uplo, trans, diag, N, K,
                      bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event);

    if (err != CL_SUCCESS) {
        printf("clblasStbsv() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float),
                                  X, 0, NULL, NULL);

        /* At this point you will get the result of STBSV placed in X array. */
        printResult();
    }

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
T profileReduce(ReduceType datatype,
                  cl_int  n, 
                  int  numThreads,
                  int  numBlocks,
                  int  maxThreads,
                  int  maxBlocks,
                  int  whichKernel, 
                  int  testIterations,
                  bool cpuFinalReduction,
                  int  cpuFinalThreshold,
                  double* dTotalTime,
                  T* h_odata,
                  cl_mem d_idata, 
                  cl_mem d_odata)
{


    T gpu_result = 0;
    bool needReadBack = true;
    cl_kernel finalReductionKernel[10];
    int finalReductionIterations=0;

    //shrLog("Profile Kernel %d\n", whichKernel);

    cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) );
    clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata);
    clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata);
    clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n);
    clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL);

    if( !cpuFinalReduction ) {
        int s=numBlocks;
        int threads = 0, blocks = 0;
        int kernel = (whichKernel == 6) ? 5 : whichKernel;
        
        while(s > cpuFinalThreshold) 
        {
            getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);

            finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) );
            clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata);
            clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata);
            clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n);
            clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL);
            
            if (kernel < 3)
                s = (s + threads - 1) / threads;
            else
                s = (s + (threads*2-1)) / (threads*2);

            finalReductionIterations++;
        }
    }
    
    size_t globalWorkSize[1];
    size_t localWorkSize[1];

    for (int i = 0; i < testIterations; ++i)
    {
        gpu_result = 0;

        clFinish(cqCommandQueue);
        if(i>0) shrDeltaT(1);

        // execute the kernel
        globalWorkSize[0] = numBlocks * numThreads;
        localWorkSize[0] = numThreads;
	
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize,
                                          0, NULL, NULL);               

        // check if kernel execution generated an error        
        oclCheckError(ciErrNum, CL_SUCCESS);

        if (cpuFinalReduction)
        {
            // sum partial sums from each block on CPU        
            // copy result from device to host
            clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), 
                                h_odata, 0, NULL, NULL);

            for(int i=0; i<numBlocks; i++) 
            {
                gpu_result += h_odata[i];
            }

            needReadBack = false;
        }
        else
        {
            // sum partial block sums on GPU
            int s=numBlocks;
            int kernel = (whichKernel == 6) ? 5 : whichKernel;
            int it = 0;
            

            while(s > cpuFinalThreshold) 
            {
                int threads = 0, blocks = 0;
                getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);

                globalWorkSize[0] = threads * blocks;
                localWorkSize[0] = threads;
                
                ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0,
                                                  globalWorkSize, localWorkSize, 0, NULL, NULL);               
                oclCheckError(ciErrNum, CL_SUCCESS);
                
                if (kernel < 3)
                    s = (s + threads - 1) / threads;
                else
                    s = (s + (threads*2-1)) / (threads*2);

                it++;
            }

            if (s > 1)
            {
                // copy result from device to host
                clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), 
                                    h_odata, 0, NULL, NULL);

                for(int i=0; i < s; i++) 
                {
                    gpu_result += h_odata[i];
                }

                needReadBack = false;
            }
        }

        clFinish(cqCommandQueue);
        if(i>0) *dTotalTime += shrDeltaT(1); 
    }

    if (needReadBack)
    {
        // copy final sum from device to host
        clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), 
                            &gpu_result, 0, NULL, NULL);
    }

    // Release the kernels
    clReleaseKernel(reductionKernel);
    if( !cpuFinalReduction ) {
        for(int it=0; it<finalReductionIterations; ++it) {
            clReleaseKernel(finalReductionKernel[it]);
        }
        
    }

    return gpu_result;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'remainder_uchar8uchar8.cl' */
        source_code = read_buffer("remainder_uchar8uchar8.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "remainder_uchar8uchar8", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_uchar8 *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_uchar8));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_uchar8){{2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uchar8), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uchar8), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create and init host side src buffer 1 */
        cl_uchar8 *src_1_host_buffer;
        src_1_host_buffer = malloc(num_elem * sizeof(cl_uchar8));
        for (int i = 0; i < num_elem; i++)
                src_1_host_buffer[i] = (cl_uchar8){{2, 2, 2, 2, 2, 2, 2, 2}};
        
        /* Create and init device side src buffer 1 */
        cl_mem src_1_device_buffer;
        src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uchar8), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uchar8), src_1_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_uchar8 *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_uchar8));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_uchar8));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_uchar8), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
        ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_uchar8), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_uchar8));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Free host side src buffer 1 */
        free(src_1_host_buffer);

        /* Free device side src buffer 1 */
        ret = clReleaseMemObject(src_1_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Exemple #30
0
int
main(void)
{
  cl_int err;
  cl_platform_id platforms[MAX_PLATFORMS];
  cl_uint nplatforms;
  cl_device_id devices[MAX_DEVICES];
  cl_uint ndevices;
  cl_uint i, j;

  err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms);	
  if (err != CL_SUCCESS)
    return EXIT_FAILURE;

  for (i = 0; i < nplatforms; i++)
    {
      err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES,
			   devices, &ndevices);
      if (err != CL_SUCCESS)
	return EXIT_FAILURE;
      
      for (j = 0; j < ndevices; j++)
	{
	  cl_context context = clCreateContext(NULL, 1, &devices[j], NULL, NULL, &err);
	  if (err != CL_SUCCESS)
	    return EXIT_FAILURE;
	  cl_command_queue queue = clCreateCommandQueue(context, devices[j], 0, &err);
	  if (err != CL_SUCCESS)
	    return EXIT_FAILURE;

	  const int buf_size = 1024;
	  cl_int host_buf[buf_size];

	  cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * buf_size, NULL, &err);
	  if (err != CL_SUCCESS)
	    return EXIT_FAILURE;
	  cl_event buf_event;
	  if (clEnqueueReadBuffer(queue, buf, CL_TRUE, 0, sizeof(cl_int) * buf_size, &host_buf, 0, NULL, &buf_event) != CL_SUCCESS)
	    return EXIT_FAILURE;
	  clFinish(queue);
	  cl_command_queue event_command_queue;
	  size_t param_val_size_ret;
	  if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_QUEUE, sizeof(cl_command_queue), &event_command_queue, &param_val_size_ret) != CL_SUCCESS)
	    return EXIT_FAILURE;
	  if (param_val_size_ret != sizeof(cl_command_queue) || event_command_queue != queue)
	    return EXIT_FAILURE;

	  cl_command_type command_type;
	  if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &command_type, &param_val_size_ret) != CL_SUCCESS)
	    return EXIT_FAILURE;
	  if (param_val_size_ret != sizeof(cl_command_type) || command_type != CL_COMMAND_READ_BUFFER)
	    return EXIT_FAILURE;

	  cl_int execution_status;
	  if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &execution_status, &param_val_size_ret) != CL_SUCCESS)
	    return EXIT_FAILURE;
	  if (param_val_size_ret != sizeof(cl_int) || execution_status != CL_COMPLETE)
	    return EXIT_FAILURE;

	  cl_uint ref_count;
	  if (clGetEventInfo(buf_event, CL_EVENT_REFERENCE_COUNT, sizeof(cl_uint), &ref_count, &param_val_size_ret) != CL_SUCCESS)
	    return EXIT_FAILURE;
	  if (param_val_size_ret != sizeof(cl_uint) || ref_count != 1)
        {
          printf("FAIL: expected refcount 1, got %d\n", ref_count);
          return EXIT_FAILURE;
        }

	  clReleaseEvent(buf_event);
	  clReleaseMemObject(buf);
	  clReleaseCommandQueue(queue);
	}
    }
  return EXIT_SUCCESS;
}