// Function to read in kernel from uncompiled source, create the OCL program and build the OCL program 
    // **************************************************************************************************
    int CreateProgramAndKernel(cl_context cxGPUContext, cl_device_id* cdDevices, const char *kernel_name, cl_kernel *kernel, bool bDouble)
    {
        cl_program cpProgram;
        size_t szSourceLen;
        cl_int ciErrNum = CL_SUCCESS; 

        // Read the kernel in from file
        shrLog("\nLoading Uncompiled kernel from .cl file, using %s\n", clSourcefile);
        char* cPathAndFile = shrFindFilePath(clSourcefile, cExecutablePath);
        oclCheckError(cPathAndFile != NULL, shrTRUE);
        char* pcSource = oclLoadProgSource(cPathAndFile, "", &szSourceLen);
        oclCheckError(pcSource != NULL, shrTRUE);

	// Check OpenCL version -> vec3 types are supported only from version 1.1 and above
	char cOCLVersion[32];
	clGetDeviceInfo(cdDevices[0], CL_DEVICE_VERSION, sizeof(cOCLVersion), &cOCLVersion, 0);

	int iVec3Length = 3;
	if( strncmp("OpenCL 1.0", cOCLVersion, 10) == 0 ) {
		iVec3Length = 4;
	}


		//for double precision
		char *pcSourceForDouble;
		std::stringstream header;
		if (bDouble)
		{
			header << "#define REAL double";
			header << std::endl;
			header << "#define REAL4 double4";
			header << std::endl;
			header << "#define REAL3 double" << iVec3Length;
			header << std::endl;
			header << "#define ZERO3 {0.0, 0.0, 0.0" << ((iVec3Length == 4) ? ", 0.0}" : "}");
			header << std::endl;
		}
		else
		{
			header << "#define REAL float";
			header << std::endl;
			header << "#define REAL4 float4";
			header << std::endl;
			header << "#define REAL3 float" << iVec3Length;
			header << std::endl;
			header << "#define ZERO3 {0.0f, 0.0f, 0.0f" << ((iVec3Length == 4) ? ", 0.0f}" : "}");
			header << std::endl;
		}
		
		header << pcSource;
		pcSourceForDouble = (char *)malloc(header.str().size() + 1);
		szSourceLen = header.str().size();
#ifdef WIN32
        strcpy_s(pcSourceForDouble, szSourceLen + 1, header.str().c_str());
#else
        strcpy(pcSourceForDouble, header.str().c_str());
#endif

        // create the program 
        cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&pcSourceForDouble, &szSourceLen, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS);
        shrLog("clCreateProgramWithSource\n"); 

        // Build the program with 'mad' Optimization option
#ifdef MAC
	char *flags = "-cl-fast-relaxed-math -DMAC";
#else
	char *flags = "-cl-fast-relaxed-math";
#endif
        ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
            // write out standard error, Build Log and PTX, then cleanup and exit
            shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
            oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
            oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclNbody.ptx");
            oclCheckError(ciErrNum, CL_SUCCESS); 
        }
        shrLog("clBuildProgram\n"); 

        // create the kernel
        *kernel = clCreateKernel(cpProgram, kernel_name, &ciErrNum);
        oclCheckError(ciErrNum, CL_SUCCESS); 
        shrLog("clCreateKernel\n"); 

		size_t wgSize;
		ciErrNum = clGetKernelWorkGroupInfo(*kernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
		if (wgSize == 64) {
		  shrLog(
			 "ERROR: Minimum work-group size 256 required by this application is not supported on this device.\n");
		  exit(0);
		}
	
		free(pcSourceForDouble);

        return 0;
    }
Ejemplo n.º 2
0
Archivo: ocl.c Proyecto: furyan/cgminer
_clState *initCl(unsigned int gpu, char *name, size_t nameSize)
{
	int patchbfi = 0;
	cl_int status = 0;
	unsigned int i;

	_clState *clState = calloc(1, sizeof(_clState));

	cl_uint numPlatforms;
	cl_platform_id platform = NULL;
	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if (status != CL_SUCCESS)
	{
		applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)");
		return NULL;
	}

	if (numPlatforms > 0)
	{
		cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));
		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
		if (status != CL_SUCCESS)
		{
			applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)");
			return NULL;
		}

		for(i = 0; i < numPlatforms; ++i)
		{
			char pbuff[100];
			status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS)
			{
				applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)");
				free(platforms);
				return NULL;
			}
			platform = platforms[i];
			if (!strcmp(pbuff, "Advanced Micro Devices, Inc."))
			{
				break;
			}
		}
		free(platforms);
	}

	if (platform == NULL) {
		perror("NULL platform found!\n");
		return NULL;
	}

	size_t nDevices;
	cl_uint numDevices;
	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
	if (status != CL_SUCCESS)
	{
		applog(LOG_ERR, "Error: Getting Device IDs (num)");
		return NULL;
	}

	cl_device_id *devices;
	if (numDevices > 0 ) {
		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

		/* Now, get the device list data */

		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
		if (status != CL_SUCCESS)
		{
			applog(LOG_ERR, "Error: Getting Device IDs (list)");
			return NULL;
		}

		applog(LOG_INFO, "List of devices:");

		unsigned int i;
		for(i=0; i<numDevices; i++) {
			char pbuff[100];
			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
			if (status != CL_SUCCESS)
			{
				applog(LOG_ERR, "Error: Getting Device Info");
				return NULL;
			}

			applog(LOG_INFO, "\t%i\t%s", i, pbuff);
		}

		if (gpu < numDevices) {
			char pbuff[100];
			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, &nDevices);
			if (status != CL_SUCCESS)
			{
				applog(LOG_ERR, "Error: Getting Device Info");
				return NULL;
			}

			applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
			strncpy(name, pbuff, nameSize);
		} else {
			applog(LOG_ERR, "Invalid GPU %i", gpu);
			return NULL;
		}

	} else return NULL;

	cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
	if (status != CL_SUCCESS)
	{
		applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)");
		return NULL;
	}

	/* Check for BFI INT support. Hopefully people don't mix devices with
	 * and without it! */
	char * extensions = malloc(1024);
	const char * camo = "cl_amd_media_ops";
	char *find;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS");
		return NULL;
	}
	find = strstr(extensions, camo);
	if (find)
		clState->hasBitAlign = patchbfi = 1;

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT");
		return NULL;
	}
	if (opt_debug)
		applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth);

	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE");
		return NULL;
	}
	if (opt_debug)
		applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);

	/* For some reason 2 vectors is still better even if the card says
	 * otherwise, and many cards lie about their max so use 256 as max
	 * unless explicitly set on the command line */
	if (clState->preferred_vwidth > 1)
		clState->preferred_vwidth = 2;
	if (opt_vectors)
		clState->preferred_vwidth = opt_vectors;
	if (opt_worksize && opt_worksize <= clState->max_work_size)
		clState->work_size = opt_worksize;
	else
		clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) /
				clState->preferred_vwidth;

	/* Create binary filename based on parameters passed to opencl
	 * compiler to ensure we only load a binary that matches what would
	 * have otherwise created. The filename is:
	 * name + kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin
	 */
	char binaryfilename[255];
	char numbuf[10];
	char filename[16];

	if (chosen_kernel == KL_NONE) {
		if (clState->hasBitAlign)
			chosen_kernel = KL_PHATK;
		else
			chosen_kernel = KL_POCLBM;
	}

	switch (chosen_kernel) {
		case KL_POCLBM:
			strcpy(filename, "poclbm110817.cl");
			strcpy(binaryfilename, "poclbm110817");
			break;
		case KL_NONE: /* Shouldn't happen */
		case KL_PHATK:
			strcpy(filename, "phatk110817.cl");
			strcpy(binaryfilename, "phatk110817");
			break;
	}

	FILE *binaryfile;
	size_t *binary_sizes;
	char **binaries;
	int pl;
	char *source, *rawsource = file_contents(filename, &pl);
	size_t sourceSize[] = {(size_t)pl};

	if (!rawsource)
		return NULL;

	source = malloc(pl);
	if (!source) {
		applog(LOG_ERR, "Unable to malloc source");
		return NULL;
	}

	binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices);
	if (unlikely(!binary_sizes)) {
		applog(LOG_ERR, "Unable to malloc binary_sizes");
		return NULL;
	}
	binaries = (char **)malloc(sizeof(char *)*nDevices);
	if (unlikely(!binaries)) {
		applog(LOG_ERR, "Unable to malloc binaries");
		return NULL;
	}

	strcat(binaryfilename, name);
	if (clState->hasBitAlign)
		strcat(binaryfilename, "bitalign");

	strcat(binaryfilename, "v");
	sprintf(numbuf, "%d", clState->preferred_vwidth);
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, "w");
	sprintf(numbuf, "%d", (int)clState->work_size);
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, "long");
	sprintf(numbuf, "%d", (int)sizeof(long));
	strcat(binaryfilename, numbuf);
	strcat(binaryfilename, ".bin");

	binaryfile = fopen(binaryfilename, "rb");
	if (!binaryfile) {
		if (opt_debug)
			applog(LOG_DEBUG, "No binary found, generating from source");
	} else {
		struct stat binary_stat;

		if (unlikely(stat(binaryfilename, &binary_stat))) {
			if (opt_debug)
				applog(LOG_DEBUG, "Unable to stat binary, generating from source");
			fclose(binaryfile);
			goto build;
		}
		binary_sizes[gpu] = binary_stat.st_size;
		binaries[gpu] = (char *)malloc(binary_sizes[gpu]);
		if (unlikely(!binaries[gpu])) {
			applog(LOG_ERR, "Unable to malloc binaries");
			fclose(binaryfile);
			return NULL;
		}

		if (fread(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu]) {
			applog(LOG_ERR, "Unable to fread binaries[gpu]");
			fclose(binaryfile);
			goto build;
		}
		fclose(binaryfile);

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL);
		if (status != CL_SUCCESS)
		{
			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
			return NULL;
		}
		if (opt_debug)
			applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

		free(binaries[gpu]);
		goto built;
	}

	/////////////////////////////////////////////////////////////////
	// Load CL file, build CL program object, create CL kernel object
	/////////////////////////////////////////////////////////////////

build:
	memcpy(source, rawsource, pl);

	/* Patch the source file with the preferred_vwidth */
	if (clState->preferred_vwidth > 1) {
		char *find = strstr(source, "VECTORSX");

		if (unlikely(!find)) {
			applog(LOG_ERR, "Unable to find VECTORSX in source");
			return NULL;
		}
		find += 7; // "VECTORS"
		if (clState->preferred_vwidth == 2)
			strncpy(find, "2", 1);
		else
			strncpy(find, "4", 1);
		if (opt_debug)
			applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth);
	}

	/* Patch the source file defining BITALIGN */
	if (clState->hasBitAlign) {
		char *find = strstr(source, "BITALIGNX");

		if (unlikely(!find)) {
			applog(LOG_ERR, "Unable to find BITALIGNX in source");
			return NULL;
		}
		find += 8; // "BITALIGN"
		strncpy(find, " ", 1);
		if (opt_debug)
			applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BITALIGN");
	} else if (opt_debug)
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BITALIGN patch");

	if (patchbfi) {
		char *find = strstr(source, "BFI_INTX");

		if (unlikely(!find)) {
			applog(LOG_ERR, "Unable to find BFI_INTX in source");
			return NULL;
		}
		find += 7; // "BFI_INT"
		strncpy(find, " ", 1);
		if (opt_debug)
			applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BFI_INT");
	} else if (opt_debug)
		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BFI_INT patch");

	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithSource)");
		return NULL;
	}

	clRetainProgram(clState->program);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
		return NULL;
	}

	/* create a cl program executable for all the devices specified */
	char CompilerOptions[256];
	sprintf(CompilerOptions, "%s%i", "-DWORKSIZE=", (int)clState->work_size);
	//int n = 1000;
	//while(n--)
	//	printf("%s", CompilerOptions);
	//return 1;
	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);

	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_INFO, "%s", log);
		return NULL;
	}

	status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetPlatformInfo)");
		return NULL;
	}

	/* copy over all of the generated binaries. */
	if (opt_debug)
		applog(LOG_DEBUG, "binary size %d : %d", gpu, binary_sizes[gpu]);
	if (!binary_sizes[gpu]) {
		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, may need to reboot!");
		return NULL;
	}
	binaries[gpu] = (char *)malloc( sizeof(char)*binary_sizes[gpu]);
	status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL );
	if (unlikely(status != CL_SUCCESS)) {
		applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)");
		return NULL;
	}

	/* Patch the kernel if the hardware supports BFI_INT */
	if (patchbfi) {
		unsigned remaining = binary_sizes[gpu];
		char *w = binaries[gpu];
		unsigned int start, length;

		/* Find 2nd incidence of .text, and copy the program's
		* position and length at a fixed offset from that. Then go
		* back and find the 2nd incidence of \x7ELF (rewind by one
		* from ELF) and then patch the opcocdes */
		if (!advance(&w, &remaining, ".text"))
			{patchbfi = 0; goto build;}
		w++; remaining--;
		if (!advance(&w, &remaining, ".text")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		memcpy(&start, w + 285, 4);
		memcpy(&length, w + 289, 4);
		w = binaries[gpu]; remaining = binary_sizes[gpu];
		if (!advance(&w, &remaining, "ELF"))
			{patchbfi = 0; goto build;}
		w++; remaining--;
		if (!advance(&w, &remaining, "ELF")) {
			/* 32 bit builds only one ELF */
			w--; remaining++;
		}
		w--; remaining++;
		w += start; remaining -= start;
		if (opt_debug)
			applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
				w, remaining);
		patch_opcodes(w, length);

		status = clReleaseProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Releasing program. (clReleaseProgram)");
			return NULL;
		}

		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
			return NULL;
		}

		clRetainProgram(clState->program);
		if (status != CL_SUCCESS) {
			applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
			return NULL;
		}
	}

	free(source);
	free(rawsource);

	/* Save the binary to be loaded next time */
	binaryfile = fopen(binaryfilename, "wb");
	if (!binaryfile) {
		/* Not a fatal problem, just means we build it again next time */
		if (opt_debug)
			applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
	} else {
		if (unlikely(fwrite(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu])) {
			applog(LOG_ERR, "Unable to fwrite to binaryfile");
			return NULL;
		}
		fclose(binaryfile);
	}
	if (binaries[gpu])
		free(binaries[gpu]);
built:
	free(binaries);
	free(binary_sizes);

	applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d",
	       filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size);

	/* create a cl program executable for all the devices specified */
	status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
		size_t logSize;
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

		char *log = malloc(logSize);
		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
		applog(LOG_INFO, "%s", log);
		return NULL;
	}

	/* get a kernel object handle for a kernel with the given name */
	clState->kernel = clCreateKernel(clState->program, "search", &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)");
		return NULL;
	}

	/////////////////////////////////////////////////////////////////
	// Create an OpenCL command queue
	/////////////////////////////////////////////////////////////////
	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
	if (status != CL_SUCCESS) /* Try again without OOE enable */
		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)");
		return NULL;
	}

	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status);
	if (status != CL_SUCCESS) {
		applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)");
		return NULL;
	}

	return clState;
}
int main(int argc, char const *argv[])
{
        /* Get platform */
        cl_platform_id platform;
        cl_uint num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformIDs' failed\n");
                exit(1);
        }
        
        printf("Number of platforms: %d\n", num_platforms);
        printf("platform=%p\n", platform);
        
        /* Get platform name */
        char platform_name[100];
        ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetPlatformInfo' failed\n");
                exit(1);
        }
        
        printf("platform.name='%s'\n\n", platform_name);
        
        /* Get device */
        cl_device_id device;
        cl_uint num_devices;
        ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceIDs' failed\n");
                exit(1);
        }
        
        printf("Number of devices: %d\n", num_devices);
        printf("device=%p\n", device);
        
        /* Get device name */
        char device_name[100];
        ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
        device_name, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clGetDeviceInfo' failed\n");
                exit(1);
        }
        
        printf("device.name='%s'\n", device_name);
        printf("\n");
        
        /* Create a Context Object */
        cl_context context;
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateContext' failed\n");
                exit(1);
        }
        
        printf("context=%p\n", context);
        
        /* Create a Command Queue Object*/
        cl_command_queue command_queue;
        command_queue = clCreateCommandQueue(context, device, 0, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateCommandQueue' failed\n");
                exit(1);
        }
        
        printf("command_queue=%p\n", command_queue);
        printf("\n");

        /* Program source */
        unsigned char *source_code;
        size_t source_length;

        /* Read program from 'logb_float.cl' */
        source_code = read_buffer("logb_float.cl", &source_length);

        /* Create a program */
        cl_program program;
        program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret);

        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateProgramWithSource' failed\n");
                exit(1);
        }
        printf("program=%p\n", program);

        /* Build program */
        ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        if (ret != CL_SUCCESS )
        {
                size_t size;
                char *log;

                /* Get log size */
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size);

                /* Allocate log and print */
                log = malloc(size);
                clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL);
                printf("error: call to 'clBuildProgram' failed:\n%s\n", log);
                
                /* Free log and exit */
                free(log);
                exit(1);
        }

        printf("program built\n");
        printf("\n");
        
        /* Create a Kernel Object */
        cl_kernel kernel;
        kernel = clCreateKernel(program, "logb_float", &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clCreateKernel' failed\n");
                exit(1);
        }
        
        /* Create and allocate host buffers */
        size_t num_elem = 10;
        
        /* Create and init host side src buffer 0 */
        cl_float *src_0_host_buffer;
        src_0_host_buffer = malloc(num_elem * sizeof(cl_float));
        for (int i = 0; i < num_elem; i++)
                src_0_host_buffer[i] = (cl_float)(2.0);
        
        /* Create and init device side src buffer 0 */
        cl_mem src_0_device_buffer;
        src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create source buffer\n");
                exit(1);
        }        
        ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float), src_0_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueWriteBuffer' failed\n");
                exit(1);
        }

        /* Create host dst buffer */
        cl_float *dst_host_buffer;
        dst_host_buffer = malloc(num_elem * sizeof(cl_float));
        memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float));

        /* Create device dst buffer */
        cl_mem dst_device_buffer;
        dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float), NULL, &ret);
        if (ret != CL_SUCCESS)
        {
                printf("error: could not create dst buffer\n");
                exit(1);
        }
        
        /* Set kernel arguments */
        ret = CL_SUCCESS;
        ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
        ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clSetKernelArg' failed\n");
                exit(1);
        }

        /* Launch the kernel */
        size_t global_work_size = num_elem;
        size_t local_work_size = num_elem;
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
                exit(1);
        }

        /* Wait for it to finish */
        clFinish(command_queue);

        /* Read results from GPU */
        ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float), dst_host_buffer, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clEnqueueReadBuffer' failed\n");
                exit(1);
        }

        /* Dump dst buffer to file */
        char dump_file[100];
        sprintf((char *)&dump_file, "%s.result", argv[0]);
        write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float));
        printf("Result dumped to %s\n", dump_file);
        /* Free host dst buffer */
        free(dst_host_buffer);

        /* Free device dst buffer */
        ret = clReleaseMemObject(dst_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }
        
        /* Free host side src buffer 0 */
        free(src_0_host_buffer);

        /* Free device side src buffer 0 */
        ret = clReleaseMemObject(src_0_device_buffer);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseMemObject' failed\n");
                exit(1);
        }

        /* Release kernel */
        ret = clReleaseKernel(kernel);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseKernel' failed\n");
                exit(1);
        }

        /* Release program */
        ret = clReleaseProgram(program);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseProgram' failed\n");
                exit(1);
        }
        
        /* Release command queue */
        ret = clReleaseCommandQueue(command_queue);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseCommandQueue' failed\n");
                exit(1);
        }
        
        /* Release context */
        ret = clReleaseContext(context);
        if (ret != CL_SUCCESS)
        {
                printf("error: call to 'clReleaseContext' failed\n");
                exit(1);
        }
                
        return 0;
}
Ejemplo n.º 4
0
void OpenCLExecuter::ocl_filter_shared(void)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	cl_sampler sampler;					// OpenCL sampler
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	// set Local work size dimensions
//	size_t local_threads[3] ={256,256,64};
	// set Global work size dimensions
//	size_t global_threads[3] ={roundup((int) volobj->texwidth/local_threads[0], 0)*local_threads[0], roundup((int) volobj->texheight/local_threads[1], 0)*local_threads[1], roundup((int) volobj->texdepth/local_threads[2], 0)*local_threads[2]};

	// set Global work size dimensions
	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateFromGLTexture3D (ocl_wrapper->context, CL_MEM_READ_WRITE, GL_TEXTURE_3D, 0, volobj->TEXTURE3D_RED, &err);
	printf("OPENCL: clCreateFromGLTexture3D: %s\n", ocl_wrapper->get_error(err));

	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// create a sampler object
	sampler = clCreateSampler(ocl_wrapper->context, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &err);
	printf("OPENCL: clCreateSampler: %s\n", ocl_wrapper->get_error(err));
 
    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(sampler), (void*)&sampler);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);

	// grab input data from OpenGL, compute, copy the results back to OpenGL
	// Runs asynchronous to host, up until blocking clFinish at the end

	glFinish();
	glFlush();
	
	// grab the OpenGL texture object for read/writing from OpenCL
	err = clEnqueueAcquireGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,NULL,NULL);
	printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));

	// Execute a kernel
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	/*
	// Blocking read of results from GPU to Host
	int size = volobj->texwidth*volobj->texheight*volobj->texdepth;
	unsigned char* result = new unsigned char[size];
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, result, 0, NULL, NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
	for(int i=0; i<size; i++) volobj->texture3d[3*i+0] = result[i];
	delete[] result;
	*/

	// copy OpenCL buffer to OpenGl texture
	size_t corigin[3] = {0,0,0};
	size_t cdimensions[3] = {(unsigned int)volobj->texwidth, (unsigned int)volobj->texheight, (unsigned int)volobj->texdepth};
	err = clEnqueueCopyBufferToImage(ocl_wrapper->commandQue , dst_buffer, src_buffer, 0, corigin, cdimensions, 0, NULL, NULL);
	printf("OPENCL: clEnqueueCopyBufferToImage: %s\n", ocl_wrapper->get_error(err));

	//make sure we block until we are done.
	//err = clFinish(ocl_wrapper->commandQue);
	//printf("OPENCL: clFinish: %s\n", ocl_wrapper->get_error(err));
	
	//release opengl objects now
	err = clEnqueueReleaseGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,0,0);
	printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(dst_buffer)clReleaseMemObject(dst_buffer);
}
Ejemplo n.º 5
0
void OpenCLExecuter::ocl_parrallelReduction(void)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem tmp_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	size_t numWorkGroups;
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = 65536; //65536 // Length of float arrays to process

	// set Local work size dimensions
	szLocalWorkSize = 512;
	// set Global work size dimensions
	szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize;  
	//szGlobalWorkSize = iNumElements;
	numWorkGroups = (float)szGlobalWorkSize/(float)szLocalWorkSize;
	printf("OPENCL: number of elements: %d\n", (int)iNumElements);
	printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize);
	printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize);
	printf("OPENCL: work groups: %d\n", (int)(numWorkGroups));
	
	//temp array
	int* data = new int[iNumElements];

	for(int i=0; i<iNumElements; i++)
		data[i] = randomFloat(1.0, (float)iNumElements);

	data[iNumElements/2] = -100.0;

	//for(int i=0; i<iNumElements; i++)
	//	printf("data: %d\n", data[i]);

	size_t global_threads[1] ={iNumElements};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the temp buffer memory object
	tmp_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
	
	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(int) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "min_reduce", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(int)*szLocalWorkSize, NULL);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&iNumElements);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end
	
	int numb_iterations = sqrt((float)numWorkGroups);
	numb_iterations=0;
	bool cont = true;

	Timer timer;
	timer.startTimer();
	//for(int i=0; i<numb_iterations; i++)
	while(cont)
	{
		// Write data from host to GPU
		err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL);
		printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
		// Launch kernel 
		err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
		printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

		// Blocking read of results from GPU to Host
		err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(int) * iNumElements, data, 0, NULL,  NULL);
		printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
		
		numb_iterations++;
		if(data[1]==0) cont = false;

		//printf("min: %d\n", data[0]);
		for(int i=0; i<numWorkGroups; i++)
			printf("min: %d\n", data[i]);
	}
	timer.endTimer("GPU find min");

	timer.startTimer();
	int min=iNumElements;
	for(int i=0; i<iNumElements; i++)
		if(data[i]<min) min = data[i];
	timer.endTimer("CPU find min");

	printf("iters: %d\n", numb_iterations);
	printf("gpu-min: %d\n", data[0]);
	printf("cpu-min: %d\n", min);
	
	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 


    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);


//	printf("min: %d\n", data[0]);

	delete[] data;
}
Ejemplo n.º 6
0
void OpenCLExecuter::ocl_filter_multi(void)
{
	cl_int err;										// debugging variables
	size_t szParmDataBytes;							// Byte size of context information        

	cl_mem src_buffer[MAX_DEVICES];					// OpenCL device source buffer
	cl_mem dst_buffer[MAX_DEVICES];					// OpenCL device source buffer
	cl_command_queue queues[MAX_DEVICES];			// OpenCL device queue
	cl_kernel ckKernel[MAX_DEVICES];				// OpenCL kernel

	cl_event gpuDone[MAX_DEVICES];

//	int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth*3; // Length of float arrays to process

	int xdim, ydim, zdim;
	xdim = (float)volobj->texwidth; // (float)ocl_wrapper->numDevices;
	ydim = (float)volobj->texheight; // (float)ocl_wrapper->numDevices;
	zdim = (float)volobj->texdepth / (float)ocl_wrapper->numDevices;

	//Length of array to process
	int iNumElements = (xdim*ydim*zdim);
	size_t global_threads[3] = {xdim, ydim, zdim};
	
	//temp array
	unsigned char** data = new unsigned char*[ocl_wrapper->numDevices];

	for(int i=0; i<ocl_wrapper->numDevices; i++)
		data[i] = new unsigned char[iNumElements];

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		printf("OPENCL: Computing Device%d\n", i);

		//create the command queue we will use to execute OpenCL commands
		queues[i] = clCreateCommandQueue(ocl_wrapper->context, ocl_wrapper->devices[i], 0, &err);
		printf("OPENCL: clCreateCommandQueue: %s\n", ocl_wrapper->get_error(err));
		
		// allocate the source buffer memory object
		src_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
		printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
		// allocate the destination buffer memory object
		dst_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
		printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

		// Create the kernel
		ckKernel[i] = clCreateKernel (cpProgram, "myFunc", &err);
		printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
		// Set the Argument values
		err = clSetKernelArg (ckKernel[i], 0, sizeof(cl_mem), (void*)&src_buffer[i]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 1, sizeof(cl_mem), (void*)&dst_buffer[i]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 2, sizeof(int), (void*)&global_threads[0]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 3, sizeof(int), (void*)&global_threads[1]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
		err = clSetKernelArg (ckKernel[i], 4, sizeof(int), (void*)&global_threads[2]);
		printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	
		//Prepare data to upload
		int iOffsetElements = (xdim*ydim*zdim*i);
		for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++)
			data[i][j-iOffsetElements] = volobj->texture3d[3*j+0];

		// Write data from host to GPU
		err = clEnqueueWriteBuffer (queues[i], src_buffer[i], CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL, NULL);
		printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		// Launch kernel 
		err = clEnqueueNDRangeKernel (queues[i], ckKernel[i], 3, NULL, global_threads, NULL, 0, NULL, NULL);
		printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		// Blocking read of results from GPU to Host
		err = clEnqueueReadBuffer (queues[i], dst_buffer[i], CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL,  &gpuDone[i]);
		printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
	}

	// Synchronize with the GPUs
    printf("OPENCL: Waiting for devices to sync\n");
	clWaitForEvents(ocl_wrapper->numDevices, gpuDone);

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		//read data back
		int iOffsetElements = (xdim*ydim*zdim*i);
		for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++)
			volobj->texture3d[3*j+0] = data[i][j-iOffsetElements];
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
	{
		// Cleanup allocated objects
		printf("OPENCL: Releasing kernel memory\n");
		if(ckKernel[i])clReleaseKernel(ckKernel[i]); 
   
		//need to release any other OpenCL memory objects here
		if(dst_buffer[i])clReleaseMemObject(dst_buffer[i]);
		if(src_buffer[i])clReleaseMemObject(src_buffer[i]);
	}

	for(int i=0; i<ocl_wrapper->numDevices; i++)
		delete[] data[i];

	delete[] data;
}
Ejemplo n.º 7
0
void OpenCLExecuter::ocl_filterBoundingBox(int channel, int window_size)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem bbmin_buffer;				// OpenCL device source buffer
	cl_mem bbmax_buffer;				// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	cl_int4 minbb;
	cl_int4 maxbb;

	minbb.s[0] = minbb.s[1] = minbb.s[2] = 8192;
	maxbb.s[0] = maxbb.s[1] = maxbb.s[2] = -8192;

	int iNumElements = 3*volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the destination buffer memory object
	bbmin_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(cl_int4), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

 	bbmax_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(cl_int4), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&bbmin_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&bbmax_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&channel);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(bbmin_buffer)clReleaseMemObject(bbmin_buffer);
    if(bbmax_buffer)clReleaseMemObject(bbmax_buffer);
	
	maxbb.s[0] += (float)window_size/2.0;
	maxbb.s[1] += (float)window_size/2.0;
	maxbb.s[2] += (float)window_size/2.0;

	minbb.s[0] -= (float)window_size/2.0;
	minbb.s[1] -= (float)window_size/2.0;
	minbb.s[2] -= (float)window_size/2.0;

	maxbb.s[0] += 2;
	maxbb.s[1] += 2;
	maxbb.s[2] += 2;

	minbb.s[0] -= 2;
	minbb.s[1] -= 2;
	minbb.s[2] -= 2;	
	
	if(maxbb.s[0]>volobj->texwidth-1) maxbb.s[0]  =volobj->texwidth-1;
	if(maxbb.s[1]>volobj->texheight-1) maxbb.s[1] =volobj->texheight-1;
	if(maxbb.s[2]>volobj->texdepth-1) maxbb.s[2] =volobj->texdepth-1;

	if(minbb.s[0]<0) minbb.s[0]=0;
	if(minbb.s[1]<0) minbb.s[1]=0;
	if(minbb.s[2]<0) minbb.s[2]=0;

	volobj->boundingboxSize.x = ((maxbb.s[0])-(minbb.s[0]-1));
	volobj->boundingboxSize.y = ((maxbb.s[1])-(minbb.s[1]-1));
	volobj->boundingboxSize.z = ((maxbb.s[2])-(minbb.s[2]-1));
	volobj->boundingboxCentre.x = 0.0; //-(((float)boundingboxSize.x)/2.0);
	volobj->boundingboxCentre.y = 0.0; //-(((float)boundingboxSize.y)/2.0);
	volobj->boundingboxCentre.z = 0.0; //-(((float)boundingboxSize.z)/2.0);
	volobj->boundingboxMin = Vector(minbb.s[0], minbb.s[1], minbb.s[2]);
	volobj->boundingboxMax = Vector(maxbb.s[0], maxbb.s[1], maxbb.s[2]);

	printf("min: %f, %f, %f\n", volobj->boundingboxMin.x, volobj->boundingboxMin.y, volobj->boundingboxMin.z);
	printf("max: %f, %f, %f\n", volobj->boundingboxMax.x, volobj->boundingboxMax.y, volobj->boundingboxMax.z);
}
Ejemplo n.º 8
0
int main(void) {
    // se crea los 2 vectores de entrada
    int i;
    const int LIST_SIZE = 1024;
    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = i;
        B[i] = LIST_SIZE - i;
    }
 
    // cargamos el kernel en source_str
    FILE *fp;
    char *source_str;
    size_t source_size;
 
    fp = fopen("vector_add_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );
 
    // obtenemos las plataformas y informacion de los devices
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, 
            &device_id, &ret_num_devices);
 
    // creamos un contexto OpenCL
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
 
    // creamos la cola de comandos
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
    // creamos el buffer de memoria en el device para cada vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            LIST_SIZE * sizeof(int), NULL, &ret);
 
    // copiamos los vectores A y B a sus respectivas memorias buffer
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
 
    // creamos un programa para el kernel 
    cl_program program = clCreateProgramWithSource(context, 1, 
            (const char **)&source_str, (const size_t *)&source_size, &ret);
 
    // generamos el programa
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    // creamos el kernel 
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
 
    // establecemos los argumentos del kernel 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
 
    // ejecutamos el kernel de la lista
    size_t global_item_size = LIST_SIZE; 
    size_t local_item_size = 64; // dividimos los work items en grupos de 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);
 
    // copiamos la memoria buffer C del device hacia la variable local C
    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, 
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
 
    // muestra el resultado
    for(i = 0; i < LIST_SIZE; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);
 
    
    free(A);
    free(B);
    free(C);
    return 0;
}
int main(int argc, char **argv)
{  
   printf("start \n");
   int x, y, nsteps, i, j;
   float *u_h;
   double *f_h;  //pointers to host memory	
   int ArraySizeX = 5122;
   int ArraySizeY = 5122;
   double n, ux, uy, uxx, uxy, uyy, usq;
   FILE *fp;	
   size_t size = ArraySizeX*ArraySizeY*sizeof(float);
   size_t size1 = ArraySizeX*ArraySizeY*9*sizeof(double);
   u_h = (float *)calloc(ArraySizeX*ArraySizeY,sizeof(float));
   f_h = (double *)calloc(ArraySizeX*ArraySizeY*9,sizeof(double));
   printf("initialization \n");
    // initialization 
   for( x = 0;x<ArraySizeX;x++){
     for( y =0;y<ArraySizeY;y++){
	// define the macroscopic properties of the initial condition.
     n = 1 + Amp2*exp(-(pow(x-ArraySizeX/2,2)+pow(y-ArraySizeY/2,2))/Width);
     ux = 0;
     uy = 0;		
      // intialize f to be the local equilibrium values	
     uxx = ux*ux;
     uyy = uy*uy;
     uxy = 2*ux*uy;
     usq = uxx+ uyy;
	  
     f_h[x*ArraySizeY*9+y*9] = w1*n*(1-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+1] = w2*n*(1+3*ux+4.5*uxx-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+2] = w2*n*(1-3*ux+4.5*uxx-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+3] = w2*n*(1+3*uy+4.5*uyy-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+4]= w2*n*(1-3*uy+4.5*uyy-1.5*usq); 
     f_h[x*ArraySizeY*9+y*9+5] = w3*n*(1+3*(ux+uy)+4.5*(uxx+uxy+uyy)-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+6] = w3*n*(1+3*(-ux+uy)+4.5*(uxx-uxy+uyy)-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+7] = w3*n*(1+3*(-ux-uy)+4.5*(uxx+uxy+uyy)-1.5*usq);
     f_h[x*ArraySizeY*9+y*9+8] = w3*n*(1+3*(ux-uy)+4.5*(uxx-uxy+uyy)-1.5*usq);
	}
    }
    
     cl_event event;
     cl_ulong time_start, time_end, total_time; 
     // use this to check the output of each API call
     cl_int status;
     // retrieve the number of platforms
     cl_uint numPlatforms = 0;
     status = clGetPlatformIDs(0,NULL,&numPlatforms);
     chk(status, "clGetPlatformIDs0");

     // allocate enough space for each platform
     cl_platform_id *platforms = NULL;
     platforms = (cl_platform_id *) malloc(numPlatforms*sizeof(cl_platform_id));

     // Fill in the platforms
     status = clGetPlatformIDs(numPlatforms, platforms, NULL);    
     chk(status, "clGetPlatformIDs1");

     // Retrieve the number of devices
     cl_uint numDevices = 0;
     status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
     chk(status, "clGetDeviceIDs0");
  
     // Allocate enough space for each device
     cl_device_id *devices = NULL;
     devices = (cl_device_id *) malloc(numDevices*sizeof(cl_device_id));

     // Fill in the devices
     status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
     chk(status, "clGetDeviceIDs1");

     // Create a context and associate it with devices
     cl_context	 context;
     context = clCreateContext(NULL,numDevices, devices, NULL, NULL, &status);
     chk(status,"clCreateContext");

     // Create  a command queue and associate it with device
     cl_command_queue cmdQueue;
     cmdQueue = clCreateCommandQueue(context, devices[0],CL_QUEUE_PROFILING_ENABLE,&status);
     chk(status,"clCreateCommandQueue");
     
     // Create Buffer objects on devices
     cl_mem u_d, f_d;
     u_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
     chk(status,"clCreatebuffer");
     f_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size1, NULL, &status);
     chk(status, "clCreatebuffer");

     // perform computing on GPU
     // copy data from host to device
     status = clEnqueueWriteBuffer(cmdQueue, u_d, CL_FALSE, 0, size, u_h, 0, NULL, NULL);
     chk(status,"ClEnqueueWriteBuffer");
     status = clEnqueueWriteBuffer(cmdQueue, f_d, CL_FALSE, 0, size1, f_h, 0, NULL, NULL);
     chk(status, "clEnqueueWriteBuffer");
     
     // create program with source code
     cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource, NULL, &status);
     chk(status, "clCreateProgramWithSource");

     // Compile program for the device
     status = clBuildProgram(program, numDevices, devices, NULL, NULL,NULL);
      // chk(status, "ClBuildProgram");
      if(status != CL_SUCCESS){
      printf("clBuildProgram failed (%d) \n", status);
      size_t log_size;
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
      
      char *log = (char *) malloc(log_size);
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
      printf("%s\n", log);
      exit(-1);
     }
      printf("successfully built program \n");
      
     // Create lattice-boltzman kernel
     cl_kernel kernel, kernel1;
     kernel = clCreateKernel(program, "lbiteration", &status);
     kernel1 = clCreateKernel(program, "Denrho", &status);
     chk(status, "clCreateKernel");
      printf("successfully create kernel \n");
     
     // Associate the input and output buffers with the kernel
     status = clSetKernelArg(kernel,0, sizeof(cl_mem), &f_d);
     status |= clSetKernelArg(kernel1,0, sizeof(cl_mem), &u_d);
     status |= clSetKernelArg(kernel1,1, sizeof(cl_mem), &f_d);
     status |= clSetKernelArg(kernel, 1, sizeof(int), &ArraySizeX);
     status |= clSetKernelArg(kernel1,2, sizeof(int), &ArraySizeX);
     status |= clSetKernelArg(kernel, 2, sizeof(int), &ArraySizeY);
     status |= clSetKernelArg(kernel1,3, sizeof(int),&ArraySizeY);
     chk(status, "clSerKernelArg");
    
     // set the work dimensions
     size_t localworksize[2] = {BLOCK_SIZE_X,BLOCK_SIZE_Y};
     int nBLOCKSX = (ArraySizeX-2)/(BLOCK_SIZE_X -2);
     int nBLOCKSY = (ArraySizeY-2)/(BLOCK_SIZE_Y -2);
     size_t globalworksize[2] = {nBLOCKSX*BLOCK_SIZE_X,nBLOCKSY*BLOCK_SIZE_Y};

     // loop the kernel
     for( nsteps = 0; nsteps < 100; nsteps++){
     status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalworksize,localworksize,0,NULL,&event);
     clWaitForEvents(1 , &event);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;
     }
     printf("Good so far \n");
     status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 2, NULL, globalworksize,localworksize,0,NULL,&event);
     chk(status, "clEnqueueNDR");
     clWaitForEvents(1 , &event);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;
     printf("running time is %0.3f \n",(total_time/1000000000.0));
     // retrieve data from device
     status = clEnqueueReadBuffer(cmdQueue, u_d, CL_TRUE, 0, size, u_h, 0, NULL, NULL);
     chk(status, "clEnqueueReadBuffer");

     // Output results
     fp = fopen("SolutionCL.txt", "wt");
     for(i= 0;i<ArraySizeX;i++){
       for(j=0;j<ArraySizeY;j++)
         fprintf(fp, " %f", u_h[i*ArraySizeY+j]);
        fprintf(fp, "\n");
     } 
     fclose(fp);

     //cleanup
     clReleaseKernel(kernel);
     clReleaseKernel(kernel1);
     clReleaseProgram(program);
     clReleaseCommandQueue(cmdQueue);
     clReleaseMemObject(u_d);
     clReleaseMemObject(f_d);
     clReleaseContext(context);

     free(u_h);
     free(f_h);
     free(platforms);
     free(devices);
     
     return 0;
}
Ejemplo n.º 10
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
void setup_opencl(int argc, char *argv[])
{
  cl_int err_code;
  char *source_dir = "EP";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code);
  clu_CheckError(err_code, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &err_code);
  clu_CheckError(err_code, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  sprintf(build_option, "-DM=%d -I.", M);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ep_cpu.cl";
    GROUP_SIZE = 16;
  } else {
    source_file = "ep_gpu.cl";
    GROUP_SIZE = 64;
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);

  gq_size  = np / GROUP_SIZE * NQ * sizeof(double);
  gsx_size = np / GROUP_SIZE * sizeof(double);
  gsy_size = np / GROUP_SIZE * sizeof(double);

  pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgq");

  pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsx");

  pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsy");

  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create a kernel
  DTIMER_START(T_OPENCL_API);
  kernel = clCreateKernel(program, "embar", &err_code);
  clu_CheckError(err_code, "clCreateKernel()");
  DTIMER_STOP(T_OPENCL_API);
}
Ejemplo n.º 11
0
Archivo: hw2.c Proyecto: hemantjp/HW2
int
main(int argc, char** argv)
{


   srand(1000);
   int i;

   unsigned int size_A = WA * HA;
   unsigned int mem_size_A = sizeof(float) * size_A;
   float* h_A = (float*) malloc(mem_size_A);

   unsigned int size_B = WB * HB;
   unsigned int mem_size_B = sizeof(float) * size_B;
   float* h_B = (float*) malloc(mem_size_B);


   randomInit(h_A, size_A);
   randomInit(h_B, size_B);


   unsigned int size_C = WC * HC;
   unsigned int mem_size_C = sizeof(float) * size_C;
   float* h_C = (float*) malloc(mem_size_C);

   cl_context clGPUContext;
   cl_command_queue clCommandQue;
   cl_program clProgram;
   cl_kernel clKernel;
   cl_event mm;

   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;


   cl_mem d_A;
   cl_mem d_B;
   cl_mem d_C;


   clGPUContext = clCreateContextFromType(0,
                   CL_DEVICE_TYPE_GPU,
                   NULL, NULL, &errcode);



   errcode = clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, 0, NULL,
              &dataBytes);
   cl_device_id *clDevices = (cl_device_id *)
              malloc(dataBytes);
   errcode |= clGetContextInfo(clGPUContext,
              CL_CONTEXT_DEVICES, dataBytes,
              clDevices, NULL);



   clCommandQue = clCreateCommandQueue(clGPUContext,
                  clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);



   d_C = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE,
          mem_size_A, NULL, &errcode);
   d_A = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_A, h_A, &errcode);
   d_B = clCreateBuffer(clGPUContext,
          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
          mem_size_B, h_B, &errcode);


   FILE* fp = fopen("hw2.cl", "r");
   fseek (fp , 0 , SEEK_END);
   const size_t lSize = ftell(fp);
   rewind(fp);
   unsigned char* buffer;
   buffer = (unsigned char*) malloc (lSize);
   fread(buffer, 1, lSize, fp);
   fclose(fp);

   cl_int status;
   clProgram = clCreateProgramWithBinary(clGPUContext,
                1, (const cl_device_id *)clDevices,
                &lSize, (const unsigned char**)&buffer,
                &status, &errcode);
   errcode = clBuildProgram(clProgram, 0, NULL, NULL,
                NULL, NULL);


   errcode = clBuildProgram(clProgram, 0,
              NULL, NULL, NULL, NULL);


   clKernel = clCreateKernel(clProgram,
               "MM", &errcode);




   size_t globalWorkSize[2];

   int wA = WA;
   int wC = WC;
   errcode = clSetKernelArg(clKernel, 0,
              sizeof(cl_mem), (void *)&d_C);
   errcode |= clSetKernelArg(clKernel, 1,
              sizeof(cl_mem), (void *)&d_A);
   errcode |= clSetKernelArg(clKernel, 2,
              sizeof(cl_mem), (void *)&d_B);
   errcode |= clSetKernelArg(clKernel, 3,
              sizeof(int), (void *)&wA);
   errcode |= clSetKernelArg(clKernel, 4,
              sizeof(int), (void *)&wC);



   globalWorkSize[0] = 16;
   globalWorkSize[1] = 16;

   cl_ulong time_start, time_end, total_time = 0;

   errcode = clEnqueueNDRangeKernel(clCommandQue,
              clKernel, 2, NULL, globalWorkSize,
              NULL, 0, NULL, &mm);
   printf("Average time = %lu\n");
   clFinish(clCommandQue);

         clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START,
              sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END,
               sizeof(time_end), &time_end, NULL);
         total_time += time_end - time_start;


         printf("Average time = %lu\n", total_time);
   errcode = clEnqueueReadBuffer(clCommandQue,
              d_C, CL_TRUE, 0, mem_size_C,
              h_C, 0, NULL, NULL);



   free(h_A);
   free(h_B);
   free(h_C);

   clReleaseMemObject(d_A);
   clReleaseMemObject(d_C);
   clReleaseMemObject(d_B);

   free(clDevices);

   clReleaseContext(clGPUContext);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseCommandQueue(clCommandQue);

}
Ejemplo n.º 12
0
  
  // Create the kernel
  cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors);
  
  // Set all the arguments to the kernel function
  int bones_num_args = 0;
  <kernel_argument_list>
  // Start the kernel
  size_t bones_global_worksize[] = {<parallelism>};
  bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize,NULL,0,NULL,&bones_event); error_check(bones_errors);
  
  // Synchronize and clean-up the kernel
  clFinish(bones_queue);
  clReleaseKernel(bones_kernel_<algorithm_name>_0);
Ejemplo n.º 13
0
int crackMD5(char *hash, char *cs, int passlen) {

	clut_device dev;	// device struct
	cl_event  evt;      // performance measurement event
	cl_kernel kernel;	// execution kernel
	cl_int ret;			// error code

	double td;
	int cs_len, sync_flag;
	long chunk, disp;
	unsigned char bin_hash[HASH_SIZE];

	cs_len = strlen(cs);
	sync_flag = 0;
	strToBin(hash, bin_hash, 2*HASH_SIZE);

	disp = DISPOSITIONS(cs_len, passlen);
	chunk = DISP_PER_CORE(disp, AVAILABLE_THREADS);

	debug("HOST", "Numero di disposizione da calcolare per stream processing unit = %lu\n", chunk);

	clut_open_device(&dev, PATH_TO_KERNEL);
	clut_print_device_info(&dev);


	/* ----------------------------------------- Create execution kernel ----------------------------------------- */
	kernel = clCreateKernel(dev.program, KERNEL_NAME, &ret);
	clut_check_err(ret, "Fallita la creazione del kernel");


	/* ----------------------------------- Create memory buffers on the device ----------------------------------- */
	cl_mem dchunk = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(long), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del chunk");

	cl_mem dhash = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, HASH_SIZE * sizeof(unsigned char), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione dell'hash");

	cl_mem charset = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, cs_len * sizeof(char), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del charset");

	cl_mem charset_size = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset");

	cl_mem dpasslen = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset");

	//cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, AVAILABLE_CORES * sizeof(int), NULL, &ret);
	cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del flag di sync");

	cl_mem dcracked = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE, NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro");

	cl_mem computed_hash = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE * sizeof(unsigned char), NULL, &ret);
	if (ret)
		clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro");


	/* ----------------------------------- Write memory buffers on the device ------------------------------------ */
	ret = clEnqueueWriteBuffer(dev.queue, dchunk, CL_TRUE, 0, sizeof(long), &chunk, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura del chunk sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, dhash, CL_TRUE, 0, HASH_SIZE * sizeof(unsigned char), (int *)bin_hash, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura dell'hash sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, charset, CL_TRUE, 0, cs_len * sizeof(char), cs, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura del charset sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, charset_size, CL_TRUE, 0, sizeof(int), &cs_len, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device");

	ret = clEnqueueWriteBuffer(dev.queue, dpasslen, CL_TRUE, 0, sizeof(int), &passlen, 0, NULL, NULL);
	if(ret)
	   clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device");

	//ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), &sync_flag, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &sync_flag, 0, NULL, NULL);
		if(ret)
		   clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device");

	/* --------------------------------- Set the arguments to our compute kernel --------------------------------- */
	ret  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &dchunk);
	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dhash);
	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &charset);
	ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charset_size);
	ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dpasslen);
	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &sync);
	ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &dcracked);
	ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &computed_hash);
	clut_check_err(ret, "Fallito il setting degli argomenti del kernel");


	/* ---------------------------------------- Execute the OpenCL kernel ---------------------------------------- */
	size_t global_dim[] = { AVAILABLE_THREADS };
	ret = clEnqueueNDRangeKernel(dev.queue, kernel, 1, NULL, global_dim, NULL, 0, NULL, &evt);
	if(ret)
	   clut_check_err(ret, "Fallita l'esecuzione del kernel");


	/* -------------------------- Read the device memory buffer to the local variable ---------------------------- */
	//int found[80];
	int found;
	int digest[HASH_SIZE/sizeof(int)];
	char *password = (char *) malloc(passlen * sizeof(char) + 1);
	memset(password, 0, passlen * sizeof(char) + 1);
	//memset(found, 0, AVAILABLE_CORES * sizeof(int));

	//ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), found, 0, NULL, NULL);
	ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &found, 0, NULL, NULL);
	if(ret)
	   clut_check_err(ret, "Fallimento nel leggere se la password e' stata trovata con successo");
	debug("HOST", "La password e' stata trovata dal kernel OpenCL? ");

	/*int i;
	for(i=0; i<AVAILABLE_CORES; i++){
		printf(" %d ", found[i]);
	}
	printf("\n");*/

	if(found){
	   ret = clEnqueueReadBuffer(dev.queue, dcracked, CL_TRUE, 0, HASH_SIZE, digest, 0, NULL, NULL);
	   if(ret)
	      clut_check_err(ret, "Fallimento nel leggere la password");
	   printf("Si. Password: %s\n", (char *)digest);
	}
	else
		printf("No.\n");

	/* ------------------------------------- Return kernel execution time ---------------------------------------- */
	td = clut_get_duration(evt);
	debug("HOST","Kernel duration: %f secs\n", td);

	/* ----------------------------------------------- Clean up -------------------------------------------------- */
	ret  = clReleaseKernel(kernel);
	ret |= clReleaseMemObject(dchunk);
	ret |= clReleaseMemObject(dhash);
	ret |= clReleaseMemObject(charset);
	ret |= clReleaseMemObject(charset_size);
	ret |= clReleaseMemObject(dpasslen);
	ret |= clReleaseMemObject(sync);
	ret |= clReleaseMemObject(dcracked);
	ret |= clReleaseMemObject(computed_hash);
	clut_check_err(ret, "Rilascio di risorse fallito");

	clFinish(dev.queue);

	clut_close_device(&dev);

	return 0;
}
Ejemplo n.º 14
0
int main(int argc, char** argv)
{
  cl_platform_id pf[MAX_PLATFORMS];
  cl_uint nb_platforms = 0;
  cl_int err;                            // error code returned from api calls
  cl_device_type device_type = CL_DEVICE_TYPE_ALL;

  // Filter args
  //
  argv++;
  while (argc > 1) {
    if(!strcmp(*argv, "-g") || !strcmp(*argv, "--gpu-only")) {
      if(device_type != CL_DEVICE_TYPE_ALL)
	error("--gpu-only and --cpu-only can not be specified at the same time\n");
      device_type = CL_DEVICE_TYPE_GPU;
    } else if(!strcmp(*argv, "-c") || !strcmp(*argv, "--cpu-only")) {
      if(device_type != CL_DEVICE_TYPE_ALL)
	error("--gpu-only and --cpu-only can not be specified at the same time\n");
      device_type = CL_DEVICE_TYPE_CPU;
    } else if(!strcmp(*argv, "-s") || !strcmp(*argv, "--size")) {
      unsigned i;
      int r;
      char c;

      r = sscanf(argv[1], "%u%[mMkK]", &SIZE, &c);

      if (r == 2) {
	if (c == 'k' || c == 'K')
	  SIZE *= 1024;
	else if (c == 'm' || c == 'M')
	  SIZE *= 1024 * 1024;
      }

      argc--; argv++;
    } else
      break;
    argc--; argv++;
  }

  if(argc > 1)
    TILE = atoi(*argv);

  // Get list of OpenCL platforms detected
  //
  err = clGetPlatformIDs(3, pf, &nb_platforms);
  check(err, "Failed to get platform IDs");

  printf("%d OpenCL platforms detected\n", nb_platforms);

  // For each platform do
  //
  for (cl_int p = 0; p < nb_platforms; p++) {
    cl_uint num;
    int platform_valid = 1;
    char name[1024], vendor[1024];
    cl_device_id devices[MAX_DEVICES];
    cl_uint nb_devices = 0;
    cl_context context;                 // compute context
    cl_program program;                 // compute program
    cl_kernel kernel;

    err = clGetPlatformInfo(pf[p], CL_PLATFORM_NAME, 1024, name, NULL);
    check(err, "Failed to get Platform Info");

    err = clGetPlatformInfo(pf[p], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
    check(err, "Failed to get Platform Info");

    printf("Platform %d: %s - %s\n", p, name, vendor);

    // Get list of devices
    //
    err = clGetDeviceIDs(pf[p], device_type, MAX_DEVICES, devices, &nb_devices);
    printf("nb devices = %d\n", nb_devices);

    if(nb_devices == 0)
      continue;

    // Create compute context with "device_type" devices
    //
    context = clCreateContext (0, nb_devices, devices, NULL, NULL, &err);
    check(err, "Failed to create compute context");

    // Load program source into memory
    //
    const char	*opencl_prog;
    opencl_prog = file_load(KERNEL_FILE);

    // Attach program source to context
    //
    program = clCreateProgramWithSource(context, 1, &opencl_prog, NULL, &err);
    check(err, "Failed to create program");

    // Compile program
    //
    {
      char flags[1024];

      sprintf (flags,
	       "-cl-mad-enable -cl-fast-relaxed-math -DSIZE=%d -DTILE=%d -DTYPE=%s",
	       SIZE, TILE, "float");

      err = clBuildProgram (program, 0, NULL, flags, NULL, NULL);
      if(err != CL_SUCCESS) {
	size_t len;

	// Display compiler log
	//
	clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
	{
	  char buffer[len+1];

	  fprintf(stderr, "--- Compiler log ---\n");
	  clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
	  fprintf(stderr, "%s\n", buffer);
	  fprintf(stderr, "--------------------\n");
	}
	if(err != CL_SUCCESS)
	  error("Failed to build program!\n");
      }
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, KERNEL_NAME, &err);
    check(err, "Failed to create compute kernel");

    // Allocate and initialize input data
    //
    alloc_buffers_and_user_data(context);

    // Iterate over devices
    //
    for(cl_int dev = 0; dev < nb_devices; dev++) {
      cl_command_queue queue;

      char name[1024];
      cl_device_type dtype;

      err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, 1024, name, NULL);
      check(err, "Cannot get type of device");
      err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &dtype, NULL);
      check(err, "Cannot get type of device");

      printf("\tDevice %d : %s [%s]\n", dev, (dtype == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU", name);

      // Create a command queue
      //
      queue = clCreateCommandQueue(context, devices[dev], CL_QUEUE_PROFILING_ENABLE, &err);
      check(err,"Failed to create command queue");

      // Write our data set into device buffer
      //
      send_input(queue);

      // Execute kernel
      //
      {
	cl_event prof_event;
	cl_ulong start, end;
	struct timeval t1,t2;
	double timeInMicroseconds;
	size_t global[2] = { SIZE, SIZE };  // global domain size for our calculation
	size_t local[2]  = { TILE, TILE };   // local domain size for our calculation

	printf("\t%dx%d Threads in workgroups of %dx%d\n", global[0], global[1], local[0], local[1]);

	// Set kernel arguments
	//
	err = 0;
	err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_buffer);
	check(err, "Failed to set kernel arguments");

	gettimeofday (&t1, NULL);

	for (unsigned iter = 0; iter < ITERATIONS; iter++) {
	  err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local,
				       0, NULL, &prof_event);
	  check(err, "Failed to execute kernel");
	}

	// Wait for the command commands to get serviced before reading back results
	//
	clFinish(queue);

	gettimeofday (&t2,NULL);

	// Check performance
	//
	timeInMicroseconds = (double)TIME_DIFF(t1, t2) / ITERATIONS;

	printf("\tComputation performed in %lf µs over device #%d\n",
	       timeInMicroseconds,
	       dev);

	clReleaseEvent(prof_event);
      }

      // Read back the results from the device to verify the output
      //
      retrieve_output(queue);

      // Validate computation
      //
      check_output_data();

      clReleaseCommandQueue(queue);
    }

    // Cleanup
    //
    free_buffers_and_user_data();

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseContext(context);
  }


  return 0;
}
Ejemplo n.º 15
0
int main(int argc, const char * argv[])
{
	
	 //First we set the variables for measuring performance.
	 
	 struct timeval tim1, tim2;	                
	 uint64_t time;
	 
	 //Calling the function "gettimeofday" to measure the time before the program executes.
	 gettimeofday(&tim1, NULL);
		                

	/*
	 * These are the declarations of the OpenCL structures are described below:
	 * cl_platform-id - Stores the types of platforms installed on the host.
	 * cl_device_id - Stores the type of the device (GPU, CPU, etc.)
	 * cl_context - Stores the context in which a command queue can be created.
	 * cl_command_queue - Stores the command queue which governs how the GPU will
	 *                    will execute the kernel.
	 * cl_program - Stores the kernel code (which can be comprised of several kernels). Is compiled later its
	 * 				functions get packaged into kernels.
	 * cl_kernel - The OpenCL data structure that represents kernels.
	 */

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
	cl_program program;
    cl_kernel kernel;
	
	//A cl_int used to store error flags that are returned if OpenCL function does not execute properly. 
	cl_int err;

    /*
	 * A file object and buffers used to store the input kernel code as well as allocate the memory for the kernel code 
	 * and the output log from the compiler during the compilation of the kernel code.
	 */
    
	FILE *program_handle;
    char *program_buffer, *program_log;
	size_t program_size, log_size;

	//The number of work items in each dimension of the data.
    size_t work_units_per_kernel;
    
	//This value determines the size of the nxn (square) array.
    int n = 1000;
    
	//Allocating the memory for the nxn arrays of floats.
    float **h_xx = (float**)malloc(sizeof(float*)*n);
    float **h_yy = (float**)malloc(sizeof(float*)*n);
    float **h_zz = (float**)malloc(sizeof(float*)*n);
    
    for(int i = 0; i<n; i++){
        h_xx[i] = (float*)malloc(sizeof(float)*n);
        h_yy[i] = (float*)malloc(sizeof(float)*n);
        h_zz[i] = (float*)malloc(sizeof(float)*n);
        
		//Initializing the arrays.
        for(int j = 0; j<n; j++){
            
            h_xx[i][j] = i+j;
            h_yy[i][j] = i+j;
            
        }
       
    }
    	
	/*
	 * These three variables of the type cl_mem (memory object) are used as buffers and hold the data which will
	 * be sent to the device and then once calculated sent back to the host.
	 */

    cl_mem d_xx;
    cl_mem d_yy;
    cl_mem d_zz;

	
    
    // Obtains the Platform information installed on the host and stores into the memory location of the variable "platform"
    err = clGetPlatformIDs(1, &platform, NULL);
    if(err != CL_SUCCESS){
        
        std::cout << "Error: Failed to locate Platform." << std::endl;
        exit(1);
    }
    
	// Obtains the device information (looking for specifically GPU devices) and stores it into the memory location of the variable "device"
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if(err != CL_SUCCESS){
        printf("Error: Failed to locate Device.");
        exit(1);
    }
    
    // Creates a context on the device and stores it into the "context" variable.  
	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create context." << std::endl;
        exit(1);
    }
    
    /*
	 * The following code stores the file "arraySum.cl" into the FILE object "program_handle". It then determines the size
	 * of the file and reads the content into the variable "program_buffer".
	 */

	program_handle = fopen("flopstestloop.cl", "r");
    if(!program_handle){
        std::cout << "Error: Failed to Load Kernel" << std::endl;
        exit(1);
    }
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);
    
    // Stores the kernel code into a program and stores it into the "program" variable.
	program = clCreateProgramWithSource(context, 1, (const char **)&program_buffer, (const size_t *)&program_size, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the program" << std::endl;
        exit(1);
    }
    
    free(program_buffer);

    //Compiles the program and stores the compiled code into the argument "program"
	err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not compile the program" << std::endl;

		/*
		 * The following code first allocates the correct amount of memory in order to store the output of the compilers
		 * build log and then it stores this log into the buffer "program_log". Finally it prints this buffer to the
		 * screen.
		 */

        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        program_log = (char*)malloc(log_size+1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        exit(1);
    }
    
	//From the compiled code in the program creates a kernel called "arraysum"
    kernel = clCreateKernel(program, "arraysum", &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the kernel" << std::endl;
        exit(1);
    }
    
	//Creates a command queue and stores it into the variable "queue".
    queue = clCreateCommandQueue(context, device, 0, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the queue" << std::endl;
        exit(1);
    }
    
    //Creating the Device memory buffers. These will be used to transfer data from the host to the device and vice versa.
    d_xx = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the buffer d_xx" << std::endl;
        exit(1);
    }
    
    d_yy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the buffer d_yy" << std::endl;
        exit(1);
    }
    
    d_zz = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err);
    if(err != CL_SUCCESS){
        std::cout << "Error: Could not create the buffer d_zz" << std::endl;
        exit(1);
    }
    
   /*
	* This for loop loops over the each row in the matrices x and y first writes the row to the device memory where 
	* the kernel arguments are then set and then then passed to the compiled kernel code already located on the device. 
	* Once executed, the results are then stored in the d_zz buffer and are read back to the host.
	*/

    for(int i = 0; i<n; i++)
    {
        //Writing the data from the host to the device
        err = clEnqueueWriteBuffer(queue, d_xx, CL_TRUE, 0, sizeof(float)*n, h_xx[i], 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not write to buffer d_xx" << std::endl;
            exit(1);
        }
        
        err = clEnqueueWriteBuffer(queue, d_yy, CL_TRUE, 0, sizeof(float)*n, h_yy[i], 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not write to buffer d_yy" << std::endl;
            exit(1);
        }
    
        //Setting the Kernel Arguments
        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_xx);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not set kernel argument h_xx." << std::endl;
            exit(1);
        }
    
        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_yy);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not set kernel argument h_yy." << std::endl;
            exit(1);
        }
    
        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_zz);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not set kernel argument h_zz." << std::endl;
        }
    
        work_units_per_kernel = n;
    
        //Executing the Kernel
        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not execute kernel." << std::endl;
            exit(1);
        }
    
        //Reading the Data from the Kernel
        err = clEnqueueReadBuffer(queue, d_zz, CL_TRUE, 0, n*(sizeof(float)), h_zz[i], 0, NULL, NULL);
        if(err != CL_SUCCESS){
            std::cout << "Error: Could not read data from kernel." << std::endl;
            exit(1);
        }
        
    }

    //Measuring the time after the OpenCL code has executed and has been copied back to the host.
	gettimeofday(&tim2, NULL);
	//Finding the difference between the two measured times.
	time = tim2.tv_sec - tim1.tv_sec;
	//Displaying the elapsed time in seconds.
	std::cout << time + (tim2.tv_usec - tim1.tv_usec)/1000000.00 << std::endl;

    //The previously allocated memory is freed.
    clReleaseMemObject(d_xx);
    clReleaseMemObject(d_yy);
    clReleaseMemObject(d_zz);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    
    return 0;
}
Ejemplo n.º 16
0
int main()
{
	// Initiating opencl
	cl_device_id device_id;
	cl_int err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in device."<<std::endl;
        return EXIT_FAILURE;
    }
    cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  	if (!context)
    {
        std::cout<<"Error in context."<<std::endl;
        return EXIT_FAILURE;
    }
    cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        std::cout<<"Error in command queue."<<std::endl;
        return EXIT_FAILURE;
    }
	std::ifstream in("transpMatrix.cl");
	std::string contents((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
    const char* kernelSource = contents.c_str();
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    if (!program)
    {
        std::cout<<"Error in program."<<std::endl;
        return EXIT_FAILURE;
    }
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        std::cout<<"Error in compiling the opencl program."<<std::endl;
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        std::cout<<buffer<<std::endl;
        return EXIT_FAILURE;
    }
    cl_kernel kernel = clCreateKernel(program, "simplecl", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        std::cout<<"Error in kernel "<<err<<std::endl;
        return EXIT_FAILURE;
    }

    // Data to compute
    float* data =  new float[count*count];

    for(int i = 0; i < count; ++i)
    {
        for(int j = 0; j < count; ++j)
        {
            data[i*count+j] = rand()%10;
            std::cout<<data[i*count+j]<<" ";
        }
        std::cout<<std::endl;
    }
    std::cout<<std::endl;
    // Creating communication buffers
    cl_mem input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * count*count, NULL, NULL);
    cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count*count, NULL, NULL);

	if (!input || !output)
    {
        std::cout<<"Error in allocation."<<std::endl;
        return EXIT_FAILURE;
    }  

    // Copy data to input buffer
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in copy."<<std::endl;
        return EXIT_FAILURE;
    }

 	err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(int), &count);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in argument."<<std::endl;
        return EXIT_FAILURE;
    }
    size_t local[] = {1,1};
    size_t global[] = {10,10};
    // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    // if (err != CL_SUCCESS)
    // {
    //     std::cout<<"Error in getting loal."<<std::endl;
    //     return EXIT_FAILURE;
    // }
    err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, global, local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        std::cout<<"Error in pushing to queue "<<err<<std::endl;
        return EXIT_FAILURE;
    }
    clFinish(commands);
    // Is done now

    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
       std::cout<<"Error in reading back."<<std::endl;
       return EXIT_FAILURE;
    }

    for(int i = 0; i < count; ++i)
    {
        for(int j = 0; j < count; ++j)
        {
            std::cout<<data[i*count+j]<<" ";
        }
        std::cout<<std::endl;
    }
    std::cout<<std::endl;
	return 0;
}
Ejemplo n.º 17
0
void    vectorVectorAdditionGMDP (cl_uint numDevices,cl_device_id *devices, cl_program program,cl_context context,double * h_VectA,double *h_VectB, double *h_Output,int vectSize)
{
	cl_event                gpuExec[1];
        cl_int err;	
	cl_command_queue cmdQueue;   //holds command queue object
	cl_kernel kernel;		//holds kernel object
	cl_mem d_VectA,d_VectB,d_Output;		//holds device input output buffer
	 cl_event                events;        // events
	size_t globalWorkSize[2]={vectSize,vectSize}; //holds global group size

	 double                  gflops=0.0;             //holds total achieved gflops
        cl_ulong startTime, endTime,elapsedTime;        //holds time
        float executionTimeInSeconds;                    //holds total execution time

	  
    	/*create command queue*/
        cmdQueue = clCreateCommandQueue(context, devices[0],  CL_QUEUE_PROFILING_ENABLE, &err);
        if( err != CL_SUCCESS ||  cmdQueue == 0)
         {
               printf("\n\t Failed to create command queue  \n" );
               exit (-1);
         }
        
	/*create kernel object*/
        kernel = clCreateKernel(program,"VectVectAddDPKernel",&err);
        OPENCL_CHECK_STATUS("error while creating kernel",err);
        
	/*create buffer*/
       d_VectA=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectA,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for input",err);
        
       d_VectB=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectB,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for input",err);
        
	d_Output=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(double)*vectSize,NULL,&err);
        OPENCL_CHECK_STATUS("error while creating buffer for d_Output",err);
        
	/*set kernel arg*/
        err=clSetKernelArg(kernel,0,sizeof(cl_mem),&d_VectA);
        OPENCL_CHECK_STATUS("error while setting arg 0",err);
        
	err=clSetKernelArg(kernel,1,sizeof(cl_mem),&d_VectB);
        OPENCL_CHECK_STATUS("error while setting arg 1",err);
        
	err=clSetKernelArg(kernel,2,sizeof(cl_mem),&d_Output);
        OPENCL_CHECK_STATUS("error while setting arg 2",err);
        
	/*load kernel*/
        err = clEnqueueNDRangeKernel(cmdQueue,kernel,2,NULL,globalWorkSize,NULL,0,NULL,&gpuExec[0]);
        OPENCL_CHECK_STATUS("error while creating ND range",err);
        
	//completion of all commands to command queue
        err = clFinish(cmdQueue);
        OPENCL_CHECK_STATUS("clFinish",err);

	/* calculate start time and end time*/
        clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL);
        clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);


	/* total elapsed time*/
        elapsedTime = endTime-startTime;

	/*total execution time*/
        executionTimeInSeconds = (float)(1.0e-9 * elapsedTime);


	
	/* reading buffer object*/
        err = clEnqueueReadBuffer(cmdQueue,d_Output,CL_TRUE,0,sizeof(cl_double)*vectSize,h_Output,0,0,&events);
        OPENCL_CHECK_STATUS("error while reading buffer",err);
	
	
	/* calculate total gflops*/
         gflops= (1.0e-9 * (( vectSize) / executionTimeInSeconds));


        // Print the gflops on the screen
         print_on_screen("Vector Vector Addition double precision using global memory",executionTimeInSeconds,vectSize,gflops,1);


	//check results 
	vectVectAddCheckResultGMDP(h_VectA,h_VectB,h_Output,vectSize);

	//release opencl objects
	clReleaseMemObject(d_VectA);
	clReleaseMemObject(d_VectB);
	clReleaseMemObject(d_Output);
	clReleaseProgram(program);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(cmdQueue);
	clReleaseContext(context);
}
Ejemplo n.º 18
0
int main(int argc, char **argv)
{
	cl_platform_id platforms[100];
	cl_uint platforms_n = 0;
	CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n));

	printf("=== %d OpenCL platform(s) found: ===\n", platforms_n);
	for (int i=0; i<platforms_n; i++)
	{
		char buffer[10240];
		printf("  -- %d --\n", i);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL));
		printf("  PROFILE = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL));
		printf("  VERSION = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL));
		printf("  NAME = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL));
		printf("  VENDOR = %s\n", buffer);
		CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL));
		printf("  EXTENSIONS = %s\n", buffer);
	}

	cl_device_id devices[100];
	cl_uint devices_n = 0;
	// CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n));
	CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 100, devices, &devices_n));

	printf("=== %d OpenCL device(s) found on platform:\n", platforms_n);
	for (int i=0; i<devices_n; i++)
	{
		char buffer[10240];
		cl_uint buf_uint;
		cl_ulong buf_ulong;
		printf("  -- %d --\n", i);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_NAME = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VENDOR = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DEVICE_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL));
		printf("  DRIVER_VERSION = %s\n", buffer);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL));
		printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
		CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL));
		printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
	}

	if (devices_n == 0)
		return 1;

	cl_context context;
	context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err));

	const char *program_source[] = {
		"__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n",
		"{\n",
		"	int i = get_global_id(0);\n",
		"	dst[i] = src[i] * factor;\n",
		"}\n"
	};

	cl_program program;
	program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err));
	if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) {
		char buffer[10240];
		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
		fprintf(stderr, "CL Compilation failed:\n%s", buffer);
		abort();
	}
	CL_CHECK(clUnloadCompiler());

	cl_mem input_buffer;
	input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	cl_mem output_buffer;
	output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err));

	int factor = 2;

	cl_kernel kernel;
	kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err));
	CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
	CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
	CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

	cl_command_queue queue;
	queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err));

	for (int i=0; i<NUM_DATA; i++) {
		CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL));
	}

	cl_event kernel_completion;
	size_t global_work_size[1] = { NUM_DATA };
	CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
	CL_CHECK(clWaitForEvents(1, &kernel_completion));
	CL_CHECK(clReleaseEvent(kernel_completion));

	printf("Result:");
	for (int i=0; i<NUM_DATA; i++) {
		int data;
		CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL));
		printf(" %d", data);
	}
	printf("\n");

	CL_CHECK(clReleaseMemObject(input_buffer));
	CL_CHECK(clReleaseMemObject(output_buffer));

	CL_CHECK(clReleaseKernel(kernel));
	CL_CHECK(clReleaseProgram(program));
	CL_CHECK(clReleaseContext(context));

	return 0;
}
Ejemplo n.º 19
0
void OpenCLExecuter::ocl_filter(int src_chan)
{
	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process

	//temp array
	unsigned char* data = new unsigned char[iNumElements];

	// set Local work size dimensions
	//szLocalWorkSize = 256;
	// set Global work size dimensions
	//szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize;  
	//szGlobalWorkSize = iNumElements;
//	printf("OPENCL: number of elements: %d\n", (int)iNumElements);
//	printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize);
//	printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize);
//	printf("OPENCL: work groups: %d\n", (int)((float)szGlobalWorkSize/(float)szLocalWorkSize));

	size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	//Prepare data to upload
	for(int j=0; j<iNumElements; j++)
		data[j] = volobj->texture3d[3*j+src_chan];

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	// Write data from host to GPU
//	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL);
//	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
//	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL);
//	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
		// Blocking read of results from GPU to Host

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	//read data back
	for(int i=0; i<iNumElements; i++)
	{
			if(volobj->is_greyscale)
				volobj->texture3d[3*i+0] = volobj->texture3d[3*i+1] = volobj->texture3d[3*i+2] = data[i];
			else
				volobj->texture3d[3*i+src_chan] = data[i];
	}

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);

	delete[] data;
}
Ejemplo n.º 20
0
int Parallel::setup() {
    /**
     * OpenCL initialization.
     */
    cl_int status = Simulator::setup();
    CheckStatus(status, "Simulator::setup() failed.");
    
    cl_uint numPlatforms;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    CheckStatus(status, "clGetPlatformIDs, fetching number");
    DEBUG_STDOUT("Number of platforms: " << numPlatforms);
    
    cl_platform_id platform = NULL;
    if (numPlatforms > 0) {
        std::unique_ptr<cl_platform_id[]> platforms (new cl_platform_id[numPlatforms]);
        status = clGetPlatformIDs(numPlatforms, platforms.get(), NULL);
        CheckStatus(status, "clGetPlatformIDs, fetching platforms");
        
        for (unsigned i = 0; i < numPlatforms; ++i) {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);
            CheckStatus(status, "clGetPlatformInfo");
        }
        
        // Just grab the first platform.
        platform = platforms[0];
    }
    CheckConditional(platform != NULL, "platform == NULL");
    
    cl_uint numDevices;
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
    CheckStatus(status, "clGetDeviceIDs: fetching number");
    DEBUG_STDOUT("Number of devices: " << numDevices);
    
    cl_device_id *devices = new cl_device_id[numDevices];
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
    CheckStatus(status, "clGetDeviceIDs: fetching devices");
    
    int deviceIndex = 0;
    for (unsigned i = 0; i < numDevices; ++i) {
        char pbuf[100];
        status = clGetDeviceInfo(devices[i],
                                 CL_DEVICE_NAME,
                                 sizeof(pbuf),
                                 pbuf,
                                 NULL);
        if (!strncmp(pbuf, "ATI", 3)) {
            deviceIndex = i;
        }
    }
    
    /* Create the context. */
    context = clCreateContext(0, numDevices, devices, NULL, NULL, &status);
    CheckConditional(context != NULL, "clCreateContextFromType");
    
    /* Create command queue */
    cl_command_queue_properties prop = CL_QUEUE_PROFILING_ENABLE;
    commandQueue = clCreateCommandQueue(context, devices[deviceIndex], prop, &status);
    CheckStatus(status, "clCreateCommandQueue");
    
    /* Create a CL program using the kernel source */
    SDKFile kernelFile;
    std::string kernelPath = getenv("HOME") + std::string("/md-simulator/src/TestKernel.cl");
    if(!kernelFile.open(kernelPath.c_str())) {
        DEBUG_STDERR("Failed to load kernel file : " << kernelPath);
        return MD_FAILURE;
    }
    
    const char *source = kernelFile.source().c_str();
    size_t sourceSize[] = {strlen(source)};
    program = clCreateProgramWithSource(context,
                                        1,
                                        &source,
                                        sourceSize,
                                        &status);
    CheckStatus(status, "clCreateProgramWithSource");
    
    /* Create a cl program executable for all the devices specified */
    status = clBuildProgram(program,
                            numDevices,
                            devices,
                            NULL,
                            NULL,
                            NULL);
    
    if (status != CL_SUCCESS) {
        if (status == CL_BUILD_PROGRAM_FAILURE) {
            cl_int logStatus;
            std::unique_ptr<char[]> buildLog (nullptr);
            //char *buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo(program,
                                              devices[deviceIndex],
                                              CL_PROGRAM_BUILD_LOG,
                                              buildLogSize,
                                              buildLog.get(),
                                              &buildLogSize);
            CheckStatus(logStatus, "clGetProgramBuildInfo");
            
            buildLog = std::unique_ptr<char[]>(new char[buildLogSize]);
            if(!buildLog) {
                return MD_FAILURE;
            }
            std::fill_n(buildLog.get(), buildLogSize, 0);
            
            logStatus = clGetProgramBuildInfo(program,
                                              devices[deviceIndex],
                                              CL_PROGRAM_BUILD_LOG,
                                              buildLogSize,
                                              buildLog.get(),
                                              NULL);
            CheckStatus(logStatus, "clGetProgramBuildInfo (2)");
            
            DEBUG_STDERR("\n\t\t\tBUILD LOG\n");
            DEBUG_STDERR("************************************************\n");
            DEBUG_STDERR(buildLog.get());
            DEBUG_STDERR("************************************************\n");
        }
    }
    CheckStatus(status, "clBuildProgram");
    
    /* Get a kernel object handle for a kernel with the given name */
    kernel = clCreateKernel(program, "computeAccelerations", &status);
    CheckStatus(status, "clCreateKernel");
    
    /* Check group size against group size returned by kernel */
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceIndex],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    CheckStatus(status, "clGetKernelWorkGroupInfo");
    DEBUG_STDOUT("kernelWorkGroupSize: " << kernelWorkGroupSize);
    
    /**
     * Initialize some simulator data structures.
     */
    global = particleCount * particleCount;
    local = particleCount;
    
    if (global * local > kernelWorkGroupSize) {
        DEBUG_STDERR("WARNING - global * local > kernelWorkGroupSize; global: " << global << ", local: " << local << ", kernelWorkGroupSize: " << kernelWorkGroupSize);
        return MD_FAILURE;
    }
    
    // Data holds the molecule positions.
    data = std::unique_ptr<float[]> (new float[particleCount * 3]);
    
    // Constants holds simulator constants.
    constants = std::unique_ptr<float[]> (new float[NUM_CONSTANTS]);
    
    // Copy constants to buffer;
    constants[0] = epsilon;
    constants[1] = sigma;
    constants[2] = negForceCutoffMinusHalf;
    constants[3] = forceCutoffMinusHalf;
    constants[4] = wallStiffness;
    
    // Results holds pairwise forces.
    results = std::unique_ptr<float[]> (new float[particleCount * particleCount * 3]);
    
    return MD_SUCCESS;
}
Ejemplo n.º 21
0
void OpenCLExecuter::ocl_filterPeronaMalik(float lambda, float dT, unsigned char* src_array, unsigned char* dst_array, int w, int h, int d)
{
	float lambda2 = lambda*lambda;

	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = w*h*d; // Length of float arrays to process

	size_t global_threads[3] ={w,h,d};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

    // Create the kernel
	ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(float), (void*)&lambda2);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(float), (void*)&dT);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texwidth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&volobj->texheight);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&volobj->texdepth);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);
}
int main(void) {
//time meassuring
  	struct timeval tvs;
  	struct timeval tve;
    	float elapsedTime;

	int	  Nx;
	int 	  Ny;
	int 	  Nz;
	int	  N;
	int 	  plotnum=0;
	int	  Tmax=0;
	int 	  plottime=0;
	int	  plotgap=0;
	float	  Lx,Ly,Lz;
	float	  dt=0.0;	
	float	  A=0.0;
	float	  B=0.0;
	float	  Du=0.0;
	float	  Dv=0.0;
	float	  a[2]={1.0,0.0};	
	float 	  b[2]={0.5,0.0};
	float*	  x,*y,*z ;
	float*	  u[2],*v[2];
//openCL variables
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem cl_u[2] = {NULL,NULL};
    cl_mem cl_v[2] = {NULL,NULL};
    cl_mem cl_uhat[2] = {NULL,NULL};
    cl_mem cl_vhat[2] = {NULL,NULL};
    cl_mem cl_x = NULL;
    cl_mem cl_y = NULL;
    cl_mem cl_z = NULL;
    cl_mem cl_kx = NULL;
    cl_mem cl_ky = NULL;
    cl_mem cl_kz = NULL;
    cl_program p_grid = NULL,p_frequencies = NULL,p_initialdata = NULL,p_linearpart=NULL,p_nonlinearpart=NULL;
    cl_kernel grid = NULL,frequencies = NULL,initialdata = NULL,linearpart=NULL,nonlinearpart=NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
    	size_t source_size;
    	char *source_str;
//end opencl
	int  i,n;
int status=0;	
 	//int  start, finish, count_rate, ind, numthreads
	char	nameconfig[100]="";
//Read infutfile
	char	InputFileName[]="./INPUTFILE";
	FILE*fp;
	fp=fopen(InputFileName,"r");
   	 if(!fp) {fprintf(stderr, "Failed to load IPUTFILE.\n");exit(1);}	 
	int ierr=fscanf(fp, "%d %d %d %d %d %f %f %f %f %f %f %f %f", &Nx,&Ny,&Nz,&Tmax,&plotgap,&Lx,&Ly,&Lz,&dt,&Du,&Dv,&A,&B);
	if(ierr!=13){fprintf(stderr, "INPUTFILE corrupted.\n");exit(1);}	
	fclose(fp);
	printf("NX %d\n",Nx); 
	printf("NY %d\n",Ny); 
	printf("NZ %d\n",Nz); 
	printf("Tmax %d\n",Tmax);
	printf("plotgap %d\n",plotgap);
	printf("Lx %f\n",Lx);
	printf("Ly %f\n",Ly);
	printf("Lz %f\n",Lz);
	printf("dt %f\n",dt);		
	printf("Du %f\n",Du);
	printf("Dv %f\n",Dv);
	printf("F %f\n",A);
	printf("k %f\n",B);
	printf("Read inputfile\n");
	N=Nx*Ny*Nz;
	plottime=plotgap;
	B=A+B;
//ALLocate the memory
	u[0]=(float*) malloc(N*sizeof(float));
	v[0]=(float*) malloc(N*sizeof(float));
	x=(float*) malloc(Nx*sizeof(float));
	y=(float*) malloc(Ny*sizeof(float));
	z=(float*) malloc(Nz*sizeof(float));

//allocate gpu mem
	cl_u[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_v[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_u[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_v[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_uhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_vhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_uhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	cl_vhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret);
	printf("allocated space\n");

	// FFT library realted declarations. 
	clfftPlanHandle planHandle;
	clfftDim dim = CLFFT_3D;
	size_t clLengths[3] = {Nx, Ny, Nz};
	// Setup clFFT. 
	clfftSetupData fftSetup;
	ret = clfftInitSetupData(&fftSetup);
	ret = clfftSetup(&fftSetup);
	// Create a default plan for a complex FFT. 
	ret = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths);
	// Set plan parameters. 
	ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE);
	ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR);
	ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE);
	// Bake the plan. 
	ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL);
	// Create temporary buffer. 
	cl_mem tmpBufferu = 0;
	cl_mem tmpBufferv = 0;
	// Size of temp buffer. 
	size_t tmpBufferSize = 0;
	status = clfftGetTmpBufSize(planHandle, &tmpBufferSize);
	if ((status == 0) && (tmpBufferSize > 0)) {
		tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret);
		tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret);
		if (ret != CL_SUCCESS)
			printf("Error with tmpBuffer clCreateBuffer\n");
	}
//kernel grid
    	fp = fopen("./grid.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); }
    	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );
	
	p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL);
        grid = clCreateKernel(p_grid, "grid", &ret);
//first x
	cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret);
        ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx);
	size_t global_work_size_x[3] = {Nx, 0, 0};
        ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL);
	ret = clFinish(command_queue);
//then y
	cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret);	
	ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny);
	size_t global_work_size_y[3] = {Ny, 0, 0};

	ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL);
	ret = clFinish(command_queue);

//last z
	cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret);
	ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z);
	ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz);
	ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz);
	size_t global_work_size_z[3] = {Nz, 0, 0};
	ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
	ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL);
	ret = clFinish(command_queue);
    	ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid);

//kernel initial data
    	fp = fopen("./initialdata.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); }
	free(source_str);    	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );

	p_initialdata = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_initialdata, 1, &device_id, NULL, NULL, NULL);
        initialdata = clCreateKernel(p_initialdata, "initialdata", &ret);


        ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u[0]);
	ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v[0]);
        ret = clSetKernelArg(initialdata, 2, sizeof(cl_mem),(void *)&cl_u[1]);
	ret = clSetKernelArg(initialdata, 3, sizeof(cl_mem),(void* )&cl_v[1]);
	ret = clSetKernelArg(initialdata, 4, sizeof(cl_mem),(void* )&cl_x);
	ret = clSetKernelArg(initialdata, 5, sizeof(cl_mem),(void* )&cl_y);
	ret = clSetKernelArg(initialdata, 6, sizeof(cl_mem),(void* )&cl_z);
	ret = clSetKernelArg(initialdata, 7, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(initialdata, 8, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(initialdata, 9, sizeof(int),(void* )&Nz);
	size_t global_work_size[3] = {N, 0, 0};
        ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(initialdata); ret = clReleaseProgram(p_initialdata);
        ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL);
	ret = clFinish(command_queue);
        ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL);
	ret = clFinish(command_queue);
	ret = clReleaseMemObject(cl_x);
	ret = clReleaseMemObject(cl_y);
	ret = clReleaseMemObject(cl_z);
//write to disk
	fp=fopen("./data/xcoord.dat","w");
    	if (!fp) {fprintf(stderr, "Failed to write xcoord.dat.\n"); exit(1); }
	for(i=0;i<Nx;i++){fprintf(fp,"%f\n",x[i]);}
    	fclose( fp );
	fp=fopen("./data/ycoord.dat","w");
    	if (!fp) {fprintf(stderr, "Failed to write ycoord.dat.\n"); exit(1); }
	for(i=0;i<Ny;i++){fprintf(fp,"%f\n",y[i]);}
    	fclose( fp );
	fp=fopen("./data/zcoord.dat","w");
    	if (!fp) {fprintf(stderr, "Failed to write zcoord.dat.\n"); exit(1); }
	for(i=0;i<Nz;i++){fprintf(fp,"%f\n",z[i]);}
    	fclose( fp );
	free(x); free(y); free(z);
	n=0;
	plotnum=0;
//output of initial data U
	char tmp_str[10];
	strcpy(nameconfig,"./data/u");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);}
    	fclose( fp );	
//V
	strcpy(nameconfig,"./data/v");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);}
    	fclose( fp );


//frequencies kernel

    	fp = fopen("./frequencies.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load frequencies.\n"); exit(1); }
	free(source_str);
    	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );
	
	p_frequencies = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_frequencies, 1, &device_id, NULL, NULL, NULL);
        frequencies = clCreateKernel(p_frequencies, "frequencies", &ret);
//get frequencies first x
	cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret);
        ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kx);
	ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lx);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nx);
        ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//then y
	cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret);	
	ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_ky);
	ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Ly);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Ny);
	ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);
//last z
	cl_kz = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret);
	ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kz);
	ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lz);
	ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nz);
	ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	printf("Setup grid, fourier frequencies and initialcondition\n");
//load the rest of the kernels
//linearpart kernel
    	fp = fopen("./linearpart.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load linearpart.\n"); exit(1); }
	free(source_str);    	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );

	p_linearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_linearpart, 1, &device_id, NULL, NULL, NULL);
        linearpart = clCreateKernel(p_linearpart, "linearpart", &ret);

//kernel nonlinear
    	fp = fopen("./nonlinearpart.cl", "r");
    	if (!fp) {fprintf(stderr, "Failed to load nonlinearpart.\n"); exit(1); }
	free(source_str);    	
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
   	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp );
    	fclose( fp );

	p_nonlinearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        ret = clBuildProgram(p_nonlinearpart, 1, &device_id, NULL, NULL, NULL);
        nonlinearpart = clCreateKernel(p_nonlinearpart, "nonlinearpart", &ret);

	printf("Got initial data, starting timestepping\n");
  gettimeofday(&tvs, NULL); 
	for(n=0;n<=Tmax;n++){
//linear
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv);
	ret = clFinish(command_queue);

        ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]);
        ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]);
        ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]);
        ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]);
	ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz);
	ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt);
	ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du);
	ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A);
	ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B);
	ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]);
	ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]);
	ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz);
        ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv);
	ret = clFinish(command_queue);    
//nonlinearpart
        ret = clSetKernelArg(nonlinearpart, 0, sizeof(cl_mem),(void *)&cl_u[0]);
        ret = clSetKernelArg(nonlinearpart, 1, sizeof(cl_mem),(void *)&cl_u[1]);
	ret = clSetKernelArg(nonlinearpart, 2, sizeof(cl_mem),(void* )&cl_v[0]);
	ret = clSetKernelArg(nonlinearpart, 3, sizeof(cl_mem),(void* )&cl_v[1]);
	ret = clSetKernelArg(nonlinearpart, 4, sizeof(float),(void* )&dt);
	ret = clSetKernelArg(nonlinearpart, 5, sizeof(float),(void* )&a[0]);
	ret = clSetKernelArg(nonlinearpart, 6, sizeof(float),(void* )&a[1]);
        ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);		
// linear part
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv);	
	ret = clFinish(command_queue);

        ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]);
        ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]);
        ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]);
        ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]);
	ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx);
	ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky);
	ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz);
	ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt);
	ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du);
	ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv);
	ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A);
	ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B);
	ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]);
	ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]);
	ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx);
	ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny);
	ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz);
        ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
	ret = clFinish(command_queue);

	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu);
	ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv);
	ret = clFinish(command_queue);
// done
	if(n==plottime){
		printf("time:%f, step:%d,%d\n",n*dt,n,plotnum);
		plottime=plottime+plotgap;
		plotnum=plotnum+1;
        ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL);
        ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL);
	ret = clFinish(command_queue);
//output of data U
	char tmp_str[10];
	strcpy(nameconfig,"./data/u");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write u-data.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);}
    	fclose( fp );	
//V
	strcpy(nameconfig,"./data/v");
	sprintf(tmp_str,"%d",10000000+plotnum);
	strcat(nameconfig,tmp_str);
	strcat(nameconfig,".datbin");
	fp=fopen(nameconfig,"wb");
    	if (!fp) {fprintf(stderr, "Failed to write v-data.\n"); exit(1); }
	for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);}
    	fclose( fp );
}
	}
 	gettimeofday(&tve, NULL); 
	printf("Finished time stepping\n");
 	elapsedTime = (tve.tv_sec - tvs.tv_sec) * 1000.0;      // sec to ms
    	elapsedTime += (tve.tv_usec - tvs.tv_usec) / 1000.0;   // us to ms
   	printf("%f,",elapsedTime);



	clReleaseMemObject(cl_u[0]);
	clReleaseMemObject(cl_u[1]);
	clReleaseMemObject(cl_v[0]);
	clReleaseMemObject(cl_v[1]);
	clReleaseMemObject(cl_uhat[0]);
	clReleaseMemObject(cl_uhat[1]);
	clReleaseMemObject(cl_vhat[0]);
	clReleaseMemObject(cl_vhat[1]);
	clReleaseMemObject(cl_kx);
	clReleaseMemObject(cl_ky);
	clReleaseMemObject(cl_kz);
    	ret = clReleaseKernel(frequencies); ret = clReleaseProgram(p_frequencies);
    	ret = clReleaseKernel(linearpart); ret = clReleaseProgram(p_linearpart);
    	ret = clReleaseKernel(nonlinearpart); ret = clReleaseProgram(p_nonlinearpart);
	free(u[0]);
	free(v[0]);
	clReleaseMemObject(tmpBufferu);
	clReleaseMemObject(tmpBufferv);
	/* Release the plan. */
	ret = clfftDestroyPlan(&planHandle);
	/* Release clFFT library. */
	clfftTeardown();

	ret = clReleaseCommandQueue(command_queue);
     	ret = clReleaseContext(context);	
	printf("Program execution complete\n");

	return 0;
}
Ejemplo n.º 23
0
void OpenCLExecuter::ocl_filterGaussian(unsigned char* src_array, unsigned char* dst_array, int w, int h, int d)
{
//	printf("gaussian_sum: %f\n", gaussian_sum);
	printf("gaussian_width: %d\n", filter_width);
	printf("gaussian_mask size: %d\n", filter_kernel.size());

	cl_int err;							// debugging variables
	size_t szParmDataBytes;				// Byte size of context information        
	cl_mem src_buffer;					// OpenCL device source buffer
	cl_mem gauss_buffer;				// OpenCL device source buffer
	cl_mem dst_buffer;					// OpenCL device source buffer
	size_t szGlobalWorkSize;			// 1D var for Total # of work items
	size_t szLocalWorkSize;				// 1D var for # of work items in the work group
	cl_kernel ckKernel;					// OpenCL kernel

	int iNumElements = w*h*d; // Length of float arrays to process

	size_t global_threads[3] ={w,h,d};

	// allocate the source buffer memory object
	src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
		
	gauss_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY,  sizeof(float) * filter_kernel.size(), NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));

	// allocate the destination buffer memory object
	dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE,  sizeof(unsigned char) * iNumElements, NULL, &err);
	printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
 
	//==================================================
	// X axis 
	//==================================================

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "gaussianX", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	/*size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);*/

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	//err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	//printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
	//==================================================
	// Y axis 
	//==================================================

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "gaussianY", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

/*	size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);
*/
	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	// Write data from host to GPU
	//err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	//printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	//err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	//printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
	//==================================================
	// Z axis 
	//==================================================

	// Create the kernel
	ckKernel = clCreateKernel (cpProgram, "gaussianZ", &err);
	printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
  
	// Set the Argument values
	err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
	err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d);
	printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));

	/*size_t local;
	err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
	printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
	printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);
	*/

	// Copy input data to GPU, compute, copy results back
	// Runs asynchronous to host, up until blocking read at end

	//Prepare data to upload
	//for(int j=0; j<iNumElements; j++)
	//	data[j] = volobj->texture3d[3*j+0];

	// Write data from host to GPU
	//err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL);
	//printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
	
	err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL);
	printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));

	// Launch kernel 
	err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
	printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));

	// Blocking read of results from GPU to Host
	err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL,  NULL);
	printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));

	// Cleanup allocated objects
	printf("OPENCL: Releasing kernel memory\n");
    if(ckKernel)clReleaseKernel(ckKernel); 
   
    //need to release any other OpenCL memory objects here
    if(dst_buffer)clReleaseMemObject(dst_buffer);
    if(src_buffer)clReleaseMemObject(src_buffer);
    if(gauss_buffer)clReleaseMemObject(gauss_buffer);
}
Ejemplo n.º 24
0
void
DarkenManager::setupCLprog()
{
    cl_int errNum;
    std::string fileName("darken.cl");
    fileName = CL_LOC + fileName;
    std::ifstream file(fileName.c_str() );
    if (!file)
    {
	std::stringstream s;
	s << "error opening " << fileName << " in " 
	  << __FILE__ << " at " << __LINE__ << std::endl;
	throw std::runtime_error(s.str());
    }
    std::string source = std::string(std::istreambuf_iterator<char>(file),
				     std::istreambuf_iterator<char>());
    const char *souce_sting=source.c_str();
    
    m_darken_program=clCreateProgramWithSource(m_context,
					       1,
					       &souce_sting, NULL, &errNum);
    ASSERT_CL(errNum);
    errNum=clBuildProgram(m_darken_program, 0, NULL, "-cl-fast-relaxed-math",
                          NULL, NULL);
    if (errNum==CL_BUILD_PROGRAM_FAILURE)
    {
	size_t logSize;
	errNum = clGetProgramBuildInfo(m_darken_program, m_id,
				       CL_PROGRAM_BUILD_LOG,
				       0, NULL, &logSize);
	ASSERT_CL(errNum);
	std::vector<char> log_vec;
	log_vec.resize(logSize);
	errNum = clGetProgramBuildInfo(m_darken_program, m_id,
				       CL_PROGRAM_BUILD_LOG,
				       logSize, &log_vec[0], &logSize);
	ASSERT_CL(errNum);
	std::string log(&log_vec[0]);
	std::string errMessage("Error compiling: ");
	errMessage += fileName;
	errMessage += "\n";
	errMessage += log;
	std::cout << errMessage << std::endl;
	throw std::runtime_error(errMessage);
    }
    ASSERT_CL(errNum);
    // m_darken_kernel = clCreateKernel(m_darken_program, "darken", &errNum);
    m_darken_kernel = clCreateKernel(m_darken_program, "average_NO_LOCAL", &errNum);
    // m_darken_kernel = clCreateKernel(m_darken_program, "average", &errNum);
    ASSERT_CL(errNum);
    errNum = clSetKernelArg(m_darken_kernel, 0, sizeof(m_cl_src_buffer), &m_cl_src_buffer);
    errNum |= clSetKernelArg(m_darken_kernel, 1, sizeof(m_cl_dst_buffer), &m_cl_dst_buffer);
    cl_int w = WINDOW_SIZE_WIDTH;
    cl_int h = WINDOW_SIZE_HEIGHT;
    errNum |= clSetKernelArg(m_darken_kernel, 2, sizeof(cl_int), &w);
    errNum |= clSetKernelArg(m_darken_kernel, 3, sizeof(cl_int), &h);
    errNum |= clSetKernelArg(m_darken_kernel, 4, 
			     (WORKGROUP_DIM_X+2)*(WORKGROUP_DIM_Y+2)*4, NULL);
    ASSERT_CL(errNum);


}
Ejemplo n.º 25
0
   int main() {

         // Create the variables for the time measure
         int starttime, stoptime;

         //Get initial time
         starttime = GetTimeMs();

         // This code executes on the OpenCL host

         // Host data
         float *A=NULL; // Input array
         float *B=NULL; // Input array
         float *C=NULL; // Output array

         // Elements in each array
          const int elements=2048;
         // Compute the size of the data
         size_t datasize=sizeof(int)*elements;

         // Allocate space for input/output data
         A=(float*)malloc(datasize);
         B=(float*)malloc(datasize);
         C=(float*)malloc(datasize);

         // Initialize the input data
         A[0]=2.2;
         A[1]=1.3;
         B[0]=3.7;
         B[1]=5.4;


         // Load the kernel source code into the array programSource
	     FILE *fp;
	     char *programSource;
	     size_t programSize;
	 
	     fp = fopen("fplos_kernels.cl", "r");
	     if (!fp) {
	         fprintf(stderr, "Failed to load kernel.\n");
	         exit(1);
	     }
	     programSource = (char*)malloc(MAX_SOURCE_SIZE);
	     fclose( fp );

         // Use this to check the output of each API call
         cl_int status;

         // Retrieve the number of platforms
         cl_uint numPlatforms=0;
         status=clGetPlatformIDs(0, NULL,&numPlatforms);

         // Allocate enough space for each platform
         cl_platform_id *platforms=NULL;
         platforms=(cl_platform_id*)malloc(
              numPlatforms*sizeof(cl_platform_id));

         // Fill in the platforms
         status = clGetPlatformIDs(numPlatforms, platforms, NULL);

         // Retrieve the number of devices
         cl_uint numDevices=0;
         status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0,
               NULL,&numDevices);

         // Allocate enough space for each device
         cl_device_id *devices;
         devices = (cl_device_id*)malloc(
               numDevices*sizeof(cl_device_id));

         // Fill in the devices
         status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,
              numDevices, devices, NULL);

         // Create a context and associate it with the devices
         cl_context context;
         context = clCreateContext(NULL, numDevices, devices, NULL,
             NULL, &status);

         // Create a command queue and associate it with the device
         cl_command_queue cmdQueue;
         cmdQueue = clCreateCommandQueue(context, devices[0], 0,
            &status);

         // Create a buffer object that will contain the data
         // from the host array A
         cl_mem bufA;
         bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize,
             NULL, &status);

         // Create a buffer object that will contain the data
         // from the host array B
         cl_mem bufB;
         bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize,
            NULL, &status);

         // Create a buffer object that will hold the output data
         cl_mem bufC;
         bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize,
            NULL, &status);

         // Write input array A to the device buffer bufferA
         status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE,
            0, datasize, A, 0, NULL, NULL);

         // Write input array B to the device buffer bufferB
         status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE,
            0, datasize, B, 0, NULL, NULL);

         // Create a program with source code
         cl_program program=clCreateProgramWithSource(context, 1,
            (const char**)&programSource, NULL, &status);

         // Build (compile) the program for the device
         status=clBuildProgram(program, numDevices, devices,
            NULL, NULL, NULL);

         // Create the vector addition kernel
         cl_kernel kernel;
         kernel=clCreateKernel(program, "floatadd", &status);

         // Associate the input and output buffers with the kernel
         status=clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
         status=clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
         status=clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);

         // Define an index space (global work size) of work
         // items for execution. A workgroup size (local work size)
         // is not required, but can be used.
         size_t globalWorkSize[1];

         // There are 'elements' work-items
         globalWorkSize[0]=elements;

         // Execute the kernel for execution
         status=clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL,
            globalWorkSize, NULL, 0, NULL, NULL);

         // Read the device output buffer to the host output array
         clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0,
            datasize, C, 0, NULL, NULL);

           printf("Output = %.1f\n", C[0]);
           printf("Output = %.1f\n", C[1]);

        // Free OpenCL resources
         clReleaseKernel(kernel);
         clReleaseProgram(program);
         clReleaseCommandQueue(cmdQueue);
         clReleaseMemObject(bufA);
         clReleaseMemObject(bufB);
         clReleaseMemObject(bufC);
         clReleaseContext(context);

         // Free host resources
         free(A);
         free(B);
         free(C);
         free(platforms);
         free(devices);

         //Get initial time
         stoptime = GetTimeMs();

         printf("Duration= %d ms\n", stoptime - starttime);
            
         return 0;
   }
Ejemplo n.º 26
0
int main(void)
{
    float *h_psum;              // vector to hold partial sum
    int in_nsteps = INSTEPS;    // default number of steps (updated later to device preferable)
    int niters = ITERS;         // number of iterations
    int nsteps;
    float step_size;
    size_t nwork_groups;
    size_t max_size, work_group_size = 8;
    float pi_res;

    cl_mem d_partial_sums;

    char *kernelsource = getKernelSource("../pi_ocl.cl");             // Kernel source

    cl_int err;
    cl_device_id     device_id;     // compute device id 
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        kernel_pi;     // compute kernel

    // Set up OpenCL context. queue, kernel, etc.
    cl_uint numPlatforms;
    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to find a platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    if (err != CL_SUCCESS || numPlatforms <= 0)
    {
        printf("Error: Failed to get the platform!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Secure a device
    for (int i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
            break;
    }
    if (device_id == NULL)
    {
        printf("Error: Failed to create a device group!\n%s\n",err_code(err));
        return EXIT_FAILURE;
    }
    // Output information
    err = output_device_info(device_id);
    // Create a compute context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Build the program  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }
    // Create the compute kernel from the program 
    kernel_pi = clCreateKernel(program, "pi", &err);
    if (!kernel_pi || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // Find kernel work-group size
    err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }
    // Now that we know the size of the work-groups, we can set the number of
    // work-groups, the actual number of steps, and the step size
    nwork_groups = in_nsteps/(work_group_size*niters);

    if (nwork_groups < 1)
    {
        err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL);
        work_group_size = in_nsteps / (nwork_groups * niters);
    }

    nsteps = work_group_size * niters * nwork_groups;
    step_size = 1.0f/(float)nsteps;
    h_psum = calloc(sizeof(float), nwork_groups);
    if (!h_psum)
    {
        printf("Error: could not allocate host memory for h_psum\n");
        return EXIT_FAILURE;
    }

    printf(" %ld work-groups of size %ld. %d Integration steps\n",
            nwork_groups,
            work_group_size,
            nsteps);

    d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // Set kernel arguments
    err  = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters);
    err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size);
    err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL);
    err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments!\n");
        return EXIT_FAILURE;
    }

    // Execute the kernel over the entire range of our 1D input data set
    // using the maximum number of work items for this device
    size_t global = nwork_groups * work_group_size;
    size_t local = work_group_size;
    double rtime = wtime();
    err = clEnqueueNDRangeKernel(
        commands,
        kernel_pi,
        1, NULL,
        &global,
        &local,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to execute kernel\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }


    err = clEnqueueReadBuffer(
        commands,
        d_partial_sums,
        CL_TRUE,
        0,
        sizeof(float) * nwork_groups,
        h_psum,
        0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read buffer\n%s\n", err_code(err));
        return EXIT_FAILURE;
    }

    // complete the sum and compute the final integral value on the host
    pi_res = 0.0f;
    for (unsigned int i = 0; i < nwork_groups; i++)
    {
        pi_res += h_psum[i];
    }
    pi_res *= step_size;

    rtime = wtime() - rtime;

    printf("\nThe calculation ran in %lf seconds\n", rtime);
    printf(" pi = %f for %d steps\n", pi_res, nsteps);

    // clean up
    clReleaseMemObject(d_partial_sums);
    clReleaseProgram(program);
    clReleaseKernel(kernel_pi);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    free(kernelsource);
    free(h_psum);
}
Ejemplo n.º 27
0
void spmv_csr_cpu(const csr_matrix* csr,const float* x,const float* y,float* out) {
    int num_rows = csr->num_rows;

    int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char));
	if(!source) { fprintf(stderr, "ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_csr_src  = "csr_ocl";
	char * tempchar = "./spmv_kernel.cl";
	FILE * fp = fopen(tempchar, "rb");
	if(!fp) { fprintf(stderr, "ERROR: unable to open '%s'\n", tempchar); return -1; }
	fread(source + strlen(source), sourcesize, 1, fp);
	fclose(fp);

	int use_gpu = 1;
	if(initialize(use_gpu)) return -1;

	// compile kernel
	cl_int err = 0;
	const char * slist[2] = { source, 0 };
	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clBuildProgram() => %d\n", err); return -1; }

	cl_kernel kernel_csr;
	kernel_csr = clCreateKernel(prog, kernel_csr_src, &err);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
	clReleaseProgram(prog);

	cl_mem memAp;
	cl_mem memAj;
	cl_mem memAx;
	cl_mem memx;
	cl_mem memy;

	memAp = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*(csr.num_rows+1), NULL, &err);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memAj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*csr.num_nonzeros, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memAx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}
	memy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*csr.num_rows, NULL, &err );
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;}

	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, memAp, CL_FALSE, 0, sizeof(unsigned int)*csr.num_rows+4, csr->Ap, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memAj, CL_FALSE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr->Aj, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memAx, CL_FALSE, 0, sizeof(float)*csr.num_nonzeros, csr->Ax, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memx, CL_FALSE, 0, sizeof(float)*csr.num_cols, x, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, memy, CL_FALSE, 0, sizeof(float)*csr.num_rows, y, 0, NULL, NULL);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; }

	clSetKernelArg(kernel_csr, 0, sizeof(unsigned int *), (unsigned int *) &csr->num_rows);
	clSetKernelArg(kernel_csr, 1, sizeof(void *), (void*) &memAp);
	clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memAj);
	clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memAx);
	clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memx);
	clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memy);

	err = clEnqueueNDRangeKernel(cmd_queue, kernel_csr, 2, NULL, global_work, local_work, 0, 0, 0);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }

	err = clEnqueueReadBuffer(cmd_queue, memy, 1, 0, sizeof(float)*csr.num_rows, out, 0, 0, 0);
	if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1  clEnqueueReadBuffer: out\n"); return -1; }

	clReleaseMemObject(memAp);
	clReleaseMemObject(memAj);
	clReleaseMemObject(memAx);
	clReleaseMemObject(memx);
	clReleaseMemObject(memy);
}
void buildOpenCLKernels_update_halo_kernel1_fr2(int xdim0, int ydim0, int xdim1,
                                                int ydim1, int xdim2, int ydim2,
                                                int xdim3, int ydim3, int xdim4,
                                                int ydim4, int xdim5, int ydim5,
                                                int xdim6, int ydim6) {

  // int ocl_fma = OCL_FMA;
  if (!isbuilt_update_halo_kernel1_fr2) {
    buildOpenCLKernels();
    // clSafeCall( clUnloadCompiler() );
    cl_int ret;
    char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_fr2.cl"};

    // Load the kernel source code into the array source_str
    FILE *fid;
    char *source_str[1];
    size_t source_size[1];

    for (int i = 0; i < 1; i++) {
      fid = fopen(source_filename[i], "r");
      if (!fid) {
        fprintf(stderr, "Can't open the kernel source file!\n");
        exit(1);
      }

      source_str[i] = (char *)malloc(4 * 0x1000000);
      source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid);
      if (source_size[i] != 4 * 0x1000000) {
        if (ferror(fid)) {
          printf("Error while reading kernel source file %s\n",
                 source_filename[i]);
          exit(-1);
        }
        if (feof(fid))
          printf("Kernel source file %s succesfuly read.\n",
                 source_filename[i]);
        // printf("%s\n",source_str[i]);
      }
      fclose(fid);
    }

    printf("Compiling update_halo_kernel1_fr2 %d source -- start \n", OCL_FMA);

    // Create a program from the source
    OPS_opencl_core.program = clCreateProgramWithSource(
        OPS_opencl_core.context, 1, (const char **)&source_str,
        (const size_t *)&source_size, &ret);
    clSafeCall(ret);

    // Build the program
    char buildOpts[255 * 8];
    char *pPath = NULL;
    pPath = getenv("OPS_INSTALL_PATH");
    if (pPath != NULL)
      if (OCL_FMA)
        sprintf(buildOpts, "-cl-mad-enable -DOCL_FMA -I%s/c/include "
                           "-DOPS_WARPSIZE=%d  "
                           "-Dxdim0_update_halo_kernel1_fr2=%d  "
                           "-Dydim0_update_halo_kernel1_fr2=%d  "
                           "-Dxdim1_update_halo_kernel1_fr2=%d  "
                           "-Dydim1_update_halo_kernel1_fr2=%d  "
                           "-Dxdim2_update_halo_kernel1_fr2=%d  "
                           "-Dydim2_update_halo_kernel1_fr2=%d  "
                           "-Dxdim3_update_halo_kernel1_fr2=%d  "
                           "-Dydim3_update_halo_kernel1_fr2=%d  "
                           "-Dxdim4_update_halo_kernel1_fr2=%d  "
                           "-Dydim4_update_halo_kernel1_fr2=%d  "
                           "-Dxdim5_update_halo_kernel1_fr2=%d  "
                           "-Dydim5_update_halo_kernel1_fr2=%d  "
                           "-Dxdim6_update_halo_kernel1_fr2=%d  "
                           "-Dydim6_update_halo_kernel1_fr2=%d ",
                pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3,
                ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6);
      else
        sprintf(buildOpts, "-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d  "
                           "-Dxdim0_update_halo_kernel1_fr2=%d  "
                           "-Dydim0_update_halo_kernel1_fr2=%d  "
                           "-Dxdim1_update_halo_kernel1_fr2=%d  "
                           "-Dydim1_update_halo_kernel1_fr2=%d  "
                           "-Dxdim2_update_halo_kernel1_fr2=%d  "
                           "-Dydim2_update_halo_kernel1_fr2=%d  "
                           "-Dxdim3_update_halo_kernel1_fr2=%d  "
                           "-Dydim3_update_halo_kernel1_fr2=%d  "
                           "-Dxdim4_update_halo_kernel1_fr2=%d  "
                           "-Dydim4_update_halo_kernel1_fr2=%d  "
                           "-Dxdim5_update_halo_kernel1_fr2=%d  "
                           "-Dydim5_update_halo_kernel1_fr2=%d  "
                           "-Dxdim6_update_halo_kernel1_fr2=%d  "
                           "-Dydim6_update_halo_kernel1_fr2=%d ",
                pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3,
                ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6);
    else {
      sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath);
      exit(EXIT_FAILURE);
    }

    ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id,
                         buildOpts, NULL, NULL);

    if (ret != CL_SUCCESS) {
      char *build_log;
      size_t log_size;
      clSafeCall(clGetProgramBuildInfo(
          OPS_opencl_core.program, OPS_opencl_core.device_id,
          CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size));
      build_log = (char *)malloc(log_size + 1);
      clSafeCall(clGetProgramBuildInfo(
          OPS_opencl_core.program, OPS_opencl_core.device_id,
          CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL));
      build_log[log_size] = '\0';
      fprintf(
          stderr,
          "=============== OpenCL Program Build Info ================\n\n%s",
          build_log);
      fprintf(stderr,
              "\n========================================================= \n");
      free(build_log);
      exit(EXIT_FAILURE);
    }
    printf("compiling update_halo_kernel1_fr2 -- done\n");

    // Create the OpenCL kernel
    OPS_opencl_core.kernel[22] = clCreateKernel(
        OPS_opencl_core.program, "ops_update_halo_kernel1_fr2", &ret);
    clSafeCall(ret);

    isbuilt_update_halo_kernel1_fr2 = true;
  }
}
void 
kernel_gpu_opencl_wrapper_2(knode *knodes,
							long knodes_elem,
							long knodes_mem,

							int order,
							long maxheight,
							int count,

							long *currKnode,
							long *offset,
							long *lastKnode,
							long *offset_2,
							int *start,
							int *end,
							int *recstart,
							int *reclength)
{

	//======================================================================================================================================================150
	//	CPU VARIABLES
	//======================================================================================================================================================150

	// timer
	long long time0;
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long time6;

	time0 = get_time();

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	// cudaThreadSynchronize();

	//====================================================================================================100
	//	COMMON VARIABLES
	//====================================================================================================100

	// common variables
	cl_int error;

	//====================================================================================================100
	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
	//====================================================================================================100

	// Get the number of available platforms
	cl_uint num_platforms;
	error = clGetPlatformIDs(	0, 
								NULL, 
								&num_platforms);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of available platforms
	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
	error = clGetPlatformIDs(	num_platforms, 
								platforms, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the 1st platform
	cl_platform_id platform = platforms[0];

	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
	char pbuf[100];
	error = clGetPlatformInfo(	platform, 
								CL_PLATFORM_VENDOR, 
								sizeof(pbuf), 
								pbuf, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Platform: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE CONTEXT FOR THE PLATFORM
	//====================================================================================================100

	// Create context properties for selected platform
	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
													(cl_context_properties) platform, 
													0};

	// Create context for selected platform being GPU
	cl_context context;
	context = clCreateContextFromType(	context_properties, 
										CL_DEVICE_TYPE_GPU, 
										NULL, 
										NULL, 
										&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE
	//====================================================================================================100

	// Get the number of devices (previousely selected for the context)
	size_t devices_size;
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								0, 
								NULL, 
								&devices_size);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Get the list of devices (previousely selected for the context)
	cl_device_id *devices = (cl_device_id *) malloc(devices_size);
	error = clGetContextInfo(	context, 
								CL_CONTEXT_DEVICES, 
								devices_size, 
								devices, 
								NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one)
	cl_device_id device;
	device = devices[0];

	// Get the name of the selected device (previousely selected for the context) and print it
	error = clGetDeviceInfo(device, 
							CL_DEVICE_NAME, 
							sizeof(pbuf), 
							pbuf, 
							NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);
	printf("Device: %s\n", pbuf);

	//====================================================================================================100
	//	CREATE COMMAND QUEUE FOR THE DEVICE
	//====================================================================================================100

	// Create a command queue
	cl_command_queue command_queue;
	command_queue = clCreateCommandQueue(	context, 
											device, 
											0, 
											&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//====================================================================================================100
	//	CREATE PROGRAM, COMPILE IT
	//====================================================================================================100

	// Load kernel source code from file
	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl");
	size_t sourceSize = strlen(source);

	// Create the program
	cl_program program = clCreateProgramWithSource(	context, 
													1, 
													&source, 
													&sourceSize, 
													&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	char clOptions[110];
	//  sprintf(clOptions,"-I../../src");                                                                                 
	sprintf(clOptions,"-I./../");

#ifdef DEFAULT_ORDER_2
	sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2);
#endif

	// Compile the program
	error = clBuildProgram(	program, 
							1, 
							&device, 
							clOptions, 
							NULL, 
							NULL);
	// Print warnings and errors from compilation
	static char log[65536]; 
	memset(log, 0, sizeof(log));
	clGetProgramBuildInfo(	program, 
							device, 
							CL_PROGRAM_BUILD_LOG, 
							sizeof(log)-1, 
							log, 
							NULL);
	printf("-----OpenCL Compiler Output-----\n");
	if (strstr(log,"warning:") || strstr(log, "error:")) 
		printf("<<<<\n%s\n>>>>\n", log);
	printf("--------------------------------\n");
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Create kernel
	cl_kernel kernel;
	kernel = clCreateKernel(program, 
							"findRangeK", 
							&error);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time1 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY				MALLOC
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	knodesD
	//==================================================50

	cl_mem knodesD;
	knodesD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								knodes_mem, 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	cl_mem currKnodeD;
	currKnodeD = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	cl_mem offsetD;
	offsetD = clCreateBuffer(	context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	lastKnodeD
	//==================================================50

	cl_mem lastKnodeD;
	lastKnodeD = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offset_2D
	//==================================================50

	cl_mem offset_2D;
	offset_2D = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(long), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	startD
	//==================================================50

	cl_mem startD;
	startD = clCreateBuffer(context, 
								CL_MEM_READ_WRITE, 
								count*sizeof(int), 
								NULL, 
								&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	endD
	//==================================================50

	cl_mem endD;
	endD = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							count*sizeof(int), 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	cl_mem ansDStart;
	ansDStart = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							count*sizeof(int), 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	ansDLength
	//==================================================50

	cl_mem ansDLength;
	ansDLength = clCreateBuffer(	context, 
							CL_MEM_READ_WRITE, 
							count*sizeof(int), 
							NULL, 
							&error );
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time2 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	knodesD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									knodesD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									knodes_mem,				// size to be copied
									knodes,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	currKnodeD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									currKnodeD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									currKnode,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offsetD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									offsetD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									offset,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	lastKnodeD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									lastKnodeD,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									lastKnode,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	offset_2D
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									offset_2D,				// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(long),		// size to be copied
									offset_2,				// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	startD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									startD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									start,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	endD
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									endD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									end,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									endD,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									end,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	ansDLength
	//==================================================50

	error = clEnqueueWriteBuffer(	command_queue,			// command queue
									ansDLength,					// destination
									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
									0,						// offset in destination to write to
									count*sizeof(int),		// size to be copied
									reclength,					// source
									0,						// # of events in the list of events to wait for
									NULL,					// list of events to wait for
									NULL);					// ID of this operation to be used by waiting operations
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time3 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//======================================================================================================================================================150
	//	KERNEL
	//======================================================================================================================================================150

	//====================================================================================================100
	//	Execution Parameters
	//====================================================================================================100

	size_t local_work_size[1];
	local_work_size[0] = order < 1024 ? order : 1024;
	size_t global_work_size[1];
	global_work_size[0] = count * local_work_size[0];

	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);

	//====================================================================================================100
	//	Kernel Arguments
	//====================================================================================================100

	clSetKernelArg(	kernel, 
					0, 
					sizeof(long), 
					(void *) &maxheight);
	clSetKernelArg(	kernel, 
					1, 
					sizeof(cl_mem), 
					(void *) &knodesD);
	clSetKernelArg(	kernel, 
					2, 
					sizeof(long), 
					(void *) &knodes_elem);

	clSetKernelArg(	kernel, 
					3, 
					sizeof(cl_mem), 
					(void *) &currKnodeD);
	clSetKernelArg(	kernel, 
					4, 
					sizeof(cl_mem), 
					(void *) &offsetD);
	clSetKernelArg(	kernel, 
					5, 
					sizeof(cl_mem), 
					(void *) &lastKnodeD);
	clSetKernelArg(	kernel, 
					6, 
					sizeof(cl_mem), 
					(void *) &offset_2D);
	clSetKernelArg(	kernel, 
					7, 
					sizeof(cl_mem), 
					(void *) &startD);
	clSetKernelArg(	kernel, 
					8, 
					sizeof(cl_mem), 
					(void *) &endD);
	clSetKernelArg(	kernel, 
					9, 
					sizeof(cl_mem), 
					(void *) &ansDStart);
	clSetKernelArg(	kernel, 
					10, 
					sizeof(cl_mem), 
					(void *) &ansDLength);

	//====================================================================================================100
	//	Kernel
	//====================================================================================================100

	error = clEnqueueNDRangeKernel(	command_queue, 
									kernel, 
									1, 
									NULL, 
									global_work_size, 
									local_work_size, 
									0, 
									NULL, 
									NULL);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
	error = clFinish(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time4 = get_time();

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY (CONTD.)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	error = clEnqueueReadBuffer(command_queue,				// The command queue.
								ansDStart,					// The image on the device.
								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
								0,							// Offset. None in this case.
								count*sizeof(int),			// Size to copy.
								recstart,					// The pointer to the image on the host.
								0,							// Number of events in wait list. Not used.
								NULL,						// Event wait list. Not used.
								NULL);						// Event object for determining status. Not used.
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	//==================================================50
	//	ansDLength
	//==================================================50

	error = clEnqueueReadBuffer(command_queue,				// The command queue.
								ansDLength,					// The image on the device.
								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
								0,							// Offset. None in this case.
								count*sizeof(int),			// Size to copy.
								reclength,					// The pointer to the image on the host.
								0,							// Number of events in wait list. Not used.
								NULL,						// Event wait list. Not used.
								NULL);						// Event object for determining status. Not used.
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	time5 = get_time();

	//==================================================50
	//	END
	//==================================================50

	//====================================================================================================100
	//	END
	//====================================================================================================100

	//======================================================================================================================================================150
	//	GPU MEMORY DEALLOCATION
	//======================================================================================================================================================150

	// Release kernels...
	clReleaseKernel(kernel);

	// Now the program...
	clReleaseProgram(program);

	// Clean up the device memory...
	clReleaseMemObject(knodesD);

	clReleaseMemObject(currKnodeD);
	clReleaseMemObject(offsetD);
	clReleaseMemObject(lastKnodeD);
	clReleaseMemObject(offset_2D);
	clReleaseMemObject(startD);
	clReleaseMemObject(endD);
	clReleaseMemObject(ansDStart);
	clReleaseMemObject(ansDLength);

	// Flush the queue
	error = clFlush(command_queue);
	if (error != CL_SUCCESS) 
		fatal_CL(error, __LINE__);

	// ...and finally, the queue and context.
	clReleaseCommandQueue(command_queue);

	// ???
	clReleaseContext(context);

	time6 = get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");

	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);

	printf("Total time:\n");
	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);

	//======================================================================================================================================================150
	//	END
	//======================================================================================================================================================150

}
Ejemplo n.º 30
0
int main(int argc, char** argv)
{
  cl_device_type    DEV_TYPE = CL_DEVICE_TYPE_GPU;
  cl_platform_id    platform;
  cl_device_id      device;
  cl_context        context;
  cl_command_queue  command_queue;
  cl_program        program;
  cl_kernel         kernel;
  cl_mem            buffer_src;
  cl_mem            buffer_dst;
  cl_int            err;

  size_t local      = 4;
  size_t global     = local * 8;
  size_t SIZE       = global;

  err = clGetPlatformIDs(1, &platform, NULL);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]\n", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  cl_uint num_dev = 1;
  err = clGetDeviceIDs(platform, DEV_TYPE, num_dev, &device, &num_dev);
  //printf("clGetDeviceIDs : device = 0x%x\n", device);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]\n", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  if (num_dev < 1) exit(EXIT_FAILURE);

  int* host_src = (int*) calloc(SIZE, sizeof(int));
  for (int i = 0; i < SIZE; i++) {
    host_src[i] = -666; //i * 10;
  }

  int* host_dst = (int*) calloc(SIZE, sizeof(int));

  context = clCreateContext(0, num_dev, &device, NULL, NULL, &err);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  command_queue = clCreateCommandQueue(context, device, 0, &err);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  buffer_src = clCreateBuffer(context, CL_MEM_READ_ONLY, SIZE * sizeof(int), NULL, &err);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  buffer_dst = clCreateBuffer(context, CL_MEM_WRITE_ONLY, SIZE * sizeof(int), NULL, &err);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  size_t kernel_src_len = strlen(kernel_src);
  program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, &kernel_src_len, &err);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  /*fprintf(stderr, "%s(%d) : size = %d\n", "main", __LINE__, SIZE * sizeof(int));
  for(int i=0; i<16; i++){
    fprintf(stderr, "%s(%d) : ptr[%d] = %d\n", "main", __LINE__, i, *((int *)host_src + i));
  }
  for(int i=0; i<16; i++){
    fprintf(stderr, "%s(%d) : ptr[%d] = %d\n", "main", __LINE__, i, *((int *)host_src + 16 + i));
  }*/
  err = clEnqueueWriteBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_src, 0, NULL, NULL);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  //err = clEnqueueWriteBuffer(command_queue, buffer_dst, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL);
  //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  kernel = clCreateKernel(program, "sample", &err);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &buffer_dst);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &buffer_src);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  int offset = 100;
  err = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &offset);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }
  
  //global = global/2;
  err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  //size_t gwo = global;
  //err = clEnqueueNDRangeKernel(command_queue, kernel, 1, &gwo, &global, &local, 0, NULL, NULL);
  //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  //err = clFinish(command_queue);
  //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  //err = clEnqueueReadBuffer(command_queue, buffer_dst, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL);
  //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  err = clEnqueueReadBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL);
  if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  for (int i = 0; i < SIZE; i++) printf("[%2d] %d\n", i, host_dst[i]);

  //memset(host_src, 0, SIZE*sizeof(int));

  //err = clEnqueueWriteBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_src, 0, NULL, NULL);
  //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  //err = clEnqueueReadBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL);
  //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  //for (int i = 0; i < SIZE; i++) printf("[%2d] %d\n", i, host_dst[i]);

  //err = clEnqueueReadBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL);
  //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); }

  //for (int i = 0; i < SIZE; i++) printf("[%2d] %d\n", i, host_dst[i]);

  free(host_src);
  free(host_dst);

  return 0;
}