cl_int buildCachedProgram(ocl_constructs * occs, char * filebase, const char * options){ cl_int error = 0; if(occs->program != NULL) return error; char binaryName[256]; binaryName[0] = '\0'; strcat(binaryName, filebase); strcat(binaryName, ".bin"); occs->program = CreateProgramFromBinary(occs->context, occs->deviceId, binaryName, options); if(occs->program == NULL){ occs->program = CreateProgram(occs->context, occs->deviceId, filebase, options); if(occs->program == NULL){ //Cleanup(context, commandQueue, program, kernel, memObjects); fprintf(stderr, "cachedProgram.c::buildCachedProgram failed to build program\n"); return 1; } if(SaveProgramBinary(occs->program, occs->deviceId, binaryName)){ fprintf(stderr, "cachedProgram.c::buildCachedProgram failed to save program binary\n"); //CLeanup return 2; } } return error; }
int main(int argc, char **argv) { printf("enter demo main\n"); fflush(stdout); putenv("POCL_VERBOSE=1"); putenv("POCL_DEVICES=basic"); putenv("POCL_LEAVE_TEMP_DIRS=1"); putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1"); putenv("POCL_TEMP_DIR=pocl"); putenv("POCL_CACHE_DIR=pocl"); putenv("POCL_WORK_GROUP_METHOD=spmd"); if(argc >= 2){ printf("argv[1]:%s:\n",argv[1]); if(!strcmp(argv[1], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[1], "c")) putenv("POCL_CROSS_COMPILE=1"); } if(argc >= 3){ printf("argv[2]:%s:\n",argv[2]); if(!strcmp(argv[2], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[2], "c")) putenv("POCL_CROSS_COMPILE=1"); } //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib"); //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath()); cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err)); cl_kernel kernel = 0; cl_mem memObjects[3] = {0,0,0}; // Create OpenCL program - first attempt to load cached binary. // If that is not available, then create the program from source // and store the binary for future use. std::cout << "Attempting to create program from binary..." << std::endl; cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin"); if (program == NULL) { std::cout << "Binary not loaded, create from source..." << std::endl; program = CreateProgram(context, devices[1], "kernel.cl"); if (program == NULL) { Cleanup(context, queue, program, kernel, memObjects); return 1; } std::cout << "Save program binary for future run..." << std::endl; if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false) { std::cerr << "Failed to write program binary" << std::endl; Cleanup(context, queue, program, kernel, memObjects); return 1; } } else { std::cout << "Read program from binary." << std::endl; } printf("attempting to create input buffer\n"); fflush(stdout); cl_mem input_bufferA; input_bufferA = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err)); cl_mem input_bufferB; input_bufferB = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err)); printf("attempting to create output buffer\n"); fflush(stdout); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err)); memObjects[0] = input_bufferA; memObjects[1] = input_bufferB; memObjects[2] = output_buffer; size_t width = NUM_DATA; printf("attempting to create kernel\n"); fflush(stdout); kernel = CL_CHECK_ERR(clCreateKernel(program, "sgemm_single", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_bufferA), &input_bufferA)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(input_bufferB), &input_bufferB)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(width), &width)); printf("attempting to enqueue write buffer\n"); fflush(stdout); for (int i=0; i<NUM_DATA*NUM_DATA; i++) { float in = ((float)rand()/(float)(RAND_MAX)) * 100.0; CL_CHECK(clEnqueueWriteBuffer(queue, input_bufferA, CL_TRUE, i*sizeof(float), 4, &in, 0, NULL, NULL)); in = ((float)rand()/(float)(RAND_MAX)) * 100.0; CL_CHECK(clEnqueueWriteBuffer(queue, input_bufferB, CL_TRUE, i*sizeof(float), 4, &in, 0, NULL, NULL)); } cl_event kernel_completion; const size_t local_work_size[3] = { 64, 1, 1}; // a_offset size_t global_work_size[3] = { NUM_DATA, NUM_DATA, NUM_DATA }; printf("attempting to enqueue kernel\n"); fflush(stdout); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &kernel_completion)); printf("Enqueue'd kerenel\n"); fflush(stdout); cl_ulong time_start, time_end; CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsed = time_end - time_start; printf("time(ns):%lg\n",elapsed); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA*NUM_DATA; i++) { float data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(float), 4, &data, 0, NULL, NULL)); //printf(" %f", data); } printf("\n"); CL_CHECK(clReleaseMemObject(memObjects[0])); CL_CHECK(clReleaseMemObject(memObjects[1])); CL_CHECK(clReleaseMemObject(memObjects[2])); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int main(int argc, char *argv[]){ if (MODE == 5){ printf("---OpenCL Test Code---\n\n"); cl_int errNum; cl_uint numPlatforms; cl_platform_id *platforms = NULL; cl_uint numDevices; cl_device_id *devices = NULL; //platform info fields char vendor[1024], name[1024], version[1024]; //device info fields size_t MAX_WORK_GROUP_SIZE; cl_ulong GLOBAL_MEM_CACHE_SIZE, GLOBAL_MEM_SIZE, LOCAL_MEM_SIZE, GLOBAL_MEM_CACHELINE_SIZE; cl_uint MAX_COMPUTE_UNITS, MAX_WORK_ITEM_DIMENSIONS; size_t MAX_WORK_ITEM_SIZES[3]; char DEVICE_NAME[1024], DEVICE_VENDOR[1024], DEVICE_VERSION[1024], DRIVER_VERSION[1024], EXTENSIONS[2048]; cl_device_mem_cache_type GLOBAL_MEM_CACHE_TYPE; //printf("Getting number of OpenCL Platforms...\n"); errNum = clGetPlatformIDs(0, NULL, &numPlatforms); if (errNum != CL_SUCCESS) { printf("Failed to get number of OpenCL platforms.\n"); return 0; } else { //printf("found %d.\n", numPlatforms); } //printf("Allocating space for the platform info...\n"); platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); printf("---Platform Info---\n"); errNum = clGetPlatformIDs(numPlatforms, platforms, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get platform info.\n"); return 0; } else { clGetPlatformInfo (platforms[0], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_NAME, sizeof(name), name, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_VERSION, sizeof(version), version, NULL); //printf("Got platform info.\n"); printf("Vendor: \t%s\n", vendor); printf("Name: \t%s\n", name); printf("Version:\t%s\n", version); } //printf("Getting number of devices...\n"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS) { printf("Failed to get number of devices.\n"); return 0; } else { //printf("Found %d.\n", numDevices); } //printf("Allocating space for device info...\n"); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); printf("\n---Device Info---"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get device info.\n"); return 0; } else { int i, j = 0; for (i = 0; i < numDevices; i++ ) { printf("\nDevice ID: %d\n", i+1); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(DEVICE_NAME), DEVICE_NAME, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(DEVICE_VENDOR), DEVICE_VENDOR, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(DEVICE_VERSION), DEVICE_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(DRIVER_VERSION), DRIVER_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(EXTENSIONS), EXTENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS), &MAX_COMPUTE_UNITS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(GLOBAL_MEM_SIZE), &GLOBAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(LOCAL_MEM_SIZE), &LOCAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(MAX_WORK_ITEM_DIMENSIONS), &MAX_WORK_ITEM_DIMENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MAX_WORK_ITEM_SIZES), MAX_WORK_ITEM_SIZES, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_WORK_GROUP_SIZE), &MAX_WORK_GROUP_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(GLOBAL_MEM_CACHE_SIZE), &GLOBAL_MEM_CACHE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(GLOBAL_MEM_CACHELINE_SIZE), &GLOBAL_MEM_CACHELINE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(GLOBAL_MEM_CACHE_TYPE), &GLOBAL_MEM_CACHE_TYPE, NULL); printf("Device Name:\t%s\n", DEVICE_NAME); printf("Device Vendor:\t%s\n", DEVICE_VENDOR); printf("Device Version:\t%s\n", DEVICE_VERSION); printf("Driver Version:\t%s\n", DRIVER_VERSION); printf("EXTENSIONS:\t%s\n", EXTENSIONS); printf("Number of CUs:\t%d\n", MAX_COMPUTE_UNITS); printf("GMem:\t\t%lld (Bytes)\n", (long long) GLOBAL_MEM_SIZE); printf("GMem $ Size:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHE_SIZE); printf("GMem $ Line:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHELINE_SIZE); if(GLOBAL_MEM_CACHE_TYPE == CL_NONE) { printf("GMem $ Type:\tCL_NONE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_ONLY_CACHE) { printf("GMem $ Type:\tCL_READ_ONLY_CACHE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_WRITE_CACHE) { printf("GMem $ Type:\tCL_READ_WRITE_CACHE\n"); } printf("LMem:\t\t%lld (Bytes)\n", (long long) LOCAL_MEM_SIZE); printf("Work Group Size:%d (Max)\n", (int) MAX_WORK_GROUP_SIZE); printf("Work Item Dim:\t%d (Max)\n", MAX_WORK_ITEM_DIMENSIONS); printf("Work Item Size:\t"); for(j = 0; j < MAX_WORK_ITEM_DIMENSIONS; j ++) { if (j != (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d, ", (int) MAX_WORK_ITEM_SIZES[j]); if (j == (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d ", (int) MAX_WORK_ITEM_SIZES[j]); } printf("(Max)\n"); } //printf("Got device info.\n"); } } else if (MODE == 4){ cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; //Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } //Create a command-queue on the first device available on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create commandQueue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL program and store the binary for future use. printf("Attempting to create kernel binary from source.\n"); program = CreateProgram(context, device, KERNELPATHIN); if (program == NULL) { printf("Failed to create Program"); Cleanup(context, commandQueue, program, NULL); return 1; } printf("Kernel is saved.\n"); if (SaveProgramBinary(program, device, KERNELPATHOUT) == false) { printf("Failed to write program binary.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //printf("---Done---"); //return 1; } else if (MODE == 3){ //todo free remaining objects not passed to cleanup //profiling int write_bytes = 0; int read_bytes = 0; /*unsigned long long start_cycles, stop_cycles; unsigned long long start_setup, stop_setup; unsigned long long start_write, stop_write; unsigned long long start_read, stop_read; unsigned long long start_finalize, stop_finalize; struct timespec start_time_t, stop_time_t;*/ printf("Stream Mode\n\n"); //clock_gettime(CLOCK_MONOTONIC, &start_time_t); //start_cycles = rdtsc(); int i; time_t t; srand((unsigned) time(&t)); // Create the two input vectors printf("\nHostside malloc(s)\n"); fflush(stdout); int *A = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *B = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *C = (int*)malloc(sizeof(int)*(SIZE*SIZE)); //profile //bytes += 3 * sizeof(int)*(SIZE*SIZE); printf("\nHostside mat init\n"); fflush(stdout); for(i = 0; i < (SIZE*SIZE); i++) { A[i] = B[i] = rand() % 10 + 1;; } //print matrix printf("Matrix A[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", A[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //print matrix printf("\nMatrix B[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", B[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //syscall(STATS_RESET); //Get platform and device information cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_uint err = 0; //char *filepath = NULL; //Create the context printf("\nCreateContext\n"); fflush(stdout); context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } /* printf("\nEnd CreateContext\n"); fflush(stdout);*/ //Create a command-queue on the first device available on the created context printf("\nCreateCommandQueue\n"); fflush(stdout); commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create command queue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //create the program from the binary //program = CreateProgramFromBinary(context, device, "/home/stardica/Desktop/Kernels/vector.cl.bin.GPU"); //strcat(KERNELPATHOUT, ".GPU") printf("\nCreateProgramFromBinary\n"); fflush(stdout); program = CreateProgramFromBinary(context, device, KERNEL); if (program == NULL) { printf("Failed to load kernel binary,\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL kernel printf("\nclCreateKernel\n"); fflush(stdout); kernel = clCreateKernel(program, "Matrix", NULL); if (kernel == NULL) { printf("Failed to create kernel.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } cl_mem a_mem_obj = 0; cl_mem b_mem_obj = 0; cl_mem c_mem_obj = 0; //Create memory buffers on the device for each vector printf("\nclCreateBuffer(s)\n"); fflush(stdout); if(LOCALMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the GPU's local memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the system memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 1) { //this creates cached buffers in the system memory. #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if (a_mem_obj == NULL || b_mem_obj == NULL || c_mem_obj == NULL) { printf("Failed to create memory objects.\n"); Cleanup(context, commandQueue, program, kernel); return 1; } //Copy the lists A and B to their respective memory buffers printf("\nclEnqueueWriteBuffer(s)\n"); fflush(stdout); write_bytes += 2 * sizeof(int)*(SIZE*SIZE); // start_write = rdtsc(); clEnqueueWriteBuffer(commandQueue, a_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), A, 0, NULL, NULL); clEnqueueWriteBuffer(commandQueue, b_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), B, 0, NULL, NULL); // stop_write = rdtsc(); // Set the arguments of the kernel int *size = (int *)SIZE; printf("\nclSetKernelArg(s)\n"); fflush(stdout); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj); err = clSetKernelArg(kernel, 3, sizeof(int), (void *)&size); if (err != CL_SUCCESS) { printf("Kernel args not set.\n"); return 1; } // Execute the OpenCL kernel on the list size_t GlobalWorkSize[2], LocalWorkSize[2]; //Rember that in OpenCL we need to express the globalWorkSize in //terms of the total number of threads. The underlying OpenCL API //will look at the globalWorkSize and divide by the localWorkSize //to arrive at a 64 by 64 NDRange of 16 by 16 work groups. GlobalWorkSize[0] = GWS_0;//SIZE*SIZE*SIZE; // Process the entire lists GlobalWorkSize[1] = GWS_1;//SIZE*SIZE*SIZE; // Process the entire lists LocalWorkSize[0] = LWS_0; //SIZE Divide work items into groups of 64 LocalWorkSize[1] = LWS_1; //SIZE Divide work items into groups of 64 //used null for local, lets OpenCL determine the best local size. //err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); printf("\nclEnqueueNDRangeKernel\n"); fflush(stdout); err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("ND range not enqueued. Code: %d\n", err); return 1; } //Read the memory buffer C on the device to the local variable C printf("\nclEnqueueReadBuffer\n"); fflush(stdout); read_bytes += sizeof(int)*(SIZE*SIZE); //start_read = rdtsc(); err = clEnqueueReadBuffer(commandQueue, c_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), C, 0, NULL, NULL); // stop_read = rdtsc(); if (err != CL_SUCCESS) { printf("Buffer not returned.\n"); return 1; } //syscall(STATS_STOP); //print matrix printf("\nMatrix C[%d][%d] = A[%d][%d]*B[%d][%d]:\n", SIZE, SIZE, SIZE, SIZE, SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", C[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } printf("\nHostside clean up\n"); fflush(stdout); err = clFlush(commandQueue); err = clFinish(commandQueue); Cleanup(context, commandQueue, program, kernel); err = clReleaseMemObject(a_mem_obj); err = clReleaseMemObject(b_mem_obj); err = clReleaseMemObject(c_mem_obj); free(A); free(B); free(C); //printf("---Done---"); /*stop_cycles = rdtsc(); clock_gettime(CLOCK_MONOTONIC, &stop_time_t); printf("Total cycles = %llu\n", (stop_cycles - start_cycles)); long int time_s = stop_time_t.tv_nsec - start_time_t.tv_nsec; printf("Approximate runtime (check) = %ld ms\n", (time_s/1000000)); printf("Bytes written %d\n", write_bytes); printf("transfer cycles = %llu\n", (stop_write - start_write)); printf("start at = %llu\n", (start_write - start_cycles)); printf("Bytes read %d\n", read_bytes); printf("transfer cycles = %llu\n", (stop_read - start_read)); printf("start at = %llu\n", (start_read - start_cycles));*/ } else if (MODE == 2){ printf("Multi Thread Mode\n"); //cal this: //assignToThisCore(0);//assign to core 0,1,2,... unsigned long long a, b; int i = 0; int j = 0; int k = 0; LoadMatrices(); pthread_t tid[SIZE*SIZE]; //printf("waiting\n"); //start our threads a = rdtsc(); syscall(BEGIN_PARALLEL_SECTION); for(i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ struct RowColumnData *RCData = (struct RowColumnData *) malloc(sizeof(struct RowColumnData)); RCData->RowNum = i; RCData->ColumnNum = j; //printf("Thread create %d Row %d Col %d\n", k, RCData->RowNum, RCData->ColumnNum); pthread_create(&tid[k], NULL, RowColumnMultiply, RCData); k++; } } //Join threads//////////////////////////// for (i=0;i<NUM_THREADS;i++) { pthread_join(tid[i], NULL); } syscall(END_PARALLEL_SECTION); b = rdtsc(); PrintMatrices(); //printf("\nend clock Cycles: %llu\n", b); printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 1) { printf("Single Thread Mode\n\n"); //unsigned long long a, b; //a = rdtsc(); //time_t t; int i,j,k; //srand((unsigned) time(&t)); LoadMatrices(); //multiply mats///////////////////////// for (i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ for(k=0;k<SIZE;k++){ matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]); } } } PrintMatrices(); //b = rdtsc(); //printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 0) { printf("---Misc Tests---\n\n"); printf("size of long long is %d\n", (int) sizeof(long long)); printf("size of long is %d\n", (int) sizeof(long)); printf("size of int is %d\n", (int) sizeof(int)); printf("size of short is %d\n", (int) sizeof(short)); printf("size of char * %d\n", (int) sizeof(char *)); printf("size of unsigned int (word) %d\n", (int) sizeof(unsigned int)); char *string = "test string"; printf("Here is the string 1: \"%s\"\n", string); //Using the struct //set string variable and point to print_me. object.string = strdup(string); object.print_me = (void (*)(void *)) print_me; //use of print_me object.print_me(object.string); //pointer fun struct Object *ptr = &object; printf("this is the value of the pointer to struct object: %p\n", ptr); object.next=&object; printf("this is the value of the pointer to struct object: %p\n", object.next); object_ptr = &object; object_ptr->next = &object; printf("this is the value of the pointer to struct object: %p\n", object_ptr->next); //Macro fun PRINT(ptr, ptr); PRINT(object.next, object.next); PRINT(object_ptr->next, object_ptr->next); int mmu_page_size = 1 << 12; printf("mmu_papge_size = %d\n", mmu_page_size); //setjmp and longjmp fun /*jmp_buf environment; int i; i = setjmp(environment); printf("\n\nsetjmp returned = %d\n", i); printf("Env 1:\n"); int x = 0; for(x = 0; x < 6; x++) { printf(" %x\n", environment[x]); } if (i < 3) { longjmp(environment, 3); } printf("longjmp finished with i = %d\n", i);*/ } else { printf("---Invalid Mode Set---\n\n"); } printf("\n---Done---\n"); return 1; }