/*! \brief Creates cl_command_queue for this Queue. * * This is only needed when the Queue is not instantiated with a Device and a Queue. * * \param ctxt Context for which this Queue will run. If not specified the set context will be taken. */ void ocl::Queue::create(ocl::Context * ctxt) { if(ctxt == 0){ if(_context == nullptr) throw std::runtime_error("this queue must have a valid context"); } else { if(_context != ctxt && _context != nullptr) throw std::runtime_error("cannot have different contexts for the same program"); _context = ctxt; } cl_int status; #if CL_VERSION_2_0 if ( supportsAtLeast2Point0( device().platform() ) ) { cl_queue_properties propties[] = { CL_QUEUE_PROPERTIES, this->properties(), 0 }; _id = clCreateCommandQueueWithProperties( this->context().id(), this->device().id(), propties, &status ); } else #endif { _id = clCreateCommandQueue(this->context().id(), this->device().id(), this->properties(), &status); } OPENCL_SAFE_CALL(status); if(_id == nullptr) throw std::runtime_error("could not create command queue"); _context->insert(this); }
EasyOpenCL<T>::EasyOpenCL(bool printData) { info = printData; cl_uint numPlatforms; //the NO. of platforms // Fetch the different platforms on which we can run our kernel cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); checkError("clGetPlatformIDs"); // Take the first platform available if (numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); platform = platforms[0]; free(platforms); } // Get the devices which are available on said platform cl_uint numDevices = 0; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (numDevices) { //Use the first GPU available devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); } else { // If there is no GPU support, fall back to the CPU if(info) { std::cout << "No supported GPU device available." << std::endl; std::cout << "Falling back to using the CPU." << std::endl; std::cout << std::endl; } status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL); } //Print the data of the selected device if (info) { printDeviceProperty(*devices); } //Create an OpenCL context and a command queue context = clCreateContext(NULL, 1, devices, NULL, NULL, &status); checkError("clCreateContext"); commandQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status); checkError("clCreateCommandQueueWithProperties"); }
EXTERN_C_ENTER JNIEXPORT jlong JNICALL Java_org_lwjgl_opencl_CL20_nclCreateCommandQueueWithProperties(JNIEnv *__env, jclass clazz, jlong contextAddress, jlong deviceAddress, jlong propertiesAddress, jlong errcode_retAddress, jlong __functionAddress) { cl_context context = (cl_context)(intptr_t)contextAddress; cl_device_id device = (cl_device_id)(intptr_t)deviceAddress; const cl_command_queue_properties *properties = (const cl_command_queue_properties *)(intptr_t)propertiesAddress; cl_int *errcode_ret = (cl_int *)(intptr_t)errcode_retAddress; clCreateCommandQueueWithPropertiesPROC clCreateCommandQueueWithProperties = (clCreateCommandQueueWithPropertiesPROC)(intptr_t)__functionAddress; UNUSED_PARAMS(__env, clazz) return (jlong)(intptr_t)clCreateCommandQueueWithProperties(context, device, properties, errcode_ret); }
static cl_command_queue skc_runtime_cl_12_create_cq(struct skc_runtime * const runtime, struct skc_cq_pool * const pool) { cl_command_queue cq; #if 1 // // <= OpenCL 1.2 // cl_int cl_err; cq = clCreateCommandQueue(runtime->cl.context, runtime->cl.device_id, pool->cq_props, &cl_err); cl_ok(cl_err); #else if (runtime_cl->version.major < 2) { // // <= OpenCL 1.2 // cl_int cl_err; cq = clCreateCommandQueue(runtime_cl->context, runtime_cl->device_id, (cl_command_queue_properties)type, &cl_err); cl_ok(cl_err); } else { // // >= OpenCL 2.0 // cl_int cl_err; cl_queue_properties const queue_properties[] = { CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0 }; cq = clCreateCommandQueueWithProperties(runtime_cl->context, runtime_cl->device_id, queue_properties, &cl_err); cl_ok(cl_err); } #endif return cq; }
void LSHReservoirSampler::clCommandQueue() { // Create command queue.Properties(2): CL_QUEUE_PROFILING_ENABLE, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE. #ifdef OPENCL_2XX command_queue_gpu = clCreateCommandQueueWithProperties(context_gpu, devices_gpu[CL_DEVICE_ID], NULL, &_err); clCheckError(_err, "[OpenCL] Couldn't create command queue for GPU."); //command_queue_cpu = clCreateCommandQueueWithProperties(context_cpu, devices_cpu[CL_CPU_DEVICE], NULL, &_err); //clCheckError(_err, "[OpenCL] Couldn't create command queue for CPU."); #else command_queue_gpu = clCreateCommandQueue(context_gpu, devices_gpu[CL_DEVICE_ID], NULL, &_err); clCheckError(_err, "[OpenCL] Couldn't create command queue for GPU."); //command_queue_cpu = clCreateCommandQueue(context_cpu, devices_cpu[CL_CPU_DEVICE], NULL, &_err); //clCheckError(_err, "[OpenCL] Couldn't create command queue for CPU."); #endif }
OpenCLFramework<T>::OpenCLFramework(bool printData) { info = printData; cl_uint numPlatforms; //the NO. of platforms cl_platform_id platform = NULL; //the chosen platform status = clGetPlatformIDs(0, NULL, &numPlatforms); checkError("clGetPlatformIDs"); //Just take the first platform available if (numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms* sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); platform = platforms[0]; free(platforms); } //Try to get the GPU, if not available, take the CPU cl_uint numDevices = 0; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); checkError("clGetDeviceIDs"); if (numDevices == 0) //no GPU available. { std::cout << "No GPU device available." << std::endl; std::cout << "Choose CPU as default device." << std::endl; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL); } else { //Pick the GPU devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); } //Print the data about the picked device if (info) { printDeviceProperty(*devices); } //Create an OpenCL context and a command queue context = clCreateContext(NULL, 1, devices, NULL, NULL, &status); checkError("clCreateContext"); commandQueue = clCreateCommandQueueWithProperties(context, devices[0], 0, &status); checkError("clCreateCommandQueueWithProperties"); }
cl_int set_kernel(int did, cl_prop *prop) { cl_int status; prop->context = clCreateContext(0, prop->num_devices, (const cl_device_id *)prop->devices, NULL, NULL, &status); prop->queue = clCreateCommandQueueWithProperties(prop->context, prop->devices[did], 0, &status); prop->program = clCreateProgramWithSource(prop->context, prop->kcode.count, (const char **)prop->kcode.codes, NULL, &status); const char *options = "-I./include"; status = clBuildProgram(prop->program, prop->num_devices, (const cl_device_id *)prop->devices, options, NULL, NULL); if(status != CL_SUCCESS) { printf("%s[Build Error Log]%s\n", ERR_STR, CLR_STR); } else { printf("%s[Build Log]%s\n", WHT_STR, CLR_STR); } print_build_log(did, prop); if(status != CL_SUCCESS) getchar(); prop->gabor = clCreateKernel(prop->program, (const char *)"enable_gabor", NULL); prop->pooling = clCreateKernel(prop->program, (const char *)"enable_pooling", NULL); prop->feature = clCreateKernel(prop->program, (const char *)"feature_rfcn", NULL); prop->cls = clCreateKernel(prop->program, (const char *)"class_rfcn", NULL); return status; }
int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue command_queue; cl_program program; cl_kernel kernel; cl_mem buffer; cl_int error; cl_event event; cl_ulong startTime, endTime; size_t globalSize[1], localSize[1], warpSize; FILE* fptr; unsigned long long start, end; void* hostData = NULL; /* Parse options */ CommandParser(argc, argv); HostDataCreation(hostData); GetPlatformAndDevice(platform, device); fptr = fopen(g_opencl_ctrl.powerFile, "a"); /* Create context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); CHECK_CL_ERROR(error); /* Create command queue */ #ifdef USE_CL_2_0_API { cl_queue_properties property[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; command_queue = clCreateCommandQueueWithProperties(context, device, property, &error); } #else { command_queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &error); } #endif CHECK_CL_ERROR(error); /* Create program */ CreateAndBuildProgram(program, context, device, strdup(g_opencl_ctrl.fileName)); /* Create kernels */ kernel = clCreateKernel(program, g_opencl_ctrl.kernelName, &error); CHECK_CL_ERROR(error); error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &warpSize, NULL); CHECK_CL_ERROR(error); fprintf(stderr, "Preferred work group size: %lu\n", warpSize); #if 0 fprintf(stderr, "\nData before process:\n"); switch (g_opencl_ctrl.dataType) { case TYPE_INT: { int *intptr = (int *)(hostData); for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++) fprintf(stderr, "%d ", intptr[i]); fprintf(stderr, "\n"); } break; case TYPE_FLOAT: { float *fltptr = (float *)(hostData); for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++) fprintf(stderr, "%f ", fltptr[i]); fprintf(stderr, "\n"); } break; case TYPE_DOUBLE: { double *dblptr = (double *)(hostData); for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++) fprintf(stderr, "%lf ", dblptr[i]); fprintf(stderr, "\n"); } break; } #endif /* Create buffers */ buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.dataByte, hostData, &error); CHECK_CL_ERROR(error); /* Execute kernels */ error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer); CHECK_CL_ERROR(error); error = clSetKernelArg(kernel, 1, sizeof(long), &g_opencl_ctrl.iteration); CHECK_CL_ERROR(error); error = clSetKernelArg(kernel, 2, sizeof(int), &g_opencl_ctrl.interval); CHECK_CL_ERROR(error); start = PrintTimingInfo(fptr); globalSize[0] = g_opencl_ctrl.global_size; localSize[0] = g_opencl_ctrl.local_size; error = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &event); CHECK_CL_ERROR(error); error = clFinish(command_queue); CHECK_CL_ERROR(error); end = PrintTimingInfo(fptr); fclose(fptr); error = clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, g_opencl_ctrl.dataByte, hostData, 0, NULL, NULL); CHECK_CL_ERROR(error); #if 0 fprintf(stderr, "\nData after process:\n"); switch (g_opencl_ctrl.dataType) { case TYPE_INT: { int *intptr = (int *)(hostData); for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++) fprintf(stderr, "%d ", intptr[i]); fprintf(stderr, "\n"); } break; case TYPE_FLOAT: { float *fltptr = (float *)(hostData); for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++) fprintf(stderr, "%f ", fltptr[i]); fprintf(stderr, "\n"); } break; case TYPE_DOUBLE: { double *dblptr = (double *)(hostData); for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++) fprintf(stderr, "%lf ", dblptr[i]); fprintf(stderr, "\n"); } break; } #endif /* Event profiling */ error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL); CHECK_CL_ERROR(error); error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(endTime), &endTime, NULL); CHECK_CL_ERROR(error); fprintf(stderr, "\n['%s' execution time] %llu ns\n", g_opencl_ctrl.kernelName, (end - start) * 1000); fprintf(stdout, "%llu\n", (end - start) * 1000); /* Read the output */ /* Release object */ clReleaseKernel(kernel); clReleaseMemObject(buffer); clReleaseEvent(event); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(hostData); return 0; }
/** * initialize OpenCL device */ int cl_init(int num_values, mvalue_ptr *values, int num_members, member *members, int metric_type) { int i, j; #ifdef _VERBOSE char string_one[128]; char string_two[128]; char string[256]; #endif // _VERBOSE int platform_index = 0; int device_index = 0; const char *source = NULL; population = num_members; segments = num_values; act_metric = metric_type; cl_int err; cl_uint platformCount; cl_uint deviceCount; cl_context_properties properties[3]; // Probe platforms clGetPlatformIDs(0, NULL, &platformCount); platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount, platforms, NULL); #ifdef _VERBOSE for (i = 0; i < platformCount; i++) { printf("platform %d\n", i); // get all devices clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount); clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL); for (j = 0; j < deviceCount; j++) { clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 128, string_one, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 128, string_two, NULL); sprintf(string, "%s (version %s)", string_one, string_two); printf(" device %d: %s\n", j, string); } free(devices); } #endif // _VERBOSE if (platformCount == 0) { fprintf(stderr, "OpenCL platform not found\n"); return OPENCL_ERROR; } // ASK user do { #ifdef _VERBOSE puts("platform number: "); fgets((char *) string, 7, stdin); i = strtol(string, NULL, 10); #else i = 0; #endif } while (i >= platformCount); platform_index = i; // get all devices clGetDeviceIDs(platforms[platform_index], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount); clGetDeviceIDs(platforms[platform_index], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL); do { #ifdef _VERBOSE puts("device number: "); fgets((char *) string, 7, stdin); j = strtol(string, NULL, 10); #else j = 0; #endif } while (j >= deviceCount); device_index = j; // load values to dynamic memory for (i = 0; i < segments; i++) max_seg_vals = max_seg_vals > values[i].cvals ? max_seg_vals : values[i].cvals; mvalue *seg_vals = (mvalue *) malloc(sizeof(mvalue) * max_seg_vals * segments); memset(seg_vals, 0, sizeof(mvalue) * max_seg_vals * segments); // initialize for (i = 0; i < segments; i++) memcpy(seg_vals + i * max_seg_vals, values[i].vals, sizeof(mvalue) * values[i].cvals); // create lenghts array int *lenghts = (int *) malloc(sizeof(int) * segments); for (i = 0; i < segments; i++) lenghts[i] = values[i].cvals; // read kernels source = read_source_file("fitness.cl"); // context properties list - must be terminated with 0 properties[0]= CL_CONTEXT_PLATFORM; // specifies the platform to use properties[1]= (cl_context_properties) platforms[platform_index]; properties[2]= 0; // create context context = clCreateContext(properties,deviceCount,devices,NULL,NULL,&err); if (err != CL_SUCCESS) { printf("chyba ve vytváření kontextu %d\n", err); } // create command queue command_queue = clCreateCommandQueueWithProperties(context, devices[device_index], 0, &err); if (err != CL_SUCCESS) { printf("chyba ve vytváření fronty úloh %d\n", err); } program = clCreateProgramWithSource(context, 1, &source, 0, &err); err = clBuildProgram(program, 1, devices + device_index, "-I.", NULL, NULL); if (err != CL_SUCCESS) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, devices[device_index], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, devices[device_index], CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); free(log); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(devices); free(platforms); return 1; } // specify which kernel from the program to execute kernel_population = clCreateKernel(program, "kernel_population", &err); kernel_equation = clCreateKernel(program, "solve_equation", &err); kernel_avg = clCreateKernel(program, "solve_avg", &err); free((void *) source); buf_seg_vals = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(mvalue) * max_seg_vals * segments, seg_vals, NULL); buf_lenghts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * segments, lenghts, NULL); buf_members = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float16) * population, members, NULL); buf_members_new = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float16) * population, members, NULL); buf_seg_vals_res = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * max_seg_vals * segments * population, NULL, NULL); free(seg_vals); free(lenghts); // set the argument list for the kernel command clSetKernelArg(kernel_population, 0, sizeof(cl_mem), &buf_members); clSetKernelArg(kernel_population, 1, sizeof(cl_mem), &buf_members_new); clSetKernelArg(kernel_equation, 0, sizeof(int), &segments); clSetKernelArg(kernel_equation, 1, sizeof(cl_mem), &buf_seg_vals); clSetKernelArg(kernel_equation, 2, sizeof(cl_mem), &buf_lenghts); clSetKernelArg(kernel_equation, 3, sizeof(int), &population); clSetKernelArg(kernel_equation, 4, sizeof(cl_mem), &buf_members_new); clSetKernelArg(kernel_equation, 5, sizeof(cl_mem), &buf_seg_vals_res); clSetKernelArg(kernel_equation, 6, sizeof(char), &act_metric); clSetKernelArg(kernel_avg, 0, sizeof(int), &max_seg_vals); clSetKernelArg(kernel_avg, 1, sizeof(int), &segments); clSetKernelArg(kernel_avg, 2, sizeof(cl_mem), &buf_seg_vals_res); clSetKernelArg(kernel_avg, 3, sizeof(cl_mem), &buf_lenghts); clSetKernelArg(kernel_avg, 4, sizeof(cl_mem), &buf_members); clSetKernelArg(kernel_avg, 5, sizeof(cl_mem), &buf_members_new); clSetKernelArg(kernel_avg, 6, sizeof(char), &act_metric); three_dim[0] = max_seg_vals; three_dim[1] = segments; three_dim[2] = population; one_dim[0] = population; return 0; }
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA, const void *inMatrixB, int nrowB, int ncolB, bool transposeB, double alpha, double beta, void *outMatrix, bool use_float) { std::stringstream result; float *input_matrixA_f = (float *)inMatrixA; float *input_matrixB_f = (float *)inMatrixB; float *output_matrix_f = (float *)outMatrix; double *input_matrixA_d = (double *)inMatrixA; double *input_matrixB_d = (double *)inMatrixB; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrixA = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixA:" << std::endl; } if (use_float) { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(float), input_matrixA_f, &err); } else { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(double), input_matrixA_d, &err); } } cl_mem cl_input_matrixB = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixB:" << std::endl; } if (use_float) { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(float), input_matrixB_f, &err); } else { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(double), input_matrixB_d, &err); } } int nrowC = transposeA ? ncolA : nrowA; int ncolC = transposeB ? nrowB : ncolB; cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const int lda = nrowA; // first dimension of A (rows), before any transpose const int ldb = nrowB; // first dimension of B (rows), before any transpose const int ldc = nrowC; // first dimension of C (rows) const int M = transposeA ? ncolA : nrowA; // rows in A (after transpose, if any) and C const int N = transposeB ? nrowB : ncolB; // cols in B (after transpose, if any) and C const int K = transposeA ? nrowA : ncolA; // cols in A and rows in B (after transposes, if any) const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans; const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSgemm:" << std::endl; } status = clblasSgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDgemm:" << std::endl; } status = clblasDgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrixA); cl_input_matrixA = NULL; clReleaseMemObject(cl_input_matrixB); cl_input_matrixB = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; return errorStatus; }
enum piglit_result piglit_cl_test(const int argc, const char** argv, const struct piglit_cl_api_test_config* config, const struct piglit_cl_api_test_env* env) { enum piglit_result result = PIGLIT_PASS; int i; int mask; cl_int errNo; cl_context cl_ctx; cl_command_queue command_queue; cl_uint num_devices; cl_device_id* devices; cl_command_queue_properties mixed_command_queue_properties[4] = {CL_QUEUE_PROPERTIES, 0, 0, 0}; cl_context_properties context_properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)env->platform_id, 0 }; int num_command_queue_properties = PIGLIT_CL_ENUM_NUM(cl_command_queue_properties, env->version); const cl_command_queue_properties* command_queue_properties = PIGLIT_CL_ENUM_ARRAY(cl_command_queue_properties); /*** Normal usage ***/ /* create context */ cl_ctx = clCreateContext(context_properties, 1, &env->device_id, NULL, NULL, &errNo); if(errNo == CL_DEVICE_NOT_FOUND) { fprintf(stderr, "No available devices.\n"); return PIGLIT_SKIP; } if(!piglit_cl_check_error(errNo, CL_SUCCESS)) { fprintf(stderr, "Failed (error code: %s): Create context.\n", piglit_cl_get_error_name(errNo)); return PIGLIT_FAIL; } /* * For each command queue properties mix. * There are 2^(num_command_queue_properties)-1 possible options. */ for(mask = 0; mask < (1 << num_command_queue_properties); mask++) { mixed_command_queue_properties[1] = get_mixed_command_queue_properties(mask, command_queue_properties); if (properties_forbidden(mixed_command_queue_properties[1], env)) continue; #if defined CL_VERSION_2_0 if (env->version >= 20) { command_queue = clCreateCommandQueueWithProperties( cl_ctx, env->device_id, mixed_command_queue_properties, &errNo); } else #endif //CL_VERSION_2_0 { command_queue = clCreateCommandQueue(cl_ctx, env->device_id, mixed_command_queue_properties[1], &errNo); } if(errNo != CL_SUCCESS && errNo != CL_INVALID_QUEUE_PROPERTIES) { piglit_cl_check_error(errNo, CL_SUCCESS); fprintf(stderr, "Failed (error code: %s): Create command queue using 0x%X as command queue properties.\n", piglit_cl_get_error_name(errNo), (unsigned int)mixed_command_queue_properties[1]); piglit_merge_result(&result, PIGLIT_FAIL); } clReleaseCommandQueue(command_queue); } /*** Errors ***/ /* * CL_INVALID_CONTEXT if context is not a valid context. */ clCreateCommandQueue(NULL, env->device_id, 0, &errNo); if(!piglit_cl_check_error(errNo, CL_INVALID_CONTEXT)) { fprintf(stderr, "Failed (error code: %s): Trigger CL_INVALID_CONTEXT if contest is not a valid context.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } /* * CL_INVALID_DEVICE if device is not a valid device or is * not associated with context. */ clCreateCommandQueue(cl_ctx, NULL, 0, &errNo); if(!piglit_cl_check_error(errNo, CL_INVALID_DEVICE)) { fprintf(stderr, "Failed (error code: %s): Trigger CL_INVALID_DEVICE if device is not a valid device.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } num_devices = piglit_cl_get_device_ids(env->platform_id, CL_DEVICE_TYPE_ALL, &devices); for(i = 0; i < num_devices; i++) { if(devices[i] != env->device_id) { clCreateCommandQueue(cl_ctx, devices[i], 0, &errNo); if(!piglit_cl_check_error(errNo, CL_INVALID_DEVICE)) { fprintf(stderr, "Failed (error code: %s): Trigger CL_INVALID_DEVICE if device that is not associated with context.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } } } free(devices); /* * CL_INVALID_VALUE if values specified in properties are not valid. */ clCreateCommandQueue(cl_ctx, env->device_id, 0XFFFFFFFF, &errNo); if(!piglit_cl_check_error(errNo, CL_INVALID_VALUE)) { fprintf(stderr, "Failed (error code: %s): Trigger CL_INVALID_VALUE if values specified in properties are not valid.\n", piglit_cl_get_error_name(errNo)); piglit_merge_result(&result, PIGLIT_FAIL); } /* * CL_INVALID_QUEUE_PROPERTIES if values specified in properties * are valid but are not supported by the device. * * Note: already tested in 'normal usage' section */ clReleaseContext(cl_ctx); return result; }
/* * This function picks/creates necessary OpenCL objects which are needed. * The objects are: * OpenCL platform, device, context, and command queue. * * All these steps are needed to be performed once in a regular OpenCL application. * This happens before actual compute kernels calls are performed. * * For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t, * so this function populates fields of this structure, which is passed as parameter ocl. * Please, consider reviewing the fields before going further. * The structure definition is right in the beginning of this file. */ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) { // The following variable stores return codes for all OpenCL calls. cl_int err = CL_SUCCESS; // Query for all available OpenCL platforms on the system // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType); if (NULL == platformId) { LogError("Error: Failed to find OpenCL platform.\n"); return CL_INVALID_VALUE; } // Create context with device of specified type. // Required device type is passed as function argument deviceType. // So you may use this function to create context for any CPU or GPU OpenCL device. // The creation is synchronized (pfn_notify is NULL) and NULL user_data cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0 }; ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err); if ((CL_SUCCESS != err) || (NULL == ocl->context)) { LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err)); return err; } // Query for OpenCL device which was used for context creation err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL); if (CL_SUCCESS != err) { LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err)); return err; } // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions GetPlatformAndDeviceVersion(platformId, ocl); // Create command queue. // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues. // Command queue guarantees some ordering between calls and other OpenCL commands. // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device. #ifdef CL_VERSION_2_0 if (OPENCL_VERSION_2_0 == ocl->deviceVersion) { const cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err); } else { // default behavior: OpenCL 1.2 cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); } #else // default behavior: OpenCL 1.2 cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); #endif if (CL_SUCCESS != err) { LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err)); return err; } return CL_SUCCESS; }
int main(int argc, char *argv[]) { int i; int n = 5; int outSize = 7; /* A, B, C, D, E */ float p0[n], p1[n]; int np[n]; float out[outSize]; if (argc != 21) { fprintf(stderr, "Usage: %s M0 H0 T0 TAU A0 A1 NA B0 B1 NB " "C0 C1 NC D0 D1 ND E0 E1 NE INPUT\n", argv[0]); exit(1); } float m0 = atof(argv[1]); float h0 = atof(argv[2]); float t0 = atof(argv[3]); float tau = strtof(argv[4], NULL); /* p0 is where the search starts, p1 is where the search ends and np is the * number of points in between p0 and p1 to do the search */ for (i = 0; i < 5; i++) { p0[i] = atof(argv[5 + 3*i]); p1[i] = atof(argv[5 + 3*i + 1]); np[i] = atoi(argv[5 + 3*i + 2]); } /* Load the traces from the file */ char *path = argv[20]; FILE *fp = fopen(path, "r"); if (!fp) { fprintf(stderr, "Failed to open prestack file '%s'!\n", path); return 1; } su_trace_t tr; vector_t(su_trace_t) traces; vector_init(traces); while (su_fgettr(fp, &tr)) { vector_push(traces, tr); } /* Construct the aperture structure from the traces, which is a vector * containing pointers to traces */ aperture_t ap; ap.ap_m = 0; ap.ap_h = 0; ap.ap_t = tau; vector_init(ap.traces); for (int i = 0; i < traces.len; i++) vector_push(ap.traces, &vector_get(traces, i)); my_aperture_t my_ap = transform(ap); //puts("fim transform\n"); /*-------------------------------------------------------------------------*/ char *kernelSource = (char *) malloc(MAXSOURCE * sizeof(char)); FILE * file = fopen("kernel.cl", "r"); if(file == NULL) { printf("Error: open the kernel file (kernel.cl)\n"); exit(1); } // Read kernel code size_t source_size = fread(kernelSource, 1, MAXSOURCE, file); //Device input buffers cl_mem d_my_ap; cl_mem d_p0, d_p1, d_np, d_aopt, d_bopt, d_copt, d_dopt, d_eopt, d_stack, d_smax; //Device output buffer cl_mem d_out; cl_int err; char deviceName[MAX_DEVICE_NAME_SIZE]; cl_platform_id cpPlatform; cl_device_id device_id; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_platform_id *platforms; cl_uint platformCount; //Tamanho em bytes de cada vetor size_t bytes_my_ap = sizeof(my_aperture_t); size_t bytes_p0 = sizeof(float) * n; size_t bytes_p1 = sizeof(float) * n; size_t bytes_np = sizeof(int) * n; size_t bytes_opt = sizeof(float) * np[0]; size_t bytes_out = sizeof(float) * outSize; //Numero de workitems em cada local work group (local size) // size_t localSize[3] = {LOCALSIZE, LOCALSIZE, LOCALSIZE}; // // size_t globalSize[3] = { // ceil((float)np[0] / (float)localSize[0]), // ceil((float)np[1] / (float)localSize[1]), // ceil((float)np[2] / (float)localSize[2]) // }; size_t localSize[3] = {2,2,2}; size_t globalSize[3] = {20,20,20}; // Bind to platforms clGetPlatformIDs(0, NULL, &platformCount); if (platformCount == 0) { printf("Error, cound not find any OpenCL platforms on the system.\n"); exit (2); } platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount,platforms, NULL); // Find first device that works err = 1; for (i = 0; i < platformCount && err !=CL_SUCCESS; i++) { // Get ID for the device (CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_GPU, ...) err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); } checkError(err, "get device"); if (err !=CL_SUCCESS) { printf("Error, could not find a valid device."); exit (3); } err = clGetDeviceInfo(device_id, CL_DEVICE_NAME,MAX_DEVICE_NAME_SIZE, deviceName, NULL); printf("Device: %s \n",deviceName); if (err !=CL_SUCCESS) { printf("Error, could not read the info for device."); exit (4); } // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (err !=CL_SUCCESS) { printf("Error, could not create the context."); exit (5); } // Create a command queue queue = clCreateCommandQueueWithProperties(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource,(const size_t *) &source_size, &err); if (err !=CL_SUCCESS) { printf("Error, could not create program with source."); exit (6); } // Build the program executable " --disable-multilib " err = clBuildProgram(program, 0,NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char* buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, NULL, &buildLogSize); buildLog = (char*)malloc(buildLogSize); memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); printf("ERROR %d (logsz = %d): [[%s]]\n", err, buildLogSize, buildLog); free(buildLog); return err; } else if (err!=0) { printf("Error, could not build program.\n"); exit (7); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "calculate", &err); if (err !=CL_SUCCESS) { printf("Error, could not create the kernel."); exit (6); } float smax[np[0]]; for(int i = 0; i < np[0]; i++){ smax[i] = -1.0; } size_t bytes_smax = sizeof(float) * np[0]; // Create the input and output arrays in device memory for our calculation d_my_ap = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_my_ap, NULL, NULL); d_p0 = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_p0, NULL, NULL); d_p1 = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_p1, NULL, NULL); d_np = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_np, NULL, NULL); d_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes_out, NULL, NULL); d_aopt = clCreateBuffer(context, CL_MEM_READ_WRITE , bytes_smax, NULL, NULL); d_bopt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL); d_copt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL); d_dopt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL); d_eopt = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL); d_stack = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes_smax, NULL, NULL); d_smax = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes_smax, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_my_ap, CL_TRUE, 0, bytes_my_ap, (const void*)&my_ap, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_p0, CL_TRUE, 0, bytes_p0, p0, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_p1, CL_TRUE, 0, bytes_p1, p1, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_np, CL_TRUE, 0, bytes_np, np, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_smax, CL_TRUE, 0, bytes_smax, smax, 0, NULL, NULL); // Set the arguments to our compute kernel err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_my_ap); err |= clSetKernelArg(kernel, 1, sizeof(float), &m0); err |= clSetKernelArg(kernel, 2, sizeof(float), &h0); err |= clSetKernelArg(kernel, 3, sizeof(float), &t0); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_p0); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_p1); err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &d_np); err |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &d_out); err |= clSetKernelArg(kernel, 8, np[0] * sizeof(cl_float), &d_aopt);//_Aopt err |= clSetKernelArg(kernel, 9, np[0] * sizeof(cl_float), &d_bopt);//_Bopt err |= clSetKernelArg(kernel, 10, np[0] * sizeof(cl_float), &d_copt);//_Copt err |= clSetKernelArg(kernel, 11, np[0] * sizeof(cl_float), &d_dopt);//_Dopt err |= clSetKernelArg(kernel, 12, np[0] * sizeof(cl_float), &d_eopt);//_Eopt err |= clSetKernelArg(kernel, 13, np[0] * sizeof(cl_float), &d_stack);//_stack err |= clSetKernelArg(kernel, 14, np[0] * sizeof(cl_float), &d_smax);//smax if (err !=CL_SUCCESS) { printf("Error, could not set kernel args."); exit (7); } err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, (const size_t *)globalSize, (const size_t *)localSize, 0, NULL, NULL); // Execute the kernel over the entire range of the data set if (err !=CL_SUCCESS) { printf("Error, could not enqueue commands. %d\n", err); exit (8); } // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_out, CL_TRUE, 0, bytes_out, out, 0, NULL, NULL ); /*-------------------------------------------------------------------------*/ printf("A=%g\n", out[0]); printf("B=%g\n", out[1]); printf("C=%g\n", out[2]); printf("D=%g\n", out[3]); printf("E=%g\n", out[4]); printf("Stack=%g\n", out[5]); printf("Semblance=%g\n", out[6]); printf("\n"); return 0; }
int main(int argc, char** argv) { // beginning of the verbose OpenCL allocation cl_platform_id platform_id = NULL; cl_uint ret_num_platforms = 0; cl_uint ret_num_devices = 0; cl_int ret = 0; // the output from opencl kernel float *c_inputs = malloc(ARRAY_SIZE*sizeof(float)); float *c_outputs = malloc(ARRAY_SIZE*sizeof(float)); cl_float *cl_inputs = malloc(ARRAY_SIZE*sizeof(cl_float)); cl_float *cl_outputs = malloc(ARRAY_SIZE*sizeof(cl_float)); // get random numbers via Rmath set_seed(atoi(argv[1]), 197414); float tmp_in = 0.0; #pragma omp parallel for for (long i = 0; i < ARRAY_SIZE; i++) { tmp_in = rnorm(0, 1); c_inputs[i] = tmp_in; cl_inputs[i] = (cl_float) tmp_in; } // measure time elapse clock_t start = clock(); #pragma omp parallel for for (long i = 0; i < ARRAY_SIZE; i++) { c_outputs[i] = expf(c_inputs[i]); } printf("CPU time for %d exp operation: %d\n", ARRAY_SIZE, (int) (clock() - start)); // read kernel source FILE *fp; char filename[] = "./hello_log.cl"; char *source_str; size_t source_size; fp = fopen(filename, "r"); source_str = (char*) malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // get platform and device info ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); cl_device_id device_ids[2]; ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices); printf("Number of devices: %5d\n", ret_num_devices); // print device name char bdname[100]; clGetDeviceInfo(device_ids[1], CL_DEVICE_NAME, 100, bdname, NULL); printf("Used device: %s\n", bdname); // use second GPU cl_device_id device_id = device_ids[1]; // create opencl context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // create command queue cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); // create memory buffer for input cl_mem memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_SIZE*sizeof(cl_float), NULL, &ret); // create memory buffer for output cl_mem memobj_out = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_SIZE*sizeof(cl_float), NULL, &ret); // create kernel program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // build program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); printf("build program successfully\n"); // create opencl kernel cl_kernel kernel = clCreateKernel(program, "hello_exp", &ret); // set opencl parameters for inputs ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj_in); // set opencl parameters for inputs ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_out); // execute opencl kernel size_t global_item_size = ARRAY_SIZE/32; size_t local_item_size = 32; // measure time start = clock(); ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), cl_inputs, 0, NULL, NULL); // run it ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // copy results from the memory buffer ret = clEnqueueReadBuffer(command_queue, memobj_out, CL_TRUE, 0, ARRAY_SIZE*sizeof(cl_float), cl_outputs, 0, NULL, NULL); printf("GPU time (with PCI-E overhead): %d\n", (int) (clock() - start)); printf("inputs: %3.7f %3.7f\n", c_inputs[150000], cl_inputs[150000]); printf("outputs: %3.7f %3.7f\n", c_outputs[150000], (float) cl_outputs[150000]); // finalization ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj_in); ret = clReleaseMemObject(memobj_out); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(source_str); return 0; }
int main(int argc, char **argv) { if (find_option(argc, argv, "-h") >= 0) { printf("Options:\n"); printf("-h to see this help\n"); printf("-n <int> to set the number of particles\n"); printf("-o <filename> to specify the output file name\n"); printf("-s <filename> to specify the summary output file name\n"); return 0; } int n = read_int(argc, argv, "-n", 1000); char *savename = read_string(argc, argv, "-o", NULL); char *sumname = read_string(argc, argv, "-s", NULL); // For return values. cl_int ret; // OpenCL stuff. // Loading kernel files. FILE *kernelFile; char *kernelSource; size_t kernelSize; kernelFile = fopen("simulationKernel.cl", "r"); if (!kernelFile) { fprintf(stderr, "No file named simulationKernel.cl was found\n"); exit(-1); } kernelSource = (char*)malloc(MAX_SOURCE_SIZE); kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile); fclose(kernelFile); // Getting platform and device information cl_platform_id platformId = NULL; cl_device_id deviceID = NULL; cl_uint retNumDevices; cl_uint retNumPlatforms; ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms); // Different types of devices to pick from. At the moment picks the default opencl device. //CL_DEVICE_TYPE_GPU //CL_DEVICE_TYPE_ACCELERATOR //CL_DEVICE_TYPE_DEFAULT //CL_DEVICE_TYPE_CPU ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices); // Max workgroup size size_t max_available_local_wg_size; ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL); // Creating context. cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret); // Creating command queue cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret); // Build program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret); // printf("program = ret %i \n", ret); ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); // printf("clBuildProgram: ret %i \n", ret); // Create kernels cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret); cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret); cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret); cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret); FILE *fsave = savename ? fopen(savename, "w") : NULL; FILE *fsum = sumname ? fopen(sumname, "a") : NULL; particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t)); // GPU particle data structure cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret); // Set size set_size(n); init_particles(n, particles); double copy_time = read_timer(); // Copy particles to device. ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL); copy_time = read_timer() - copy_time; // Calculating thread and thread block counts. // sizes size_t globalItemSize; size_t localItemSize; // Global item size if (n <= NUM_THREADS) { globalItemSize = NUM_THREADS; localItemSize = 16; } else if (n % NUM_THREADS != 0) { globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS; } else { globalItemSize = n; } // Local item size localItemSize = globalItemSize / NUM_THREADS; // Bins and bin sizes. // Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10. // There will never be 10 particles in one bin. int maxParticles = 10; // Calculating the number of bins. int numberOfBins = (int)ceil(size/(2*cutoff)) + 2; // Bins will only exist on the device. particle_t* bins; // How many particles are there in each bin - also only exists on the device. volatile int* binSizes; // Number of bins to be initialized. size_t clearAmt = numberOfBins*numberOfBins; // Allocate memory for bins on the device. cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret); cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret); // SETTING ARGUMENTS FOR THE KERNELS // Set arguments for the init / clear kernel ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins); // Set arguments for the binning kernel ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(binKernel, 1, sizeof(int), &n); ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins); // Set arguments for force kernel. ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n); ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins); // Set arguments for move kernel ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n); ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size); // Variable to check if kernel execution is done. cl_event kernelDone; double simulation_time = read_timer(); int step = 0; for (step = 0; step < NSTEPS; step++) { // Execute bin initialization (clearing after first iteration) ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute binning kernel ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); // ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute force kernel ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute move kernel ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); if (fsave && (step%SAVEFREQ) == 0) { // Copy the particles back to the CPU ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); save(fsave, n, particles); } } simulation_time = read_timer() - simulation_time; printf("CPU-GPU copy time = %g seconds\n", copy_time); printf("n = %d, simulation time = %g seconds\n", n, simulation_time); if (fsum) fprintf(fsum, "%d %lf \n", n, simulation_time); if (fsum) fclose(fsum); free(particles); if (fsave) fclose(fsave); ret = clFlush(commandQueue); ret = clFinish(commandQueue); ret = clReleaseCommandQueue(commandQueue); ret = clReleaseKernel(forceKernel); ret = clReleaseKernel(moveKernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(d_particles); ret = clReleaseContext(context); return 0; }
ErrorStatus crossprod_clblas(cl_device_id device, void *inMatrix, void *outMatrix, int nrow, int ncol, bool use_float) { std::stringstream result; float *input_matrix_f = (float *)inMatrix; float *output_matrix_f = (float *)outMatrix; double *input_matrix_d = (double *)inMatrix; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "crossprod_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ", nrow = " << nrow << ", ncol = " << ncol << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrix:" << std::endl; } if (use_float) { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(float), input_matrix_f, &err); } else { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(double), input_matrix_d, &err); } } cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = clblasTrans; const size_t lda = nrow; const size_t ldc = ncol; const cl_float alpha = 1.0; clblasUplo uplo = clblasUpper; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSsyrk:" << std::endl; } status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDsyrk:" << std::endl; } status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL); symmetrizeSquare_f(output_matrix_f, ncol); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL); symmetrizeSquare_d(output_matrix_d, ncol); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrix); cl_input_matrix = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; // return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err); return errorStatus; }
int main(int argc, char const * argv[]) { char const * const target_platform_substring = "Intel"; char const * const target_device_substring = "Graphics"; // // find platform and device ids // cl_platform_id platform_id; cl_device_id device_id; #define HS_DEVICE_NAME_SIZE 64 char device_name[HS_DEVICE_NAME_SIZE]; size_t device_name_size; cl(FindIdsByName(target_platform_substring, target_device_substring, &platform_id, &device_id, HS_DEVICE_NAME_SIZE, device_name, &device_name_size, true)); // // create context // cl_context_properties context_properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0 }; cl_int cl_err; cl_context context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_err); cl_ok(cl_err); // // create command queue // #if 0 // OPENCL 2.0 cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, #ifndef NDEBUG (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, #endif 0 }; cl_queue_properties props_profile[] = { CL_QUEUE_PROPERTIES, (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, 0 }; cl_command_queue cq = clCreateCommandQueueWithProperties(context, device_id, props, &cl_err); cl_ok(cl_err); cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context, device_id, props_profile, &cl_err); cl_ok(cl_err); #else // OPENCL 1.2 cl_command_queue cq = clCreateCommandQueue(context, device_id, #ifndef NDEBUG CL_QUEUE_PROFILING_ENABLE | #endif CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &cl_err); cl_ok(cl_err); cl_command_queue cq_profile = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &cl_err); cl_ok(cl_err); #endif // // Intel GEN workaround -- create dummy kernel for semi-accurate // profiling on an out-of-order queue. // hs_dummy_kernel_create(context,device_id); // // select the target // uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0); struct hs_cl_target const * hs_target; if (key_val_words == 1) hs_target = &hs_intel_gen8_u32; else hs_target = &hs_intel_gen8_u64; // // create kernels // fprintf(stdout,"Creating... "); struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id); fprintf(stdout,"done.\n"); // // // #ifdef NDEBUG #define HS_BENCH_LOOPS 100 #define HS_BENCH_WARMUP 100 #else #define HS_BENCH_LOOPS 1 #define HS_BENCH_WARMUP 0 #endif // // sort sizes and loops // uint32_t const kpb = hs_target->config.slab.height << hs_target->config.slab.width_log2; uint32_t const count_lo = (argc <= 2) ? kpb : strtoul(argv[2],NULL,0); uint32_t const count_hi = (argc <= 3) ? count_lo : strtoul(argv[3],NULL,0); uint32_t const count_step = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0); uint32_t const loops = (argc <= 5) ? HS_BENCH_LOOPS : strtoul(argv[5],NULL,0); uint32_t const warmup = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0); bool const linearize = (argc <= 7) ? true : strtoul(argv[7],NULL,0); // // labels // fprintf(stdout, "Device, " "Driver, " "Type, " "Slab/Linear, " "Verified?, " "Keys, " "Keys Padded In, " "Keys Padded Out, " "CPU Algorithm, " "CPU Msecs, " "CPU Mkeys/s, " "Trials, " "Avg. Msecs, " "Min Msecs, " "Max Msecs, " "Avg. Mkeys/s, " "Max. Mkeys/s\n"); // // we want to track driver versions // size_t driver_version_size; cl(GetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &driver_version_size)); char * const driver_version = ALLOCA_MACRO(driver_version_size); cl(GetDeviceInfo(device_id, CL_DRIVER_VERSION, driver_version_size, driver_version, NULL)); // // benchmark // hs_bench(context, cq,cq_profile, device_name, driver_version, hs_target->config.words.key + hs_target->config.words.val, 1 << hs_target->config.slab.width_log2, hs_target->config.slab.height, hs, count_lo, count_hi, count_step, loops, warmup, linearize); // // release everything // hs_cl_release(hs); hs_dummy_kernel_release(); cl(ReleaseCommandQueue(cq)); cl(ReleaseCommandQueue(cq_profile)); cl(ReleaseContext(context)); return 0; }
int main(void) { cl_context context = 0; cl_command_queue command_waiting_line = 0; cl_program program = 0; cl_device_id device_id = 0; cl_kernel kernel = 0; // int numberOfMemoryObjects = 3; cl_mem memoryObjects[3] = {0, 0, 0}; cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_int errorNumber; cl_int ret; /* Load the source code containing the kernel*/ char fileName[] = "source/parallel/composition_population.cl"; FILE *fp; char *source_str; size_t source_size; fp = fopen(fileName, "r"); cl_uint ret_num_platforms; if (!fp) { fprintf(stderr, "Failed to load kernel %s:%d.\n", __FILE__, __LINE__); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // printf("file: %s :file", source_str); getInfo(); ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to get platform id's. %s:%d\n", __FILE__, __LINE__); return 1; } ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to get OpenCL devices. %s:%d\n", __FILE__, __LINE__); return 1; } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create an OpenCL context. %s:%d\n", __FILE__, __LINE__); return 1; } #ifdef CL_VERSION_2_0 command_waiting_line = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); #else command_waiting_line = clCreateCommandQueue(context, device_id, 0, &ret); #endif if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create the OpenCL command queue. %s:%d\n", __FILE__, __LINE__); return 1; } /* create program */ program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL program. %s:%d\n", __FILE__, __LINE__); return 1; } /* Build Kernel Program */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to build OpenCL program. %s:%d\n", __FILE__, __LINE__); return 1; } kernel = clCreateKernel(program, "composition_population", &errorNumber); if (!success_verification(errorNumber)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL kernel. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Setup memory] */ /* Number of elements in the arrays of input and output data. */ /* The buffers are the size of the arrays. */ uint16_t activity_atom_size = MAX_INDEPENDENTCLAUSE_TABLET * 1; uint8_t program_size = 1; uint8_t population_size = 4; size_t activity_atom_byte_size = activity_atom_size * sizeof(v16us); uint16_t population_byte_size = (uint16_t)(program_size * (uint16_t)(population_size * sizeof(v16us))); /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than allocating * it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within the * kernel. */ int createMemoryObjectsSuccess = TRUE; memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, activity_atom_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); memoryObjects[1] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, population_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, newspaper_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); if (!createMemoryObjectsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Setup memory] */ /* [Map the buffers to pointers] */ /* Map the memory buffers created by the OpenCL implementation to pointers so * we can access them on the CPU. */ int mapMemoryObjectsSuccess = TRUE; v16us *activity_atom = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, activity_atom_byte_size, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= success_verification(errorNumber); // cl_int *inputB = (cl_int *)clEnqueueMapBuffer( // command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, // bufferSize, 0, // NULL, NULL, &errorNumber); // mapMemoryObjectsSuccess &= success_verification(errorNumber); if (!mapMemoryObjectsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Map the buffers to pointers] */ /* [Initialize the input data] */ const char *activity_atom_text = "nyistu htoftu hnattu hnamtu"; const uint16_t activity_atom_text_size = (uint16_t)(strlen(activity_atom_text)); const char *quiz_independentClause_list_text = "zrundoka hwindocayu hwindokali" "hwindoka tyutdocayu tyindokali" "tyutdoka tyutdocayu hfutdokali" "tyindoka fwandocayu nyatdokali"; //"bu.hnac.2.hnac.buka bu.hnac.2.hnac.buca yu " //"bu.hnac.4.hnac.bukali"; const uint16_t quiz_independentClause_list_text_size = (uint16_t)strlen(quiz_independentClause_list_text); uint16_t quiz_independentClause_list_size = 4; v16us quiz_independentClause_list[8]; uint16_t text_remainder = 0; // uint16_t program_worth = 0; uint64_t random_seed = 0x0123456789ABCDEF; uint16_t tablet_indexFinger = 0; // uint8_t champion = 0; // uint16_t champion_worth = 0; // v16us program_; // v16us population[4]; memset(quiz_independentClause_list, 0, (size_t)(quiz_independentClause_list_size * TABLET_LONG * WORD_THICK)); text_code(activity_atom_text_size, activity_atom_text, &activity_atom_size, activity_atom, &text_remainder); assert(text_remainder == 0); text_code(quiz_independentClause_list_text_size, quiz_independentClause_list_text, &quiz_independentClause_list_size, quiz_independentClause_list, &text_remainder); /* [Initialize the input data] */ /* [Un-map the buffers] */ /* * Unmap the memory objects as we have finished using them from the CPU side. * We unmap the memory because otherwise: * - reads and writes to that memory from inside a kernel on the OpenCL side * are undefined. * - the OpenCL implementation cannot free the memory when it is finished. */ if (!success_verification( clEnqueueUnmapMemObject(command_waiting_line, memoryObjects[0], activity_atom, 0, NULL, NULL))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } // if (!success_verification(clEnqueueUnmapMemObject(command_waiting_line, // memoryObjects[1], // inputB, 0, NULL, NULL))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); // cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ // << endl; // return 1; //} /* [Un-map the buffers] */ /* [Set the kernel arguments] */ int setKernelArgumentsSuccess = TRUE; printf("arg0\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg( kernel, 0, sizeof(uint8_t), (uint8_t *)&activity_atom_size)); printf("arg1\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[0])); printf("arg2\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 2, sizeof(uint16_t), (uint16_t *)&program_size)); printf("arg3\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 3, sizeof(uint8_t), (uint8_t *)&population_size)); printf("arg4\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 4, sizeof(uint64_t), (uint64_t *)&random_seed)); printf("arg5\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg(kernel, 5, sizeof(uint64_t *), NULL)); printf("arg6\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 6, sizeof(cl_mem), &memoryObjects[1])); printf("arg7\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg(kernel, 7, sizeof(uint8_t *), NULL)); printf("arg8\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 8, sizeof(cl_mem), &memoryObjects[2])); if (!setKernelArgumentsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed setting OpenCL kernel arguments. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Set the kernel arguments] */ /* An event to associate with the Kernel. Allows us to retrieve profiling * information later. */ cl_event event = 0; /* [Global work size] */ /* * Each instance of our OpenCL kernel operates on a single element of each * array so the number of * instances needed is the number of elements in the array. */ size_t globalWorksize[1] = {population_size}; size_t localWorksize[1] = {2}; /* Enqueue the kernel */ if (!success_verification(clEnqueueNDRangeKernel( command_waiting_line, kernel, 1, NULL, globalWorksize, localWorksize, 0, NULL, &event))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed enqueuing the kernel. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Global work size] */ /* Wait for kernel execution completion. */ if (!success_verification(clFinish(command_waiting_line))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed waiting for kernel execution to finish. %s:%d\n", __FILE__, __LINE__); return 1; } /* Print the profiling information for the event. */ // printProfilingInfo(event); /* Release the event object. */ if (!success_verification(clReleaseEvent(event))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed releasing the event object. %s:%d\n", __FILE__, __LINE__); return 1; } /* Get a pointer to the output data. */ printf("clOut\n"); v16us *output = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_READ, 0, population_byte_size, 0, NULL, NULL, &errorNumber); v16us *newspaper = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, newspaper_byte_size, 0, NULL, NULL, &errorNumber); if (!success_verification(errorNumber)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Output the results] */ /* Uncomment the following block to print results. */ for (tablet_indexFinger = 0; tablet_indexFinger < (population_size * TABLET_LONG); ++tablet_indexFinger) { if (tablet_indexFinger % 0x10 == 0) printf("\n"); printf("%04X ", (uint)((uint16_t *)output)[tablet_indexFinger]); } printf("\n"); // printf("program %04X \n", (uint)*((uint16_t *)&(output[1]))); printf("newspaper \n"); for (tablet_indexFinger = 0; tablet_indexFinger < (NEWSPAPER_LONG * TABLET_LONG); ++tablet_indexFinger) { if (tablet_indexFinger % 0x10 == 0) printf("\n"); printf("%04X ", (uint)((uint16_t *)newspaper)[tablet_indexFinger]); } printf("\n"); /* [Output the results] */ /* Unmap the memory object as we are finished using them from the CPU side. */ if (!success_verification(clEnqueueUnmapMemObject( command_waiting_line, memoryObjects[1], output, 0, NULL, NULL))) { printf("unmapping\n"); // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } if (!success_verification(clEnqueueUnmapMemObject( command_waiting_line, memoryObjects[2], newspaper, 0, NULL, NULL))) { printf("unmapping\n"); // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } printf("releasing\n"); /* Release OpenCL objects. */ // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); }