void QueryOpenCLEnvironmentInfo( OpenCLEnvironmentInfo *opencl_env ) { cl_platform_id *platform_ids = NULL; opencl_env->platforms = NULL; // Detects the number of available OpenCL platforms CL_CHECK( clGetPlatformIDs( 1, NULL, &opencl_env->num_platforms ) ); if ( opencl_env->num_platforms > 0 ) { platform_ids = ( cl_platform_id* ) malloc( sizeof( cl_platform_id ) * opencl_env->num_platforms ); CL_CHECK( clGetPlatformIDs( opencl_env->num_platforms, platform_ids, NULL ) ); opencl_env->platforms = ( Platform* ) malloc( sizeof( Platform ) * opencl_env->num_platforms ); // fill in the OpenCL environment data structures for ( cl_uint i = 0; i < opencl_env->num_platforms; i++ ) QueryPlatformInfo( &platform_ids[i], &opencl_env->platforms[i] ); if ( platform_ids ) { free( platform_ids ); platform_ids = NULL; } } else { fprintf( stderr, "No OpenCL platforms found. Exiting...\n" ); exit( EXIT_FAILURE ); } int opencl_devices_found = 0; // false for ( cl_uint i = 0; i < opencl_env->num_platforms; i++ ) opencl_devices_found = ( opencl_env->platforms[i].num_devices ) ? 1 : 0; if ( !opencl_devices_found ) { fprintf( stderr, "No OpenCL devices found. Exiting...\n" ); exit( EXIT_FAILURE ); } }
void OpenCLMemory::free() { OpenCLDevice& current_device = OpenCLManager::CurrentPlatform()->CurrentDevice(); cl_int err; if (this->ptr_device_mem_ != NULL) { if (this->hasEvent()) { CL_CHECK( clWaitForEvents( 1, &this->memoryEvent)); this->resetEvent(); } err = clReleaseMemObject( this->ptr_device_mem_); if (err != CL_SUCCESS) { std::ostringstream oss; oss << current_device.name() << "> failed to call clReleaseMemObject(" << this->ptr_device_mem << ")."; LOG(ERROR)<< oss.str().c_str(); throw OpenCLMemoryException( oss.str()); } this->ptr_device_mem_ = NULL; DLOG(INFO)<< current_device.name() << "> clReleaseMemObject(" << this->ptr_device_mem << ") succeeded."; numCallsFree++; logStatistics(); } }
clblasStatus SGEMM_mod1024( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, float alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, float beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { const char *tileKernelSource = NULL; cl_kernel *tileClKernel = NULL; size_t tileKernelBinarySize = 0; cl_int err; const unsigned char *tileKernelBinary = NULL; clblasStatus status; //split the kernel calls to handle sgemm NT perf drop at big multiples of 1024 if ((lda % 1024 == 0) && (ldb % 1024 == 0) && (K > lda / 4)) { if ((lda == ldb) && (lda >= 4096) && (lda <= 8192)) // between 4096 and 8192 for now { if (lda != 6144)// 6144 is handled by 96 x 96 kernel { // we are going to call 16 GEMMs with M=M/2, N=N/2, K=K/4 // each GEMM requires M%128 == 0, N%128 == 0, K%16 == 0 if (M % 256 == 0 && N % 256 == 0 && K % 64 == 0) { if (!((transA == clblasNoTrans) && (transB == clblasTrans))) return clblasNotImplemented; specialCaseHandled = true; unsigned int M_split_factor; unsigned int N_split_factor; unsigned int K_split_factor; if (lda < 7168) { M_split_factor = 1; N_split_factor = 1; K_split_factor = 1; } else { //7168, 8192 M_split_factor = 2; N_split_factor = 2; K_split_factor = 4; } tileKernelSource = sgemm_Col_NT_B1_MX128_NX128_KX16_src; tileClKernel = &sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX128_NX128_KX16_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX128_NX128_KX16_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); status = GEMM_SPLIT_CALLS( tileClKernel, clblasColumnMajor, 128, 16, M_split_factor, N_split_factor, K_split_factor, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return status; } } else { // lda == ldb == 6144 // we are going to call 4 GEMMs each with K = K/4 if (M % 96 == 0 && N % 96 == 0 && K % 64 == 0) { if (!((transA == clblasNoTrans) && (transB == clblasTrans))) return clblasNotImplemented; specialCaseHandled = true; unsigned int M_split_factor = 1; unsigned int N_split_factor = 1; unsigned int K_split_factor = 4; tileKernelSource = sgemm_Col_NT_B1_MX096_NX096_KX16_src; tileClKernel = &sgemm_Col_NT_B1_MX096_NX096_KX16_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX096_NX096_KX16_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX096_NX096_KX16_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); status = GEMM_SPLIT_CALLS( tileClKernel, clblasColumnMajor, 96, 16, M_split_factor, N_split_factor, K_split_factor, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return status; } } } } return clblasNotImplemented; }
int main(int argc, char **argv) { printf("enter demo main\n"); fflush(stdout); putenv("POCL_VERBOSE=1"); putenv("POCL_DEVICES=basic"); putenv("POCL_LEAVE_TEMP_DIRS=1"); putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1"); putenv("POCL_TEMP_DIR=pocl"); putenv("POCL_CACHE_DIR=pocl"); putenv("POCL_WORK_GROUP_METHOD=spmd"); if(argc >= 2){ printf("argv[1]:%s:\n",argv[1]); if(!strcmp(argv[1], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[1], "c")) putenv("POCL_CROSS_COMPILE=1"); } if(argc >= 3){ printf("argv[2]:%s:\n",argv[2]); if(!strcmp(argv[2], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[2], "c")) putenv("POCL_CROSS_COMPILE=1"); } //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib"); //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath()); cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err)); cl_kernel kernel = 0; cl_mem memObjects[3] = {0,0,0}; // Create OpenCL program - first attempt to load cached binary. // If that is not available, then create the program from source // and store the binary for future use. std::cout << "Attempting to create program from binary..." << std::endl; cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin"); if (program == NULL) { std::cout << "Binary not loaded, create from source..." << std::endl; program = CreateProgram(context, devices[1], "kernel.cl"); if (program == NULL) { Cleanup(context, queue, program, kernel, memObjects); return 1; } std::cout << "Save program binary for future run..." << std::endl; if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false) { std::cerr << "Failed to write program binary" << std::endl; Cleanup(context, queue, program, kernel, memObjects); return 1; } } else { std::cout << "Read program from binary." << std::endl; } printf("attempting to create input buffer\n"); fflush(stdout); cl_mem input_bufferA; input_bufferA = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err)); cl_mem input_bufferB; input_bufferB = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err)); printf("attempting to create output buffer\n"); fflush(stdout); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*NUM_DATA*NUM_DATA, NULL, &_err)); memObjects[0] = input_bufferA; memObjects[1] = input_bufferB; memObjects[2] = output_buffer; size_t width = NUM_DATA; printf("attempting to create kernel\n"); fflush(stdout); kernel = CL_CHECK_ERR(clCreateKernel(program, "sgemm_single", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_bufferA), &input_bufferA)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(input_bufferB), &input_bufferB)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(width), &width)); printf("attempting to enqueue write buffer\n"); fflush(stdout); for (int i=0; i<NUM_DATA*NUM_DATA; i++) { float in = ((float)rand()/(float)(RAND_MAX)) * 100.0; CL_CHECK(clEnqueueWriteBuffer(queue, input_bufferA, CL_TRUE, i*sizeof(float), 4, &in, 0, NULL, NULL)); in = ((float)rand()/(float)(RAND_MAX)) * 100.0; CL_CHECK(clEnqueueWriteBuffer(queue, input_bufferB, CL_TRUE, i*sizeof(float), 4, &in, 0, NULL, NULL)); } cl_event kernel_completion; const size_t local_work_size[3] = { 64, 1, 1}; // a_offset size_t global_work_size[3] = { NUM_DATA, NUM_DATA, NUM_DATA }; printf("attempting to enqueue kernel\n"); fflush(stdout); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &kernel_completion)); printf("Enqueue'd kerenel\n"); fflush(stdout); cl_ulong time_start, time_end; CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsed = time_end - time_start; printf("time(ns):%lg\n",elapsed); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA*NUM_DATA; i++) { float data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(float), 4, &data, 0, NULL, NULL)); //printf(" %f", data); } printf("\n"); CL_CHECK(clReleaseMemObject(memObjects[0])); CL_CHECK(clReleaseMemObject(memObjects[1])); CL_CHECK(clReleaseMemObject(memObjects[2])); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int fftCore(cl_mem dst, cl_mem src, cl_mem spin, cl_int m, enum Mode direction) { cl_int ret; cl_int iter; cl_uint flag; cl_int n = 1 << m; cl_event kernelDone; cl_kernel brev = NULL; cl_kernel bfly = NULL; cl_kernel norm = NULL; brev = clCreateKernel(program, "bitReverse", &ret); CL_CHECK(ret); bfly = clCreateKernel(program, "butterfly", &ret); CL_CHECK(ret); norm = clCreateKernel(program, "norm", &ret); CL_CHECK(ret); size_t gws[2]; size_t lws[2]; switch (direction) { case forward: flag = 0x00000000; break; case inverse: flag = 0x80000000; break; } CL_CHECK(ret = clSetKernelArg(brev, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(ret = clSetKernelArg(brev, 1, sizeof(cl_mem), (void *)&src)); CL_CHECK(ret = clSetKernelArg(brev, 2, sizeof(cl_int), (void *)&m)); CL_CHECK(ret = clSetKernelArg(brev, 3, sizeof(cl_int), (void *)&n)); CL_CHECK(ret = clSetKernelArg(bfly, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(ret = clSetKernelArg(bfly, 1, sizeof(cl_mem), (void *)&spin)); CL_CHECK(ret = clSetKernelArg(bfly, 2, sizeof(cl_int), (void *)&m)); CL_CHECK(ret = clSetKernelArg(bfly, 3, sizeof(cl_int), (void *)&n)); CL_CHECK(ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag)); CL_CHECK(ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n)); setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL)); setWorkSize(gws, lws, n/2, n); for(iter = 1; iter <= m; iter++) { CL_CHECK(ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter)); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone)); CL_CHECK(ret = clWaitForEvents(1, &kernelDone)); } if(direction == inverse) { setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone)); CL_CHECK(ret = clWaitForEvents(1, &kernelDone)); } CL_CHECK(ret = clReleaseKernel(bfly)); CL_CHECK(ret = clReleaseKernel(brev)); CL_CHECK(ret = clReleaseKernel(norm)); return 0; }
int main(int argc, char *argv[]) { //fprintf(stderr, "[%s:%d:%s()] FFT!\n", __FILE__, __LINE__, __func__); LOG("FFT Start\n"); cl_mem xmobj = NULL; cl_mem rmobj = NULL; cl_mem wmobj = NULL; cl_kernel sfac = NULL; cl_kernel trns = NULL; cl_kernel hpfl = NULL; cl_uint ret_num_platforms; cl_uint ret_num_devices; cl_int ret; cl_float2 *xm; cl_float2 *rm; cl_float2 *wm; pgm_t ipgm; pgm_t opgm; FILE *fp; const char fileName[] = "./fft.cl"; size_t source_size; char *source_str; cl_int i, j; cl_int n; cl_int m; size_t gws[2]; size_t lws[2]; fp = fopen(fileName, "r"); if(!fp) { fprintf(stderr, "[%s:%d:%s()] ERROR, Failed to load kernel source.\n", __FILE__, __LINE__, __func__); return 1; } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); readPGM(&ipgm, "./lena.pgm"); n = ipgm.width; m = (cl_int)(log((double)n)/log(2.0)); LOG("n = %d, m = %d.\n", m, n); xm = (cl_float2*)malloc(n*n*sizeof(cl_float2)); rm = (cl_float2*)malloc(n*n*sizeof(cl_float2)); wm = (cl_float2*)malloc(n/2 *sizeof(cl_float2)); for( i = 0; i < n; i++) { for(j = 0; j < n; j++) { ((float*)xm)[2*(n*j + i) + 0] = (float)ipgm.buf[n*j + i]; ((float*)xm)[2*(n*j + i) + 1] = (float)0; } } CL_CHECK(ret = clGetPlatformIDs(MAX_PLATFORM_IDS, platform_ids, &ret_num_platforms)); platform_id = platform_ids[0]; CL_CHECK(ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices)); LOG("platform_id = %p, device_id = %p\n", platform_id, device_id); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); CL_CHECK(ret); queue = clCreateCommandQueue(context, device_id, 0, &ret); xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); CL_CHECK(ret); rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); CL_CHECK(ret); wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); CL_CHECK(ret); CL_CHECK(ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL)); program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); CL_CHECK(ret); CL_CHECK(ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); sfac = clCreateKernel(program, "spinFact", &ret); CL_CHECK(ret); trns = clCreateKernel(program, "transpose", &ret); CL_CHECK(ret); hpfl = clCreateKernel(program, "highPassFilter", &ret); CL_CHECK(ret); CL_CHECK(ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj)); CL_CHECK(ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n)); setWorkSize(gws, lws, n/2, 1); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL)); fftCore(rmobj, xmobj, wmobj, m, forward); CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj)); CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj)); CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n)); setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL)); fftCore(rmobj, xmobj, wmobj, m, forward); #if 1 //FILTER cl_int radius = n>>4; CL_CHECK(ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj)); CL_CHECK(ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius)); setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL)); #endif #if 1 /* Inverse FFT */ fftCore(xmobj, rmobj, wmobj, m, inverse); CL_CHECK(ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj)); CL_CHECK(ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj)); CL_CHECK(ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n)); setWorkSize(gws, lws, n, n); CL_CHECK(ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL)); fftCore(xmobj, rmobj, wmobj, m, inverse); #endif CL_CHECK(ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL)); float *ampd; ampd = (float*)malloc(n*n*sizeof(float)); for(i = 0; i < n; i++) { for(j = 0; j < n; j++) { ampd[n*i + j] = AMP( ((float*)xm)[2*(n*i + j)], ((float*)xm)[2*(n*i + j) + 1] ); // fprintf(stderr, "%d ", (int)ampd[n*i + j]); } // fprintf(stderr, "\n"); } opgm.width = n; opgm.height = n; normalizeF2PGM(&opgm, ampd); free(ampd); writePGM(&opgm, "output.pgm"); /* Termination */ CL_CHECK(ret = clFlush(queue)); CL_CHECK(ret = clFinish(queue)); CL_CHECK(ret = clReleaseKernel(hpfl)); CL_CHECK(ret = clReleaseKernel(trns)); CL_CHECK(ret = clReleaseKernel(sfac)); CL_CHECK(ret = clReleaseProgram(program)); CL_CHECK(ret = clReleaseMemObject(xmobj)); CL_CHECK(ret = clReleaseMemObject(rmobj)); CL_CHECK(ret = clReleaseMemObject(wmobj)); CL_CHECK(ret = clReleaseCommandQueue(queue)); CL_CHECK(ret = clReleaseContext(context)); destroyPGM(&ipgm); destroyPGM(&opgm); free(source_str); free(wm); free(rm); free(xm); return 0; }
int fft_main(cl_mem dst, cl_mem src, cl_mem twiddles, cl_int m, enum Tipo direcao, struct event_in_fft_t *fft_event) { cl_int ret_code; cl_int iter; cl_uint flag; size_t global_wg[2]; size_t local_wg[2]; cl_int n = 1 << m; cl_kernel kernel_bits_rev = NULL; cl_kernel kernel_butterfly_op = NULL; cl_kernel kernel_normalize = NULL; kernel_bits_rev = clCreateKernel(program, "bits_reverse", &ret_code); kernel_butterfly_op = clCreateKernel(program, "butterfly_operation", &ret_code); kernel_normalize = clCreateKernel(program, "normalizar", &ret_code); switch (direcao) { case direta:flag = 0x00000000; break; case inversa:flag = 0x80000000; break; } CL_CHECK(clSetKernelArg(kernel_bits_rev, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(clSetKernelArg(kernel_bits_rev, 1, sizeof(cl_mem), (void *)&src)); CL_CHECK(clSetKernelArg(kernel_bits_rev, 2, sizeof(cl_int), (void *)&m)); CL_CHECK(clSetKernelArg(kernel_bits_rev, 3, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 1, sizeof(cl_mem), (void *)&twiddles)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 2, sizeof(cl_int), (void *)&m)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 3, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 5, sizeof(cl_uint), (void *)&flag)); CL_CHECK(clSetKernelArg(kernel_normalize, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(clSetKernelArg(kernel_normalize, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_bits_rev, 2, NULL, global_wg, local_wg, 0, NULL, &fft_event->kernel_bitsrev)); config_workgroup_size(global_wg, local_wg, n/2, n); for (iter = 1; iter <= m; iter++) { CL_CHECK(clSetKernelArg(kernel_butterfly_op, 4, sizeof(cl_int), (void *)&iter)); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_butterfly_op, 2, NULL, global_wg, local_wg, 0, NULL, &kernel_butter_events[butter_event_it])); butter_event_it++; } fft_event->kernel_normalize = NULL; if (direcao == inversa) { config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_normalize, 2, NULL, global_wg, local_wg, 0, NULL, &fft_event->kernel_normalize)); } clReleaseKernel(kernel_bits_rev); clReleaseKernel(kernel_butterfly_op); clReleaseKernel(kernel_normalize); return 0; }
clblasStatus DGEMM_BIG_MOD48( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, double alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, double beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { const char *tileKernelSource = NULL; cl_kernel *tileClKernel = NULL; size_t tileKernelBinarySize = 0; cl_int err; const unsigned char *tileKernelBinary = NULL; clblasStatus status; //split the kernel calls to handle dgemm NT perf drop when matrix sizes are big if ((lda == ldb) && (lda >= 18000) && (lda <= 36000)) // between 18000 and 36000 for now { if (!((transA == clblasNoTrans) && (transB == clblasTrans))) return clblasNotImplemented; unsigned int M_split_factor; unsigned int N_split_factor; unsigned int K_split_factor; if ((M % 192 == 0) && (N % 192 == 0) && (K % 192 == 0) && (K > lda / 4)) { M_split_factor = 4; N_split_factor = 4; K_split_factor = 4; } else if ((M % 96 == 0) && (N % 96 == 0) && (K % 96 == 0) && (K > lda / 4)) { M_split_factor = 2; N_split_factor = 2; K_split_factor = 2; } else { return clblasNotImplemented; } tileKernelSource = dgemm_Col_NT_B1_MX048_NX048_KX08_src; tileClKernel = &dgemm_Col_NT_B1_MX048_NX048_KX08_clKernel; tileKernelBinary = dgemm_Col_NT_B1_MX048_NX048_KX08_bin; tileKernelBinarySize = dgemm_Col_NT_B1_MX048_NX048_KX08_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_double), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_double), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); status = GEMM_SPLIT_CALLS( tileClKernel, clblasColumnMajor, 48, 8, M_split_factor, N_split_factor, K_split_factor, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); if (status == clblasSuccess) specialCaseHandled = true; return status; } return clblasNotImplemented; }
int main(int argc, char *argv[]) { /* Variaveis obrigatorias do openCL pdccpk*/ cl_platform_id platform_ids[2]; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel_sobel; cl_int ret_code; cl_uint ret_num_devices; cl_uint ret_num_platforms; // cl_event kernel_event; cl_ulong kernel_start_time = (cl_ulong) 0; cl_ulong kernel_end_time = (cl_ulong) 0; cl_ulong kernel_run_time = (cl_ulong) 0; cl_event write_host_dev_event; cl_ulong write_host_dev_start_time = (cl_ulong) 0; cl_ulong write_host_dev_end_time = (cl_ulong) 0; cl_ulong write_host_dev_run_time = (cl_ulong) 0; cl_event read_dev_host_event; cl_ulong read_dev_host_start_time = (cl_ulong) 0; cl_ulong read_dev_host_end_time = (cl_ulong) 0; cl_ulong read_dev_host_run_time = (cl_ulong) 0; unsigned __int64 image_tam; const unsigned __int64 MEGA_BYTES = 1048576; // 1024*1024 double image_tam_MB; double tempo_total; /* objetos que serao armazenados na memoria da GPU */ cl_mem image_in_mem, image_out_mem; /* objetos que serao armazenados na memoria local (host) */ unsigned char *image_in_host, *image_out_host; unsigned int image_width, image_height; size_t image_size; /*IMPORTANTE: dimensionamento dos compute units para exec do kernel*/ size_t work_global[C_NUM_DIMENSOES]; size_t work_local[C_NUM_DIMENSOES]; /*Setup dos nomes de arquivos*/ const char *kernel_filename = C_NOME_ARQ_KERNEL; pgm_t ipgm, opgm; /* Codigo fonte do kernel dever ser aberto como uma cadeia de caracteres*/ image_file_t* image_filename; char* output_filename; FILE *fp; size_t source_size; char *source_str; /* Timer count start */ timer_reset(); timer_start(); if (argc < 2) { printf("**Erro: A imagem de entrada é necessaria.\n"); exit(EXIT_FAILURE); } //=================================================================================================== image_filename = (image_file_t *) malloc(sizeof(image_file_t)); split_image_filename(image_filename, argv[1]); output_filename = (char *) malloc(40*sizeof(char)); sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO); //=================================================================================================== fp = fopen(kernel_filename, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *) malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); //=================================================================================================== // Abrindo imagem do arquivo para objeto de memoria local if( ler_pgm(&ipgm, argv[1]) == -1) exit(EXIT_FAILURE); image_in_host = ipgm.buf; image_width = ipgm.width; image_height = ipgm.height; image_size = (int) (image_width * image_height) * sizeof(unsigned char); image_tam = image_size; /* Alocando memoria para a imagem de saida apos o processamento*/ image_out_host = (unsigned char *) malloc(image_size); //=================================================================================================== /* Recebe um vetor de platform_id e retorna sucesso * se encontrar plataformas OpenCL no sistema, inseridos * essas plataformas no vetor com no maximo MAX_PLATFORM_ID * entradas, caso contrario retorna codigo de erro. * CL_CHECK é um macro para retornar o titulo do erro * a partir de uma funcao que retorne um codigo de erro ***************************************************/ CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_ids, &ret_num_platforms)); if (ret_num_platforms == 0) { fprintf(stderr, "[Erro] Não existem plataformas OpenCL\n"); exit(2); } //=================================================================================================== /* Recebe uma platform_id e retorna sucesso * se obter um device do tipo GPU dessa plataforma OpenCL * caso contrario retorna codigo de erro. ***************************************************/ CL_CHECK(clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices)); //print_platform_info(&platform_ids[0]); //system("pause"); //exit(0); //=================================================================================================== /* Retorna sucesso se consegui criar um contexto para * o device id escolhido, caso contrario retorna codigo de erro. ***************************************************/ context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code); //=================================================================================================== commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code); //=================================================================================================== program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, NULL); //=================================================================================================== ret_code = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (ret_code != CL_SUCCESS) { char build_str[4096]; fprintf(stderr, "[ERRO] clBuildProgram '%s' (code: %d)\n", error_cl_str(ret_code), ret_code ); clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_str), build_str, NULL); fprintf(stderr, "[ERRO] log: '%s'\n", build_str); system("pause"); exit(4); } //=================================================================================================== kernel_sobel = clCreateKernel(program, "sobel_kernel", NULL); image_in_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, image_size, NULL, NULL); image_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, image_size , NULL, NULL); //=================================================================================================== CL_CHECK(clEnqueueWriteBuffer(commands, image_in_mem, CL_TRUE, 0, image_size, image_in_host, 0, NULL, &write_host_dev_event)); CL_CHECK(clSetKernelArg(kernel_sobel, 0, sizeof(cl_mem), &image_in_mem)); CL_CHECK(clSetKernelArg(kernel_sobel, 1, sizeof(cl_mem), &image_out_mem)); //=================================================================================================== work_global[0] = image_width; work_global[1] = image_height; work_local[0] = MAX_WORK_GROUP_ITEM_SIZE_DIM_1; work_local[1] = MAX_WORK_GROUP_ITEM_SIZE_DIM_2; //=================================================================================================== CL_CHECK(clEnqueueNDRangeKernel(commands, kernel_sobel, 2, NULL, work_global, work_local, 0, NULL, &kernel_event) ); // CL_CHECK(clFinish(commands)); // CL_CHECK( clWaitForEvents(1 , &kernel_event) ); //=================================================================================================== CL_CHECK(clEnqueueReadBuffer(commands, image_out_mem, CL_TRUE, 0, image_size, image_out_host, 0, NULL, &read_dev_host_event)); //== Total time elapsed ============================================================================= timer_stop(); tempo_total = get_elapsed_time(); //=================================================================================================== //====== Get time of Profile Info =================================================================== // kernel sobel time CL_CHECK(clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); // Write data time CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL)); // Read data time CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL)); //=================================================================================================== write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time; read_dev_host_run_time = read_dev_host_end_time - read_dev_host_start_time; kernel_run_time = kernel_end_time - kernel_start_time; image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES); //=================================================================================================== save_log_gpu(image_filename, kernel_run_time, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)), (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME); //=================================================================================================== opgm.width = image_width; opgm.height = image_height; opgm.buf = image_out_host; escrever_pgm(&opgm, output_filename); //=================================================================================================== CL_CHECK(clReleaseMemObject(image_in_mem)); CL_CHECK(clReleaseEvent(kernel_event)); CL_CHECK(clReleaseEvent(read_dev_host_event)); CL_CHECK(clReleaseEvent(write_host_dev_event)); CL_CHECK(clReleaseMemObject(image_out_mem)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseKernel(kernel_sobel)); CL_CHECK(clReleaseCommandQueue(commands)); CL_CHECK(clReleaseContext(context)); destruir_pgm(&ipgm); destruir_pgm(&opgm); free(source_str); free(image_filename); free(output_filename); //_CrtDumpMemoryLeaks(); return 0; }
bool OpenCLPlatform::Query() { Check(); // get the name cl_int err = 0; name_ = platform_.getInfo<CL_PLATFORM_NAME>( &err); if (!CL_CHECK( err)) { name_ = "Failed to get platform name."; } // get the vendor vendor_ = platform_.getInfo<CL_PLATFORM_VENDOR>( &err); if (!CL_CHECK( err)) { vendor_ = "Failed to get platform vendor."; } version_ = platform_.getInfo<CL_PLATFORM_VERSION>( &err); if (!CL_CHECK( err)) { version_ = "Failed to get platform version."; } extensions_ = platform_.getInfo<CL_PLATFORM_EXTENSIONS>( &err); if (!CL_CHECK( err)) { extensions_ = "failed to get platform extensions."; } profile_ = platform_.getInfo<CL_PLATFORM_PROFILE>( &err); if (!CL_CHECK( err)) { profile_ = "Failed to get platform profile."; } // CPU & GPU devices if (!CL_CHECK( clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 0, NULL, &(numDevices)))) { return false; } devicePtr = reinterpret_cast<cl_device_id*>(malloc( numDevices * sizeof(cl_device_id))); if (!CL_CHECK( clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, numDevices, devicePtr, NULL))) { return false; } // CPU devices if (clGetDeviceIDs( platform_(), CL_DEVICE_TYPE_CPU, 0, NULL, &(numCPUDevices)) == CL_SUCCESS) { cpuDevicePtr = reinterpret_cast<cl_device_id*>(malloc( numCPUDevices * sizeof(cl_device_id))); if (!CL_CHECK(clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_CPU, numCPUDevices, cpuDevicePtr, NULL))) { return false; } for (int i = 0; i < numCPUDevices; i++) { OpenCLDevice d( platform_(), cpuDevicePtr[i]); d.query(); devices.push_back( d); } } // GPU DEVICES if (!CL_CHECK(clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_GPU, 0, NULL, &(numGPUDevices)))) { return false; } gpuDevicePtr = reinterpret_cast<cl_device_id*>(malloc( numGPUDevices * sizeof(cl_device_id))); if (!CL_CHECK(clGetDeviceIDs(platform_(), CL_DEVICE_TYPE_GPU, numGPUDevices, gpuDevicePtr, NULL))) { return false; } for (int i = 0; i < numGPUDevices; i++) { OpenCLDevice d( platform_(), gpuDevicePtr[i]); d.query(); devices.push_back( d); } return true; }
static void init_helmholtzbem3d_opencl(pcbem3d bem) { uint num_kernels; const char *kernel_names[] = { "assemble_slp_cc_list_0", "assemble_slp_cc_list_1", "assemble_slp_cc_list_2", "assemble_slp_cc_list_3", "assemble_dlp_cc_list_0", "assemble_dlp_cc_list_1", "assemble_dlp_cc_list_2", "assemble_dlp_cc_list_3" }; cl_uint num_devices = ocl_system.num_devices; cl_uint num_queues = ocl_system.queues_per_device; cl_uint nthreads = num_devices * num_queues; cl_int res; cl_mem_flags mem_rflags, mem_wflags; uint i, j; real *gr_x; uint *gr_t; real *gr_n; real *q_xw, *x, *w; uint q, v, t; num_kernels = sizeof(kernel_names) / sizeof(kernel_names[0]); mem_rflags = CL_MEM_READ_ONLY; mem_wflags = CL_MEM_WRITE_ONLY; if (ocl_bem3d.num_kernels == 0) { /**************************************************** * Setup all necessary kernels ****************************************************/ setup_kernels(helmholtzbem3d_ocl_src, num_kernels, kernel_names, &ocl_bem3d.kernels); ocl_bem3d.num_kernels = num_kernels; /**************************************************** * Create buffers for matrix chunks ****************************************************/ ocl_bem3d.mem_N = (cl_mem *) allocmem(nthreads * sizeof(cl_mem)); ocl_bem3d.mem_ridx = (cl_mem *) allocmem(nthreads * sizeof(cl_mem)); ocl_bem3d.mem_cidx = (cl_mem *) allocmem(nthreads * sizeof(cl_mem)); for (i = 0; i < num_devices; ++i) { for (j = 0; j < num_queues; ++j) { ocl_bem3d.mem_N[j + i * num_queues] = clCreateBuffer(ocl_system.contexts[i], mem_wflags, ocl_system.max_package_size, NULL, &res); CL_CHECK(res) ocl_bem3d.mem_ridx[j + i * num_queues] = clCreateBuffer(ocl_system.contexts[i], mem_rflags, ocl_system.max_package_size / sizeof(field) * sizeof(uint), NULL, &res); CL_CHECK(res) ocl_bem3d.mem_cidx[j + i * num_queues] = clCreateBuffer(ocl_system.contexts[i], mem_rflags, ocl_system.max_package_size / sizeof(field) * sizeof(uint), NULL, &res); CL_CHECK(res) } } /**************************************************** * Create buffer for non singular quadrature rules * and copy the contents ****************************************************/ q = bem->sq->q; x = allocreal(q); w = allocreal(q); q_xw = allocreal(2 * q); assemble_gauss(q, x, w); for (i = 0; i < q; ++i) { q_xw[2 * i] = 0.5 + 0.5 * x[i]; q_xw[2 * i + 1] = 0.5 * w[i]; } ocl_bem3d.mem_q_xw = (cl_mem *) allocmem(num_devices * sizeof(cl_mem)); for (i = 0; i < num_devices; ++i) { ocl_bem3d.mem_q_xw[i] = clCreateBuffer(ocl_system.contexts[i], mem_rflags, 2 * q * sizeof(real), NULL, &res); CL_CHECK(res) res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues], ocl_bem3d.mem_q_xw[i], CL_TRUE, 0, 2 * q * sizeof(real), q_xw, 0, NULL, NULL); CL_CHECK(res); } ocl_bem3d.nq = 2 * q; freemem(x); freemem(w); freemem(q_xw); /**************************************************** * Create buffer for singular quadrature rules * and copy the contents ****************************************************/ q = bem->sq->q2; x = allocreal(q); w = allocreal(q); q_xw = allocreal(2 * q); assemble_gauss(q, x, w); for (i = 0; i < q; ++i) { q_xw[2 * i] = 0.5 + 0.5 * x[i]; q_xw[2 * i + 1] = 0.5 * w[i]; } ocl_bem3d.mem_q2_xw = (cl_mem *) allocmem(num_devices * sizeof(cl_mem)); for (i = 0; i < num_devices; ++i) { ocl_bem3d.mem_q2_xw[i] = clCreateBuffer(ocl_system.contexts[i], mem_rflags, 2 * q * sizeof(real), NULL, &res); CL_CHECK(res) res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues], ocl_bem3d.mem_q2_xw[i], CL_TRUE, 0, 2 * q * sizeof(real), q_xw, 0, NULL, NULL); CL_CHECK(res); } ocl_bem3d.nq2 = 2 * q; freemem(x); freemem(w); freemem(q_xw); /**************************************************** * Create buffers for geometry data and copy the contents ****************************************************/ v = bem->gr->vertices; t = bem->gr->triangles; gr_x = allocreal(3 * v); gr_t = allocuint(3 * t); gr_n = allocreal(3 * t); for (i = 0; i < v; ++i) { gr_x[3 * i + 0] = bem->gr->x[i][0]; gr_x[3 * i + 1] = bem->gr->x[i][1]; gr_x[3 * i + 2] = bem->gr->x[i][2]; } for (i = 0; i < t; ++i) { gr_t[i + 0 * t] = bem->gr->t[i][0]; gr_t[i + 1 * t] = bem->gr->t[i][1]; gr_t[i + 2 * t] = bem->gr->t[i][2]; } for (i = 0; i < t; ++i) { gr_n[3 * i + 0] = bem->gr->n[i][0]; gr_n[3 * i + 1] = bem->gr->n[i][1]; gr_n[3 * i + 2] = bem->gr->n[i][2]; } ocl_bem3d.mem_gr_t = (cl_mem *) allocmem(num_devices * sizeof(cl_mem)); ocl_bem3d.mem_gr_x = (cl_mem *) allocmem(num_devices * sizeof(cl_mem)); for (i = 0; i < num_devices; ++i) { ocl_bem3d.mem_gr_x[i] = clCreateBuffer(ocl_system.contexts[i], mem_rflags, 3 * v * sizeof(real), NULL, &res); CL_CHECK(res) ocl_bem3d.mem_gr_t[i] = clCreateBuffer(ocl_system.contexts[i], mem_rflags, 3 * t * sizeof(uint), NULL, &res); CL_CHECK(res) res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues], ocl_bem3d.mem_gr_x[i], CL_TRUE, 0, 3 * v * sizeof(real), gr_x, 0, NULL, NULL); CL_CHECK(res); res = clEnqueueWriteBuffer(ocl_system.queues[i * num_queues], ocl_bem3d.mem_gr_t[i], CL_TRUE, 0, 3 * t * sizeof(uint), gr_t, 0, NULL, NULL); CL_CHECK(res); } ocl_bem3d.triangles = t; freemem(gr_x); freemem(gr_t); freemem(gr_n); omp_init_lock(&nf_lock); omp_init_lock(&nf_dist_lock); omp_init_lock(&nf_vert_lock); omp_init_lock(&nf_edge_lock); omp_init_lock(&nf_iden_lock); }
static void fill_cc_ocl_gpu_wrapper_helmholtzbem3d(void *data, uint kernel) { merge_data_nf *mdata = (merge_data_nf *) data; cl_int res; cl_uint nq2; cl_uint triangles; uint current_device; uint current_queue; uint current_kernel_id; uint current_thread; cl_uint num_devices = ocl_system.num_devices; cl_uint num_queues = ocl_system.queues_per_device; cl_event h2d[2], calc; cl_kernel oclkernel; cl_command_queue queue; size_t global_off[] = { 0 }; size_t local_size[] = { 128 }; size_t global_size[] = { ((mdata->pos + local_size[0] - 1) / local_size[0]) * local_size[0] }; field k = mdata->bem->k; field alpha = mdata->bem->alpha; /**************************************************** * Determine device, queue and kernel. ****************************************************/ current_device = omp_get_thread_num() / num_queues; current_queue = omp_get_thread_num() % num_queues; current_thread = current_queue + current_device * num_queues; current_kernel_id = current_queue + current_device * num_queues + kernel * num_devices * num_queues; oclkernel = ocl_bem3d.kernels[current_kernel_id]; queue = ocl_system.queues[current_thread]; /**************************************************** * Transfer input data. ****************************************************/ res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_ridx[current_thread], CL_FALSE, 0, mdata->pos * sizeof(uint), mdata->ridx, 0, NULL, h2d + 0); CL_CHECK(res) res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_cidx[current_thread], CL_FALSE, 0, mdata->pos * sizeof(uint), mdata->cidx, 0, NULL, h2d + 1); CL_CHECK(res) /**************************************************** * Setup kernel arguments for 'assemble_xxx_cc_list_z' ****************************************************/ triangles = ocl_bem3d.triangles; if (kernel == 0 || kernel == 4) { nq2 = ocl_bem3d.nq; res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem), &ocl_bem3d.mem_q_xw[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2); } else { nq2 = ocl_bem3d.nq2; res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem), &ocl_bem3d.mem_q2_xw[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2); } CL_CHECK(res) res = clSetKernelArg(oclkernel, 2, sizeof(cl_mem), &ocl_bem3d.mem_gr_t[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 3, sizeof(cl_mem), &ocl_bem3d.mem_gr_x[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 4, sizeof(cl_uint), &triangles); CL_CHECK(res) res = clSetKernelArg(oclkernel, 5, sizeof(cl_mem), &ocl_bem3d.mem_ridx[current_thread]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 6, sizeof(cl_mem), &ocl_bem3d.mem_cidx[current_thread]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 7, sizeof(cl_mem), &ocl_bem3d.mem_N[current_thread]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 8, sizeof(cl_uint), &mdata->pos); CL_CHECK(res) res = clSetKernelArg(oclkernel, 9, sizeof(field), &k); CL_CHECK(res) res = clSetKernelArg(oclkernel, 10, sizeof(field), &alpha); CL_CHECK(res) /**************************************************** * Invoke the kernel 'assemble_xxx_cc_list_z' on the GPU ****************************************************/ res = clEnqueueNDRangeKernel(queue, oclkernel, 1, global_off, global_size, local_size, 2, h2d, &calc); CL_CHECK(res) /**************************************************** * Copy results back. ****************************************************/ res = clEnqueueReadBuffer(queue, ocl_bem3d.mem_N[current_thread], CL_TRUE, 0, mdata->pos * sizeof(field), mdata->N, 1, &calc, NULL); CL_CHECK(res) clReleaseEvent(h2d[0]); clReleaseEvent(h2d[1]); clReleaseEvent(calc); }
void clean_all(void) { printf("Cleaning Variables ... \n\n"); // Opencl environment variables clReleaseCommandQueue(command_queue); clReleaseContext(context); // Release all memory allocated if (Data_MeshType == UNSTRUCTURED) { // Mesh Variables free(MeshElementArray.Node1); free(MeshElementArray.Node2); free(MeshElementArray.Node3); free(MeshElementArray.Node4); free(MeshNodeArray_double.x); free(MeshNodeArray_double.y); free(MeshNodeArray_double.z); free(MeshElementArray.Neighborindex1); free(MeshElementArray.Neighborindex2); free(MeshElementArray.Neighborindex3); free(MeshElementArray.Neighborindex4); clReleaseMemObject(Mesh_Node_x); clReleaseMemObject(Mesh_Node_y); clReleaseMemObject(Mesh_Node_z); clReleaseMemObject(Mesh_Element_Node1); clReleaseMemObject(Mesh_Element_Node2); clReleaseMemObject(Mesh_Element_Node3); clReleaseMemObject(Mesh_Element_Node4); clReleaseMemObject(Mesh_Element_Neighborindex1); clReleaseMemObject(Mesh_Element_Neighborindex2); clReleaseMemObject(Mesh_Element_Neighborindex3); clReleaseMemObject(Mesh_Element_Neighborindex4); clReleaseMemObject(r); clReleaseMemObject(s); clReleaseMemObject(t); clReleaseMemObject(eid); } // Cleaning Velocity variables free(velocity.u0); free(velocity.v0); free(velocity.w0); free(velocity.u1); free(velocity.v1); free(velocity.w1); free(velocity.time0); free(velocity.time1); free(Tracer.x); Tracer.x = NULL; free(Tracer.y); Tracer.y = NULL; free(Tracer.z); Tracer.z = NULL; free(Tracer.ElementIndex); Tracer.ElementIndex = NULL; free(Tracer.Start_time); Tracer.Start_time = NULL; free(Tracer.Stop_time); Tracer.Stop_time = NULL; free(Tracer.LeftDomain); Tracer.LeftDomain = NULL; if (Trace_ReleaseStrategy == 1) { free(Tracer1.x); Tracer1.x = NULL; free(Tracer1.y); Tracer1.y = NULL; free(Tracer1.z); Tracer1.z = NULL; free(Tracer1.ElementIndex); Tracer1.ElementIndex = NULL; free(Tracer1.Start_time); Tracer1.Start_time = NULL; free(Tracer1.Stop_time); Tracer1.Stop_time = NULL; free(Tracer1.LeftDomain); Tracer1.LeftDomain = NULL; free(index1); index1 = NULL; free(Tracer.Status); Tracer.Status = NULL; } free(DataTime1); free(Output_time); free(Launch_time); clReleaseMemObject(Vel_U0); clReleaseMemObject(Vel_U1); clReleaseMemObject(Vel_V0); clReleaseMemObject(Vel_V1); clReleaseMemObject(Vel_W0); clReleaseMemObject(Vel_W1); clReleaseMemObject(x_dev); clReleaseMemObject(y_dev); clReleaseMemObject(posx); clReleaseMemObject(posy); clReleaseMemObject(xn0); clReleaseMemObject(xn1); clReleaseMemObject(integrate); if (Dimensions == 3) { clReleaseMemObject(z_dev); clReleaseMemObject(posz); clReleaseMemObject(xn2); } clReleaseMemObject(Start_time_dev); clReleaseMemObject(Stop_time_dev); clReleaseMemObject(ElementIndex_dev); clReleaseMemObject(LeftDomain_dev); // Remove Temp file containing tracer release information if (!Keep_Tempfile) { char BinFile[LONGSTRING]; sprintf(BinFile, "%s%s.bin", Path_Output, Temp_OutFilePrefix); if(remove(BinFile)) fprintf(stderr, "Warning: Could not delete file %s\n", BinFile); } CL_CHECK(clReleaseKernel(kernel1)); CL_CHECK(clReleaseKernel(kernel2)); CL_CHECK(clReleaseKernel(kernel3)); CL_CHECK(clReleaseKernel(kernel4)); CL_CHECK(clReleaseKernel(kernel5)); CL_CHECK(clReleaseProgram(program)); printf("Cleaning Successfull \n\n"); }
void initopencl(void) { int i; // Get Platform and Device Info CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms)); // Currently this program only runs on a SINGLE GPU. CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices)); printf("=== %d OpenCL platform(s) found: ===\n", num_platforms); printf("=== %d OpenCL device(s) found on platform:\n", num_devices); char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); if (num_devices == 0) { fprintf(stderr, "No Devices found that can run OpenCL."); exit(0); } // Create OpenCL context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating context: Function returned %d \n\n", ret); exit(1); } // Create Command Queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret); exit(1); } // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; fp = fopen("integrate.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Create a program from the kernel source program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); if (ret != CL_SUCCESS) { size_t length; char buffer[10240]; clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); fprintf(stderr, "Error returned %d. \n\n", (int)ret); printf("Error Log: \n\n %s \n\n", buffer); exit(0); } /* // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } */ // Create the OpenCL kernel (check_int) kernel2 = clCreateKernel(program, "check_int", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret); exit(1); } // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel4 = clCreateKernel(program, "LocalSearch3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } printf("\n\n"); }
clblasStatus SGEMM_SPLIT64_32( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, float alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, float beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { //all the mod32 sizes that is not mod64 or mod96 ranging from 1184 to 3872 //non mod32 cases are not implemented in this approach and are of less interest const char *tileKernelSource = NULL; const char *rowKernelSource = NULL; const char *columnKernelSource = NULL; const char *singleKernelSource = NULL; cl_kernel *tileClKernel = NULL; cl_kernel *rowClKernel = NULL; cl_kernel *columnClKernel = NULL; cl_kernel *singleClKernel = NULL; const unsigned char *tileKernelBinary = NULL; const unsigned char *rowKernelBinary = NULL; const unsigned char *columnKernelBinary = NULL; const unsigned char *singleKernelBinary = NULL; size_t tileKernelBinarySize = 0; size_t rowKernelBinarySize = 0; size_t columnKernelBinarySize = 0; size_t singleKernelBinarySize = 0; cl_int err; if ((M >= 1184 && N >= 1184) && (M <= 3872 && N <= 3872) && (M % 64 != 0 && N % 64 != 0) && (M % 96 != 0 && N % 96 != 0) && (K % 16 == 0)) { if ((M % 32 == 0 && N % 32 == 0) && (transA == clblasNoTrans && transB == clblasTrans)) { specialCaseHandled = true; //execute the kernels //GlobalX = ((Mvalue - 1) / 64) * 16 //GlobalY = ((Nvalue - 1) / 64) * 16 size_t GlobalX = ((M - 1) / 64) * 16; size_t GlobalY = ((N - 1) / 64) * 16; size_t gs[2] = { GlobalX, GlobalY }; size_t wgsize[2] = { 16, 16 }; tileKernelSource = sgemm_Col_NT_B1_MX064_NX064_KX16_src; tileClKernel = &sgemm_Col_NT_B1_MX064_NX064_KX16_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX064_NX064_KX16_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX064_NX064_KX16_binSize; rowKernelSource = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src; rowClKernel = &sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel; rowKernelBinary = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_bin; rowKernelBinarySize = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_binSize; columnKernelSource = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src; columnClKernel = &sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel; columnKernelBinary = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_bin; columnKernelBinarySize = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_binSize; singleKernelSource = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src; singleClKernel = &sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel; singleKernelBinary = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_bin; singleKernelBinarySize = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_binSize; cl_kernel * Kernels[4] = { tileClKernel, rowClKernel, columnClKernel, singleClKernel }; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); makeGemmKernel(rowClKernel, commandQueues[0], rowKernelSource, User_srcBuildOptions, &rowKernelBinary, &rowKernelBinarySize, User_binBuildOptions); makeGemmKernel(columnClKernel, commandQueues[0], columnKernelSource, User_srcBuildOptions, &columnKernelBinary, &columnKernelBinarySize, User_binBuildOptions); makeGemmKernel(singleClKernel, commandQueues[0], singleKernelSource, User_srcBuildOptions, &singleKernelBinary, &singleKernelBinarySize, User_binBuildOptions); for (int i = 0; i < 4; i++) { err = clSetKernelArg(*Kernels[i], 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 13, sizeof(cl_uint), &offC); CL_CHECK(err); } err = clEnqueueNDRangeKernel(commandQueues[0], *Kernels[0], 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, NULL); gs[0] = 16; err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[1], 2, NULL, gs, wgsize, 0, NULL, NULL); gs[1] = 16; gs[0] = GlobalX; err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[2], 2, NULL, gs, wgsize, 0, NULL, NULL); gs[0] = 16; gs[1] = 16; err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[3], 2, NULL, gs, wgsize, 0, NULL, events); if (err == 0) return clblasSuccess; } } return clblasNotImplemented; }
void clPrintDevices() { cl_int err; // Enumerate platforms. cl_platform_id platforms[8]; cl_uint numPlatforms; CL_CHECK(clGetPlatformIDs(8, platforms, &numPlatforms)); for (cl_uint ii = 0; ii < numPlatforms; ++ii) { // Get platform vendor. char vendor[256]; CL_CHECK(clGetPlatformInfo(platforms[ii], CL_PLATFORM_VENDOR, 256, vendor, NULL)); // Check for known vendors and save vendor str for later printing. char platformOutputStr[32]; if (NULL != bx::stristr(vendor, "advanced micro devices", 256)) { dm::strscpya(platformOutputStr, "amd"); } else if (NULL != bx::stristr(vendor, "intel", 256)) { dm::strscpya(platformOutputStr, "intel"); } else if (NULL != bx::stristr(vendor, "nvidia", 256)) { dm::strscpya(platformOutputStr, "nvidia"); } else { dm::strscpya(platformOutputStr, dm::trim(vendor)); } // Enumerate current platform devices. cl_device_id devices[8]; cl_uint numDevices; err = clGetDeviceIDs(platforms[ii], CL_DEVICE_TYPE_ALL, 8, devices, &numDevices); if (CL_SUCCESS == err) { for (cl_uint jj = 0; jj < numDevices; ++jj) { // Get device name. char deviceName[128]; CL_CHECK(clGetDeviceInfo(devices[jj], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL)); // Get device type. cl_device_type deviceType; CL_CHECK(clGetDeviceInfo(devices[jj], CL_DEVICE_TYPE, sizeof(deviceType), &deviceType, NULL)); // Get device type str. char deviceTypeOutputStr[16]; if (CMFT_CL_DEVICE_TYPE_GPU == deviceType) { dm::strscpya(deviceTypeOutputStr, "gpu"); } else if (CMFT_CL_DEVICE_TYPE_CPU == deviceType) { dm::strscpya(deviceTypeOutputStr, "cpu"); } else if (CMFT_CL_DEVICE_TYPE_ACCELERATOR == deviceType) { dm::strscpya(deviceTypeOutputStr, "accelerator"); } else //if (CMFT_CL_DEVICE_TYPE_DEFAULT == deviceType) { dm::strscpya(deviceTypeOutputStr, "default"); } // Print device info. INFO("%-40s --clVendor %-6s --deviceIndex %u --deviceType %s" , dm::trim(deviceName) , platformOutputStr , uint32_t(jj) , deviceTypeOutputStr ); } } } }
clblasStatus SGEMM_BRANCH_32( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, float alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, float beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { const char *tileKernelSource = NULL; cl_kernel *tileClKernel = NULL; size_t tileKernelBinarySize = 0; cl_int err; const unsigned char *tileKernelBinary = NULL; clblasStatus status; if ((M * N < 1080 * 1080) && (M % 32 != 0 || N % 32 != 0) && (K%16==0)) { // ((Mvalue - 1) / 32 + 1) * 16 size_t GlobalX = ((M - 1) / 32 + 1) * 16; size_t GlobalY = ((N - 1) / 32 + 1) * 16; size_t gs[2] = { GlobalX, GlobalY }; size_t wgsize[2] = { 16, 16 }; if (transA == clblasNoTrans && transB == clblasNoTrans) { specialCaseHandled = true; tileKernelSource = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src; tileClKernel = &sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel; tileKernelBinary = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_bin; tileKernelBinarySize = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]); if (err == 0) return clblasSuccess; } if (transA == clblasNoTrans && transB == clblasTrans) { specialCaseHandled = true; tileKernelSource = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src; tileClKernel = &sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]); if (err == 0) return clblasSuccess; } if (transA == clblasTrans && transB == clblasNoTrans) { specialCaseHandled = true; tileKernelSource = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src; tileClKernel = &sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel; tileKernelBinary = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_bin; tileKernelBinarySize = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]); if (err == 0) return clblasSuccess; } } return clblasNotImplemented; }
bool ClContext::init(uint8_t _vendor , cl_device_type _preferredDeviceType , cl_uint _preferredDeviceIdx , char* _vendorStrPart ) { cl_int err = CL_SUCCESS; // Enumerate platforms. cl_platform_id platforms[8]; cl_uint numPlatforms; CL_CHECK(clGetPlatformIDs(8, platforms, &numPlatforms)); // Choose preferred platform. cl_platform_id choosenPlatform = platforms[0]; if (NULL != _vendorStrPart) { char buffer[256]; for (cl_uint ii = 0; ii < numPlatforms; ++ii) { // Get platform vendor. CL_CHECK(clGetPlatformInfo(platforms[ii], CL_PLATFORM_VENDOR, 256, buffer, NULL)); if (_vendor&CMFT_CL_VENDOR_OTHER) { // If specific vendor is requested, check for it. const char* searchSpecific = bx::stristr(buffer, _vendorStrPart, 256); if (NULL != searchSpecific) { choosenPlatform = platforms[ii]; break; } } else { bool found = false; // Check for predefined vendors. if (_vendor&CMFT_CL_VENDOR_AMD) { const char* searchAmd = bx::stristr(buffer, "advanced micro devices", 256); found |= (NULL != searchAmd); } if (_vendor&CMFT_CL_VENDOR_INTEL) { const char* searchIntel = bx::stristr(buffer, "intel", 256); found |= (NULL != searchIntel); } if (_vendor&CMFT_CL_VENDOR_NVIDIA) { const char* searchNvidia = bx::stristr(buffer, "nvidia", 256); found |= (NULL != searchNvidia); } if (found) { choosenPlatform = platforms[ii]; break; } } } } // Enumerate devices. cl_device_id devices[8]; cl_uint numDevices = 0; // First try to get preferred device type. for (cl_uint ii = 0; ii < numPlatforms; ++ii) { err = clGetDeviceIDs(platforms[ii], _preferredDeviceType, 8, devices, &numDevices); if (CL_SUCCESS == err) { choosenPlatform = platforms[ii]; break; } } // If failed, just get anything there is. if (CL_SUCCESS != err) { for (cl_uint ii = 0; ii < numPlatforms; ++ii) { err = clGetDeviceIDs(platforms[ii], CL_DEVICE_TYPE_ALL, 8, devices, &numDevices); if (CL_SUCCESS == err) { choosenPlatform = platforms[ii]; break; } } } if (CL_SUCCESS != err) { WARN("OpenCL context initialization failed!"); return false; } // Choose preferred device and create context. cl_uint preferredDeviceIdx = (_preferredDeviceIdx < numDevices) ? _preferredDeviceIdx : 0; cl_device_id chosenDevice = devices[preferredDeviceIdx]; cl_context context = clCreateContext(NULL, 1, &chosenDevice, NULL, NULL, &err); if (CL_SUCCESS != err) { chosenDevice = devices[0]; context = clCreateContext(NULL, 1, &chosenDevice, NULL, NULL, &err); if (CL_SUCCESS != err) { WARN("OpenCL context initialization failed!"); return false; } } // Get device name, vendor and type. char deviceVendor[128]; CL_CHECK(clGetPlatformInfo(choosenPlatform, CL_PLATFORM_VENDOR, sizeof(deviceVendor), deviceVendor, NULL)); char deviceName[128]; CL_CHECK(clGetDeviceInfo(chosenDevice, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL)); CL_CHECK(clGetDeviceInfo(chosenDevice, CL_DEVICE_TYPE, sizeof(m_deviceType), &m_deviceType, NULL)); // Create command queue cl_command_queue commandQueue; commandQueue = clCreateCommandQueue(context, chosenDevice, 0, &err); if (CL_SUCCESS != err) { WARN("OpenCL context initialization failed!"); return false; } // Fill structure. dm::strscpya(m_deviceVendor, dm::trim(deviceVendor)); dm::strscpya(m_deviceName, dm::trim(deviceName)); m_device = chosenDevice; m_context = context; m_commandQueue = commandQueue; return true; }
int main(int argc, char **argv) { cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err)); const char *program_source[] = { "__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n", "{\n", " int i = get_global_id(0);\n", " dst[i] = src[i] * factor;\n", "}\n" }; cl_program program; program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err)); if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) { char buffer[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "CL Compilation failed:\n%s", buffer); abort(); } CL_CHECK(clUnloadCompiler()); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); int factor = 2; cl_kernel kernel; kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err)); for (int i=0; i<NUM_DATA; i++) { CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { int data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL)); printf(" %d", data); } printf("\n"); CL_CHECK(clReleaseMemObject(input_buffer)); CL_CHECK(clReleaseMemObject(output_buffer)); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int main(int argc, char *argv[]) { //FILE *fp; cl_platform_id platform_id[2]; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret_code; cl_mem image_in_mem = NULL; cl_mem image_out_mem = NULL; cl_mem twiddle_factors_mem = NULL; cl_float2 *image_in_host; cl_float2 *twiddle_factors_host; cl_kernel kernel_twiddle_factors; cl_kernel kernel_matriz_transpose; cl_kernel kernel_lowpass_filter; pgm_t ipgm; pgm_t opgm; image_file_t *image_filename; char *output_filename; FILE *fp; const char *kernel_filename = C_NOME_ARQ_KERNEL; size_t source_size; char *source_str; cl_int i, j,n ,m; cl_int raio = 0; size_t global_wg[2]; size_t local_wg[2]; float *image_amplitudes; size_t log_size; char *log_file; cl_event kernels_events_out_fft[4]; cl_ulong kernel_runtime = (cl_ulong) 0; cl_ulong kernel_start_time = (cl_ulong) 0; cl_ulong kernel_end_time = (cl_ulong) 0; cl_event write_host_dev_event; cl_ulong write_host_dev_start_time = (cl_ulong) 0; cl_ulong write_host_dev_end_time = (cl_ulong) 0; cl_ulong write_host_dev_run_time = (cl_ulong) 0; cl_event read_dev_host_event; cl_ulong read_dev_host_start_time = (cl_ulong) 0; cl_ulong read_dev_host_end_time = (cl_ulong) 0; cl_ulong read_dev_host_run_time = (cl_ulong) 0; unsigned __int64 image_tam; unsigned __int64 MEGA_BYTES = 1048576; // 1024*1024 double image_tam_MB; double tempo_total; struct event_in_fft_t *fft_events; //=== Timer count start ============================================================================== timer_reset(); timer_start(); //=================================================================================================== if (argc < 2) { printf("**Erro: O arquivo de entrada eh necessario.\n"); exit(EXIT_FAILURE); } image_filename = (image_file_t *) malloc(sizeof(image_file_t)); split_image_filename(image_filename, argv[1]); output_filename = (char *) malloc(40*sizeof(char)); sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO); fp = fopen(kernel_filename, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(EXIT_FAILURE); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); //=================================================================================================== /* Abrindo imagem do arquivo para objeto de memoria local*/ if( ler_pgm(&ipgm, argv[1]) == -1) exit(EXIT_FAILURE); n = ipgm.width; raio = n/8; m = (cl_int)(log((double)n)/log(2.0)); image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2)); twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j]; image_in_host[n*i + j].s[1] = (float)0; } } fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t)); kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event)); //=================================================================================================== CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms)); if (ret_num_platforms == 0 ) { fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n"); exit(2); } //=================================================================================================== CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices)); //print_platform_info(&platform_id[1]); //=================================================================================================== context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code); //=================================================================================================== cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code); //=================================================================================================== image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code); //=================================================================================================== /* Transfer data to memory buffer */ CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event)); image_tam = n*n*sizeof(cl_float2); //=================================================================================================== program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code); //=================================================================================================== ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //=================================================================================================== if (ret_code != CL_SUCCESS) { // Determine the size of the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); //=================================================================================================== // Allocate memory for the log log_file = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL); printf("%s\n", log_file); system("pause"); exit(0); } kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code); kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code); kernel_lowpass_filter = clCreateKernel(program, "lowpass_filter", &ret_code); /* Processa os fatores Wn*/ //=================================================================================================== CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem)); CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n/2, 1); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0])); //=================================================================================================== /* Executa a FFT em N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1])); //=================================================================================================== /* Executa a FFT N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]); //=================================================================================================== /* Processa o filtro passa baixa */ CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2])); //=================================================================================================== /* Obtem a FFT inversa*/ fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3])); //=================================================================================================== fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]); //=================================================================================================== CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event)); //=================================================================================================== //== Total time elapsed ============================================================================ timer_stop(); tempo_total = get_elapsed_time(); //================================================================================================== //====== Get time of Profile Info ================================================================== // Write data time CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL)); // Read data time CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL)); for (i = 0; i < MAX_CALL_FFT; i++) { kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; if (fft_events[i].kernel_normalize != NULL) { CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } } for (j=0; j < MAX_CALL_FFT*m; j++){ kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time; read_dev_host_run_time = read_dev_host_end_time - read_dev_host_start_time; /* save_log_debug(write_host_dev_run_time,fp); save_log_debug(read_dev_host_run_time,fp); close_log_debug(fp); */ image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES); //================================================================================================== save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)), (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME); //=================================================================================================== image_amplitudes = (float*)malloc(n*n*sizeof(float)); for (i=0; i < n; i++) { for (j=0; j < n; j++) { image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1])); } } //clFlush(cmd_queue); //clFinish(cmd_queue); opgm.width = n; opgm.height = n; normalizar_pgm(&opgm, image_amplitudes); escrever_pgm(&opgm, output_filename); //=================================================================================================== clFinish(cmd_queue); clReleaseKernel(kernel_twiddle_factors); clReleaseKernel(kernel_matriz_transpose); clReleaseKernel(kernel_lowpass_filter); clReleaseProgram(program); clReleaseMemObject(image_in_mem); clReleaseMemObject(image_out_mem); clReleaseMemObject(twiddle_factors_mem); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); clReleaseEvent(read_dev_host_event); clReleaseEvent(write_host_dev_event); clReleaseEvent(kernels_events_out_fft[0]); clReleaseEvent(kernels_events_out_fft[1]); clReleaseEvent(kernels_events_out_fft[2]); clReleaseEvent(kernels_events_out_fft[3]); destruir_pgm(&ipgm); destruir_pgm(&opgm); free(image_amplitudes); free(source_str); free(image_in_host); free(image_filename); free(twiddle_factors_host); free(output_filename); free(fft_events); free(kernel_butter_events); //_CrtDumpMemoryLeaks(); return 0; }