void ocl_real_arr_arr_sca(const char* kernel_name, modelica_real* src_1, modelica_real* src_2, modelica_real* dest, int size_){ cl_program OpenCLProgram; cl_kernel OpenCLfunction; clock_t c0, c1; cl_int err; cl_int inc1, inc2; inc1=1; inc2=1; size_t WorkSize[1] = {static_cast<size_t>(size_)}; size_t localWorkSize[1] = {32}; // one dimensional Range if (!device_comm_queue) ocl_initialize(); //This can be moved out. left here hoping that similar ops will be called //sequentialy. If we kept them in one .cl file we dont have to build again OpenCLProgram = ocl_build_p_from_src("real_ar_ar_sca.cl", true); OpenCLfunction = clCreateKernel(OpenCLProgram, kernel_name, &err); ocl_error_check(OCL_CREATE_KERNEL, err); cl_mem device_array_1 = ocl_alloc_init_real_arr(src_1, size_); cl_mem device_array_2 = ocl_alloc_init_real_arr(src_2, size_); cl_mem result = ocl_alloc_init_real_arr(NULL, 1); err = clSetKernelArg(OpenCLfunction, 0, sizeof(cl_int),(void*)&size_); err |= clSetKernelArg(OpenCLfunction, 1, sizeof(cl_mem), (void*)&device_array_1); err |= clSetKernelArg(OpenCLfunction, 2, sizeof(cl_int), (void*)&inc1); err |= clSetKernelArg(OpenCLfunction, 3, sizeof(cl_mem), (void*)&device_array_2); err |= clSetKernelArg(OpenCLfunction, 4, sizeof(cl_int), (void*)&inc2); err |= clSetKernelArg(OpenCLfunction, 5, sizeof(cl_mem), (void*)&result); err |= clSetKernelArg(OpenCLfunction, 6, sizeof(modelica_real)*localWorkSize[0], NULL); ocl_error_check(OCL_SET_KER_ARGS, err); c0 = clock(); err = clEnqueueNDRangeKernel(device_comm_queue, OpenCLfunction, 1, NULL, WorkSize, localWorkSize, 0, NULL, NULL); ocl_error_check(OCL_ENQUE_ND_RANGE_KERNEL, err); clFinish(device_comm_queue); c1 = clock(); printf ("\telapsed CPU CLOCKS: %f sec\n", (float) (c1-c0)/1000); ocl_copy_back_to_host_real(result, dest, 1); clReleaseMemObject(device_array_1); clReleaseMemObject(device_array_2); clReleaseMemObject(result); clReleaseKernel(OpenCLfunction); clReleaseProgram(OpenCLProgram); }
void ocl_real_matrix_matrix_matrix(const char* kernel_name, modelica_real* src_1, int M, modelica_real* src_2, int N, modelica_real* dest, int K){ cl_program OpenCLProgram; cl_kernel OpenCLfunction; clock_t c0, c1; cl_int err; size_t WorkSize[2] = {static_cast<size_t>(M), static_cast<size_t>(N)}; size_t localWorkSize[2] = {16,16}; if (!device_comm_queue){ printf("------------------------------Initizlizing---------------------\n"); ocl_initialize(); } //This can be moved out. left here hoping that similar ops will be called //sequentialy. If we kept them in one .cl file we dont have to build again OpenCLProgram = ocl_build_p_from_src("matrix.cl", true); OpenCLfunction = clCreateKernel(OpenCLProgram, kernel_name, &err); ocl_error_check(OCL_CREATE_KERNEL, err); cl_mem device_array_1 = ocl_alloc_init_real_arr(src_1, M*K); cl_mem device_array_2 = ocl_alloc_init_real_arr(src_2, K*N); cl_mem result = ocl_alloc_init_real_arr(NULL, M*N); err = clSetKernelArg(OpenCLfunction, 0, sizeof(cl_mem), (void*)&result); err |= clSetKernelArg(OpenCLfunction, 1, sizeof(cl_mem), (void*)&device_array_1); err |= clSetKernelArg(OpenCLfunction, 2, sizeof(cl_mem), (void*)&device_array_2); err |= clSetKernelArg(OpenCLfunction, 3, sizeof(cl_int), (void*)&K); err |= clSetKernelArg(OpenCLfunction, 4, sizeof(cl_int), (void*)&N); ocl_error_check(OCL_SET_KER_ARGS, err); c0 = clock(); err = clEnqueueNDRangeKernel(device_comm_queue, OpenCLfunction, 2, NULL, WorkSize, localWorkSize, 0, NULL, NULL); ocl_error_check(OCL_ENQUE_ND_RANGE_KERNEL, err); clFinish(device_comm_queue); c1 = clock(); printf ("\telapsed CPU CLOCKS: %f sec\n", (float) (c1-c0)/1000); ocl_copy_back_to_host_real(result, dest, M*N); clReleaseMemObject(device_array_1); clReleaseMemObject(device_array_2); clReleaseMemObject(result); clReleaseKernel(OpenCLfunction); clReleaseProgram(OpenCLProgram); }
void ocl_initialize(){ timeval t1, t2; double elapsedTime; gettimeofday(&t1, NULL); if (!device_comm_queue){ if(!ocl_device){ ocl_get_device(); } ocl_create_context_and_comm_queue(); ocl_build_p_from_src(); } gettimeofday(&t2, NULL); elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms printf ("\tOpenCL initialization : %lf ms\n", elapsedTime); setenv("CUDA_CACHE_DISABLE", "1", 1); }