Complex HighPrecisionComplexPolynom::evaluateAt(Complex z) { cln::cl_N precZ = complex(cl_float(z.x,clnDIGIT), cl_float(z.y,clnDIGIT)); cln::cl_N precRes = evaluateAt(precZ); Complex res(double_approx(realpart(precRes)), double_approx(imagpart(precRes))); return res; }
void HighPrecisionComplexPolynom::ini(int len, int digit) { DIGIT = digit; clnDIGIT = cln::float_format(DIGIT); char* xxxStr = new char[1000]; snprintf(xxxStr,1000,"1.0e+0_%d",DIGIT); if (LogLevel>3) printf("Initializing ONE with: %s\n",xxxStr); ONE = xxxStr; snprintf(xxxStr,1000,"2.0e+0_%d",DIGIT); if (LogLevel>3) printf("Initializing TWO with: %s\n",xxxStr); TWO = xxxStr; snprintf(xxxStr,1000,"0.0e+0_%d",DIGIT); if (LogLevel>3) printf("Initializing ZERO with: %s\n",xxxStr); ZERO = xxxStr; snprintf(xxxStr,1000,"0.5e+0_%d",DIGIT); if (LogLevel>3) printf("Initializing HALF with: %s\n",xxxStr); HALF = xxxStr; delete[] xxxStr; length = len; coeff = new cln::cl_N[len]; for (int I=0; I<len; I++) { coeff[I] = complex(cl_float(0,clnDIGIT), cl_float(0,clnDIGIT)); } }
ifft2d_hermitian_inplace::ifft2d_hermitian_inplace( gpu::compute::command_queue queue, const math::ivec2 &size, size_t num_batches) { static detail::fft_api fft_api; size_t N = size.x; size_t M = size.y; size_t lenghts[] = { N, M }; size_t in_stride[] = { 1, N / 2 + 1 }; size_t out_stride[] = { 1, N + 2 }; auto context = queue.getInfo<CL_QUEUE_CONTEXT>(); CLFFT_CHECK(clfftCreateDefaultPlan(&fft_plan, context(), CLFFT_2D, lenghts)); CLFFT_CHECK(clfftSetPlanBatchSize(fft_plan, num_batches)); CLFFT_CHECK(clfftSetPlanPrecision(fft_plan, detail::clfft_precision<math::real>::value)); CLFFT_CHECK(clfftSetResultLocation(fft_plan, CLFFT_INPLACE)); CLFFT_CHECK(clfftSetLayout(fft_plan, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL)); CLFFT_CHECK(clfftSetPlanInStride(fft_plan, CLFFT_2D, in_stride)); CLFFT_CHECK(clfftSetPlanOutStride(fft_plan, CLFFT_2D, out_stride)); CLFFT_CHECK(clfftSetPlanScale(fft_plan, CLFFT_BACKWARD, cl_float(1))); CLFFT_CHECK(clfftSetPlanDistance(fft_plan, M * (N / 2 + 1), M * (N + 2))); CLFFT_CHECK(clfftBakePlan(fft_plan, 1, &queue(), nullptr, nullptr)); size_t tmp_sz; CLFFT_CHECK(clfftGetTmpBufSize(fft_plan, &tmp_sz)); tmp_buf = gpu::compute::buffer(context, CL_MEM_READ_WRITE, tmp_sz); }
Id cl_atom(Id token) { CL_ACQUIRE_STR_D(dt, token, clNil); char *ep; long l = strtol(dt.s, &ep, 10); if (ep && *ep == '\0') return cl_int((int)l); float f = strtof(dt.s, &ep); if (ep && *ep == '\0') return cl_float(f); return cl_intern(token); }
void HighPrecisionComplex::ini(int digit, double vx, double vy) { DIGIT = digit; clnDIGIT = cln::float_format(DIGIT); char* xxxStr = new char[1000]; snprintf(xxxStr,1000,"1.0e+0_%d",DIGIT); if (LogLevel>4) printf("Initializing ONE with: %s\n",xxxStr); ONE = xxxStr; snprintf(xxxStr,1000,"2.0e+0_%d",DIGIT); if (LogLevel>4) printf("Initializing TWO with: %s\n",xxxStr); TWO = xxxStr; snprintf(xxxStr,1000,"0.0e+0_%d",DIGIT); if (LogLevel>4) printf("Initializing ZERO with: %s\n",xxxStr); ZERO = xxxStr; snprintf(xxxStr,1000,"0.5e+0_%d",DIGIT); if (LogLevel>4) printf("Initializing HALF with: %s\n",xxxStr); HALF = xxxStr; delete[] xxxStr; z = complex(cl_float(vx,clnDIGIT), cl_float(vy,clnDIGIT)); }
// Main function // ********************************************************************* int main(int argc, char **argv) { void *srcA, *srcB, *dst; // Host buffers for OpenCL test cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue; // OpenCL command que cl_device_id* cdDevices; // OpenCL device list cl_program cpProgram; // OpenCL program cl_kernel ckKernel; // OpenCL kernel cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device size_t szGlobalWorkSize[1]; // 1D var for Total # of work items size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group size_t szParmDataBytes; // Byte size of context information cl_int ciErr1, ciErr2; // Error code var int iTestN = 100000 * 8; // Size of Vectors to process int actualGlobalSize = iTestN>>3; // set Global and Local work size dimensions szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item szLocalWorkSize[0]= iTestN>>3; // Allocate and initialize host arrays srcA = (void *)malloc (sizeof(cl_float) * iTestN); srcB = (void *)malloc (sizeof(cl_float) * iTestN); dst = (void *)malloc (sizeof(cl_float) * iTestN); int i; // Initialize arrays with some values for (i=0;i<iTestN;i++) { ((cl_float*)srcA)[i] = cl_float(i); ((cl_float*)srcB)[i] = 2; ((cl_float*)dst)[i]=-1; } cl_uint numPlatforms; cl_platform_id platform = NULL; cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; // Create OpenCL context & context cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU // Query all devices available to the context ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); cdDevices = (cl_device_id*)malloc(szParmDataBytes); ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL); if (cdDevices) { printDevInfo(cdDevices[0]); } // Create a command queue for first device the context reported cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2); ciErr1 |= ciErr2; // Allocate the OpenCL source and result buffer memory objects on the device GMEM cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2); ciErr1 |= ciErr2; cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2); ciErr1 |= ciErr2; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2); ciErr1 |= ciErr2; ///create kernels from binary int numDevices = 1; ::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t)); const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*)); for (i = 0; i < numDevices; ++i) { images[i] = 0; lengths[i] = 0; } // Read the OpenCL kernel in from source file const char* cSourceFile = "VectorAddKernels.cl"; printf("loadProgSource (%s)...\n", cSourceFile); const char* cPathAndName = cSourceFile; #ifdef LOAD_FROM_FILE size_t szKernelLength; const char* cSourceCL = loadProgSource(cPathAndName, "", &szKernelLength); #else const char* cSourceCL = stringifiedSourceCL; size_t szKernelLength = strlen(stringifiedSourceCL); #endif //LOAD_FROM_FILE // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); printf("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); exit(0); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-mad-enable -DMAC -DGUID_ARG"; #else const char* flags = "-DGUID_ARG="; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); printf("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); exit(0); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1); printf("clCreateKernel (VectorAdd)...\n"); if (ciErr1 != CL_SUCCESS) { printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); exit(0); } cl_int ciErrNum; ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (ciErrNum != CL_SUCCESS) { printf("cannot get workgroup size\n"); exit(0); } // Set the Argument values ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]); int workgroupSize = wgSize; if(workgroupSize <= 0) { // let OpenCL library calculate workgroup size size_t globalWorkSize[2]; globalWorkSize[0] = actualGlobalSize; globalWorkSize[1] = 1; // Copy input data from host to GPU and launch kernel ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 ); } else { size_t localWorkSize[2], globalWorkSize[2]; workgroupSize = btMin(workgroupSize, actualGlobalSize); int num_t = actualGlobalSize / workgroupSize; int num_g = num_t * workgroupSize; if(num_g < actualGlobalSize) { num_t++; //this can cause problems -> processing outside of the buffer //make sure to check kernel } size_t globalThreads[] = {num_t * workgroupSize}; size_t localThreads[] = {workgroupSize}; localWorkSize[0] = workgroupSize; globalWorkSize[0] = num_t * workgroupSize; localWorkSize[1] = 1; globalWorkSize[1] = 1; // Copy input data from host to GPU and launch kernel ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); } if (ciErrNum != CL_SUCCESS) { printf("cannot clEnqueueNDRangeKernel\n"); exit(0); } clFinish(cqCommandQue); // Read back results and check accumulated errors ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL); // Release kernel, program, and memory objects // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. free(cdDevices); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQue); clReleaseContext(cxGPUContext); // print the results int iErrorCount = 0; for (i = 0; i < iTestN; i++) { if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i]) iErrorCount++; } if (iErrorCount) { printf("MiniCL validation FAILED\n"); } else { printf("MiniCL validation SUCCESSFULL\n"); } // Free host memory, close log and return success for (i = 0; i < 3; i++) { clReleaseMemObject(cmMemObjs[i]); } free(srcA); free(srcB); free (dst); printf("Press ENTER to quit\n"); getchar(); }
cl_object cl_float_sign(cl_narg narg, ...) { #line 245 // ------------------------------2 #line 245 const cl_env_ptr the_env = ecl_process_env(); #line 245 cl_object y; #line 245 bool yp; #line 245 va_list ARGS; va_start(ARGS, narg); cl_object x = va_arg(ARGS,cl_object); #line 245 // ------------------------------3 int negativep; #line 248 // ------------------------------4 #line 248 #line 248 if (ecl_unlikely(narg < 1|| narg > 2)) FEwrong_num_arguments(ecl_make_fixnum(378)); #line 248 if (narg > 1) { #line 248 y = va_arg(ARGS,cl_object); #line 248 yp = TRUE; #line 248 } else { #line 248 y = x; #line 248 yp = FALSE; #line 248 } #line 248 // ------------------------------5 if (!yp) { y = cl_float(2, ecl_make_fixnum(1), x); } negativep = ecl_signbit(x); switch (ecl_t_of(y)) { case t_singlefloat: { float f = ecl_single_float(y); if (signbit(f) != negativep) y = ecl_make_single_float(-f); break; } case t_doublefloat: { double f = ecl_double_float(y); if (signbit(f) != negativep) y = ecl_make_double_float(-f); break; } #ifdef ECL_LONG_FLOAT case t_longfloat: { long double f = ecl_long_float(y); if (signbit(f) != negativep) y = ecl_make_long_float(-f); break; } #endif default: FEwrong_type_nth_arg(ecl_make_fixnum(/*FLOAT-SIGN*/378),2,y,ecl_make_fixnum(/*FLOAT*/374)); } { #line 273 #line 273 cl_object __value0 = y; #line 273 the_env->nvalues = 1; #line 273 return __value0; #line 273 } ; }
cln::cl_N HighPrecisionComplexPolynom::getCoeff(int p) { if ((p<0) || (p>=length)) return complex(cl_float(0,clnDIGIT), cl_float(0,clnDIGIT)); return coeff[p]; }
void HighPrecisionComplexPolynom::addCoeff(int p, Complex val) { if ((p<0) || (p>=length)) return; coeff[p] = coeff[p] + complex(cl_float(val.x,clnDIGIT), cl_float(val.y,clnDIGIT)); }
void HighPrecisionComplex::ini(int digit, double vx) { ini(digit, 0, 0); z = complex(cl_float(vx,clnDIGIT), ZERO); }
// Main function // ********************************************************************* int main(int argc, char **argv) { void *srcA, *srcB, *dst; // Host buffers for OpenCL test cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue; // OpenCL command que cl_device_id* cdDevices; // OpenCL device list cl_program cpProgram; // OpenCL program cl_kernel ckKernel; // OpenCL kernel cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device size_t szGlobalWorkSize[1]; // 1D var for Total # of work items size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group size_t szParmDataBytes; // Byte size of context information cl_int ciErr1, ciErr2; // Error code var int iTestN = 100000 * 8; // Size of Vectors to process // set Global and Local work size dimensions szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item szLocalWorkSize[0]= iTestN>>3; // Allocate and initialize host arrays srcA = (void *)malloc (sizeof(cl_float) * iTestN); srcB = (void *)malloc (sizeof(cl_float) * iTestN); dst = (void *)malloc (sizeof(cl_float) * iTestN); int i; // Initialize arrays with some values for (i=0;i<iTestN;i++) { ((cl_float*)srcA)[i] = cl_float(i); ((cl_float*)srcB)[i] = 2; ((cl_float*)dst)[i]=-1; } // Create OpenCL context & context cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU // Query all devices available to the context ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes); cdDevices = (cl_device_id*)malloc(szParmDataBytes); ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL); if (cdDevices) { printDevInfo(cdDevices[0]); } // Create a command queue for first device the context reported cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2); ciErr1 |= ciErr2; // Allocate the OpenCL source and result buffer memory objects on the device GMEM cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2); ciErr1 |= ciErr2; cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2); ciErr1 |= ciErr2; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2); ciErr1 |= ciErr2; ///create kernels from binary int numDevices = 1; cl_int err; ::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t)); const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*)); for (i = 0; i < numDevices; ++i) { images[i] = 0; lengths[i] = 0; } cpProgram = clCreateProgramWithBinary(cxGPUContext, numDevices,cdDevices,lengths, images, 0, &err); // Build the executable program from a binary ciErr1 |= clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); // Create the kernel ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1); // Set the Argument values ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]); // Copy input data from host to GPU and launch kernel ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); // Read back results and check accumulated errors ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL); // Release kernel, program, and memory objects // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. free(cdDevices); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQue); clReleaseContext(cxGPUContext); // print the results int iErrorCount = 0; for (i = 0; i < iTestN; i++) { if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i]) iErrorCount++; } if (iErrorCount) { printf("MiniCL validation FAILED\n"); } else { printf("MiniCL validation SUCCESSFULL\n"); } // Free host memory, close log and return success for (i = 0; i < 3; i++) { clReleaseMemObject(cmMemObjs[i]); } free(srcA); free(srcB); free (dst); }
static void dump(viennacl::backend::mem_handle const & buff, uniform_tag tag, cl_uint start, cl_uint size){ viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::rand<ScalarType,1>::program_name(),"dump_uniform"); k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(size,k.local_work_size(0))); viennacl::ocl::enqueue(k(buff.opencl_handle(), start, size, cl_float(tag.a), cl_float(tag.b) , cl_uint(time(0)))); }