Complex HighPrecisionComplexPolynom::evaluateAt(Complex z) {
  cln::cl_N precZ = complex(cl_float(z.x,clnDIGIT), cl_float(z.y,clnDIGIT));
  cln::cl_N precRes = evaluateAt(precZ);
  Complex res(double_approx(realpart(precRes)), double_approx(imagpart(precRes)));

  return res;
}
void HighPrecisionComplexPolynom::ini(int len, int digit) {
  DIGIT = digit;
  clnDIGIT = cln::float_format(DIGIT);  
  
  char* xxxStr = new char[1000];
  snprintf(xxxStr,1000,"1.0e+0_%d",DIGIT);
  if (LogLevel>3) printf("Initializing ONE with: %s\n",xxxStr);
  ONE = xxxStr;

  snprintf(xxxStr,1000,"2.0e+0_%d",DIGIT);
  if (LogLevel>3) printf("Initializing TWO with: %s\n",xxxStr);
  TWO = xxxStr;

  snprintf(xxxStr,1000,"0.0e+0_%d",DIGIT);
  if (LogLevel>3) printf("Initializing ZERO with: %s\n",xxxStr);
  ZERO = xxxStr;

  snprintf(xxxStr,1000,"0.5e+0_%d",DIGIT);
  if (LogLevel>3) printf("Initializing HALF with: %s\n",xxxStr);
  HALF = xxxStr;
  delete[] xxxStr;
  
  length = len;
  coeff = new cln::cl_N[len];
  for (int I=0; I<len; I++) {
    coeff[I] = complex(cl_float(0,clnDIGIT), cl_float(0,clnDIGIT));
  }
}
Пример #3
0
ifft2d_hermitian_inplace::ifft2d_hermitian_inplace(
    gpu::compute::command_queue queue,
    const math::ivec2 &size,
    size_t num_batches)
{
    static detail::fft_api fft_api;
    size_t N = size.x;
    size_t M = size.y;
    size_t lenghts[] = { N, M };
    size_t in_stride[] = { 1, N / 2 + 1 };
    size_t out_stride[] = { 1, N + 2 };

    auto context = queue.getInfo<CL_QUEUE_CONTEXT>();
    CLFFT_CHECK(clfftCreateDefaultPlan(&fft_plan, context(), CLFFT_2D, lenghts));
    CLFFT_CHECK(clfftSetPlanBatchSize(fft_plan, num_batches));
    CLFFT_CHECK(clfftSetPlanPrecision(fft_plan, detail::clfft_precision<math::real>::value));
    CLFFT_CHECK(clfftSetResultLocation(fft_plan, CLFFT_INPLACE));
    CLFFT_CHECK(clfftSetLayout(fft_plan, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL));
    CLFFT_CHECK(clfftSetPlanInStride(fft_plan, CLFFT_2D, in_stride));
    CLFFT_CHECK(clfftSetPlanOutStride(fft_plan, CLFFT_2D, out_stride));
    CLFFT_CHECK(clfftSetPlanScale(fft_plan, CLFFT_BACKWARD, cl_float(1)));
    CLFFT_CHECK(clfftSetPlanDistance(fft_plan, M * (N / 2 + 1), M * (N + 2)));

    CLFFT_CHECK(clfftBakePlan(fft_plan, 1, &queue(), nullptr, nullptr));

    size_t tmp_sz;
    CLFFT_CHECK(clfftGetTmpBufSize(fft_plan, &tmp_sz));
    tmp_buf = gpu::compute::buffer(context, CL_MEM_READ_WRITE, tmp_sz);
}
Пример #4
0
Id cl_atom(Id token) {
  CL_ACQUIRE_STR_D(dt, token, clNil);
  char *ep;
  long l = strtol(dt.s, &ep, 10);
  if (ep && *ep == '\0') return cl_int((int)l);
  float f = strtof(dt.s, &ep);
  if (ep && *ep == '\0') return cl_float(f);
  return cl_intern(token);
}
Пример #5
0
void HighPrecisionComplex::ini(int digit, double vx, double vy) {
  DIGIT = digit;
  clnDIGIT = cln::float_format(DIGIT);  
  
  char* xxxStr = new char[1000];
  snprintf(xxxStr,1000,"1.0e+0_%d",DIGIT);
  if (LogLevel>4) printf("Initializing ONE with: %s\n",xxxStr);
  ONE = xxxStr;

  snprintf(xxxStr,1000,"2.0e+0_%d",DIGIT);
  if (LogLevel>4) printf("Initializing TWO with: %s\n",xxxStr);
  TWO = xxxStr;

  snprintf(xxxStr,1000,"0.0e+0_%d",DIGIT);
  if (LogLevel>4) printf("Initializing ZERO with: %s\n",xxxStr);
  ZERO = xxxStr;

  snprintf(xxxStr,1000,"0.5e+0_%d",DIGIT);
  if (LogLevel>4) printf("Initializing HALF with: %s\n",xxxStr);
  HALF = xxxStr;
  delete[] xxxStr;
  
  z = complex(cl_float(vx,clnDIGIT), cl_float(vy,clnDIGIT));
}
Пример #6
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
	void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
    cl_context cxGPUContext;       // OpenCL context
    cl_command_queue cqCommandQue;  // OpenCL command que
    cl_device_id* cdDevices;        // OpenCL device list    
    cl_program cpProgram;           // OpenCL program
    cl_kernel ckKernel;             // OpenCL kernel
    cl_mem cmMemObjs[3];            // OpenCL memory buffer objects:  3 for device
    size_t szGlobalWorkSize[1];     // 1D var for Total # of work items
    size_t szLocalWorkSize[1];		// 1D var for # of work items in the work group	
    size_t szParmDataBytes;			// Byte size of context information
    cl_int ciErr1, ciErr2;			// Error code var
    int iTestN = 100000 * 8;		// Size of Vectors to process

	int actualGlobalSize = iTestN>>3;
	
    // set Global and Local work size dimensions
    szGlobalWorkSize[0] = iTestN >> 3;  // do 8 computations per work item
    szLocalWorkSize[0]= iTestN>>3;
	
	
    // Allocate and initialize host arrays
    srcA = (void *)malloc (sizeof(cl_float) * iTestN);
    srcB = (void *)malloc (sizeof(cl_float) * iTestN);
    dst = (void *)malloc (sizeof(cl_float) * iTestN);

	int i;

	// Initialize arrays with some values
	for (i=0;i<iTestN;i++)
	{
		((cl_float*)srcA)[i] = cl_float(i);
		((cl_float*)srcB)[i] = 2;
		((cl_float*)dst)[i]=-1;
	}


	 cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i],
                                       CL_PLATFORM_VENDOR,
                                       sizeof(pbuf),
                                       pbuf,
                                       NULL);

            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }

	cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    // Create OpenCL context & context
    cxGPUContext = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
	
    // Query all devices available to the context
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
    cdDevices = (cl_device_id*)malloc(szParmDataBytes);
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
	if (cdDevices)
	{
		printDevInfo(cdDevices[0]);
	}

    // Create a command queue for first device the context reported
    cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
    ciErr1 |= ciErr2; 

    // Allocate the OpenCL source and result buffer memory objects on the device GMEM
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
    ciErr1 |= ciErr2;

///create kernels from binary
	int numDevices = 1;
	::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
	const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));

	for (i = 0; i < numDevices; ++i) {
		images[i] = 0;
		lengths[i] = 0;
	}

	
	// Read the OpenCL kernel in from source file
	const char* cSourceFile = "VectorAddKernels.cl";
	
    printf("loadProgSource (%s)...\n", cSourceFile); 
    const char* cPathAndName = cSourceFile;
#ifdef LOAD_FROM_FILE
	size_t szKernelLength;
    const char* cSourceCL = loadProgSource(cPathAndName, "", &szKernelLength);
#else
	const char* cSourceCL = stringifiedSourceCL;
	size_t szKernelLength = strlen(stringifiedSourceCL);
#endif //LOAD_FROM_FILE


	
    // Create the program
    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
    printf("clCreateProgramWithSource...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        exit(0);
    }
	
    // Build the program with 'mad' Optimization option
#ifdef MAC
	char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
#else
	const char* flags = "-DGUID_ARG=";
#endif
    ciErr1 = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL);
    printf("clBuildProgram...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
        exit(0);
    }
	
    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
    printf("clCreateKernel (VectorAdd)...\n"); 
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
		exit(0);
    }
	
	
	cl_int ciErrNum;
	
	ciErrNum = clGetKernelWorkGroupInfo(ckKernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
	if (ciErrNum != CL_SUCCESS)
	{
		printf("cannot get workgroup size\n");
		exit(0);
	}

	

   
    // Set the Argument values
    ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);

	
	
	int workgroupSize = wgSize;
	if(workgroupSize <= 0)
	{ // let OpenCL library calculate workgroup size
		size_t globalWorkSize[2];
		globalWorkSize[0] = actualGlobalSize;
		globalWorkSize[1] = 1;
	
		// Copy input data from host to GPU and launch kernel 
		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalWorkSize, NULL, 0,0,0 );

	}
	else
	{
		size_t localWorkSize[2], globalWorkSize[2];
		workgroupSize = btMin(workgroupSize, actualGlobalSize);
		int num_t = actualGlobalSize / workgroupSize;
		int num_g = num_t * workgroupSize;
		if(num_g < actualGlobalSize)
		{
			num_t++;
			//this can cause problems -> processing outside of the buffer
			//make sure to check kernel
		}

		size_t globalThreads[] = {num_t * workgroupSize};
		size_t localThreads[] = {workgroupSize};


		localWorkSize[0]  = workgroupSize;
		globalWorkSize[0] = num_t * workgroupSize;
		localWorkSize[1] = 1;
		globalWorkSize[1] = 1;

		// Copy input data from host to GPU and launch kernel 
		ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);

	}
	
	if (ciErrNum != CL_SUCCESS)
	{
		printf("cannot clEnqueueNDRangeKernel\n");
		exit(0);
	}
	
	clFinish(cqCommandQue);
    // Read back results and check accumulated errors
    ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);

    // Release kernel, program, and memory objects
	// NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    free(cdDevices);
	clReleaseKernel(ckKernel);  
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQue);
    clReleaseContext(cxGPUContext);


    // print the results
    int iErrorCount = 0;
    for (i = 0; i < iTestN; i++) 
    {
		if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
			iErrorCount++;
    }
	
	if (iErrorCount)
	{
		printf("MiniCL validation FAILED\n");
	} else
	{
		printf("MiniCL validation SUCCESSFULL\n");
	}
    // Free host memory, close log and return success
	for (i = 0; i < 3; i++)
    {
        clReleaseMemObject(cmMemObjs[i]);
    }

    free(srcA); 
    free(srcB);
    free (dst);
	printf("Press ENTER to quit\n");
	getchar();
}
Пример #7
0
cl_object cl_float_sign(cl_narg narg, ...)
{
#line 245
// ------------------------------2
#line 245
	const cl_env_ptr the_env = ecl_process_env();
#line 245
	cl_object y;
#line 245
	bool yp;
#line 245
	va_list ARGS;
	va_start(ARGS, narg);
	cl_object x = va_arg(ARGS,cl_object);  
#line 245
// ------------------------------3

	int negativep;
#line 248
// ------------------------------4
#line 248
#line 248
	if (ecl_unlikely(narg < 1|| narg > 2)) FEwrong_num_arguments(ecl_make_fixnum(378));
#line 248
	if (narg > 1) {
#line 248
		y = va_arg(ARGS,cl_object);  
#line 248
		yp = TRUE;
#line 248
	} else {
#line 248
		y = x;
#line 248
		yp = FALSE;
#line 248
	}
#line 248
// ------------------------------5
	if (!yp) {
		y = cl_float(2, ecl_make_fixnum(1), x);
	}
	negativep = ecl_signbit(x);
	switch (ecl_t_of(y)) {
	case t_singlefloat: {
		float f = ecl_single_float(y);
                if (signbit(f) != negativep) y = ecl_make_single_float(-f);
		break;
	}
	case t_doublefloat: {
		double f = ecl_double_float(y);
                if (signbit(f) != negativep) y = ecl_make_double_float(-f);
		break;
	}
#ifdef ECL_LONG_FLOAT
	case t_longfloat: {
		long double f = ecl_long_float(y);
                if (signbit(f) != negativep) y = ecl_make_long_float(-f);
		break;
	}
#endif
	default:
                FEwrong_type_nth_arg(ecl_make_fixnum(/*FLOAT-SIGN*/378),2,y,ecl_make_fixnum(/*FLOAT*/374));
	}
	{
#line 273
		#line 273
		cl_object __value0 = y;
#line 273
		the_env->nvalues = 1;
#line 273
		return __value0;
#line 273
	}
;
}
cln::cl_N HighPrecisionComplexPolynom::getCoeff(int p) {
  if ((p<0) || (p>=length)) return complex(cl_float(0,clnDIGIT), cl_float(0,clnDIGIT));
  return coeff[p];
}
void HighPrecisionComplexPolynom::addCoeff(int p, Complex val) {
  if ((p<0) || (p>=length)) return;
  
  coeff[p] = coeff[p] + complex(cl_float(val.x,clnDIGIT), cl_float(val.y,clnDIGIT));
}
Пример #10
0
void HighPrecisionComplex::ini(int digit, double vx) {
  ini(digit, 0, 0);  
  z = complex(cl_float(vx,clnDIGIT), ZERO);
}
Пример #11
0
// Main function 
// *********************************************************************
int main(int argc, char **argv)
{
	void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
    cl_context cxGPUContext;       // OpenCL context
    cl_command_queue cqCommandQue;  // OpenCL command que
    cl_device_id* cdDevices;        // OpenCL device list    
    cl_program cpProgram;           // OpenCL program
    cl_kernel ckKernel;             // OpenCL kernel
    cl_mem cmMemObjs[3];            // OpenCL memory buffer objects:  3 for device
    size_t szGlobalWorkSize[1];     // 1D var for Total # of work items
    size_t szLocalWorkSize[1];		// 1D var for # of work items in the work group	
    size_t szParmDataBytes;			// Byte size of context information
    cl_int ciErr1, ciErr2;			// Error code var
    int iTestN = 100000 * 8;		// Size of Vectors to process

    // set Global and Local work size dimensions
    szGlobalWorkSize[0] = iTestN >> 3;  // do 8 computations per work item
    szLocalWorkSize[0]= iTestN>>3;


    // Allocate and initialize host arrays
    srcA = (void *)malloc (sizeof(cl_float) * iTestN);
    srcB = (void *)malloc (sizeof(cl_float) * iTestN);
    dst = (void *)malloc (sizeof(cl_float) * iTestN);

	int i;

	// Initialize arrays with some values
	for (i=0;i<iTestN;i++)
	{
		((cl_float*)srcA)[i] = cl_float(i);
		((cl_float*)srcB)[i] = 2;
		((cl_float*)dst)[i]=-1;
	}

    // Create OpenCL context & context
    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_CPU, NULL, NULL, &ciErr1); //could also be CL_DEVICE_TYPE_GPU
	
    // Query all devices available to the context
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
    cdDevices = (cl_device_id*)malloc(szParmDataBytes);
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
	if (cdDevices)
	{
		printDevInfo(cdDevices[0]);
	}

    // Create a command queue for first device the context reported
    cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr2);
    ciErr1 |= ciErr2; 

    // Allocate the OpenCL source and result buffer memory objects on the device GMEM
    cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcA, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * szGlobalWorkSize[0], srcB, &ciErr2);
    ciErr1 |= ciErr2;
    cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float8) * szGlobalWorkSize[0], NULL, &ciErr2);
    ciErr1 |= ciErr2;

///create kernels from binary
	int numDevices = 1;
	cl_int err;
	::size_t* lengths = (::size_t*) malloc(numDevices * sizeof(::size_t));
	const unsigned char** images = (const unsigned char**) malloc(numDevices * sizeof(const void*));

	for (i = 0; i < numDevices; ++i) {
		images[i] = 0;
		lengths[i] = 0;
	}

	cpProgram = clCreateProgramWithBinary(cxGPUContext, numDevices,cdDevices,lengths, images, 0, &err);

	// Build the executable program from a binary
	ciErr1 |= clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);

    // Create the kernel
    ckKernel = clCreateKernel(cpProgram, "VectorAdd", &ciErr1);
    
    // Set the Argument values
    ciErr1 |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmMemObjs[0]);
    ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmMemObjs[1]);
    ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmMemObjs[2]);

    // Copy input data from host to GPU and launch kernel 
    ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);

    // Read back results and check accumulated errors
    ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL);

    // Release kernel, program, and memory objects
	// NOTE:  Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity.
    free(cdDevices);
	clReleaseKernel(ckKernel);  
    clReleaseProgram(cpProgram);
    clReleaseCommandQueue(cqCommandQue);
    clReleaseContext(cxGPUContext);


    // print the results
    int iErrorCount = 0;
    for (i = 0; i < iTestN; i++) 
    {
		if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i])
			iErrorCount++;
    }
	
	if (iErrorCount)
	{
		printf("MiniCL validation FAILED\n");
	} else
	{
		printf("MiniCL validation SUCCESSFULL\n");
	}
    // Free host memory, close log and return success
	for (i = 0; i < 3; i++)
    {
        clReleaseMemObject(cmMemObjs[i]);
    }

    free(srcA); 
    free(srcB);
    free (dst);
}
Пример #12
0
 static void dump(viennacl::backend::mem_handle const & buff, uniform_tag tag, cl_uint start, cl_uint size){
   viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::rand<ScalarType,1>::program_name(),"dump_uniform");
   k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(size,k.local_work_size(0)));
   viennacl::ocl::enqueue(k(buff.opencl_handle(), start, size, cl_float(tag.a), cl_float(tag.b) , cl_uint(time(0))));
 }