示例#1
0
文件: bluestein.cpp 项目: rzel/clfft
void 
allocateHostMemoryBluesteins(const unsigned n, const unsigned m)
{
    
  h_Hreal = (float *) malloc(sizeof(float) * m);
  checkError((h_Hreal != NULL), shrTRUE, "Could not allocate memory");
    
  h_Himag = (float *) malloc(sizeof(float) * m);
  checkError((h_Himag != NULL), shrTRUE, "Could not allocate memory");
  
  h_Yreal = (float *) malloc(sizeof(float) * m);
  checkError((h_Yreal != NULL), shrTRUE, "Could not allocate memory");
    
  h_Yimag = (float *) malloc(sizeof(float) * m);
  checkError((h_Yimag != NULL), shrTRUE, "Could not allocate memory");
  
  h_Zreal = (float *) malloc(sizeof(float) * m);
  checkError((h_Zreal != NULL), shrTRUE, "Could not allocate memory");
    
  h_Zimag = (float *) malloc(sizeof(float) * m);
  checkError((h_Zimag != NULL), shrTRUE, "Could not allocate memory");
  
  h_Xreal = (float *) malloc(sizeof(float) * m);
  checkError((h_Xreal != NULL), shrTRUE, "Could not allocate memory");
    
  h_Ximag = (float *) malloc(sizeof(float) * m);
  checkError((h_Ximag != NULL), shrTRUE, "Could not allocate memory");


  for(unsigned i =0; i< m ; i++)
    {
      h_Xreal[i]=0;
      h_Ximag[i]=0;
    }
  h_Xreal[0]=1;
  h_Ximag[0]=1;

  //Precomputation. 

  const float TWOPI = 2*3.14159265358979323846;
  const float theta =  TWOPI / (2 * n);

  for(int l = 0; l < n; l++)
    {
      float c = cos( -1 *theta * l *l);
      float s = sin( -1 *theta * l*l);

      //Toeplitz matrix
      h_Hreal[l] = c;
      h_Himag[l] = s;

      //Y_l Since W_n^-l*l/2
      h_Yreal[l] = h_Xreal[l] * c + h_Ximag[l] * s;
      h_Yimag[l] = h_Ximag[l] *c -  h_Xreal[l] * s;
    }

  for(int i=n; i< m -n +1 ; i++)
    {
      h_Hreal[i] = 0; 
      h_Himag[i] = 0;
      h_Yreal[i] = 0;
      h_Yimag[i] = 0;
    }

  for(int i = m -n +2 ; i < m ; i++)
    {
      h_Hreal[i] = h_Hreal[m-i]; 
      h_Himag[i] = h_Himag[m-i];
      h_Yreal[i] = 0;
      h_Yimag[i] = 0;
    }
}
示例#2
0
//! glBlendEquationSeparatei wrapper. May throw.
inline void blendEquationSeparatei(GLuint const buf, GLenum const mode_rgb,
                                   GLenum const mode_alpha)
{
  glBlendEquationSeparatei(buf, mode_rgb, mode_alpha);
  checkError("glBlendEquationSeparatei");
}
示例#3
0
inline void cullFace(GLenum const mode)
{
  glCullFace(mode);
  checkError("glCullFace");
}
示例#4
0
//! glClearBufferuiv wrapper. May throw.
inline void clearBufferuiv(GLenum const buf,
                           GLint const drawbuffer,
                           GLuint const* value) {
  glClearBufferuiv(buf, drawbuffer, value);
  checkError("glClearBufferuiv");
}
示例#5
0
//! glReadBuffer wrapper. May throw.
inline void readBuffer(GLenum const mode)
{
  glReadBuffer(mode);
  checkError("glReadBuffer");
}
示例#6
0
//! glBlendEquationSeparate wrapper. May throw.
inline void blendEquationSeparate(GLenum const mode_rgb,
                                  GLenum const mode_alpha)
{
  glBlendEquationSeparate(mode_rgb, mode_alpha);
  checkError("glBlendEquationSeparate");
}
示例#7
0
//! glClearDepthf wrapper. May throw. 
inline void clearDepthf(GLclampf const d) {
  glClearDepthf(d);
  checkError("glClearDepthf");
}
示例#8
0
//! glIsEnabled wrapper. May throw. 
inline GLboolean isEnabled(GLenum const cap) {
  const GLboolean enabled = glIsEnabled(cap);
  checkError("glIsEnabled");
  return enabled;
}
示例#9
0
//! glGetString wrapper. May throw.
inline const GLubyte* getString(GLenum const name) {
  GLubyte const* str = glGetString(name);
  checkError("glGetString");
  return str;
}
示例#10
0
//! glGetFloatv wrapper. May throw. 
inline void getFloatv(GLenum const pname, GLfloat* data) {
  glGetFloatv(pname, data);
  checkError("glGetFloatv");
}
示例#11
0
//! glGetDoublev wrapper. May throw.
inline void getDoublev(GLenum const pname, GLdouble* data) {
  glGetDoublev(pname, data);
  checkError("glGetDoublev");
}
示例#12
0
//! glGetInteger64v wrapper. May throw.
inline void getInteger64v(GLenum const pname, GLint64* data) {
  glGetInteger64v(pname, data);
  checkError("glGetInteger64v");
}
示例#13
0
//! glGetBooleanv wrapper. May throw.
inline void getBooleanv(GLenum const pname, GLboolean* data) {
  glGetBooleanv(pname, data);
  checkError("glGetBooleanv");
}
示例#14
0
int main(int argc, char** argv)
{
    cl_int          err;               // error code returned from OpenCL calls

    size_t dataSize = sizeof(float) * LENGTH;
    float*       h_a = (float *)malloc(dataSize);       // a vector
    float*       h_b = (float *)malloc(dataSize);       // b vector
    float*       h_c = (float *)malloc(dataSize);       // c vector (result)
    float*       h_d = (float *)malloc(dataSize);       // d vector (result)
    float*       h_e = (float *)malloc(dataSize);       // e vector
    float*       h_f = (float *)malloc(dataSize);       // f vector (result)
    float*       h_g = (float *)malloc(dataSize);       // g vector
    unsigned int correct;           // number of correct results

    size_t global;                  // global domain size

    cl_device_id     device_id;     // compute device id
    cl_context       context;       // compute context
    cl_command_queue commands;      // compute command queue
    cl_program       program;       // compute program
    cl_kernel        ko_vadd;       // compute kernel

    cl_mem d_a;                     // device memory used for the input  a vector
    cl_mem d_b;                     // device memory used for the input  b vector
    cl_mem d_c;                     // device memory used for the output c vector
    cl_mem d_d;                     // device memory used for the output d vector
    cl_mem d_e;                     // device memory used for the input e vector
    cl_mem d_f;                     // device memory used for the output f vector
    cl_mem d_g;                     // device memory used for the input g vector

    // Fill vectors a and b with random float values
    int i = 0;
    for(i = 0; i < LENGTH; i++){
        h_a[i] = rand() / (float)RAND_MAX;
        h_b[i] = rand() / (float)RAND_MAX;
        h_e[i] = rand() / (float)RAND_MAX;
        h_g[i] = rand() / (float)RAND_MAX;
    }

    // Set up platform and GPU device

    cl_uint numPlatforms;

    // Find number of platforms
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkError(err, "Finding platforms");
    if (numPlatforms == 0)
    {
        printf("Found 0 platforms!\n");
        return EXIT_FAILURE;
    }

    // Get all platforms
    cl_platform_id Platform[numPlatforms];
    err = clGetPlatformIDs(numPlatforms, Platform, NULL);
    checkError(err, "Getting platforms");

    // Secure a GPU
    for (i = 0; i < numPlatforms; i++)
    {
        err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
        if (err == CL_SUCCESS)
        {
            break;
        }
    }

    if (device_id == NULL)
        checkError(err, "Getting device");

    err = output_device_info(device_id);
    checkError(err, "Outputting device info");
  
    // Create a compute context 
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    checkError(err, "Creating context");

    // Create a command queue
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    checkError(err, "Creating command queue");

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    checkError(err, "Creating program");

    // Build the program  
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n%s\n", err_code(err));
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }

    // Create the compute kernel from the program 
    ko_vadd = clCreateKernel(program, "vadd", &err);
    checkError(err, "Creating kernel");

    // Create the input (a, b, e, g) arrays in device memory
    // NB: we copy the host pointers here too
    d_a  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_a, &err);
    checkError(err, "Creating buffer d_a");
    d_b  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_b, &err);
    checkError(err, "Creating buffer d_b");
    d_e  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_e, &err);
    checkError(err, "Creating buffer d_e");
    d_g  = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  dataSize, h_g, &err);
    checkError(err, "Creating buffer d_g");
    
    // Create the output arrays in device memory
    d_c  = clCreateBuffer(context,  CL_MEM_READ_WRITE, dataSize, NULL, &err);
    checkError(err, "Creating buffer d_c");
    d_d  = clCreateBuffer(context,  CL_MEM_READ_WRITE, dataSize, NULL, &err);
    checkError(err, "Creating buffer d_d");
    d_f  = clCreateBuffer(context,  CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
    checkError(err, "Creating buffer d_f"); 

    const int count = LENGTH;

    // Enqueue kernel - first time
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &count);
    checkError(err, "Setting kernel arguments"); 
	
    // Execute the kernel over the entire range of our 1d input data set
    // letting the OpenCL runtime choose the work-group size
    global = count;
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL);
    checkError(err, "Enqueueing kernel 1st time");

    // Enqueue kernel - second time
    // Set different arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_e);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_d);
    checkError(err, "Setting kernel arguments");
    
    // Enqueue the kernel again    
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL);
    checkError(err, "Enqueueing kernel 2nd time");

    // Enqueue kernel - third time
    // Set different (again) arguments to our compute kernel
    err  = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_g);
    err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_d);
    err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_f);
    checkError(err, "Setting kernel arguments");

    // Enqueue the kernel again    
    err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL);
    checkError(err, "Enqueueing kernel 3rd time");

    // Read back the result from the compute device
    err = clEnqueueReadBuffer( commands, d_f, CL_TRUE, 0, sizeof(float) * count, h_f, 0, NULL, NULL );  
    checkError(err, "Reading back d_f");
    
    // Test the results
    correct = 0;
    float tmp;
    
    for(i = 0; i < count; i++)
    {
        tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i];     // assign element i of a+b+e+g to tmp
        tmp -= h_f[i];                               // compute deviation of expected and output result
        if(tmp*tmp < TOL*TOL)                        // correct if square deviation is less than tolerance squared
            correct++;
        else {
            printf(" tmp %f h_a %f h_b %f h_e %f h_g %f h_f %f\n",tmp, h_a[i], h_b[i], h_e[i], h_g[i], h_f[i]);
        }
    }

    // summarize results
    printf("C = A+B+E+G:  %d out of %d results were correct.\n", correct, count);

    // cleanup then shutdown
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseMemObject(d_d);
    clReleaseMemObject(d_e);
    clReleaseMemObject(d_f);
    clReleaseMemObject(d_g);
    clReleaseProgram(program);
    clReleaseKernel(ko_vadd);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    free(h_a);
    free(h_b);
    free(h_c);
    free(h_d);
    free(h_e);
    free(h_f);
    free(h_g);

    return 0;
}
示例#15
0
//! glStencilMaskSeparate wrapper. May throw.
inline void stencilMaskSeparate(GLenum const face, GLuint const mask) {
  glStencilMaskSeparate(face, mask);
  checkError("glStencilMaskSeparate");
}
示例#16
0
//! glVertexAttribDivisor wrapper. May throw.
inline void vertexAttribDivisor(GLuint const index, GLuint const divisor) {
  glVertexAttribDivisor(index, divisor);
  checkError("glVertexAttribDivisor");
}
示例#17
0
//! glClear wrapper. May throw.
inline void clear(GLbitfield const buf) {
  glClear(buf);
  checkError("glClear");        
}
示例#18
0
//! glDepthRange wrapper. May throw.
inline void depthRange(GLclampd const n, GLclampd const f) {
  glDepthRange(n, f);
  checkError("glDepthRange");
}
示例#19
0
//! glClearDepth wrapper. May throw.
inline void clearDepth(GLclampd const d) {
  glClearDepth(d);
  checkError("glClearDepth");
}
示例#20
0
//! glDrawBuffer wrapper. May throw.
inline void drawBuffer(GLenum const buf)
{
  glDrawBuffer(buf);
  checkError("glDrawBuffer");
} 
示例#21
0
//! glClearStencil wrapper. May throw.
inline void clearStencil(GLint const s) {
  glClearStencil(s);
  checkError("glClearStencil");
}
示例#22
0
//! glDrawBuffers wrapper. May throw.
inline void drawBuffers(GLsizei const n, GLenum const* bufs)
{
  glDrawBuffers(n, bufs);
  checkError("glDrawBuffers");
}
示例#23
0
//! glClearBufferfi wrapper. May throw.
inline void clearBufferfi(GLenum const buf, GLint const drawbuffer,
                          GLfloat const depth, GLint const stencil) {
  glClearBufferfi(buf, drawbuffer, depth, stencil);
  checkError("glClearBufferfi");
}
示例#24
0
//! glBlendEquationi wrapper. May throw.
inline void blendEquationi(GLuint const buf, GLenum const mode) {
  glBlendEquationi(buf, mode);
  checkError("glBlendEquationi");
}
示例#25
0
//! glNamedFramebufferReadBuffer. May throw.
inline void namedFramebufferReadBuffer(GLuint const framebuffer,
                                       GLenum const mode)
{
  glNamedFramebufferReadBufferEXT(framebuffer, mode);
  checkError("glNamedFramebufferReadBufferEXT");
}
示例#26
0
//! glDepthMask wrapper. May throw.
inline void depthMask(GLboolean const mask) {
  glDepthMask(mask);
  checkError("glColorMask");
}
示例#27
0
inline void lineWidth(GLfloat const width)
{
  glLineWidth(width);
  checkError("glLineWidth");
}
示例#28
0
//! glStencilMask wrapper. May throw.
inline void stencilMask(GLuint const mask) {
  glStencilMask(mask);
  checkError("glStencilMask");
}
示例#29
0
inline void frontFace(GLenum const mode)
{
  glFrontFace(mode);
  checkError("glFrontFace");
}
示例#30
0
文件: bluestein.cpp 项目: rzel/clfft
void
bluesteinsFFTGpu(const char* const argv[],const unsigned n, 
		 const unsigned orign,const unsigned size)
{
  const unsigned powM = (unsigned) log2(n);
  printf("Compiling Bluesteins Program..\n");

  compileProgram(argv, "fft.h", "kernels/bluesteins.cl");

    printf("Creating Kernel\n");
    for (unsigned i = 0; i < deviceCount; ++i) {
        createKernel(i, "bluesteins");
    }

    const unsigned sizePerGPU = size / deviceCount;
    for (unsigned i = 0; i < deviceCount; ++i) {
        workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU 
                                               : (size - workOffset[i]);       
        
        allocateDeviceMemoryBS(i , workSize[i], workOffset[i]);
        
        clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]);
        clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]);
	clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]);
        clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]);
	clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]);
        clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]);
	clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n);
	clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign);
	clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM);
	clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize);
       

        if ((i + 1) < deviceCount) {
            workOffset[i + 1] = workOffset[i] + workSize[i];
        } 

    }

    size_t localWorkSize[] = {blockSize};
    for (unsigned i = 0; i < deviceCount; ++i) {
        size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; 
        // kernel non blocking execution 
        runKernel(i, localWorkSize, globalWorkSize);
    }

    h_Rreal = h_Hreal;
    h_Rimag = h_Himag;
    
    for (unsigned i = 0; i < deviceCount; ++i) {
        copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i],
                                                workSize[i]); 
        copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i],
                                                 workSize[i]);
    }

    // wait for copy event
    const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone);
    checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents");
    printGpuTime();
}