void allocateHostMemoryBluesteins(const unsigned n, const unsigned m) { h_Hreal = (float *) malloc(sizeof(float) * m); checkError((h_Hreal != NULL), shrTRUE, "Could not allocate memory"); h_Himag = (float *) malloc(sizeof(float) * m); checkError((h_Himag != NULL), shrTRUE, "Could not allocate memory"); h_Yreal = (float *) malloc(sizeof(float) * m); checkError((h_Yreal != NULL), shrTRUE, "Could not allocate memory"); h_Yimag = (float *) malloc(sizeof(float) * m); checkError((h_Yimag != NULL), shrTRUE, "Could not allocate memory"); h_Zreal = (float *) malloc(sizeof(float) * m); checkError((h_Zreal != NULL), shrTRUE, "Could not allocate memory"); h_Zimag = (float *) malloc(sizeof(float) * m); checkError((h_Zimag != NULL), shrTRUE, "Could not allocate memory"); h_Xreal = (float *) malloc(sizeof(float) * m); checkError((h_Xreal != NULL), shrTRUE, "Could not allocate memory"); h_Ximag = (float *) malloc(sizeof(float) * m); checkError((h_Ximag != NULL), shrTRUE, "Could not allocate memory"); for(unsigned i =0; i< m ; i++) { h_Xreal[i]=0; h_Ximag[i]=0; } h_Xreal[0]=1; h_Ximag[0]=1; //Precomputation. const float TWOPI = 2*3.14159265358979323846; const float theta = TWOPI / (2 * n); for(int l = 0; l < n; l++) { float c = cos( -1 *theta * l *l); float s = sin( -1 *theta * l*l); //Toeplitz matrix h_Hreal[l] = c; h_Himag[l] = s; //Y_l Since W_n^-l*l/2 h_Yreal[l] = h_Xreal[l] * c + h_Ximag[l] * s; h_Yimag[l] = h_Ximag[l] *c - h_Xreal[l] * s; } for(int i=n; i< m -n +1 ; i++) { h_Hreal[i] = 0; h_Himag[i] = 0; h_Yreal[i] = 0; h_Yimag[i] = 0; } for(int i = m -n +2 ; i < m ; i++) { h_Hreal[i] = h_Hreal[m-i]; h_Himag[i] = h_Himag[m-i]; h_Yreal[i] = 0; h_Yimag[i] = 0; } }
//! glBlendEquationSeparatei wrapper. May throw. inline void blendEquationSeparatei(GLuint const buf, GLenum const mode_rgb, GLenum const mode_alpha) { glBlendEquationSeparatei(buf, mode_rgb, mode_alpha); checkError("glBlendEquationSeparatei"); }
inline void cullFace(GLenum const mode) { glCullFace(mode); checkError("glCullFace"); }
//! glClearBufferuiv wrapper. May throw. inline void clearBufferuiv(GLenum const buf, GLint const drawbuffer, GLuint const* value) { glClearBufferuiv(buf, drawbuffer, value); checkError("glClearBufferuiv"); }
//! glReadBuffer wrapper. May throw. inline void readBuffer(GLenum const mode) { glReadBuffer(mode); checkError("glReadBuffer"); }
//! glBlendEquationSeparate wrapper. May throw. inline void blendEquationSeparate(GLenum const mode_rgb, GLenum const mode_alpha) { glBlendEquationSeparate(mode_rgb, mode_alpha); checkError("glBlendEquationSeparate"); }
//! glClearDepthf wrapper. May throw. inline void clearDepthf(GLclampf const d) { glClearDepthf(d); checkError("glClearDepthf"); }
//! glIsEnabled wrapper. May throw. inline GLboolean isEnabled(GLenum const cap) { const GLboolean enabled = glIsEnabled(cap); checkError("glIsEnabled"); return enabled; }
//! glGetString wrapper. May throw. inline const GLubyte* getString(GLenum const name) { GLubyte const* str = glGetString(name); checkError("glGetString"); return str; }
//! glGetFloatv wrapper. May throw. inline void getFloatv(GLenum const pname, GLfloat* data) { glGetFloatv(pname, data); checkError("glGetFloatv"); }
//! glGetDoublev wrapper. May throw. inline void getDoublev(GLenum const pname, GLdouble* data) { glGetDoublev(pname, data); checkError("glGetDoublev"); }
//! glGetInteger64v wrapper. May throw. inline void getInteger64v(GLenum const pname, GLint64* data) { glGetInteger64v(pname, data); checkError("glGetInteger64v"); }
//! glGetBooleanv wrapper. May throw. inline void getBooleanv(GLenum const pname, GLboolean* data) { glGetBooleanv(pname, data); checkError("glGetBooleanv"); }
int main(int argc, char** argv) { cl_int err; // error code returned from OpenCL calls size_t dataSize = sizeof(float) * LENGTH; float* h_a = (float *)malloc(dataSize); // a vector float* h_b = (float *)malloc(dataSize); // b vector float* h_c = (float *)malloc(dataSize); // c vector (result) float* h_d = (float *)malloc(dataSize); // d vector (result) float* h_e = (float *)malloc(dataSize); // e vector float* h_f = (float *)malloc(dataSize); // f vector (result) float* h_g = (float *)malloc(dataSize); // g vector unsigned int correct; // number of correct results size_t global; // global domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_vadd; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory used for the input b vector cl_mem d_c; // device memory used for the output c vector cl_mem d_d; // device memory used for the output d vector cl_mem d_e; // device memory used for the input e vector cl_mem d_f; // device memory used for the output f vector cl_mem d_g; // device memory used for the input g vector // Fill vectors a and b with random float values int i = 0; for(i = 0; i < LENGTH; i++){ h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_e[i] = rand() / (float)RAND_MAX; h_g[i] = rand() / (float)RAND_MAX; } // Set up platform and GPU device cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); checkError(err, "Finding platforms"); if (numPlatforms == 0) { printf("Found 0 platforms!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); checkError(err, "Getting platforms"); // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) { break; } } if (device_id == NULL) checkError(err, "Getting device"); err = output_device_info(device_id); checkError(err, "Outputting device info"); // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); checkError(err, "Creating command queue"); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program ko_vadd = clCreateKernel(program, "vadd", &err); checkError(err, "Creating kernel"); // Create the input (a, b, e, g) arrays in device memory // NB: we copy the host pointers here too d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_a, &err); checkError(err, "Creating buffer d_a"); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_b, &err); checkError(err, "Creating buffer d_b"); d_e = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_e, &err); checkError(err, "Creating buffer d_e"); d_g = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_g, &err); checkError(err, "Creating buffer d_g"); // Create the output arrays in device memory d_c = clCreateBuffer(context, CL_MEM_READ_WRITE, dataSize, NULL, &err); checkError(err, "Creating buffer d_c"); d_d = clCreateBuffer(context, CL_MEM_READ_WRITE, dataSize, NULL, &err); checkError(err, "Creating buffer d_d"); d_f = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err); checkError(err, "Creating buffer d_f"); const int count = LENGTH; // Enqueue kernel - first time // Set the arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &count); checkError(err, "Setting kernel arguments"); // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = count; err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel 1st time"); // Enqueue kernel - second time // Set different arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_e); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_d); checkError(err, "Setting kernel arguments"); // Enqueue the kernel again err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel 2nd time"); // Enqueue kernel - third time // Set different (again) arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_g); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_d); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_f); checkError(err, "Setting kernel arguments"); // Enqueue the kernel again err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel 3rd time"); // Read back the result from the compute device err = clEnqueueReadBuffer( commands, d_f, CL_TRUE, 0, sizeof(float) * count, h_f, 0, NULL, NULL ); checkError(err, "Reading back d_f"); // Test the results correct = 0; float tmp; for(i = 0; i < count; i++) { tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i]; // assign element i of a+b+e+g to tmp tmp -= h_f[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else { printf(" tmp %f h_a %f h_b %f h_e %f h_g %f h_f %f\n",tmp, h_a[i], h_b[i], h_e[i], h_g[i], h_f[i]); } } // summarize results printf("C = A+B+E+G: %d out of %d results were correct.\n", correct, count); // cleanup then shutdown clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseMemObject(d_d); clReleaseMemObject(d_e); clReleaseMemObject(d_f); clReleaseMemObject(d_g); clReleaseProgram(program); clReleaseKernel(ko_vadd); clReleaseCommandQueue(commands); clReleaseContext(context); free(h_a); free(h_b); free(h_c); free(h_d); free(h_e); free(h_f); free(h_g); return 0; }
//! glStencilMaskSeparate wrapper. May throw. inline void stencilMaskSeparate(GLenum const face, GLuint const mask) { glStencilMaskSeparate(face, mask); checkError("glStencilMaskSeparate"); }
//! glVertexAttribDivisor wrapper. May throw. inline void vertexAttribDivisor(GLuint const index, GLuint const divisor) { glVertexAttribDivisor(index, divisor); checkError("glVertexAttribDivisor"); }
//! glClear wrapper. May throw. inline void clear(GLbitfield const buf) { glClear(buf); checkError("glClear"); }
//! glDepthRange wrapper. May throw. inline void depthRange(GLclampd const n, GLclampd const f) { glDepthRange(n, f); checkError("glDepthRange"); }
//! glClearDepth wrapper. May throw. inline void clearDepth(GLclampd const d) { glClearDepth(d); checkError("glClearDepth"); }
//! glDrawBuffer wrapper. May throw. inline void drawBuffer(GLenum const buf) { glDrawBuffer(buf); checkError("glDrawBuffer"); }
//! glClearStencil wrapper. May throw. inline void clearStencil(GLint const s) { glClearStencil(s); checkError("glClearStencil"); }
//! glDrawBuffers wrapper. May throw. inline void drawBuffers(GLsizei const n, GLenum const* bufs) { glDrawBuffers(n, bufs); checkError("glDrawBuffers"); }
//! glClearBufferfi wrapper. May throw. inline void clearBufferfi(GLenum const buf, GLint const drawbuffer, GLfloat const depth, GLint const stencil) { glClearBufferfi(buf, drawbuffer, depth, stencil); checkError("glClearBufferfi"); }
//! glBlendEquationi wrapper. May throw. inline void blendEquationi(GLuint const buf, GLenum const mode) { glBlendEquationi(buf, mode); checkError("glBlendEquationi"); }
//! glNamedFramebufferReadBuffer. May throw. inline void namedFramebufferReadBuffer(GLuint const framebuffer, GLenum const mode) { glNamedFramebufferReadBufferEXT(framebuffer, mode); checkError("glNamedFramebufferReadBufferEXT"); }
//! glDepthMask wrapper. May throw. inline void depthMask(GLboolean const mask) { glDepthMask(mask); checkError("glColorMask"); }
inline void lineWidth(GLfloat const width) { glLineWidth(width); checkError("glLineWidth"); }
//! glStencilMask wrapper. May throw. inline void stencilMask(GLuint const mask) { glStencilMask(mask); checkError("glStencilMask"); }
inline void frontFace(GLenum const mode) { glFrontFace(mode); checkError("glFrontFace"); }
void bluesteinsFFTGpu(const char* const argv[],const unsigned n, const unsigned orign,const unsigned size) { const unsigned powM = (unsigned) log2(n); printf("Compiling Bluesteins Program..\n"); compileProgram(argv, "fft.h", "kernels/bluesteins.cl"); printf("Creating Kernel\n"); for (unsigned i = 0; i < deviceCount; ++i) { createKernel(i, "bluesteins"); } const unsigned sizePerGPU = size / deviceCount; for (unsigned i = 0; i < deviceCount; ++i) { workSize[i] = (i != (deviceCount - 1)) ? sizePerGPU : (size - workOffset[i]); allocateDeviceMemoryBS(i , workSize[i], workOffset[i]); clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void*) &d_Hreal[i]); clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void*) &d_Himag[i]); clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void*) &d_Yreal[i]); clSetKernelArg(kernel[i], 3, sizeof(cl_mem), (void*) &d_Yimag[i]); clSetKernelArg(kernel[i], 4, sizeof(cl_mem), (void*) &d_Zreal[i]); clSetKernelArg(kernel[i], 5, sizeof(cl_mem), (void*) &d_Zimag[i]); clSetKernelArg(kernel[i], 6, sizeof(unsigned), &n); clSetKernelArg(kernel[i], 7, sizeof(unsigned), &orign); clSetKernelArg(kernel[i], 8, sizeof(unsigned), &powM); clSetKernelArg(kernel[i], 9, sizeof(unsigned), &blockSize); if ((i + 1) < deviceCount) { workOffset[i + 1] = workOffset[i] + workSize[i]; } } size_t localWorkSize[] = {blockSize}; for (unsigned i = 0; i < deviceCount; ++i) { size_t globalWorkSize[] = {shrRoundUp(blockSize, workSize[i])}; // kernel non blocking execution runKernel(i, localWorkSize, globalWorkSize); } h_Rreal = h_Hreal; h_Rimag = h_Himag; for (unsigned i = 0; i < deviceCount; ++i) { copyFromDevice(i, d_Hreal[i], h_Rreal + workOffset[i], workSize[i]); copyFromDevice(i, d_Himag[i], h_Rimag + workOffset[i], workSize[i]); } // wait for copy event const cl_int ciErrNum = clWaitForEvents(deviceCount, gpuDone); checkError(ciErrNum, CL_SUCCESS, "clWaitForEvents"); printGpuTime(); }