int main() { srand (time(0)); int temp; for (int i =0;i<SIZE;i++){ temp=rand() % 10000 + 1; data[i]=temp; data2[i]=temp; } ResetMilli(); bitonic_cpu(data, SIZE); printf("CPU:%f\n", GetSeconds()); ResetMilli(); #ifdef GPU bitonic_gpu(data2, SIZE); printf("GPU:%f\n", GetSeconds()); for (int i=0;i<SIZE;i++) if (data[i] != data2[i]) { printf("Error at %d ", i); return(1); } #endif // Print result if (SIZE <= MAXPRINTSIZE) for (int i=0;i<SIZE;i++) printf("%d ", data[i]); printf("\nYour sorting looks correct!\n"); }
int main() { const int N = 1024; float *a = new float[N*N]; float *b = new float[N*N]; float *c = new float[N*N]; for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) { a[i+j*N] = 10 + i; b[i+j*N] = (float)j / N; } ResetMilli(); add_matrix(a, b, c, N); printf("Total time in ms: %d\n", GetMicroseconds()); /* for (int i = 0; i < N; i++) */ /* { */ /* for (int j = 0; j < N; j++) */ /* { */ /* printf("%0.2f ", c[i+j*N]); */ /* } */ /* printf("\n"); */ /* } */ }
//////////////////////////////////////////////////////////////////////////////// // main computation function //////////////////////////////////////////////////////////////////////////////// void computeImages() { //read in full size of memory image = readppm("maskros512.ppm", &n, &m); out = (unsigned char*) malloc(n*m*3); cl_mem in_data, out_data; cl_int ciErrNum = CL_SUCCESS; // Create space for data and copy image to device (note that we could also use clEnqueueWriteBuffer to upload) in_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 3*n*m * sizeof(unsigned char), image, &ciErrNum); printCLError(ciErrNum,6); out_data = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 3*n*m * sizeof(unsigned char), NULL, &ciErrNum); printCLError(ciErrNum,7); // set the args values ciErrNum = clSetKernelArg(theKernel, 0, sizeof(cl_mem), (void *) &in_data); ciErrNum |= clSetKernelArg(theKernel, 1, sizeof(cl_mem), (void *) &out_data); ciErrNum |= clSetKernelArg(theKernel, 2, sizeof(cl_uint), (void *) &n); ciErrNum |= clSetKernelArg(theKernel, 3, sizeof(cl_uint), (void *) &m); printCLError(ciErrNum,8); // Computing arrangement //size_t localWorkSize, globalWorkSize; size_t globalWorkSize[3] = {512, 512, 1}; size_t localWorkSize[3] = {16, 16, 1}; //256 threads in each block // 32*32 (1024) blocks in total to have 512*512 threads in total printf("Startup time %lf\n", GetSeconds()); // Compute! cl_event event; ResetMilli(); ciErrNum = clEnqueueNDRangeKernel(commandQueue, theKernel, 2, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event); printCLError(ciErrNum,9); ciErrNum = clWaitForEvents(1, &event); // Synch printCLError(ciErrNum,10); printf("time %lf\n", GetSeconds()); ciErrNum = clEnqueueReadBuffer(commandQueue, out_data, CL_TRUE, 0, 3*n*m * sizeof(unsigned char), out, 0, NULL, &event); printCLError(ciErrNum,11); clWaitForEvents(1, &event); // Synch printCLError(ciErrNum,10); clReleaseMemObject(in_data); clReleaseMemObject(out_data); return; }
int main() { ResetMilli(); bitonic_cpu(data, SIZE); printf("%f\n", GetSeconds()); ResetMilli(); bitonic_gpu(data2, SIZE); printf("%f\n", GetSeconds()); for (int i=0;i<SIZE;i++) if (data[i] != data2[i]) { printf("Error at %d ", i); return(1); } // Print result if (SIZE <= MAXPRINTSIZE) for (int i=0;i<SIZE;i++) printf("%d ", data[i]); printf("\nYour sorting looks correct!\n"); }
// Main program, inits int main( int argc, char** argv) { glutInit(&argc, argv); glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA ); glutInitWindowSize( 1024, 512 ); glutCreateWindow("CUDA on live GL"); glutDisplayFunc(Draw); ResetMilli(); if (init_OpenCL()<0) { printf("OpenCL could not be initialized!\n"); close_OpenCL(); return; } readAndBuildKernel("filter.cl"); computeImages(); close_OpenCL(); glutMainLoop(); }